Changeset 54 for liacs/dbdm
- Timestamp:
- Dec 22, 2009, 7:44:50 AM (15 years ago)
- File:
-
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
liacs/dbdm/dbdm_4/parse-fasta.py
r53 r54 6 6 from MultiReplace import MultiReplace 7 7 8 fasta_translate = { 9 'r' : 'ga', # purine 10 'y' : 'tc', # pyrimide 11 'k' : 'gt', # keto 12 'm' : 'ac', # amino 13 's' : 'gc', # strong 14 'w' : 'at', # weak 15 'b' : 'gtc', 16 'd' : 'gat', 17 'h' : 'act', 18 'v' : 'gca', 19 } 20 21 fasta = MultiReplace(fasta_translate) 22 8 23 def parse_file(file): 9 handle = open( "data/AE005174v2-1.fas", "rU")24 handle = open(file, "rU") 10 25 for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna): 11 26 # How to translate damm thing into plain nucleic acid codes 12 27 # http://en.wikipedia.org/wiki/FASTA_format 13 stupid = seq_record.seq.__str__() 14 fasta_translate = { 15 'r' : 'ga', # purine 16 'y' : 'tc', # pyrimide 17 'k' : 'gt', # keto 18 'm' : 'ac', # amino 19 's' : 'gc', # strong 20 'w' : 'at', # weak 21 'b' : 'gtc', 22 'd' : 'gat', 23 'h' : 'act', 24 'v' : 'gca', 25 } 26 27 r = MultiReplace(fasta_translate) 28 stupid = r.replace(stupid) 28 retval = seq_record.seq.__str__() 29 29 30 pdict = {} 31 for n in range(1, len(stupid)): 32 protein = stupid[n] 33 if not pdict.has_key(protein): 34 pdict[protein] = 1 35 else: 36 pdict[protein] += 1 30 handle.close() 31 return(retval) 32 33 def stats(seq): 34 pdict = {} 35 for n in range(1, len(seq)): 36 protein = seq[n] 37 if not pdict.has_key(protein): 38 pdict[protein] = 1 39 else: 40 pdict[protein] += 1 37 41 38 print pdict 42 print pdict 43 44 45 def concat(head,tail,max_length=1000): 46 "Concat two strings together removing common parts" 47 l_head = len(head) 48 l_tail = len(tail) 49 50 # Start/Stop at the right time 51 start = 1 52 if (l_head < l_tail): 53 stop = l_head + 1 54 else: 55 stop = l_tail + 1 56 57 # Make sure not to run for-ever 58 if (stop > max_length): 59 stop = max_length 60 61 # Find largest common part 62 # XXX: Not very effient (on very large overlap sets 63 for i in reversed(range(start,stop)): 64 #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i]) 65 if head[-i:] == tail[0:i]: 66 return((i,(tail[0:i]),head + tail[i:])) 67 68 # None found return full part 69 return(-1,'',head + tail) 70 39 71 40 72 file1 = parse_file("data/AE005174v2-1.fas") 41 73 file2 = parse_file("data/AE005174v2-2.fas") 74 file1 = fasta.replace(file1) 75 file2 = fasta.replace(file2) 76 (retval, common, result) = concat(file2,file1) 77 print retval, common 78 stats(result) 79 80 out = open("full_contig.raw","w") 81 out.write(result) 82 out.close()
Note:
See TracChangeset
for help on using the changeset viewer.