- Timestamp:
- Dec 22, 2009, 9:29:24 AM (15 years ago)
- Location:
- liacs/dbdm/dbdm_4
- Files:
-
- 1 copied
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
liacs/dbdm/dbdm_4/parse_fasta.py
r58 r59 9 9 import Bio.Data.CodonTable 10 10 from MultiReplace import MultiReplace 11 from reading_frames import reading_frames 11 12 12 13 fasta_translate = { … … 90 91 print file1 91 92 stats(seq1) 92 print file1 93 stats(seq1) 93 reading_frames(seq1) 94 print file2 95 stats(seq2) 96 reading_frames(seq2) 97 94 98 95 99 # Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and … … 99 103 result = result + "n" * 4000; 100 104 stats(result) 105 reading_frames(result) 101 106 102 107 # Write to file for later further processing -
liacs/dbdm/dbdm_4/reading_frames.py
r56 r59 1 1 #!/usr/bin/env python 2 2 # 3 # Parse 2 FASTA files and print statistics3 # Parse FASTA file and print statistics on reading frames 4 4 # BSDLicence 5 5 # Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl> 6 from Bio import SeqIO,Seq 7 from Bio import Alphabet 8 from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna 9 import Bio.Data.CodonTable 10 from MultiReplace import MultiReplace 6 import sys 11 7 12 fasta_translate = { 13 'r' : 'ga', # purine 14 'y' : 'tc', # pyrimide 15 'k' : 'gt', # keto 16 'm' : 'ac', # amino 17 's' : 'gc', # strong 18 'w' : 'at', # weak 19 'b' : 'gtc', 20 'd' : 'gat', 21 'h' : 'act', 22 'v' : 'gca', 23 } 24 25 fasta = MultiReplace(fasta_translate) 8 def _frame_stat(seq, start=0): 9 pdict = {} 10 for n in range(start,len(seq),2): 11 codon = seq[n:n+3] 12 if len(codon) < 3: 13 continue 14 if not pdict.has_key(codon): 15 pdict[codon] = 1 16 else: 17 pdict[codon] += 1 26 18 27 def parse_file(file): 28 handle = open(file, "rU") 29 for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna): 30 # How to translate damm thing into plain nucleic acid codes 31 # http://en.wikipedia.org/wiki/FASTA_format 32 retval = seq_record.seq.__str__() 33 34 handle.close() 35 return(retval) 19 return(pdict) 36 20 37 def stats(seq): 38 pdict = {} 39 for n in range(1, len(seq)): 40 protein = seq[n] 41 if not pdict.has_key(protein): 42 pdict[protein] = 1 43 else: 44 pdict[protein] += 1 45 46 print pdict 21 def reading_frames(seq): 22 '''Parse from left to right and right to left, at position 1,2,3 in 23 in the so called nucleotide triplets 24 See: http://en.wikipedia.org/wiki/Genetic_code''' 25 26 # Populate all empty 27 final = {} 28 for a in 'acgtn': 29 for b in 'acgtn': 30 for c in 'acgtn': 31 final[a + b + c] = [0,0,0,0,0,0] 32 33 for start in [0,1,2]: 34 print "Normal; start %i" % (start) 35 retval = _frame_stat(seq,start) 36 for codon,v in retval.iteritems(): 37 final[codon][start] += v 38 print "Reverse; start %i" % (start) 39 retval = _frame_stat(seq[::-1],start) 40 for codon,v in retval.iteritems(): 41 final[codon][start+3] += v 42 43 print "CODON : N:0 , N:1 , N:2 , R:0 , R:1 , R:2 " 44 45 for codon in sorted(final.keys()): 46 print codon," : ", ",".join(["%6i" % x for x in final[codon]]) 47 47 48 48 49 def concat(head,tail,max_length=1000):50 "Concat two strings together removing common parts"51 l_head = len(head)52 l_tail = len(tail)53 49 54 # Start/Stop at the right time 55 start = 1 56 if (l_head < l_tail): 57 stop = l_head + 1 58 else: 59 stop = l_tail + 1 60 61 # Make sure not to run for-ever 62 if (stop > max_length): 63 stop = max_length 64 65 # Find largest common part 66 # XXX: Not very effient (on very large overlap sets 67 for i in reversed(range(start,stop)): 68 #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i]) 69 if head[-i:] == tail[0:i]: 70 return((i,(tail[0:i]),head + tail[i:])) 71 72 # None found return full part 73 return(-1,'',head + tail) 74 75 76 # Get data 77 file1 = parse_file("data/AE005174v2-1.fas") 78 file2 = parse_file("data/AE005174v2-2.fas") 79 80 file1 = fasta.replace(file1) 81 file2 = fasta.replace(file2) 82 83 # Find overlap 84 (retval, common, result) = concat(file2,file1) 85 print retval, common 86 87 # Strictly speaking there is a gap of about 4 kbs (4000 bs) between file1 and 88 # file2, so lets' put that into the the statistics as well. Due to circular 89 # nature, does not matter wether we add it in the beginning or in the end 90 result = result + "n" * 4000; 91 stats(result) 92 93 # Write to file for later further processing 94 out = open("full_contig.raw","w") 95 out.write(result) 96 out.close() 50 if __name__ == "__main__": 51 # Load data 52 try: 53 handle = open(sys.argv[1],"rU") 54 except IndexError: 55 print "Usage %s <data_file>" % (sys.argv[0]) 56 sys.exit(64) 57 except IOError: 58 print "Unable to open '%s'" % (sys.argv[1]) 59 sys.exit(64) 60 61 seq = handle.read() 62 handle.close() 63 reading_frames(seq)
Note:
See TracChangeset
for help on using the changeset viewer.