1 | #!/usr/bin/env python
|
---|
2 | #
|
---|
3 | # Parse 2 FASTA files and print statistics
|
---|
4 | # BSDLicence
|
---|
5 | # Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
|
---|
6 | from Bio import SeqIO,Seq
|
---|
7 | from Bio import Alphabet
|
---|
8 | from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
|
---|
9 | import Bio.Data.CodonTable
|
---|
10 | from MultiReplace import MultiReplace
|
---|
11 | from reading_frames import reading_frames
|
---|
12 |
|
---|
13 | # The mapping is kind of odd, as 'r' could mean either 'g' or 'a'
|
---|
14 | fasta_translate = {
|
---|
15 | 'r' : 'ga', # purine
|
---|
16 | 'y' : 'tc', # pyrimide
|
---|
17 | 'k' : 'gt', # keto
|
---|
18 | 'm' : 'ac', # amino
|
---|
19 | 's' : 'gc', # strong
|
---|
20 | 'w' : 'at', # weak
|
---|
21 | 'b' : 'gtc',
|
---|
22 | 'd' : 'gat',
|
---|
23 | 'h' : 'act',
|
---|
24 | 'v' : 'gca',
|
---|
25 | }
|
---|
26 |
|
---|
27 | fasta = MultiReplace(fasta_translate)
|
---|
28 |
|
---|
29 | def parse_file(file):
|
---|
30 | handle = open(file, "rU")
|
---|
31 | for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
|
---|
32 | # How to translate damm thing into plain nucleic acid codes
|
---|
33 | # http://en.wikipedia.org/wiki/FASTA_format
|
---|
34 | retval = seq_record.seq.__str__()
|
---|
35 |
|
---|
36 | handle.close()
|
---|
37 | return(retval)
|
---|
38 |
|
---|
39 | def stats(seq):
|
---|
40 | pdict = {}
|
---|
41 | for n in range(1, len(seq)):
|
---|
42 | protein = seq[n]
|
---|
43 | if not pdict.has_key(protein):
|
---|
44 | pdict[protein] = 1
|
---|
45 | else:
|
---|
46 | pdict[protein] += 1
|
---|
47 |
|
---|
48 | print pdict
|
---|
49 |
|
---|
50 |
|
---|
51 | def concat(head,tail,max_length=1000):
|
---|
52 | "Concat two strings together removing common parts"
|
---|
53 | l_head = len(head)
|
---|
54 | l_tail = len(tail)
|
---|
55 |
|
---|
56 | # Start/Stop at the right time
|
---|
57 | start = 1
|
---|
58 | if (l_head < l_tail):
|
---|
59 | stop = l_head + 1
|
---|
60 | else:
|
---|
61 | stop = l_tail + 1
|
---|
62 |
|
---|
63 | # Make sure not to run for-ever
|
---|
64 | if (stop > max_length):
|
---|
65 | stop = max_length
|
---|
66 |
|
---|
67 | # Find largest common part
|
---|
68 | # XXX: Not very effient (on very large overlap sets
|
---|
69 | for i in reversed(range(start,stop)):
|
---|
70 | #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
|
---|
71 | if head[-i:] == tail[0:i]:
|
---|
72 | return((i,(tail[0:i]),head + tail[i:]))
|
---|
73 |
|
---|
74 | # None found return full part
|
---|
75 | return(-1,'',head + tail)
|
---|
76 |
|
---|
77 | # File names
|
---|
78 | file1 = "data/AE005174v2-1.fas"
|
---|
79 | file2 = "data/AE005174v2-2.fas"
|
---|
80 |
|
---|
81 | # Get data
|
---|
82 | seq1 = parse_file(file1)
|
---|
83 | seq2 = parse_file(file2)
|
---|
84 |
|
---|
85 | # Wrong assumption, replace is not possible as the real value is not know yet
|
---|
86 | # seq1 = fasta.replace(seq1)
|
---|
87 | # seq2 = fasta.replace(seq2)
|
---|
88 |
|
---|
89 | # Find overlap
|
---|
90 | (retval, common, result) = concat(seq2,seq1)
|
---|
91 | print retval, common
|
---|
92 |
|
---|
93 | print file1
|
---|
94 | stats(seq1)
|
---|
95 | reading_frames(seq1)
|
---|
96 | print file2
|
---|
97 | stats(seq2)
|
---|
98 | reading_frames(seq2)
|
---|
99 |
|
---|
100 |
|
---|
101 | # Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
|
---|
102 | # seq2, so lets' put that into the the statistics as well. Due to circular
|
---|
103 | # nature, does not matter wether we add it in the beginning or in the end
|
---|
104 | print "Total (inc 4 kbs gap (n))"
|
---|
105 | result = result + "n" * 4000;
|
---|
106 | stats(result)
|
---|
107 | reading_frames(result)
|
---|
108 |
|
---|
109 | # Write to file for later further processing
|
---|
110 | out = open("AE005174v2-1.raw","w")
|
---|
111 | out.write(seq1)
|
---|
112 | out.close()
|
---|
113 |
|
---|
114 | out = open("AE005174v2-2.raw","w")
|
---|
115 | out.write(seq2)
|
---|
116 | out.close()
|
---|
117 |
|
---|
118 | out = open("full_contig.raw","w")
|
---|
119 | out.write(result)
|
---|
120 | out.close()
|
---|