1 | #!/usr/bin/env python
|
---|
2 |
|
---|
3 | # http://en.wikipedia.org/wiki/Stop_codon
|
---|
4 | # http://en.wikipedia.org/wiki/Escherichia_coli
|
---|
5 | # http://en.wikipedia.org/wiki/Open_reading_frame
|
---|
6 | # http://nl.wikipedia.org/wiki/Genetische_code
|
---|
7 |
|
---|
8 | import sys
|
---|
9 | import csv
|
---|
10 | import string
|
---|
11 |
|
---|
12 | # http://ghmm.sourceforge.net/
|
---|
13 | import ghmm
|
---|
14 |
|
---|
15 | from MultiReplace import MultiReplace
|
---|
16 |
|
---|
17 | # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
|
---|
18 | fasta_translate = {
|
---|
19 | 'r' : 'ga', # purine
|
---|
20 | 'y' : 'tc', # pyrimide
|
---|
21 | 'k' : 'gt', # keto
|
---|
22 | 'm' : 'ac', # amino
|
---|
23 | 's' : 'gc', # strong
|
---|
24 | 'w' : 'at', # weak
|
---|
25 | 'b' : 'gtc',
|
---|
26 | 'd' : 'gat',
|
---|
27 | 'h' : 'act',
|
---|
28 | 'v' : 'gca',
|
---|
29 | }
|
---|
30 |
|
---|
31 | dna_ascii_translate = {
|
---|
32 | '0' : '*',
|
---|
33 | '1' : '<',
|
---|
34 | '2' : '<',
|
---|
35 | '3' : '<',
|
---|
36 | '4' : '-',
|
---|
37 | '5' : '>',
|
---|
38 | '6' : '>',
|
---|
39 | '7' : '>',
|
---|
40 | }
|
---|
41 |
|
---|
42 | dna_ascii = MultiReplace(dna_ascii_translate)
|
---|
43 |
|
---|
44 | def pretty_print(test_seq, ans_seq, v, length=70, parts=10, seperator=''):
|
---|
45 | """ Pretty printing of output for verification purposes """
|
---|
46 | for i in range(0,len(v[0]),length):
|
---|
47 | seq = []
|
---|
48 | ans = []
|
---|
49 | result = []
|
---|
50 | for j in range(0,length,parts):
|
---|
51 | t = i + j
|
---|
52 | seq.append(test_seq[t:t+parts])
|
---|
53 | ans.append(ans_seq[t:t+parts])
|
---|
54 | result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts]))))
|
---|
55 |
|
---|
56 | print seperator.join(seq)
|
---|
57 | print seperator.join(ans)
|
---|
58 | print seperator.join(result)
|
---|
59 | print ''
|
---|
60 | print "fairness of test_seq: ", v[1]
|
---|
61 |
|
---|
62 |
|
---|
63 | def ecoli_hmm():
|
---|
64 | """Try to find genes inside e sequence using a HMM"""
|
---|
65 | # Model 4 bases A C G T and unknown state N
|
---|
66 | sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
|
---|
67 | print sigma
|
---|
68 |
|
---|
69 | # XXX: Proper values, based of statistics
|
---|
70 | # The transition matrix A is chosen such that it reflects the statistics
|
---|
71 | # Probalities from moving from one state to an other
|
---|
72 | # 0) Outer-gene : will try to get us into a gene
|
---|
73 | # 1) Start-codon : beginning of gene - part 1
|
---|
74 | # 2) Start-codon : beginning of gene - part 2
|
---|
75 | # 3) Start-codon : beginning of gene - part 3
|
---|
76 | # 4) Inside-gene : in the gene
|
---|
77 | # 5) Stop-codon : end of gene - part 1
|
---|
78 | # 6) Stop-codon : end of gene - part 2
|
---|
79 | # 7) Stop-codon : end of gene - part 3
|
---|
80 | A = [
|
---|
81 | [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
---|
82 | [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
---|
83 | [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
|
---|
84 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
|
---|
85 | [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0],
|
---|
86 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
|
---|
87 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
|
---|
88 | [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
---|
89 | ]
|
---|
90 |
|
---|
91 | # XXX: Proper values, based of statistics
|
---|
92 | # The emission probabilities matrix is modeled after the statistics
|
---|
93 | # (['a', 'c', 'g', 't', 'n' ]
|
---|
94 | B = [
|
---|
95 | # e.g. state 0 -> emission probability
|
---|
96 | [0.2, 0.2, 0.2, 0.2, 0.2] ,
|
---|
97 | [0.9, 0.0, 0.1, 0.0, 0.0] ,
|
---|
98 | [0.0, 0.0, 0.0, 1.0, 0.0] ,
|
---|
99 | [0.0, 0.0, 1.0, 0.0, 0.0] ,
|
---|
100 | [0.2, 0.2, 0.2, 0.2, 0.2] ,
|
---|
101 | [0.0, 0.0, 0.0, 1.0, 0.0] ,
|
---|
102 | [0.7, 0.0, 0.3, 0.0, 0.0] ,
|
---|
103 | [0.7, 0.0, 0.3, 0.0, 0.0] ,
|
---|
104 | ]
|
---|
105 |
|
---|
106 | # Initial distribution favors outside
|
---|
107 | pi = [0.9] + [0.1/7] * 7
|
---|
108 |
|
---|
109 | m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
|
---|
110 | print "Initial HMM"
|
---|
111 | print m
|
---|
112 |
|
---|
113 | obs_seq = m.sampleSingle(20)
|
---|
114 | print "Observation sequence : ", obs_seq
|
---|
115 | obs = map(sigma.external, obs_seq)
|
---|
116 | print "Observations : ", ''.join(obs)
|
---|
117 |
|
---|
118 | answer = {}
|
---|
119 | handle = open('AE005174v2-2-gene.raw', 'rU')
|
---|
120 | answer['AE005174v2-2'] = handle.read()
|
---|
121 | handle.close()
|
---|
122 |
|
---|
123 | contig_seq = {}
|
---|
124 | handle = open('AE005174v2-2.raw', 'rU')
|
---|
125 | contig_seq['AE005174v2-2'] = handle.read()
|
---|
126 | handle.close()
|
---|
127 |
|
---|
128 | handle = open('AE005174v2-1.raw', 'rU')
|
---|
129 | contig_seq['AE005174v2-1'] = handle.read()
|
---|
130 | handle.close()
|
---|
131 |
|
---|
132 | test_seq = contig_seq['AE005174v2-2'][0:490]
|
---|
133 | ans_seq = answer['AE005174v2-2'][0:490]
|
---|
134 | test_eseq=ghmm.EmissionSequence(sigma,list(test_seq))
|
---|
135 | v = m.viterbi(test_eseq)
|
---|
136 | pretty_print(test_seq, ans_seq, v)
|
---|
137 |
|
---|
138 |
|
---|
139 | # Train sequence
|
---|
140 | print "Training baumWelch"
|
---|
141 | train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1']))
|
---|
142 | v = m.baumWelch(train_seq)
|
---|
143 | print m
|
---|
144 |
|
---|
145 |
|
---|
146 | print "Results after training sequence..."
|
---|
147 | v = m.viterbi(test_eseq)
|
---|
148 | pretty_print(test_seq, ans_seq, v)
|
---|
149 |
|
---|
150 |
|
---|
151 | # XXX: Results
|
---|
152 |
|
---|
153 | if __name__ == "__main__":
|
---|
154 | ecoli_hmm()
|
---|