source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 64

Last change on this file since 64 was 64, checked in by Rick van der Zwet, 15 years ago

Sample GHMM, needs statistics and verifing still

  • Property svn:executable set to *
File size: 4.5 KB
Line 
1#!/usr/bin/env python
2
3# http://en.wikipedia.org/wiki/Stop_codon
4# http://en.wikipedia.org/wiki/Escherichia_coli
5# http://en.wikipedia.org/wiki/Open_reading_frame
6# http://nl.wikipedia.org/wiki/Genetische_code
7
8import sys
9import csv
10import string
11
12# http://ghmm.sourceforge.net/
13import ghmm
14
15from MultiReplace import MultiReplace
16
17# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
18fasta_translate = {
19 'r' : 'ga', # purine
20 'y' : 'tc', # pyrimide
21 'k' : 'gt', # keto
22 'm' : 'ac', # amino
23 's' : 'gc', # strong
24 'w' : 'at', # weak
25 'b' : 'gtc',
26 'd' : 'gat',
27 'h' : 'act',
28 'v' : 'gca',
29 }
30
31dna_ascii_translate = {
32 '0' : '*',
33 '1' : '<',
34 '2' : '<',
35 '3' : '<',
36 '4' : '-',
37 '5' : '>',
38 '6' : '>',
39 '7' : '>',
40 }
41
42dna_ascii = MultiReplace(dna_ascii_translate)
43
44def pretty_print(test_seq, ans_seq, v, length=70, parts=10, seperator=''):
45 """ Pretty printing of output for verification purposes """
46 for i in range(0,len(v[0]),length):
47 seq = []
48 ans = []
49 result = []
50 for j in range(0,length,parts):
51 t = i + j
52 seq.append(test_seq[t:t+parts])
53 ans.append(ans_seq[t:t+parts])
54 result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts]))))
55
56 print seperator.join(seq)
57 print seperator.join(ans)
58 print seperator.join(result)
59 print ''
60 print "fairness of test_seq: ", v[1]
61
62
63def ecoli_hmm():
64 """Try to find genes inside e sequence using a HMM"""
65 # Model 4 bases A C G T and unknown state N
66 sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
67 print sigma
68
69 # XXX: Proper values, based of statistics
70 # The transition matrix A is chosen such that it reflects the statistics
71 # Probalities from moving from one state to an other
72 # 0) Outer-gene : will try to get us into a gene
73 # 1) Start-codon : beginning of gene - part 1
74 # 2) Start-codon : beginning of gene - part 2
75 # 3) Start-codon : beginning of gene - part 3
76 # 4) Inside-gene : in the gene
77 # 5) Stop-codon : end of gene - part 1
78 # 6) Stop-codon : end of gene - part 2
79 # 7) Stop-codon : end of gene - part 3
80 A = [
81 [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
82 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
83 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
84 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
85 [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0],
86 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
87 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
88 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
89 ]
90
91 # XXX: Proper values, based of statistics
92 # The emission probabilities matrix is modeled after the statistics
93 # (['a', 'c', 'g', 't', 'n' ]
94 B = [
95 # e.g. state 0 -> emission probability
96 [0.2, 0.2, 0.2, 0.2, 0.2] ,
97 [0.9, 0.0, 0.1, 0.0, 0.0] ,
98 [0.0, 0.0, 0.0, 1.0, 0.0] ,
99 [0.0, 0.0, 1.0, 0.0, 0.0] ,
100 [0.2, 0.2, 0.2, 0.2, 0.2] ,
101 [0.0, 0.0, 0.0, 1.0, 0.0] ,
102 [0.7, 0.0, 0.3, 0.0, 0.0] ,
103 [0.7, 0.0, 0.3, 0.0, 0.0] ,
104 ]
105
106 # Initial distribution favors outside
107 pi = [0.9] + [0.1/7] * 7
108
109 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
110 print "Initial HMM"
111 print m
112
113 obs_seq = m.sampleSingle(20)
114 print "Observation sequence : ", obs_seq
115 obs = map(sigma.external, obs_seq)
116 print "Observations : ", ''.join(obs)
117
118 answer = {}
119 handle = open('AE005174v2-2-gene.raw', 'rU')
120 answer['AE005174v2-2'] = handle.read()
121 handle.close()
122
123 contig_seq = {}
124 handle = open('AE005174v2-2.raw', 'rU')
125 contig_seq['AE005174v2-2'] = handle.read()
126 handle.close()
127
128 handle = open('AE005174v2-1.raw', 'rU')
129 contig_seq['AE005174v2-1'] = handle.read()
130 handle.close()
131
132 test_seq = contig_seq['AE005174v2-2'][0:490]
133 ans_seq = answer['AE005174v2-2'][0:490]
134 test_eseq=ghmm.EmissionSequence(sigma,list(test_seq))
135 v = m.viterbi(test_eseq)
136 pretty_print(test_seq, ans_seq, v)
137
138
139 # Train sequence
140 print "Training baumWelch"
141 train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1']))
142 v = m.baumWelch(train_seq)
143 print m
144
145
146 print "Results after training sequence..."
147 v = m.viterbi(test_eseq)
148 pretty_print(test_seq, ans_seq, v)
149
150
151 # XXX: Results
152
153if __name__ == "__main__":
154 ecoli_hmm()
Note: See TracBrowser for help on using the repository browser.