source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 200

Last change on this file since 200 was 66, checked in by Rick van der Zwet, 15 years ago

Annotations and readablity fixes

  • Property svn:executable set to *
File size: 4.7 KB
RevLine 
[60]1#!/usr/bin/env python
2
3# http://en.wikipedia.org/wiki/Stop_codon
4# http://en.wikipedia.org/wiki/Escherichia_coli
5# http://en.wikipedia.org/wiki/Open_reading_frame
[61]6# http://nl.wikipedia.org/wiki/Genetische_code
[60]7
8import sys
[61]9import csv
10import string
[60]11
[63]12# http://ghmm.sourceforge.net/
13import ghmm
14
[64]15from MultiReplace import MultiReplace
16
[63]17# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
18fasta_translate = {
19 'r' : 'ga', # purine
20 'y' : 'tc', # pyrimide
21 'k' : 'gt', # keto
22 'm' : 'ac', # amino
23 's' : 'gc', # strong
24 'w' : 'at', # weak
25 'b' : 'gtc',
26 'd' : 'gat',
27 'h' : 'act',
28 'v' : 'gca',
29 }
30
[66]31# Transform the state to something human readable
[64]32dna_ascii_translate = {
[66]33 '0' : '.',
[64]34 '1' : '<',
35 '2' : '<',
36 '3' : '<',
[66]37 '4' : '|',
[64]38 '5' : '>',
39 '6' : '>',
40 '7' : '>',
41 }
[63]42
[64]43dna_ascii = MultiReplace(dna_ascii_translate)
[60]44
[66]45def pretty_print(test_seq, ans_seq, v, length=100, parts=10, seperator=''):
[64]46 """ Pretty printing of output for verification purposes """
47 for i in range(0,len(v[0]),length):
48 seq = []
49 ans = []
50 result = []
51 for j in range(0,length,parts):
52 t = i + j
53 seq.append(test_seq[t:t+parts])
54 ans.append(ans_seq[t:t+parts])
55 result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts]))))
[62]56
[66]57 print "ORG: " + seperator.join(seq)
58 print "CSV: " + seperator.join(ans)
59 print "HMM: " + seperator.join(result)
[64]60 print ''
61 print "fairness of test_seq: ", v[1]
[62]62
[64]63
[63]64def ecoli_hmm():
[60]65 """Try to find genes inside e sequence using a HMM"""
66 # Model 4 bases A C G T and unknown state N
67 sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
68 print sigma
69
70 # XXX: Proper values, based of statistics
71 # The transition matrix A is chosen such that it reflects the statistics
[64]72 # Probalities from moving from one state to an other
73 # 0) Outer-gene : will try to get us into a gene
74 # 1) Start-codon : beginning of gene - part 1
75 # 2) Start-codon : beginning of gene - part 2
76 # 3) Start-codon : beginning of gene - part 3
77 # 4) Inside-gene : in the gene
78 # 5) Stop-codon : end of gene - part 1
79 # 6) Stop-codon : end of gene - part 2
80 # 7) Stop-codon : end of gene - part 3
81 A = [
[66]82 [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 0
83 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 1
84 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], # 2
85 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], # 3
86 [0.0, 0.0, 0.0, 0.0, 0.9, 0.1, 0.0, 0.0], # 4
87 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], # 5
88 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # 6
89 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 7
[64]90 ]
[60]91
[64]92 # XXX: Proper values, based of statistics
93 # The emission probabilities matrix is modeled after the statistics
94 # (['a', 'c', 'g', 't', 'n' ]
95 B = [
96 # e.g. state 0 -> emission probability
[66]97 [0.2, 0.2, 0.2, 0.2, 0.2] , # 0
98 [0.9, 0.0, 0.1, 0.0, 0.0] , # 1
99 [0.0, 0.0, 0.0, 1.0, 0.0] , # 2
100 [0.0, 0.0, 1.0, 0.0, 0.0] , # 3
101 [0.2, 0.2, 0.2, 0.2, 0.2] , # 4
102 [0.0, 0.0, 0.0, 1.0, 0.0] , # 5
103 [0.7, 0.0, 0.3, 0.0, 0.0] , # 6
104 [0.7, 0.0, 0.3, 0.0, 0.0] , # 7
[64]105 ]
[60]106
107 # Initial distribution favors outside
[64]108 pi = [0.9] + [0.1/7] * 7
[60]109
110 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
[64]111 print "Initial HMM"
[66]112 print m.verboseStr()
[60]113
114 obs_seq = m.sampleSingle(20)
115 print "Observation sequence : ", obs_seq
116 obs = map(sigma.external, obs_seq)
117 print "Observations : ", ''.join(obs)
118
[64]119 answer = {}
120 handle = open('AE005174v2-2-gene.raw', 'rU')
121 answer['AE005174v2-2'] = handle.read()
122 handle.close()
[61]123
[64]124 contig_seq = {}
125 handle = open('AE005174v2-2.raw', 'rU')
126 contig_seq['AE005174v2-2'] = handle.read()
127 handle.close()
[62]128
[64]129 handle = open('AE005174v2-1.raw', 'rU')
130 contig_seq['AE005174v2-1'] = handle.read()
131 handle.close()
[62]132
[66]133 test_size = 1000;
134 test_seq = contig_seq['AE005174v2-2'][0:test_size]
135 ans_seq = answer['AE005174v2-2'][0:test_size]
[64]136 test_eseq=ghmm.EmissionSequence(sigma,list(test_seq))
137 v = m.viterbi(test_eseq)
138 pretty_print(test_seq, ans_seq, v)
[62]139
[61]140
[64]141 # Train sequence
142 print "Training baumWelch"
[66]143 train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1'][0:10000]))
[64]144 v = m.baumWelch(train_seq)
[66]145 print m.verboseStr()
[62]146
[61]147
[64]148 print "Results after training sequence..."
149 v = m.viterbi(test_eseq)
150 pretty_print(test_seq, ans_seq, v)
[60]151
152
153 # XXX: Results
154
155if __name__ == "__main__":
[63]156 ecoli_hmm()
Note: See TracBrowser for help on using the repository browser.