Context Navigation

ecoli_hmm.py@ 260

Last change on this file since 260 was 66, checked in by Rick van der Zwet, 15 years ago
Annotations and readablity fixes
Property svn:executable set to ``*
File size: 4.7 KB

Line
1	#!/usr/bin/env python
2
3	# http://en.wikipedia.org/wiki/Stop_codon
4	# http://en.wikipedia.org/wiki/Escherichia_coli
5	# http://en.wikipedia.org/wiki/Open_reading_frame
6	# http://nl.wikipedia.org/wiki/Genetische_code
7
8	import sys
9	import csv
10	import string
11
12	# http://ghmm.sourceforge.net/
13	import ghmm
14
15	from MultiReplace import MultiReplace
16
17	# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
18	fasta_translate = {
19	'r' : 'ga', # purine
20	'y' : 'tc', # pyrimide
21	'k' : 'gt', # keto
22	'm' : 'ac', # amino
23	's' : 'gc', # strong
24	'w' : 'at', # weak
25	'b' : 'gtc',
26	'd' : 'gat',
27	'h' : 'act',
28	'v' : 'gca',
29	}
30
31	# Transform the state to something human readable
32	dna_ascii_translate = {
33	'0' : '.',
34	'1' : '<',
35	'2' : '<',
36	'3' : '<',
37	'4' : '\|',
38	'5' : '>',
39	'6' : '>',
40	'7' : '>',
41	}
42
43	dna_ascii = MultiReplace(dna_ascii_translate)
44
45	def pretty_print(test_seq, ans_seq, v, length=100, parts=10, seperator=''):
46	""" Pretty printing of output for verification purposes """
47	for i in range(0,len(v[0]),length):
48	seq = []
49	ans = []
50	result = []
51	for j in range(0,length,parts):
52	t = i + j
53	seq.append(test_seq[t:t+parts])
54	ans.append(ans_seq[t:t+parts])
55	result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts]))))
56
57	print "ORG: " + seperator.join(seq)
58	print "CSV: " + seperator.join(ans)
59	print "HMM: " + seperator.join(result)
60	print ''
61	print "fairness of test_seq: ", v[1]
62
63
64	def ecoli_hmm():
65	"""Try to find genes inside e sequence using a HMM"""
66	# Model 4 bases A C G T and unknown state N
67	sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
68	print sigma
69
70	# XXX: Proper values, based of statistics
71	# The transition matrix A is chosen such that it reflects the statistics
72	# Probalities from moving from one state to an other
73	# 0) Outer-gene : will try to get us into a gene
74	# 1) Start-codon : beginning of gene - part 1
75	# 2) Start-codon : beginning of gene - part 2
76	# 3) Start-codon : beginning of gene - part 3
77	# 4) Inside-gene : in the gene
78	# 5) Stop-codon : end of gene - part 1
79	# 6) Stop-codon : end of gene - part 2
80	# 7) Stop-codon : end of gene - part 3
81	A = [
82	[0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 0
83	[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 1
84	[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], # 2
85	[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], # 3
86	[0.0, 0.0, 0.0, 0.0, 0.9, 0.1, 0.0, 0.0], # 4
87	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], # 5
88	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # 6
89	[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 7
90	]
91
92	# XXX: Proper values, based of statistics
93	# The emission probabilities matrix is modeled after the statistics
94	# (['a', 'c', 'g', 't', 'n' ]
95	B = [
96	# e.g. state 0 -> emission probability
97	[0.2, 0.2, 0.2, 0.2, 0.2] , # 0
98	[0.9, 0.0, 0.1, 0.0, 0.0] , # 1
99	[0.0, 0.0, 0.0, 1.0, 0.0] , # 2
100	[0.0, 0.0, 1.0, 0.0, 0.0] , # 3
101	[0.2, 0.2, 0.2, 0.2, 0.2] , # 4
102	[0.0, 0.0, 0.0, 1.0, 0.0] , # 5
103	[0.7, 0.0, 0.3, 0.0, 0.0] , # 6
104	[0.7, 0.0, 0.3, 0.0, 0.0] , # 7
105	]
106
107	# Initial distribution favors outside
108	pi = [0.9] + [0.1/7] * 7
109
110	m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
111	print "Initial HMM"
112	print m.verboseStr()
113
114	obs_seq = m.sampleSingle(20)
115	print "Observation sequence : ", obs_seq
116	obs = map(sigma.external, obs_seq)
117	print "Observations : ", ''.join(obs)
118
119	answer = {}
120	handle = open('AE005174v2-2-gene.raw', 'rU')
121	answer['AE005174v2-2'] = handle.read()
122	handle.close()
123
124	contig_seq = {}
125	handle = open('AE005174v2-2.raw', 'rU')
126	contig_seq['AE005174v2-2'] = handle.read()
127	handle.close()
128
129	handle = open('AE005174v2-1.raw', 'rU')
130	contig_seq['AE005174v2-1'] = handle.read()
131	handle.close()
132
133	test_size = 1000;
134	test_seq = contig_seq['AE005174v2-2'][0:test_size]
135	ans_seq = answer['AE005174v2-2'][0:test_size]
136	test_eseq=ghmm.EmissionSequence(sigma,list(test_seq))
137	v = m.viterbi(test_eseq)
138	pretty_print(test_seq, ans_seq, v)
139
140
141	# Train sequence
142	print "Training baumWelch"
143	train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1'][0:10000]))
144	v = m.baumWelch(train_seq)
145	print m.verboseStr()
146
147
148	print "Results after training sequence..."
149	v = m.viterbi(test_eseq)
150	pretty_print(test_seq, ans_seq, v)
151
152
153	# XXX: Results
154
155	if __name__ == "__main__":
156	ecoli_hmm()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 260

Download in other formats: