Context Navigation

source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 62

Last change on this file since 62 was 62, checked in by Rick van der Zwet, 15 years ago
First results on shifting tests
Property svn:executable set to ``*
File size: 6.1 KB

Line
1	#!/usr/bin/env python
2
3	# http://en.wikipedia.org/wiki/Stop_codon
4	# http://en.wikipedia.org/wiki/Escherichia_coli
5	# http://en.wikipedia.org/wiki/Open_reading_frame
6	# http://nl.wikipedia.org/wiki/Genetische_code
7
8	import ghmm
9	import sys
10	import csv
11	import string
12
13	dna_flip = string.maketrans('atgc','tacg')
14	def get_codon(seq, position, order):
15	codon = seq[position:position+3]
16	if not order:
17	# When living on the other side of the string make sure to flip
18	# around before comparison
19	codon = codon[::-1].translate(dna_flip)
20	return(codon)
21
22
23
24	def ecoli_hmm(seq):
25	"""Try to find genes inside e sequence using a HMM"""
26	# Model 4 bases A C G T and unknown state N
27	sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
28	print sigma
29
30	# XXX: Proper values, based of statistics
31	# The transition matrix A is chosen such that it reflects the statistics
32	# Part one will try to get us into a gene, part two will tell us when to
33	# get out of it
34	A = [[0.6,0.4], [0.3, 0.7]]
35
36	# The emission probabilities matrix is modeled after the statistics with a
37	# total of 1
38	etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
39	eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
40
41	B = [etogene,eingene]
42
43	# Initial distribution favors outside
44	pi = [0.9, 0.1]
45
46	m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
47	print "Initial HMM", m
48
49	obs_seq = m.sampleSingle(20)
50	print "Observation sequence : ", obs_seq
51	obs = map(sigma.external, obs_seq)
52	print "Observations : ", ''.join(obs)
53
54	# Start codons
55	# atg, (small chance) gtg
56	# Stop codons
57	# taa, tga
58	# tag
59	# lend = Left End
60	# rend = Right End
61
62	# Training and testing
63	# A -- T, G -- C
64	start_codons = ['atg', 'gtg']
65	stop_codons = ['taa', 'tga', 'tag']
66	edl_file = 'data/edl_genes.csv'
67
68	# Counter limit of how many 'hard' errors are allowed before bailing out
69	stop_limit = 5
70
71	# Current shifting offset
72	base_shift = 0
73	stop_counter = 0
74
75	reader = csv.reader(open(edl_file,"rU"))
76	reader.next()
77	try:
78	for row in reader:
79	finished = False
80	while not finished:
81	(featureType, zNumber, contig, lend, rend, orientation,
82	segmentType, oIslandNumber, geneName, note, function, product,
83	translationNotes) = row
84	lend = int(lend)
85	rend = int(rend)
86	# Make a forward orientation mark as positive
87	if orientation == '>':
88	order = True
89	else:
90	order = False
91
92	# Living the world upside down to order is off
93	if order:
94	start_pos = base_shift + lend - 1
95	stop_pos = base_shift + rend - 3
96	else:
97	start_pos = base_shift + rend -3
98	stop_pos = base_shift + lend - 1
99
100	start_codon = get_codon(seq,start_pos,order)
101	stop_codon = get_codon(seq,stop_pos,order)
102	start_match = start_codon in start_codons
103	stop_match = stop_codon in stop_codons
104
105	print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation),
106	print "%s, %s" % (start_match, stop_match)
107
108	# Check for fucked up offsets, shifts
109	start_shift = None
110	stop_shift = None
111	# Technically speaking should the cope only the reading frames,
112	# but what the heck looking if 'free'
113	search_range = 10
114	if not start_match:
115	# Somewhere else in the reading frame?
116	for r in range(-search_range,search_range+1):
117	l = start_pos + r
118	t = get_codon(seq,l,order)
119	new_match = t in start_codons
120	if new_match:
121	start_shift = r
122	print "# Start codon reading frame match - shift %i: " % r,
123	print "%i, %s: %s" % (l,t,new_match)
124	if not stop_match:
125	# Somewhere else in the reading frame?
126	for r in range(-search_range,search_range+1):
127	l = stop_pos + r
128	t = get_codon(seq,l,order)
129	new_match = t in stop_codons
130	if new_match:
131	stop_shift = r
132	print "# Stop codon reading frame match - shift %i: " % r,
133	print "%i, %s: %s" % (l,t,new_match)
134
135
136	# Both wrong is something screwy in the data, check offset fix
137	if not start_match and not stop_match:
138	if stop_shift != None and start_shift == stop_shift:
139	print "# Matching shift %i" % start_shift
140	base_shift += start_shift
141	finished = False
142	continue
143	# Exercise left to the reader
144	if (stop_counter > stop_limit):
145	return
146	else:
147	stop_counter += 1
148	finished = True
149	else:
150	stop_counter = 0
151	finished = True
152
153	except csv.Error, e:
154	sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
155
156
157
158	test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
159	v = m.viterbi(test_seq)
160	#print "fairness of test_seq: ", v
161
162
163	# Validation
164	val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
165	v = m.viterbi(val_seq)
166	#print "Test sequence: ", v
167
168	# XXX: Results
169
170
171
172	if __name__ == "__main__":
173	# Load data
174	try:
175	handle = open(sys.argv[1],"rU")
176	except IndexError:
177	print "Usage %s <data_file>" % (sys.argv[0])
178	sys.exit(64)
179	except IOError:
180	print "Unable to open '%s'" % (sys.argv[1])
181	sys.exit(64)
182
183	seq = handle.read()
184	handle.close()
185	ecoli_hmm(seq)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: