Context Navigation

source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 63

Last change on this file since 63 was 63, checked in by Rick van der Zwet, 15 years ago
Finally got all the matching right
Property svn:executable set to ``*
File size: 7.3 KB

Line
1	#!/usr/bin/env python
2
3	# http://en.wikipedia.org/wiki/Stop_codon
4	# http://en.wikipedia.org/wiki/Escherichia_coli
5	# http://en.wikipedia.org/wiki/Open_reading_frame
6	# http://nl.wikipedia.org/wiki/Genetische_code
7
8	import sys
9	import csv
10	import string
11
12	# http://ghmm.sourceforge.net/
13	import ghmm
14
15	# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
16	fasta_translate = {
17	'r' : 'ga', # purine
18	'y' : 'tc', # pyrimide
19	'k' : 'gt', # keto
20	'm' : 'ac', # amino
21	's' : 'gc', # strong
22	'w' : 'at', # weak
23	'b' : 'gtc',
24	'd' : 'gat',
25	'h' : 'act',
26	'v' : 'gca',
27	}
28
29	DEBUG = True
30
31	dna_flip = string.maketrans('atgc','tacg')
32	def get_codon(seq, position, order):
33	codon = seq[position:position+3]
34	if not order:
35	# When living on the other side of the string make sure to flip
36	# around before comparison
37	codon = codon[::-1].translate(dna_flip)
38	return(codon)
39
40
41
42	def ecoli_hmm():
43	"""Try to find genes inside e sequence using a HMM"""
44	# Model 4 bases A C G T and unknown state N
45	sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
46	print sigma
47
48	# XXX: Proper values, based of statistics
49	# The transition matrix A is chosen such that it reflects the statistics
50	# Part one will try to get us into a gene, part two will tell us when to
51	# get out of it
52	A = [[0.6,0.4], [0.3, 0.7]]
53
54	# The emission probabilities matrix is modeled after the statistics with a
55	# total of 1
56	etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
57	eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
58
59	B = [etogene,eingene]
60
61	# Initial distribution favors outside
62	pi = [0.9, 0.1]
63
64	m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
65	print "Initial HMM", m
66
67	obs_seq = m.sampleSingle(20)
68	print "Observation sequence : ", obs_seq
69	obs = map(sigma.external, obs_seq)
70	print "Observations : ", ''.join(obs)
71
72	# Start codons
73	# atg, (small chance) gtg
74	# Stop codons
75	# taa, tga
76	# tag
77	# lend = Left End
78	# rend = Right End
79
80	# Training and testing
81	# A -- T, G -- C
82	start_codons = ['atg', 'gtg']
83	stop_codons = ['taa', 'tga', 'tag']
84	edl_file = 'data/edl_genes.csv'
85
86	# Counter limit of how many 'hard' errors are allowed before bailing out
87	stop_limit = 100
88
89	# Current shifting offset
90	base_shift = 0
91	stop_counter = 0
92
93	# Sequences of contig, XXX: Make it a FASTA / BioPython Parser
94	contig_seq = {}
95
96	reader = csv.reader(open(edl_file,"rU"))
97	reader.next()
98	try:
99	for row in reader:
100	finished = False
101	while not finished:
102	(featureType, zNumber, contig, lend, rend, orientation,
103	segmentType, oIslandNumber, geneName, note, function, product,
104	translationNotes) = row
105	lend = int(lend)
106	rend = int(rend)
107	if not contig_seq.has_key(contig):
108	# Load data
109	try:
110	handle = open(contig + ".raw","rU")
111	contig_seq[contig] = handle.read()
112	handle.close()
113	except IOError:
114	print "Unable to open '%s'" % (sys.argv[1])
115	sys.exit(64)
116	seq = contig_seq[contig]
117
118	# Make a forward orientation mark as positive
119	if orientation == '>':
120	order = True
121	else:
122	order = False
123
124	# Living the world upside down to order is off
125	if order:
126	start_pos = base_shift + lend - 1
127	stop_pos = base_shift + rend - 3
128	else:
129	start_pos = base_shift + rend -3
130	stop_pos = base_shift + lend - 1
131
132	start_codon = get_codon(seq,start_pos,order)
133	stop_codon = get_codon(seq,stop_pos,order)
134	start_match = start_codon in start_codons
135	stop_match = stop_codon in stop_codons
136
137	print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
138	print "%s, %s" % (start_match, stop_match)
139	if not start_match or not stop_match:
140	print "### Function (comment):", function
141
142	# Check for fucked up offsets, shifts
143	start_shift = []
144	stop_shift = []
145	# Technically speaking should the cope only the reading frames,
146	# but what the heck looking if 'free', but make sure to include the original frames as well
147	search_range = abs(base_shift) + 3
148	if not start_match:
149	# Somewhere else in the reading frame?
150	matches = []
151	for r in range(-search_range,search_range+1):
152	l = start_pos + r
153	t = get_codon(seq,l,order)
154	new_match = t in start_codons
155	if new_match:
156	matches.append("%i:%s" % (r, t))
157	start_shift.append(r)
158	if start_shift:
159	print "# Start codon reading frame matches: ", ",".join(matches)
160	if not stop_match:
161	# Somewhere else in the reading frame?
162	matches = []
163	for r in range(-search_range,search_range+1):
164	l = stop_pos + r
165	t = get_codon(seq,l,order)
166	new_match = t in stop_codons
167	if new_match:
168	stop_shift.append(r)
169	matches.append("%i:%s" % (r, t))
170	if stop_shift:
171	print "# Stop codon reading frame matches: ", ",".join(matches)
172
173
174	# Both wrong is something screwy in the data, check offset fix
175	if not start_match and not stop_match:
176	common_shift = list(set(start_shift) & set(stop_shift))
177	if common_shift:
178	print "# Matching shifts: " + ",".join(map(str,common_shift))
179	# Get the value closest to 0 for shifting purposes
180	# Currently asume no negative shifts
181	base_shift += min(common_shift)
182	finished = False
183	continue
184	# Exercise left to the reader
185	if (stop_counter > stop_limit):
186	return
187	else:
188	stop_counter += 1
189	finished = True
190	else:
191	stop_counter = 0
192	finished = True
193
194	except csv.Error, e:
195	sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
196
197
198
199	test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
200	v = m.viterbi(test_seq)
201	#print "fairness of test_seq: ", v
202
203
204	# Validation
205	val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
206	v = m.viterbi(val_seq)
207	#print "Test sequence: ", v
208
209	# XXX: Results
210
211
212
213	if __name__ == "__main__":
214	ecoli_hmm()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: