source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 63

Last change on this file since 63 was 63, checked in by Rick van der Zwet, 15 years ago

Finally got all the matching right

  • Property svn:executable set to *
File size: 7.3 KB
Line 
1#!/usr/bin/env python
2
3# http://en.wikipedia.org/wiki/Stop_codon
4# http://en.wikipedia.org/wiki/Escherichia_coli
5# http://en.wikipedia.org/wiki/Open_reading_frame
6# http://nl.wikipedia.org/wiki/Genetische_code
7
8import sys
9import csv
10import string
11
12# http://ghmm.sourceforge.net/
13import ghmm
14
15# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
16fasta_translate = {
17 'r' : 'ga', # purine
18 'y' : 'tc', # pyrimide
19 'k' : 'gt', # keto
20 'm' : 'ac', # amino
21 's' : 'gc', # strong
22 'w' : 'at', # weak
23 'b' : 'gtc',
24 'd' : 'gat',
25 'h' : 'act',
26 'v' : 'gca',
27 }
28
29DEBUG = True
30
31dna_flip = string.maketrans('atgc','tacg')
32def get_codon(seq, position, order):
33 codon = seq[position:position+3]
34 if not order:
35 # When living on the other side of the string make sure to flip
36 # around before comparison
37 codon = codon[::-1].translate(dna_flip)
38 return(codon)
39
40
41
42def ecoli_hmm():
43 """Try to find genes inside e sequence using a HMM"""
44 # Model 4 bases A C G T and unknown state N
45 sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
46 print sigma
47
48 # XXX: Proper values, based of statistics
49 # The transition matrix A is chosen such that it reflects the statistics
50 # Part one will try to get us into a gene, part two will tell us when to
51 # get out of it
52 A = [[0.6,0.4], [0.3, 0.7]]
53
54 # The emission probabilities matrix is modeled after the statistics with a
55 # total of 1
56 etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
57 eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
58
59 B = [etogene,eingene]
60
61 # Initial distribution favors outside
62 pi = [0.9, 0.1]
63
64 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
65 print "Initial HMM", m
66
67 obs_seq = m.sampleSingle(20)
68 print "Observation sequence : ", obs_seq
69 obs = map(sigma.external, obs_seq)
70 print "Observations : ", ''.join(obs)
71
72 # Start codons
73 # atg, (small chance) gtg
74 # Stop codons
75 # taa, tga
76 # tag
77 # lend = Left End
78 # rend = Right End
79
80 # Training and testing
81 # A -- T, G -- C
82 start_codons = ['atg', 'gtg']
83 stop_codons = ['taa', 'tga', 'tag']
84 edl_file = 'data/edl_genes.csv'
85
86 # Counter limit of how many 'hard' errors are allowed before bailing out
87 stop_limit = 100
88
89 # Current shifting offset
90 base_shift = 0
91 stop_counter = 0
92
93 # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
94 contig_seq = {}
95
96 reader = csv.reader(open(edl_file,"rU"))
97 reader.next()
98 try:
99 for row in reader:
100 finished = False
101 while not finished:
102 (featureType, zNumber, contig, lend, rend, orientation,
103 segmentType, oIslandNumber, geneName, note, function, product,
104 translationNotes) = row
105 lend = int(lend)
106 rend = int(rend)
107 if not contig_seq.has_key(contig):
108 # Load data
109 try:
110 handle = open(contig + ".raw","rU")
111 contig_seq[contig] = handle.read()
112 handle.close()
113 except IOError:
114 print "Unable to open '%s'" % (sys.argv[1])
115 sys.exit(64)
116 seq = contig_seq[contig]
117
118 # Make a forward orientation mark as positive
119 if orientation == '>':
120 order = True
121 else:
122 order = False
123
124 # Living the world upside down to order is off
125 if order:
126 start_pos = base_shift + lend - 1
127 stop_pos = base_shift + rend - 3
128 else:
129 start_pos = base_shift + rend -3
130 stop_pos = base_shift + lend - 1
131
132 start_codon = get_codon(seq,start_pos,order)
133 stop_codon = get_codon(seq,stop_pos,order)
134 start_match = start_codon in start_codons
135 stop_match = stop_codon in stop_codons
136
137 print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
138 print "%s, %s" % (start_match, stop_match)
139 if not start_match or not stop_match:
140 print "### Function (comment):", function
141
142 # Check for fucked up offsets, shifts
143 start_shift = []
144 stop_shift = []
145 # Technically speaking should the cope only the reading frames,
146 # but what the heck looking if 'free', but make sure to include the original frames as well
147 search_range = abs(base_shift) + 3
148 if not start_match:
149 # Somewhere else in the reading frame?
150 matches = []
151 for r in range(-search_range,search_range+1):
152 l = start_pos + r
153 t = get_codon(seq,l,order)
154 new_match = t in start_codons
155 if new_match:
156 matches.append("%i:%s" % (r, t))
157 start_shift.append(r)
158 if start_shift:
159 print "# Start codon reading frame matches: ", ",".join(matches)
160 if not stop_match:
161 # Somewhere else in the reading frame?
162 matches = []
163 for r in range(-search_range,search_range+1):
164 l = stop_pos + r
165 t = get_codon(seq,l,order)
166 new_match = t in stop_codons
167 if new_match:
168 stop_shift.append(r)
169 matches.append("%i:%s" % (r, t))
170 if stop_shift:
171 print "# Stop codon reading frame matches: ", ",".join(matches)
172
173
174 # Both wrong is something screwy in the data, check offset fix
175 if not start_match and not stop_match:
176 common_shift = list(set(start_shift) & set(stop_shift))
177 if common_shift:
178 print "# Matching shifts: " + ",".join(map(str,common_shift))
179 # Get the value closest to 0 for shifting purposes
180 # Currently asume no negative shifts
181 base_shift += min(common_shift)
182 finished = False
183 continue
184 # Exercise left to the reader
185 if (stop_counter > stop_limit):
186 return
187 else:
188 stop_counter += 1
189 finished = True
190 else:
191 stop_counter = 0
192 finished = True
193
194 except csv.Error, e:
195 sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
196
197
198
199 test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
200 v = m.viterbi(test_seq)
201 #print "fairness of test_seq: ", v
202
203
204 # Validation
205 val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
206 v = m.viterbi(val_seq)
207 #print "Test sequence: ", v
208
209 # XXX: Results
210
211
212
213if __name__ == "__main__":
214 ecoli_hmm()
Note: See TracBrowser for help on using the repository browser.