source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 62

Last change on this file since 62 was 62, checked in by Rick van der Zwet, 15 years ago

First results on shifting tests

  • Property svn:executable set to *
File size: 6.1 KB
Line 
1#!/usr/bin/env python
2
3# http://en.wikipedia.org/wiki/Stop_codon
4# http://en.wikipedia.org/wiki/Escherichia_coli
5# http://en.wikipedia.org/wiki/Open_reading_frame
6# http://nl.wikipedia.org/wiki/Genetische_code
7
8import ghmm
9import sys
10import csv
11import string
12
13dna_flip = string.maketrans('atgc','tacg')
14def get_codon(seq, position, order):
15 codon = seq[position:position+3]
16 if not order:
17 # When living on the other side of the string make sure to flip
18 # around before comparison
19 codon = codon[::-1].translate(dna_flip)
20 return(codon)
21
22
23
24def ecoli_hmm(seq):
25 """Try to find genes inside e sequence using a HMM"""
26 # Model 4 bases A C G T and unknown state N
27 sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
28 print sigma
29
30 # XXX: Proper values, based of statistics
31 # The transition matrix A is chosen such that it reflects the statistics
32 # Part one will try to get us into a gene, part two will tell us when to
33 # get out of it
34 A = [[0.6,0.4], [0.3, 0.7]]
35
36 # The emission probabilities matrix is modeled after the statistics with a
37 # total of 1
38 etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
39 eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
40
41 B = [etogene,eingene]
42
43 # Initial distribution favors outside
44 pi = [0.9, 0.1]
45
46 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
47 print "Initial HMM", m
48
49 obs_seq = m.sampleSingle(20)
50 print "Observation sequence : ", obs_seq
51 obs = map(sigma.external, obs_seq)
52 print "Observations : ", ''.join(obs)
53
54 # Start codons
55 # atg, (small chance) gtg
56 # Stop codons
57 # taa, tga
58 # tag
59 # lend = Left End
60 # rend = Right End
61
62 # Training and testing
63 # A -- T, G -- C
64 start_codons = ['atg', 'gtg']
65 stop_codons = ['taa', 'tga', 'tag']
66 edl_file = 'data/edl_genes.csv'
67
68 # Counter limit of how many 'hard' errors are allowed before bailing out
69 stop_limit = 5
70
71 # Current shifting offset
72 base_shift = 0
73 stop_counter = 0
74
75 reader = csv.reader(open(edl_file,"rU"))
76 reader.next()
77 try:
78 for row in reader:
79 finished = False
80 while not finished:
81 (featureType, zNumber, contig, lend, rend, orientation,
82 segmentType, oIslandNumber, geneName, note, function, product,
83 translationNotes) = row
84 lend = int(lend)
85 rend = int(rend)
86 # Make a forward orientation mark as positive
87 if orientation == '>':
88 order = True
89 else:
90 order = False
91
92 # Living the world upside down to order is off
93 if order:
94 start_pos = base_shift + lend - 1
95 stop_pos = base_shift + rend - 3
96 else:
97 start_pos = base_shift + rend -3
98 stop_pos = base_shift + lend - 1
99
100 start_codon = get_codon(seq,start_pos,order)
101 stop_codon = get_codon(seq,stop_pos,order)
102 start_match = start_codon in start_codons
103 stop_match = stop_codon in stop_codons
104
105 print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation),
106 print "%s, %s" % (start_match, stop_match)
107
108 # Check for fucked up offsets, shifts
109 start_shift = None
110 stop_shift = None
111 # Technically speaking should the cope only the reading frames,
112 # but what the heck looking if 'free'
113 search_range = 10
114 if not start_match:
115 # Somewhere else in the reading frame?
116 for r in range(-search_range,search_range+1):
117 l = start_pos + r
118 t = get_codon(seq,l,order)
119 new_match = t in start_codons
120 if new_match:
121 start_shift = r
122 print "# Start codon reading frame match - shift %i: " % r,
123 print "%i, %s: %s" % (l,t,new_match)
124 if not stop_match:
125 # Somewhere else in the reading frame?
126 for r in range(-search_range,search_range+1):
127 l = stop_pos + r
128 t = get_codon(seq,l,order)
129 new_match = t in stop_codons
130 if new_match:
131 stop_shift = r
132 print "# Stop codon reading frame match - shift %i: " % r,
133 print "%i, %s: %s" % (l,t,new_match)
134
135
136 # Both wrong is something screwy in the data, check offset fix
137 if not start_match and not stop_match:
138 if stop_shift != None and start_shift == stop_shift:
139 print "# Matching shift %i" % start_shift
140 base_shift += start_shift
141 finished = False
142 continue
143 # Exercise left to the reader
144 if (stop_counter > stop_limit):
145 return
146 else:
147 stop_counter += 1
148 finished = True
149 else:
150 stop_counter = 0
151 finished = True
152
153 except csv.Error, e:
154 sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
155
156
157
158 test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
159 v = m.viterbi(test_seq)
160 #print "fairness of test_seq: ", v
161
162
163 # Validation
164 val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
165 v = m.viterbi(val_seq)
166 #print "Test sequence: ", v
167
168 # XXX: Results
169
170
171
172if __name__ == "__main__":
173 # Load data
174 try:
175 handle = open(sys.argv[1],"rU")
176 except IndexError:
177 print "Usage %s <data_file>" % (sys.argv[0])
178 sys.exit(64)
179 except IOError:
180 print "Unable to open '%s'" % (sys.argv[1])
181 sys.exit(64)
182
183 seq = handle.read()
184 handle.close()
185 ecoli_hmm(seq)
Note: See TracBrowser for help on using the repository browser.