1 | #!/usr/bin/env python
|
---|
2 |
|
---|
3 | # http://en.wikipedia.org/wiki/Stop_codon
|
---|
4 | # http://en.wikipedia.org/wiki/Escherichia_coli
|
---|
5 | # http://en.wikipedia.org/wiki/Open_reading_frame
|
---|
6 | # http://nl.wikipedia.org/wiki/Genetische_code
|
---|
7 |
|
---|
8 | import ghmm
|
---|
9 | import sys
|
---|
10 | import csv
|
---|
11 | import string
|
---|
12 |
|
---|
13 | dna_flip = string.maketrans('atgc','tacg')
|
---|
14 | def get_codon(seq, position, order):
|
---|
15 | codon = seq[position:position+3]
|
---|
16 | if not order:
|
---|
17 | # When living on the other side of the string make sure to flip
|
---|
18 | # around before comparison
|
---|
19 | codon = codon[::-1].translate(dna_flip)
|
---|
20 | return(codon)
|
---|
21 |
|
---|
22 |
|
---|
23 |
|
---|
24 | def ecoli_hmm(seq):
|
---|
25 | """Try to find genes inside e sequence using a HMM"""
|
---|
26 | # Model 4 bases A C G T and unknown state N
|
---|
27 | sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
|
---|
28 | print sigma
|
---|
29 |
|
---|
30 | # XXX: Proper values, based of statistics
|
---|
31 | # The transition matrix A is chosen such that it reflects the statistics
|
---|
32 | # Part one will try to get us into a gene, part two will tell us when to
|
---|
33 | # get out of it
|
---|
34 | A = [[0.6,0.4], [0.3, 0.7]]
|
---|
35 |
|
---|
36 | # The emission probabilities matrix is modeled after the statistics with a
|
---|
37 | # total of 1
|
---|
38 | etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
|
---|
39 | eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
|
---|
40 |
|
---|
41 | B = [etogene,eingene]
|
---|
42 |
|
---|
43 | # Initial distribution favors outside
|
---|
44 | pi = [0.9, 0.1]
|
---|
45 |
|
---|
46 | m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
|
---|
47 | print "Initial HMM", m
|
---|
48 |
|
---|
49 | obs_seq = m.sampleSingle(20)
|
---|
50 | print "Observation sequence : ", obs_seq
|
---|
51 | obs = map(sigma.external, obs_seq)
|
---|
52 | print "Observations : ", ''.join(obs)
|
---|
53 |
|
---|
54 | # Start codons
|
---|
55 | # atg, (small chance) gtg
|
---|
56 | # Stop codons
|
---|
57 | # taa, tga
|
---|
58 | # tag
|
---|
59 | # lend = Left End
|
---|
60 | # rend = Right End
|
---|
61 |
|
---|
62 | # Training and testing
|
---|
63 | # A -- T, G -- C
|
---|
64 | start_codons = ['atg', 'gtg']
|
---|
65 | stop_codons = ['taa', 'tga', 'tag']
|
---|
66 | edl_file = 'data/edl_genes.csv'
|
---|
67 |
|
---|
68 | # Counter limit of how many 'hard' errors are allowed before bailing out
|
---|
69 | stop_limit = 5
|
---|
70 |
|
---|
71 | # Current shifting offset
|
---|
72 | base_shift = 0
|
---|
73 | stop_counter = 0
|
---|
74 |
|
---|
75 | reader = csv.reader(open(edl_file,"rU"))
|
---|
76 | reader.next()
|
---|
77 | try:
|
---|
78 | for row in reader:
|
---|
79 | finished = False
|
---|
80 | while not finished:
|
---|
81 | (featureType, zNumber, contig, lend, rend, orientation,
|
---|
82 | segmentType, oIslandNumber, geneName, note, function, product,
|
---|
83 | translationNotes) = row
|
---|
84 | lend = int(lend)
|
---|
85 | rend = int(rend)
|
---|
86 | # Make a forward orientation mark as positive
|
---|
87 | if orientation == '>':
|
---|
88 | order = True
|
---|
89 | else:
|
---|
90 | order = False
|
---|
91 |
|
---|
92 | # Living the world upside down to order is off
|
---|
93 | if order:
|
---|
94 | start_pos = base_shift + lend - 1
|
---|
95 | stop_pos = base_shift + rend - 3
|
---|
96 | else:
|
---|
97 | start_pos = base_shift + rend -3
|
---|
98 | stop_pos = base_shift + lend - 1
|
---|
99 |
|
---|
100 | start_codon = get_codon(seq,start_pos,order)
|
---|
101 | stop_codon = get_codon(seq,stop_pos,order)
|
---|
102 | start_match = start_codon in start_codons
|
---|
103 | stop_match = stop_codon in stop_codons
|
---|
104 |
|
---|
105 | print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation),
|
---|
106 | print "%s, %s" % (start_match, stop_match)
|
---|
107 |
|
---|
108 | # Check for fucked up offsets, shifts
|
---|
109 | start_shift = None
|
---|
110 | stop_shift = None
|
---|
111 | # Technically speaking should the cope only the reading frames,
|
---|
112 | # but what the heck looking if 'free'
|
---|
113 | search_range = 10
|
---|
114 | if not start_match:
|
---|
115 | # Somewhere else in the reading frame?
|
---|
116 | for r in range(-search_range,search_range+1):
|
---|
117 | l = start_pos + r
|
---|
118 | t = get_codon(seq,l,order)
|
---|
119 | new_match = t in start_codons
|
---|
120 | if new_match:
|
---|
121 | start_shift = r
|
---|
122 | print "# Start codon reading frame match - shift %i: " % r,
|
---|
123 | print "%i, %s: %s" % (l,t,new_match)
|
---|
124 | if not stop_match:
|
---|
125 | # Somewhere else in the reading frame?
|
---|
126 | for r in range(-search_range,search_range+1):
|
---|
127 | l = stop_pos + r
|
---|
128 | t = get_codon(seq,l,order)
|
---|
129 | new_match = t in stop_codons
|
---|
130 | if new_match:
|
---|
131 | stop_shift = r
|
---|
132 | print "# Stop codon reading frame match - shift %i: " % r,
|
---|
133 | print "%i, %s: %s" % (l,t,new_match)
|
---|
134 |
|
---|
135 |
|
---|
136 | # Both wrong is something screwy in the data, check offset fix
|
---|
137 | if not start_match and not stop_match:
|
---|
138 | if stop_shift != None and start_shift == stop_shift:
|
---|
139 | print "# Matching shift %i" % start_shift
|
---|
140 | base_shift += start_shift
|
---|
141 | finished = False
|
---|
142 | continue
|
---|
143 | # Exercise left to the reader
|
---|
144 | if (stop_counter > stop_limit):
|
---|
145 | return
|
---|
146 | else:
|
---|
147 | stop_counter += 1
|
---|
148 | finished = True
|
---|
149 | else:
|
---|
150 | stop_counter = 0
|
---|
151 | finished = True
|
---|
152 |
|
---|
153 | except csv.Error, e:
|
---|
154 | sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
|
---|
155 |
|
---|
156 |
|
---|
157 |
|
---|
158 | test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
|
---|
159 | v = m.viterbi(test_seq)
|
---|
160 | #print "fairness of test_seq: ", v
|
---|
161 |
|
---|
162 |
|
---|
163 | # Validation
|
---|
164 | val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
|
---|
165 | v = m.viterbi(val_seq)
|
---|
166 | #print "Test sequence: ", v
|
---|
167 |
|
---|
168 | # XXX: Results
|
---|
169 |
|
---|
170 |
|
---|
171 |
|
---|
172 | if __name__ == "__main__":
|
---|
173 | # Load data
|
---|
174 | try:
|
---|
175 | handle = open(sys.argv[1],"rU")
|
---|
176 | except IndexError:
|
---|
177 | print "Usage %s <data_file>" % (sys.argv[0])
|
---|
178 | sys.exit(64)
|
---|
179 | except IOError:
|
---|
180 | print "Unable to open '%s'" % (sys.argv[1])
|
---|
181 | sys.exit(64)
|
---|
182 |
|
---|
183 | seq = handle.read()
|
---|
184 | handle.close()
|
---|
185 | ecoli_hmm(seq)
|
---|