source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 61

Last change on this file since 61 was 61, checked in by Rick van der Zwet, 15 years ago

Start hacking to try to understand the so 'simple' csv file :-)

  • Property svn:executable set to *
File size: 3.4 KB
Line 
1#!/usr/bin/env python
2
3# http://en.wikipedia.org/wiki/Stop_codon
4# http://en.wikipedia.org/wiki/Escherichia_coli
5# http://en.wikipedia.org/wiki/Open_reading_frame
6# http://nl.wikipedia.org/wiki/Genetische_code
7
8import ghmm
9import sys
10import csv
11import string
12
13
14def ecoli_hmm(seq):
15 """Try to find genes inside e sequence using a HMM"""
16 # Model 4 bases A C G T and unknown state N
17 sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
18 print sigma
19
20 # XXX: Proper values, based of statistics
21 # The transition matrix A is chosen such that it reflects the statistics
22 # Part one will try to get us into a gene, part two will tell us when to
23 # get out of it
24 A = [[0.6,0.4], [0.3, 0.7]]
25
26 # The emission probabilities matrix is modeled after the statistics with a
27 # total of 1
28 etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
29 eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
30
31 B = [etogene,eingene]
32
33 # Initial distribution favors outside
34 pi = [0.9, 0.1]
35
36 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
37 print "Initial HMM", m
38
39 obs_seq = m.sampleSingle(20)
40 print "Observation sequence : ", obs_seq
41 obs = map(sigma.external, obs_seq)
42 print "Observations : ", ''.join(obs)
43
44 # Start codons
45 # atg, (small chance) gtg
46 # Stop codons
47 # taa, tga
48 # tag
49 # lend = Left End
50 # rend = Right End
51
52 # Training
53 # A -- T, G -- C
54 start_condons = ['atg', 'gtg']
55 stop_condons = ['taa', 'tga', 'tag']
56 dna_flip = string.maketrans('atgc','tacg')
57 edl_file = 'data/edl_genes.csv'
58 reader = csv.reader(open(edl_file,"rU"))
59 reader.next()
60 try:
61 for row in reader:
62 (featureType, zNumber, contig, lend, rend, orientation,
63 segmentType, oIslandNumber, geneName, note, function, product,
64 translationNotes) = row
65 lend = int(lend) - 1
66 rend = int(rend)
67 # Make a forward orientation as positive
68 if orientation == '>':
69 order = True
70 else:
71 order = False
72
73 if order:
74 start_codon = seq[lend:lend+3]
75 stop_codon = seq[rend-3:rend]
76 else:
77 start_codon = seq[rend-3:rend].translate(dna_flip)[::-1]
78 stop_codon = seq[lend:lend+3].translate(dna_flip)[::-1]
79
80 print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation),
81
82 print "%s, %s" % (start_codon in start_condons, stop_codon in stop_condons)
83 if not start_codon in start_condons or not stop_codon in stop_condons:
84 return
85 except csv.Error, e:
86 sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
87
88
89
90 test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
91 v = m.viterbi(test_seq)
92 #print "fairness of test_seq: ", v
93
94
95 # Validation
96 val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
97 v = m.viterbi(val_seq)
98 #print "Test sequence: ", v
99
100 # XXX: Results
101
102
103
104if __name__ == "__main__":
105 # Load data
106 try:
107 handle = open(sys.argv[1],"rU")
108 except IndexError:
109 print "Usage %s <data_file>" % (sys.argv[0])
110 sys.exit(64)
111 except IOError:
112 print "Unable to open '%s'" % (sys.argv[1])
113 sys.exit(64)
114
115 seq = handle.read()
116 handle.close()
117 ecoli_hmm(seq)
Note: See TracBrowser for help on using the repository browser.