#!/usr/bin/env python # http://en.wikipedia.org/wiki/Stop_codon # http://en.wikipedia.org/wiki/Escherichia_coli # http://en.wikipedia.org/wiki/Open_reading_frame import ghmm import sys def ecoli_hmm(seq): """Try to find genes inside e sequence using a HMM""" # Model 4 bases A C G T and unknown state N sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ]) print sigma # XXX: Proper values, based of statistics # The transition matrix A is chosen such that it reflects the statistics # Part one will try to get us into a gene, part two will tell us when to # get out of it A = [[0.6,0.4], [0.3, 0.7]] # The emission probabilities matrix is modeled after the statistics with a # total of 1 etogene = [ 0.2, 0.3, 0.3, 0.1, 0.2 ] eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ] B = [etogene,eingene] # Initial distribution favors outside pi = [0.9, 0.1] m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi) print "Initial HMM", m obs_seq = m.sampleSingle(20) print "Observation sequence : ", obs_seq obs = map(sigma.external, obs_seq) print "Observations : ", ''.join(obs) # XXX: Training # XXX: Validation # XXX: Results if __name__ == "__main__": # Load data try: handle = open(sys.argv[1],"rU") except IndexError: print "Usage %s " % (sys.argv[0]) sys.exit(64) except IOError: print "Unable to open '%s'" % (sys.argv[1]) sys.exit(64) seq = handle.read() handle.close() ecoli_hmm(seq)