#!/usr/bin/env python # http://en.wikipedia.org/wiki/Stop_codon # http://en.wikipedia.org/wiki/Escherichia_coli # http://en.wikipedia.org/wiki/Open_reading_frame # http://nl.wikipedia.org/wiki/Genetische_code import sys import csv import string # http://ghmm.sourceforge.net/ import ghmm from MultiReplace import MultiReplace # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion fasta_translate = { 'r' : 'ga', # purine 'y' : 'tc', # pyrimide 'k' : 'gt', # keto 'm' : 'ac', # amino 's' : 'gc', # strong 'w' : 'at', # weak 'b' : 'gtc', 'd' : 'gat', 'h' : 'act', 'v' : 'gca', } # Transform the state to something human readable dna_ascii_translate = { '0' : '.', '1' : '<', '2' : '<', '3' : '<', '4' : '|', '5' : '>', '6' : '>', '7' : '>', } dna_ascii = MultiReplace(dna_ascii_translate) def pretty_print(test_seq, ans_seq, v, length=100, parts=10, seperator=''): """ Pretty printing of output for verification purposes """ for i in range(0,len(v[0]),length): seq = [] ans = [] result = [] for j in range(0,length,parts): t = i + j seq.append(test_seq[t:t+parts]) ans.append(ans_seq[t:t+parts]) result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts])))) print "ORG: " + seperator.join(seq) print "CSV: " + seperator.join(ans) print "HMM: " + seperator.join(result) print '' print "fairness of test_seq: ", v[1] def ecoli_hmm(): """Try to find genes inside e sequence using a HMM""" # Model 4 bases A C G T and unknown state N sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ]) print sigma # XXX: Proper values, based of statistics # The transition matrix A is chosen such that it reflects the statistics # Probalities from moving from one state to an other # 0) Outer-gene : will try to get us into a gene # 1) Start-codon : beginning of gene - part 1 # 2) Start-codon : beginning of gene - part 2 # 3) Start-codon : beginning of gene - part 3 # 4) Inside-gene : in the gene # 5) Stop-codon : end of gene - part 1 # 6) Stop-codon : end of gene - part 2 # 7) Stop-codon : end of gene - part 3 A = [ [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 0 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 1 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], # 2 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], # 3 [0.0, 0.0, 0.0, 0.0, 0.9, 0.1, 0.0, 0.0], # 4 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], # 5 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # 6 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 7 ] # XXX: Proper values, based of statistics # The emission probabilities matrix is modeled after the statistics # (['a', 'c', 'g', 't', 'n' ] B = [ # e.g. state 0 -> emission probability [0.2, 0.2, 0.2, 0.2, 0.2] , # 0 [0.9, 0.0, 0.1, 0.0, 0.0] , # 1 [0.0, 0.0, 0.0, 1.0, 0.0] , # 2 [0.0, 0.0, 1.0, 0.0, 0.0] , # 3 [0.2, 0.2, 0.2, 0.2, 0.2] , # 4 [0.0, 0.0, 0.0, 1.0, 0.0] , # 5 [0.7, 0.0, 0.3, 0.0, 0.0] , # 6 [0.7, 0.0, 0.3, 0.0, 0.0] , # 7 ] # Initial distribution favors outside pi = [0.9] + [0.1/7] * 7 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi) print "Initial HMM" print m.verboseStr() obs_seq = m.sampleSingle(20) print "Observation sequence : ", obs_seq obs = map(sigma.external, obs_seq) print "Observations : ", ''.join(obs) answer = {} handle = open('AE005174v2-2-gene.raw', 'rU') answer['AE005174v2-2'] = handle.read() handle.close() contig_seq = {} handle = open('AE005174v2-2.raw', 'rU') contig_seq['AE005174v2-2'] = handle.read() handle.close() handle = open('AE005174v2-1.raw', 'rU') contig_seq['AE005174v2-1'] = handle.read() handle.close() test_size = 1000; test_seq = contig_seq['AE005174v2-2'][0:test_size] ans_seq = answer['AE005174v2-2'][0:test_size] test_eseq=ghmm.EmissionSequence(sigma,list(test_seq)) v = m.viterbi(test_eseq) pretty_print(test_seq, ans_seq, v) # Train sequence print "Training baumWelch" train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1'][0:10000])) v = m.baumWelch(train_seq) print m.verboseStr() print "Results after training sequence..." v = m.viterbi(test_eseq) pretty_print(test_seq, ans_seq, v) # XXX: Results if __name__ == "__main__": ecoli_hmm()