#!/usr/bin/env python # http://en.wikipedia.org/wiki/Stop_codon # http://en.wikipedia.org/wiki/Escherichia_coli # http://en.wikipedia.org/wiki/Open_reading_frame # http://nl.wikipedia.org/wiki/Genetische_code import sys import csv import string # http://ghmm.sourceforge.net/ import ghmm from MultiReplace import MultiReplace # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion fasta_translate = { 'r' : 'ga', # purine 'y' : 'tc', # pyrimide 'k' : 'gt', # keto 'm' : 'ac', # amino 's' : 'gc', # strong 'w' : 'at', # weak 'b' : 'gtc', 'd' : 'gat', 'h' : 'act', 'v' : 'gca', } dna_ascii_translate = { '0' : '*', '1' : '<', '2' : '<', '3' : '<', '4' : '-', '5' : '>', '6' : '>', '7' : '>', } dna_ascii = MultiReplace(dna_ascii_translate) def pretty_print(test_seq, ans_seq, v, length=70, parts=10, seperator=''): """ Pretty printing of output for verification purposes """ for i in range(0,len(v[0]),length): seq = [] ans = [] result = [] for j in range(0,length,parts): t = i + j seq.append(test_seq[t:t+parts]) ans.append(ans_seq[t:t+parts]) result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts])))) print seperator.join(seq) print seperator.join(ans) print seperator.join(result) print '' print "fairness of test_seq: ", v[1] def ecoli_hmm(): """Try to find genes inside e sequence using a HMM""" # Model 4 bases A C G T and unknown state N sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ]) print sigma # XXX: Proper values, based of statistics # The transition matrix A is chosen such that it reflects the statistics # Probalities from moving from one state to an other # 0) Outer-gene : will try to get us into a gene # 1) Start-codon : beginning of gene - part 1 # 2) Start-codon : beginning of gene - part 2 # 3) Start-codon : beginning of gene - part 3 # 4) Inside-gene : in the gene # 5) Stop-codon : end of gene - part 1 # 6) Stop-codon : end of gene - part 2 # 7) Stop-codon : end of gene - part 3 A = [ [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ] # XXX: Proper values, based of statistics # The emission probabilities matrix is modeled after the statistics # (['a', 'c', 'g', 't', 'n' ] B = [ # e.g. state 0 -> emission probability [0.2, 0.2, 0.2, 0.2, 0.2] , [0.9, 0.0, 0.1, 0.0, 0.0] , [0.0, 0.0, 0.0, 1.0, 0.0] , [0.0, 0.0, 1.0, 0.0, 0.0] , [0.2, 0.2, 0.2, 0.2, 0.2] , [0.0, 0.0, 0.0, 1.0, 0.0] , [0.7, 0.0, 0.3, 0.0, 0.0] , [0.7, 0.0, 0.3, 0.0, 0.0] , ] # Initial distribution favors outside pi = [0.9] + [0.1/7] * 7 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi) print "Initial HMM" print m obs_seq = m.sampleSingle(20) print "Observation sequence : ", obs_seq obs = map(sigma.external, obs_seq) print "Observations : ", ''.join(obs) answer = {} handle = open('AE005174v2-2-gene.raw', 'rU') answer['AE005174v2-2'] = handle.read() handle.close() contig_seq = {} handle = open('AE005174v2-2.raw', 'rU') contig_seq['AE005174v2-2'] = handle.read() handle.close() handle = open('AE005174v2-1.raw', 'rU') contig_seq['AE005174v2-1'] = handle.read() handle.close() test_seq = contig_seq['AE005174v2-2'][0:490] ans_seq = answer['AE005174v2-2'][0:490] test_eseq=ghmm.EmissionSequence(sigma,list(test_seq)) v = m.viterbi(test_eseq) pretty_print(test_seq, ans_seq, v) # Train sequence print "Training baumWelch" train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1'])) v = m.baumWelch(train_seq) print m print "Results after training sequence..." v = m.viterbi(test_eseq) pretty_print(test_seq, ans_seq, v) # XXX: Results if __name__ == "__main__": ecoli_hmm()