#!/usr/bin/env python

# http://en.wikipedia.org/wiki/Stop_codon
# http://en.wikipedia.org/wiki/Escherichia_coli
# http://en.wikipedia.org/wiki/Open_reading_frame
# http://nl.wikipedia.org/wiki/Genetische_code

import sys
import csv
import string

# http://ghmm.sourceforge.net/
import ghmm

from MultiReplace  import MultiReplace

# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
fasta_translate = { 
    'r' : 'ga', # purine
    'y' : 'tc', # pyrimide
    'k' : 'gt', # keto
    'm' : 'ac', # amino 
    's' : 'gc', # strong
    'w' : 'at', # weak
    'b' : 'gtc',
    'd' : 'gat',
    'h' : 'act',
    'v' : 'gca',
    }

dna_ascii_translate = {
    '0' : '*',
    '1' : '<',
    '2' : '<',
    '3' : '<',
    '4' : '-',
    '5' : '>',
    '6' : '>',
    '7' : '>',
    }

dna_ascii = MultiReplace(dna_ascii_translate)

def pretty_print(test_seq, ans_seq, v, length=70, parts=10, seperator=''):
    """ Pretty printing of output for verification purposes """
    for i in range(0,len(v[0]),length):
        seq = []
        ans = []
        result = []
        for j in range(0,length,parts):
            t = i + j
            seq.append(test_seq[t:t+parts])
            ans.append(ans_seq[t:t+parts])
            result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts]))))

        print seperator.join(seq)
        print seperator.join(ans)
        print seperator.join(result)
        print ''
    print "fairness of test_seq: ", v[1]


def ecoli_hmm():
    """Try to find genes inside e sequence using a HMM"""
    # Model 4 bases A C G T and unknown state N
    sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
    print sigma
   
    # XXX: Proper values, based of statistics
    # The transition matrix A is chosen such that it reflects the statistics
    # Probalities from moving from one state to an other
    # 0) Outer-gene  : will try to get us into a gene
    # 1) Start-codon : beginning of gene - part 1
    # 2) Start-codon : beginning of gene - part 2
    # 3) Start-codon : beginning of gene - part 3
    # 4) Inside-gene : in the gene
    # 5) Stop-codon  : end of gene - part 1
    # 6) Stop-codon  : end of gene - part 2
    # 7) Stop-codon  : end of gene - part 3
    A = [
            [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 
            [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 
            [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], 
            [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], 
            [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0], 
            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 
            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], 
            [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 
        ]

    # XXX: Proper values, based of statistics
    # The emission probabilities matrix is modeled after the statistics
    # (['a', 'c', 'g', 't', 'n' ]
    B = [
            # e.g. state 0 -> emission probability
            [0.2, 0.2, 0.2, 0.2, 0.2] , 
            [0.9, 0.0, 0.1, 0.0, 0.0] , 
            [0.0, 0.0, 0.0, 1.0, 0.0] , 
            [0.0, 0.0, 1.0, 0.0, 0.0] , 
            [0.2, 0.2, 0.2, 0.2, 0.2] , 
            [0.0, 0.0, 0.0, 1.0, 0.0] , 
            [0.7, 0.0, 0.3, 0.0, 0.0] , 
            [0.7, 0.0, 0.3, 0.0, 0.0] , 
        ]
    
    # Initial distribution favors outside
    pi  = [0.9] + [0.1/7] * 7

    m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
    print "Initial HMM"
    print m

    obs_seq = m.sampleSingle(20)
    print "Observation sequence : ", obs_seq
    obs = map(sigma.external, obs_seq)
    print "Observations         : ", ''.join(obs)

    answer = {}
    handle = open('AE005174v2-2-gene.raw', 'rU')
    answer['AE005174v2-2'] = handle.read()
    handle.close()

    contig_seq = {}
    handle = open('AE005174v2-2.raw', 'rU')
    contig_seq['AE005174v2-2'] = handle.read()
    handle.close()

    handle = open('AE005174v2-1.raw', 'rU')
    contig_seq['AE005174v2-1'] = handle.read()
    handle.close()

    test_seq = contig_seq['AE005174v2-2'][0:490]
    ans_seq = answer['AE005174v2-2'][0:490]
    test_eseq=ghmm.EmissionSequence(sigma,list(test_seq))
    v = m.viterbi(test_eseq)
    pretty_print(test_seq, ans_seq, v)


    # Train sequence
    print "Training baumWelch"
    train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1']))
    v = m.baumWelch(train_seq)
    print m
    

    print "Results after training sequence..."
    v = m.viterbi(test_eseq)
    pretty_print(test_seq, ans_seq, v)
    

    # XXX: Results

if __name__ == "__main__":
    ecoli_hmm()