#!/usr/bin/env python

# http://en.wikipedia.org/wiki/Stop_codon
# http://en.wikipedia.org/wiki/Escherichia_coli
# http://en.wikipedia.org/wiki/Open_reading_frame
# http://nl.wikipedia.org/wiki/Genetische_code

import sys
import csv
import string

# http://ghmm.sourceforge.net/
import ghmm

# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
fasta_translate = { 
    'r' : 'ga', # purine
    'y' : 'tc', # pyrimide
    'k' : 'gt', # keto
    'm' : 'ac', # amino 
    's' : 'gc', # strong
    'w' : 'at', # weak
    'b' : 'gtc',
    'd' : 'gat',
    'h' : 'act',
    'v' : 'gca',
    }

DEBUG = True

dna_flip = string.maketrans('atgc','tacg')
def get_codon(seq, position, order):
    codon = seq[position:position+3]
    if not order:
        # When living on the other side of the string make sure to flip 
        # around before comparison
        codon = codon[::-1].translate(dna_flip)
    return(codon)


def ecoli_hmm():
    """Try to find genes inside e sequence using a HMM"""
    # Model 4 bases A C G T and unknown state N
    sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
    print sigma
   
    # XXX: Proper values, based of statistics
    # The transition matrix A is chosen such that it reflects the statistics
    # Part one will try to get us into a gene, part two will tell us when to
    # get out of it
    A = [[0.6,0.4], [0.3, 0.7]]

    # The emission probabilities matrix is modeled after the statistics with a
    # total of 1
    etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
    eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]

    B = [etogene,eingene]
    
    # Initial distribution favors outside
    pi  = [0.9, 0.1]

    m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
    print "Initial HMM", m

    obs_seq = m.sampleSingle(20)
    print "Observation sequence : ", obs_seq
    obs = map(sigma.external, obs_seq)
    print "Observations         : ", ''.join(obs)

    # Start codons
    # atg, (small chance) gtg
    # Stop codons
    # taa, tga
    # tag
    # lend = Left End
    # rend = Right End

    # Training and testing
    # A -- T, G -- C
    start_codons = ['atg', 'gtg']
    stop_codons = ['taa', 'tga', 'tag']
    edl_file = 'data/edl_genes.csv'

    # Counter limit of how many 'hard' errors are allowed before bailing out
    stop_limit = 100

    # Current shifting offset
    base_shift = 0
    stop_counter = 0
    
    # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
    contig_seq = {}

    reader = csv.reader(open(edl_file,"rU"))
    reader.next()
    try:
        for row in reader:
            finished = False
            while not finished:
                (featureType, zNumber, contig, lend, rend, orientation, 
                segmentType, oIslandNumber, geneName, note, function, product, 
                translationNotes) = row
                lend = int(lend)
                rend = int(rend)
                if not contig_seq.has_key(contig):
                    # Load data
                    try:
                        handle = open(contig + ".raw","rU")
                        contig_seq[contig] = handle.read()
                        handle.close()
                    except IOError:
                        print "Unable to open '%s'" % (sys.argv[1])
                        sys.exit(64)
                seq = contig_seq[contig]
                
                # Make a forward orientation mark as positive
                if orientation == '>':
                    order = True
                else:
                    order = False

                # Living the world upside down to order is off
                if order:
                    start_pos = base_shift + lend - 1
                    stop_pos = base_shift + rend - 3
                else:
                    start_pos = base_shift + rend -3
                    stop_pos = base_shift + lend - 1
                    
                start_codon = get_codon(seq,start_pos,order)
                stop_codon = get_codon(seq,stop_pos,order)
                start_match = start_codon in start_codons
                stop_match = stop_codon in stop_codons

                print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
                print "%s, %s" % (start_match, stop_match)
                if not start_match or not stop_match:
                    print "### Function (comment):", function
    
                # Check for fucked up offsets, shifts
                start_shift = []
                stop_shift = [] 
                # Technically speaking should the cope only the reading frames,
                # but what the heck looking if 'free', but make sure to include the original frames as well
                search_range = abs(base_shift) + 3
                if not start_match:
                    # Somewhere else in the reading frame?
                    matches = []
                    for r in range(-search_range,search_range+1):
                        l = start_pos + r
                        t = get_codon(seq,l,order)
                        new_match = t in start_codons
                        if new_match:
                            matches.append("%i:%s" % (r, t))
                            start_shift.append(r)
                    if start_shift:
                        print "# Start codon reading frame matches: ", ",".join(matches)
                if not stop_match:
                    # Somewhere else in the reading frame?
                    matches = []
                    for r in range(-search_range,search_range+1):
                        l = stop_pos + r
                        t = get_codon(seq,l,order)
                        new_match = t in stop_codons
                        if new_match:
                            stop_shift.append(r)
                            matches.append("%i:%s" % (r, t))
                    if stop_shift:
                        print "# Stop codon reading frame matches: ", ",".join(matches)


                # Both wrong is something screwy in the data, check offset fix
                if not start_match and not stop_match:
                    common_shift = list(set(start_shift) & set(stop_shift))
                    if common_shift:
                        print "# Matching shifts: " + ",".join(map(str,common_shift))
                        # Get the value closest to 0 for shifting purposes
                        # Currently asume no negative shifts
                        base_shift += min(common_shift)
                        finished = False
                        continue
                    # Exercise left to the reader
                    if (stop_counter > stop_limit):
                        return
                    else:
                        stop_counter += 1
                        finished = True
                else:
                    stop_counter = 0
                    finished = True

    except csv.Error, e:
        sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))


    test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
    v = m.viterbi(test_seq)
    #print "fairness of test_seq: ", v

    
    # Validation
    val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
    v = m.viterbi(val_seq)
    #print "Test sequence: ", v

    # XXX: Results


if __name__ == "__main__":
    ecoli_hmm()