#!/usr/bin/env python
#
# Rick van der Zwet
import sys
import csv
import string

from MultiReplace  import MultiReplace

DEBUG = False

# gtg is small chance
start_codons = ['atg', 'gtg']
stop_codons = ['taa', 'tga', 'tag']

dna_flip = string.maketrans('atgc','tacg')
def get_codon(seq, position, order):
    codon = seq[position:position+3]
    if not order:
        # When living on the other side of the string make sure to flip 
        # around before comparison
        codon = codon[::-1].translate(dna_flip)
    return(codon)

def gene_detect():

    # Training and testing
    # A -- T, G -- C
    # lend = Left End
    # rend = Right End
    edl_file = 'data/edl_genes.csv'

    # Counter limit of how many 'hard' errors are allowed before bailing out
    stop_limit = 100

    # Current shifting offset
    base_shift = 0
    stop_counter = 0
    
    # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
    contig_seq = {}
    abstract_contig = {}

    try:
        reader = csv.reader(open(edl_file,"rU"))
        reader.next()
        for row in reader:
            finished = False
            while not finished:
                (featureType, zNumber, contig, lend, rend, orientation, 
                segmentType, oIslandNumber, geneName, note, function, product, 
                translationNotes) = row
                lend = int(lend)
                rend = int(rend)
                if not contig_seq.has_key(contig):
                    # Load data
                    try:
                        handle = open(contig + ".raw","rU")
                        contig_seq[contig] = handle.read()
                        abstract_contig[contig] = list('.' * len(contig_seq[contig]))
                        handle.close()
                    except IOError:
                        print "Unable to open '%s'" % (sys.argv[1])
                        sys.exit(64)
                seq = contig_seq[contig]
                
                # Make a forward orientation mark as positive
                if orientation == '>':
                    order = True
                else:
                    order = False

                # Living the world upside down to order is off
                if order:
                    start_pos = base_shift + lend - 1
                    stop_pos = base_shift + rend - 3
                else:
                    start_pos = base_shift + rend -3
                    stop_pos = base_shift + lend - 1
                    
                start_codon = get_codon(seq,start_pos,order)
                stop_codon = get_codon(seq,stop_pos,order)
                start_match = start_codon in start_codons
                stop_match = stop_codon in stop_codons

                if DEBUG: print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % \
                    (geneName,lend,rend,start_pos, stop_pos, contig,
                    start_codon, stop_codon, orientation, base_shift),
                if DEBUG: print "%s, %s" % (start_match, stop_match)

                if start_match and stop_match:
                    # Used for training DNA sequences and overlaps
                    print "Gene found %s, %s" % (start_pos, stop_pos)
                    if start_pos < stop_pos:
                        abstract_contig[contig][start_pos:start_pos+3] = ['<'] * 3
                        abstract_contig[contig][start_pos+3:stop_pos] = ['|'] * (stop_pos - (start_pos + 3))
                        abstract_contig[contig][stop_pos:stop_pos+3] = ['>'] * 3
                    break

                if not start_match or not stop_match:
                    if DEBUG: print "### Function (comment):", function
    
                # Check for fucked up offsets, shifts
                start_shift = []
                stop_shift = [] 
                # Technically speaking should the cope only the reading frames,
                # but what the heck looking if 'free', but make sure to include
                # the original frames as well
                search_range = abs(base_shift) + 3
                if not start_match:
                    # Somewhere else in the reading frame?
                    matches = []
                    for r in range(-search_range,search_range+1):
                        l = start_pos + r
                        t = get_codon(seq,l,order)
                        new_match = t in start_codons
                        if new_match:
                            matches.append("%i:%s" % (r, t))
                            start_shift.append(r)
                    if start_shift:
                        if DEBUG: print "# Start codon reading frame matches: ", ",".join(matches)
                if not stop_match:
                    # Somewhere else in the reading frame?
                    matches = []
                    for r in range(-search_range,search_range+1):
                        l = stop_pos + r
                        t = get_codon(seq,l,order)
                        new_match = t in stop_codons
                        if new_match:
                            stop_shift.append(r)
                            matches.append("%i:%s" % (r, t))
                    if stop_shift:
                        if DEBUG: print "# Stop codon reading frame matches: ", ",".join(matches)


                # Both wrong is something screwy in the data, check offset fix
                if not start_match and not stop_match:
                    common_shift = list(set(start_shift) & set(stop_shift))
                    if common_shift:
                        if DEBUG: print "# Matching shifts: " + ",".join(map(str,common_shift))
                        # Get the value closest to 0 for shifting purposes
                        # Currently asume no negative shifts
                        base_shift += min(common_shift)
                        finished = False
                        continue
                    # Exercise left to the reader
                    if (stop_counter > stop_limit):
                        return
                    else:
                        stop_counter += 1
                        finished = True
                else:
                    stop_counter = 0
                    finished = True

    except csv.Error, e:
        sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))

    for contig in abstract_contig.keys():
        handle = open(contig + "-gene.raw", "w")
        handle.write(''.join(abstract_contig[contig]))
        handle.close()

if __name__ == "__main__":
    gene_detect()