#!/usr/bin/env python
#
# Parse 2 FASTA files and print statistics
# BSDLicence
# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
from Bio import SeqIO,Seq
from Bio import Alphabet
from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
import Bio.Data.CodonTable
from MultiReplace  import MultiReplace
from reading_frames import reading_frames

# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our
# case we map them  all to unknown

fasta_translate = { 
    'r' : 'n', # purine
    'y' : 'n', # pyrimide
    'k' : 'n', # keto
    'm' : 'n', # amino 
    's' : 'n', # strong
    'w' : 'n', # weak
    'b' : 'n',
    'd' : 'n',
    'h' : 'n',
    'v' : 'n',
    }
    
fasta = MultiReplace(fasta_translate)

def parse_file(file):
    handle = open(file, "rU")
    for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
        # How to translate damm thing into plain nucleic acid codes
        # http://en.wikipedia.org/wiki/FASTA_format
        retval = seq_record.seq.__str__()
    
    handle.close()
    return(retval)

def stats(seq):
    pdict = {}
    for n in range(1, len(seq)):
        protein = seq[n]
        if not pdict.has_key(protein):
            pdict[protein] = 1
        else:
            pdict[protein] += 1
    
    print pdict


def concat(head,tail,max_length=1000):
    "Concat two strings together removing common parts"
    l_head = len(head)
    l_tail = len(tail)

    # Start/Stop at the right time
    start = 1
    if (l_head < l_tail):
        stop = l_head + 1
    else:
        stop = l_tail + 1

    # Make sure not to run for-ever
    if (stop > max_length):
        stop = max_length

    # Find largest common part
    # XXX: Not very effient (on very large overlap sets
    for i in reversed(range(start,stop)):
        #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
        if head[-i:] == tail[0:i]:
            return((i,(tail[0:i]),head + tail[i:]))

    # None found return full part
    return(-1,'',head + tail)

# File names
file1 = "data/AE005174v2-1.fas"
file2 = "data/AE005174v2-2.fas"

# Get data
seq1 = parse_file(file1)
seq2 = parse_file(file2)

# Simplify answers
seq1 = fasta.replace(seq1)
seq2 = fasta.replace(seq2)

# Find overlap
(retval, common, result) = concat(seq2,seq1)
print retval, common

print file1
stats(seq1)
reading_frames(seq1)
print file2
stats(seq2)
reading_frames(seq2)


# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
# seq2, so lets' put that into the the statistics as well. Due to circular
# nature, does not matter wether we add it in the beginning or in the end
print "Total (inc 4 kbs gap (n))"
result = result + "n" * 4000;
stats(result)
reading_frames(result)

# Write to file for later further processing
out = open("AE005174v2-1.raw","w")
out.write(seq1)
out.close()

out = open("AE005174v2-2.raw","w")
out.write(seq2)
out.close()

out = open("full_contig.raw","w")
out.write(result)
out.close()