#!/usr/bin/env python # # Parse 2 FASTA files and print statistics # BSDLicence # Rick van der Zwet - 0433373 - from Bio import SeqIO,Seq from Bio import Alphabet from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna import Bio.Data.CodonTable from MultiReplace import MultiReplace from reading_frames import reading_frames # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our # case we map them all to unknown fasta_translate = { 'r' : 'n', # purine 'y' : 'n', # pyrimide 'k' : 'n', # keto 'm' : 'n', # amino 's' : 'n', # strong 'w' : 'n', # weak 'b' : 'n', 'd' : 'n', 'h' : 'n', 'v' : 'n', } fasta = MultiReplace(fasta_translate) def parse_file(file): handle = open(file, "rU") for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna): # How to translate damm thing into plain nucleic acid codes # http://en.wikipedia.org/wiki/FASTA_format retval = seq_record.seq.__str__() handle.close() return(retval) def stats(seq): pdict = {} for n in range(1, len(seq)): protein = seq[n] if not pdict.has_key(protein): pdict[protein] = 1 else: pdict[protein] += 1 print pdict def concat(head,tail,max_length=1000): "Concat two strings together removing common parts" l_head = len(head) l_tail = len(tail) # Start/Stop at the right time start = 1 if (l_head < l_tail): stop = l_head + 1 else: stop = l_tail + 1 # Make sure not to run for-ever if (stop > max_length): stop = max_length # Find largest common part # XXX: Not very effient (on very large overlap sets for i in reversed(range(start,stop)): #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i]) if head[-i:] == tail[0:i]: return((i,(tail[0:i]),head + tail[i:])) # None found return full part return(-1,'',head + tail) # File names file1 = "data/AE005174v2-1.fas" file2 = "data/AE005174v2-2.fas" # Get data seq1 = parse_file(file1) seq2 = parse_file(file2) # Simplify answers seq1 = fasta.replace(seq1) seq2 = fasta.replace(seq2) # Find overlap (retval, common, result) = concat(seq2,seq1) print retval, common print file1 stats(seq1) reading_frames(seq1) print file2 stats(seq2) reading_frames(seq2) # Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and # seq2, so lets' put that into the the statistics as well. Due to circular # nature, does not matter wether we add it in the beginning or in the end print "Total (inc 4 kbs gap (n))" result = result + "n" * 4000; stats(result) reading_frames(result) # Write to file for later further processing out = open("AE005174v2-1.raw","w") out.write(seq1) out.close() out = open("AE005174v2-2.raw","w") out.write(seq2) out.close() out = open("full_contig.raw","w") out.write(result) out.close()