#!/usr/bin/env python from Bio import SeqIO,Seq from Bio import Alphabet from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna import Bio.Data.CodonTable from MultiReplace import MultiReplace def parse_file(file): handle = open("data/AE005174v2-1.fas", "rU") for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna): # How to translate damm thing into plain nucleic acid codes # http://en.wikipedia.org/wiki/FASTA_format stupid = seq_record.seq.__str__() fasta_translate = { 'r' : 'ga', # purine 'y' : 'tc', # pyrimide 'k' : 'gt', # keto 'm' : 'ac', # amino 's' : 'gc', # strong 'w' : 'at', # weak 'b' : 'gtc', 'd' : 'gat', 'h' : 'act', 'v' : 'gca', } r = MultiReplace(fasta_translate) stupid = r.replace(stupid) pdict = {} for n in range(1, len(stupid)): protein = stupid[n] if not pdict.has_key(protein): pdict[protein] = 1 else: pdict[protein] += 1 print pdict file1 = parse_file("data/AE005174v2-1.fas") file2 = parse_file("data/AE005174v2-2.fas")