#!/usr/bin/env python # http://en.wikipedia.org/wiki/Stop_codon # http://en.wikipedia.org/wiki/Escherichia_coli # http://en.wikipedia.org/wiki/Open_reading_frame # http://nl.wikipedia.org/wiki/Genetische_code import sys import csv import string # http://ghmm.sourceforge.net/ import ghmm # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion fasta_translate = { 'r' : 'ga', # purine 'y' : 'tc', # pyrimide 'k' : 'gt', # keto 'm' : 'ac', # amino 's' : 'gc', # strong 'w' : 'at', # weak 'b' : 'gtc', 'd' : 'gat', 'h' : 'act', 'v' : 'gca', } DEBUG = True dna_flip = string.maketrans('atgc','tacg') def get_codon(seq, position, order): codon = seq[position:position+3] if not order: # When living on the other side of the string make sure to flip # around before comparison codon = codon[::-1].translate(dna_flip) return(codon) def ecoli_hmm(): """Try to find genes inside e sequence using a HMM""" # Model 4 bases A C G T and unknown state N sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ]) print sigma # XXX: Proper values, based of statistics # The transition matrix A is chosen such that it reflects the statistics # Part one will try to get us into a gene, part two will tell us when to # get out of it A = [[0.6,0.4], [0.3, 0.7]] # The emission probabilities matrix is modeled after the statistics with a # total of 1 etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ] eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ] B = [etogene,eingene] # Initial distribution favors outside pi = [0.9, 0.1] m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi) print "Initial HMM", m obs_seq = m.sampleSingle(20) print "Observation sequence : ", obs_seq obs = map(sigma.external, obs_seq) print "Observations : ", ''.join(obs) # Start codons # atg, (small chance) gtg # Stop codons # taa, tga # tag # lend = Left End # rend = Right End # Training and testing # A -- T, G -- C start_codons = ['atg', 'gtg'] stop_codons = ['taa', 'tga', 'tag'] edl_file = 'data/edl_genes.csv' # Counter limit of how many 'hard' errors are allowed before bailing out stop_limit = 100 # Current shifting offset base_shift = 0 stop_counter = 0 # Sequences of contig, XXX: Make it a FASTA / BioPython Parser contig_seq = {} reader = csv.reader(open(edl_file,"rU")) reader.next() try: for row in reader: finished = False while not finished: (featureType, zNumber, contig, lend, rend, orientation, segmentType, oIslandNumber, geneName, note, function, product, translationNotes) = row lend = int(lend) rend = int(rend) if not contig_seq.has_key(contig): # Load data try: handle = open(contig + ".raw","rU") contig_seq[contig] = handle.read() handle.close() except IOError: print "Unable to open '%s'" % (sys.argv[1]) sys.exit(64) seq = contig_seq[contig] # Make a forward orientation mark as positive if orientation == '>': order = True else: order = False # Living the world upside down to order is off if order: start_pos = base_shift + lend - 1 stop_pos = base_shift + rend - 3 else: start_pos = base_shift + rend -3 stop_pos = base_shift + lend - 1 start_codon = get_codon(seq,start_pos,order) stop_codon = get_codon(seq,stop_pos,order) start_match = start_codon in start_codons stop_match = stop_codon in stop_codons print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift), print "%s, %s" % (start_match, stop_match) if not start_match or not stop_match: print "### Function (comment):", function # Check for fucked up offsets, shifts start_shift = [] stop_shift = [] # Technically speaking should the cope only the reading frames, # but what the heck looking if 'free', but make sure to include the original frames as well search_range = abs(base_shift) + 3 if not start_match: # Somewhere else in the reading frame? matches = [] for r in range(-search_range,search_range+1): l = start_pos + r t = get_codon(seq,l,order) new_match = t in start_codons if new_match: matches.append("%i:%s" % (r, t)) start_shift.append(r) if start_shift: print "# Start codon reading frame matches: ", ",".join(matches) if not stop_match: # Somewhere else in the reading frame? matches = [] for r in range(-search_range,search_range+1): l = stop_pos + r t = get_codon(seq,l,order) new_match = t in stop_codons if new_match: stop_shift.append(r) matches.append("%i:%s" % (r, t)) if stop_shift: print "# Stop codon reading frame matches: ", ",".join(matches) # Both wrong is something screwy in the data, check offset fix if not start_match and not stop_match: common_shift = list(set(start_shift) & set(stop_shift)) if common_shift: print "# Matching shifts: " + ",".join(map(str,common_shift)) # Get the value closest to 0 for shifting purposes # Currently asume no negative shifts base_shift += min(common_shift) finished = False continue # Exercise left to the reader if (stop_counter > stop_limit): return else: stop_counter += 1 finished = True else: stop_counter = 0 finished = True except csv.Error, e: sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)])) v = m.viterbi(test_seq) #print "fairness of test_seq: ", v # Validation val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000])) v = m.viterbi(val_seq) #print "Test sequence: ", v # XXX: Results if __name__ == "__main__": ecoli_hmm()