#!/usr/bin/env python # # Rick van der Zwet import sys import csv import string from MultiReplace import MultiReplace DEBUG = False # gtg is small chance start_codons = ['atg', 'gtg'] stop_codons = ['taa', 'tga', 'tag'] dna_flip = string.maketrans('atgc','tacg') def get_codon(seq, position, order): codon = seq[position:position+3] if not order: # When living on the other side of the string make sure to flip # around before comparison codon = codon[::-1].translate(dna_flip) return(codon) def gene_detect(): # Training and testing # A -- T, G -- C # lend = Left End # rend = Right End edl_file = 'data/edl_genes.csv' # Counter limit of how many 'hard' errors are allowed before bailing out stop_limit = 100 # Current shifting offset base_shift = 0 stop_counter = 0 # Sequences of contig, XXX: Make it a FASTA / BioPython Parser contig_seq = {} abstract_contig = {} try: reader = csv.reader(open(edl_file,"rU")) reader.next() for row in reader: finished = False while not finished: (featureType, zNumber, contig, lend, rend, orientation, segmentType, oIslandNumber, geneName, note, function, product, translationNotes) = row lend = int(lend) rend = int(rend) if not contig_seq.has_key(contig): # Load data try: handle = open(contig + ".raw","rU") contig_seq[contig] = handle.read() abstract_contig[contig] = list('.' * len(contig_seq[contig])) handle.close() except IOError: print "Unable to open '%s'" % (sys.argv[1]) sys.exit(64) seq = contig_seq[contig] # Make a forward orientation mark as positive if orientation == '>': order = True else: order = False # Living the world upside down to order is off if order: start_pos = base_shift + lend - 1 stop_pos = base_shift + rend - 3 else: start_pos = base_shift + rend -3 stop_pos = base_shift + lend - 1 start_codon = get_codon(seq,start_pos,order) stop_codon = get_codon(seq,stop_pos,order) start_match = start_codon in start_codons stop_match = stop_codon in stop_codons if DEBUG: print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % \ (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift), if DEBUG: print "%s, %s" % (start_match, stop_match) if start_match and stop_match: # Used for training DNA sequences and overlaps print "Gene found %s, %s" % (start_pos, stop_pos) if start_pos < stop_pos: abstract_contig[contig][start_pos:start_pos+3] = ['<'] * 3 abstract_contig[contig][start_pos+3:stop_pos] = ['|'] * (stop_pos - (start_pos + 3)) abstract_contig[contig][stop_pos:stop_pos+3] = ['>'] * 3 break if not start_match or not stop_match: if DEBUG: print "### Function (comment):", function # Check for fucked up offsets, shifts start_shift = [] stop_shift = [] # Technically speaking should the cope only the reading frames, # but what the heck looking if 'free', but make sure to include # the original frames as well search_range = abs(base_shift) + 3 if not start_match: # Somewhere else in the reading frame? matches = [] for r in range(-search_range,search_range+1): l = start_pos + r t = get_codon(seq,l,order) new_match = t in start_codons if new_match: matches.append("%i:%s" % (r, t)) start_shift.append(r) if start_shift: if DEBUG: print "# Start codon reading frame matches: ", ",".join(matches) if not stop_match: # Somewhere else in the reading frame? matches = [] for r in range(-search_range,search_range+1): l = stop_pos + r t = get_codon(seq,l,order) new_match = t in stop_codons if new_match: stop_shift.append(r) matches.append("%i:%s" % (r, t)) if stop_shift: if DEBUG: print "# Stop codon reading frame matches: ", ",".join(matches) # Both wrong is something screwy in the data, check offset fix if not start_match and not stop_match: common_shift = list(set(start_shift) & set(stop_shift)) if common_shift: if DEBUG: print "# Matching shifts: " + ",".join(map(str,common_shift)) # Get the value closest to 0 for shifting purposes # Currently asume no negative shifts base_shift += min(common_shift) finished = False continue # Exercise left to the reader if (stop_counter > stop_limit): return else: stop_counter += 1 finished = True else: stop_counter = 0 finished = True except csv.Error, e: sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) for contig in abstract_contig.keys(): handle = open(contig + "-gene.raw", "w") handle.write(''.join(abstract_contig[contig])) handle.close() if __name__ == "__main__": gene_detect()