[64] | 1 | #!/usr/bin/env python
|
---|
| 2 | #
|
---|
| 3 | # Rick van der Zwet
|
---|
| 4 | import sys
|
---|
| 5 | import csv
|
---|
| 6 | import string
|
---|
| 7 |
|
---|
| 8 | from MultiReplace import MultiReplace
|
---|
| 9 |
|
---|
| 10 | DEBUG = False
|
---|
| 11 |
|
---|
| 12 | # gtg is small chance
|
---|
| 13 | start_codons = ['atg', 'gtg']
|
---|
| 14 | stop_codons = ['taa', 'tga', 'tag']
|
---|
| 15 |
|
---|
| 16 | dna_flip = string.maketrans('atgc','tacg')
|
---|
| 17 | def get_codon(seq, position, order):
|
---|
| 18 | codon = seq[position:position+3]
|
---|
| 19 | if not order:
|
---|
| 20 | # When living on the other side of the string make sure to flip
|
---|
| 21 | # around before comparison
|
---|
| 22 | codon = codon[::-1].translate(dna_flip)
|
---|
| 23 | return(codon)
|
---|
| 24 |
|
---|
| 25 | def gene_detect():
|
---|
| 26 |
|
---|
| 27 | # Training and testing
|
---|
| 28 | # A -- T, G -- C
|
---|
| 29 | # lend = Left End
|
---|
| 30 | # rend = Right End
|
---|
| 31 | edl_file = 'data/edl_genes.csv'
|
---|
| 32 |
|
---|
| 33 | # Counter limit of how many 'hard' errors are allowed before bailing out
|
---|
| 34 | stop_limit = 100
|
---|
| 35 |
|
---|
| 36 | # Current shifting offset
|
---|
| 37 | base_shift = 0
|
---|
| 38 | stop_counter = 0
|
---|
| 39 |
|
---|
| 40 | # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
|
---|
| 41 | contig_seq = {}
|
---|
| 42 | abstract_contig = {}
|
---|
| 43 |
|
---|
| 44 | try:
|
---|
| 45 | reader = csv.reader(open(edl_file,"rU"))
|
---|
| 46 | reader.next()
|
---|
| 47 | for row in reader:
|
---|
| 48 | finished = False
|
---|
| 49 | while not finished:
|
---|
| 50 | (featureType, zNumber, contig, lend, rend, orientation,
|
---|
| 51 | segmentType, oIslandNumber, geneName, note, function, product,
|
---|
| 52 | translationNotes) = row
|
---|
| 53 | lend = int(lend)
|
---|
| 54 | rend = int(rend)
|
---|
| 55 | if not contig_seq.has_key(contig):
|
---|
| 56 | # Load data
|
---|
| 57 | try:
|
---|
| 58 | handle = open(contig + ".raw","rU")
|
---|
| 59 | contig_seq[contig] = handle.read()
|
---|
| 60 | abstract_contig[contig] = list('*' * len(contig_seq[contig]))
|
---|
| 61 | handle.close()
|
---|
| 62 | except IOError:
|
---|
| 63 | print "Unable to open '%s'" % (sys.argv[1])
|
---|
| 64 | sys.exit(64)
|
---|
| 65 | seq = contig_seq[contig]
|
---|
| 66 |
|
---|
| 67 | # Make a forward orientation mark as positive
|
---|
| 68 | if orientation == '>':
|
---|
| 69 | order = True
|
---|
| 70 | else:
|
---|
| 71 | order = False
|
---|
| 72 |
|
---|
| 73 | # Living the world upside down to order is off
|
---|
| 74 | if order:
|
---|
| 75 | start_pos = base_shift + lend - 1
|
---|
| 76 | stop_pos = base_shift + rend - 3
|
---|
| 77 | else:
|
---|
| 78 | start_pos = base_shift + rend -3
|
---|
| 79 | stop_pos = base_shift + lend - 1
|
---|
| 80 |
|
---|
| 81 | start_codon = get_codon(seq,start_pos,order)
|
---|
| 82 | stop_codon = get_codon(seq,stop_pos,order)
|
---|
| 83 | start_match = start_codon in start_codons
|
---|
| 84 | stop_match = stop_codon in stop_codons
|
---|
| 85 |
|
---|
| 86 | if DEBUG: print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % \
|
---|
| 87 | (geneName,lend,rend,start_pos, stop_pos, contig,
|
---|
| 88 | start_codon, stop_codon, orientation, base_shift),
|
---|
| 89 | if DEBUG: print "%s, %s" % (start_match, stop_match)
|
---|
| 90 |
|
---|
| 91 | if start_match and stop_match:
|
---|
| 92 | # Used for training DNA sequences and overlaps
|
---|
| 93 | print "Gene found %s, %s" % (start_pos, stop_pos)
|
---|
| 94 | if start_pos < stop_pos:
|
---|
| 95 | abstract_contig[contig][start_pos:start_pos+3] = ['<'] * 3
|
---|
| 96 | abstract_contig[contig][start_pos+3:stop_pos] = ['-'] * (stop_pos - (start_pos + 3))
|
---|
| 97 | abstract_contig[contig][stop_pos:stop_pos+3] = ['>'] * 3
|
---|
| 98 | break
|
---|
| 99 |
|
---|
| 100 | if not start_match or not stop_match:
|
---|
| 101 | if DEBUG: print "### Function (comment):", function
|
---|
| 102 |
|
---|
| 103 | # Check for fucked up offsets, shifts
|
---|
| 104 | start_shift = []
|
---|
| 105 | stop_shift = []
|
---|
| 106 | # Technically speaking should the cope only the reading frames,
|
---|
| 107 | # but what the heck looking if 'free', but make sure to include
|
---|
| 108 | # the original frames as well
|
---|
| 109 | search_range = abs(base_shift) + 3
|
---|
| 110 | if not start_match:
|
---|
| 111 | # Somewhere else in the reading frame?
|
---|
| 112 | matches = []
|
---|
| 113 | for r in range(-search_range,search_range+1):
|
---|
| 114 | l = start_pos + r
|
---|
| 115 | t = get_codon(seq,l,order)
|
---|
| 116 | new_match = t in start_codons
|
---|
| 117 | if new_match:
|
---|
| 118 | matches.append("%i:%s" % (r, t))
|
---|
| 119 | start_shift.append(r)
|
---|
| 120 | if start_shift:
|
---|
| 121 | if DEBUG: print "# Start codon reading frame matches: ", ",".join(matches)
|
---|
| 122 | if not stop_match:
|
---|
| 123 | # Somewhere else in the reading frame?
|
---|
| 124 | matches = []
|
---|
| 125 | for r in range(-search_range,search_range+1):
|
---|
| 126 | l = stop_pos + r
|
---|
| 127 | t = get_codon(seq,l,order)
|
---|
| 128 | new_match = t in stop_codons
|
---|
| 129 | if new_match:
|
---|
| 130 | stop_shift.append(r)
|
---|
| 131 | matches.append("%i:%s" % (r, t))
|
---|
| 132 | if stop_shift:
|
---|
| 133 | if DEBUG: print "# Stop codon reading frame matches: ", ",".join(matches)
|
---|
| 134 |
|
---|
| 135 |
|
---|
| 136 | # Both wrong is something screwy in the data, check offset fix
|
---|
| 137 | if not start_match and not stop_match:
|
---|
| 138 | common_shift = list(set(start_shift) & set(stop_shift))
|
---|
| 139 | if common_shift:
|
---|
| 140 | if DEBUG: print "# Matching shifts: " + ",".join(map(str,common_shift))
|
---|
| 141 | # Get the value closest to 0 for shifting purposes
|
---|
| 142 | # Currently asume no negative shifts
|
---|
| 143 | base_shift += min(common_shift)
|
---|
| 144 | finished = False
|
---|
| 145 | continue
|
---|
| 146 | # Exercise left to the reader
|
---|
| 147 | if (stop_counter > stop_limit):
|
---|
| 148 | return
|
---|
| 149 | else:
|
---|
| 150 | stop_counter += 1
|
---|
| 151 | finished = True
|
---|
| 152 | else:
|
---|
| 153 | stop_counter = 0
|
---|
| 154 | finished = True
|
---|
| 155 |
|
---|
| 156 | except csv.Error, e:
|
---|
| 157 | sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
|
---|
| 158 |
|
---|
| 159 | for contig in abstract_contig.keys():
|
---|
| 160 | handle = open(contig + "-gene.raw", "w")
|
---|
| 161 | handle.write(''.join(abstract_contig[contig]))
|
---|
| 162 | handle.close()
|
---|
| 163 |
|
---|
| 164 | if __name__ == "__main__":
|
---|
| 165 | gene_detect()
|
---|