Changeset 63 for liacs/dbdm/dbdm_4


Ignore:
Timestamp:
Dec 27, 2009, 1:42:15 PM (15 years ago)
Author:
Rick van der Zwet
Message:

Finally got all the matching right

Location:
liacs/dbdm/dbdm_4
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • liacs/dbdm/dbdm_4/ecoli_hmm.py

    r62 r63  
    66# http://nl.wikipedia.org/wiki/Genetische_code
    77
    8 import ghmm
    98import sys
    109import csv
    1110import string
     11
     12# http://ghmm.sourceforge.net/
     13import ghmm
     14
     15# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
     16fasta_translate = {
     17    'r' : 'ga', # purine
     18    'y' : 'tc', # pyrimide
     19    'k' : 'gt', # keto
     20    'm' : 'ac', # amino
     21    's' : 'gc', # strong
     22    'w' : 'at', # weak
     23    'b' : 'gtc',
     24    'd' : 'gat',
     25    'h' : 'act',
     26    'v' : 'gca',
     27    }
     28
     29DEBUG = True
    1230
    1331dna_flip = string.maketrans('atgc','tacg')
     
    2240
    2341
    24 def ecoli_hmm(seq):
     42def ecoli_hmm():
    2543    """Try to find genes inside e sequence using a HMM"""
    2644    # Model 4 bases A C G T and unknown state N
     
    6785
    6886    # Counter limit of how many 'hard' errors are allowed before bailing out
    69     stop_limit = 5
     87    stop_limit = 100
    7088
    7189    # Current shifting offset
    7290    base_shift = 0
    7391    stop_counter = 0
     92   
     93    # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
     94    contig_seq = {}
    7495
    7596    reader = csv.reader(open(edl_file,"rU"))
     
    84105                lend = int(lend)
    85106                rend = int(rend)
     107                if not contig_seq.has_key(contig):
     108                    # Load data
     109                    try:
     110                        handle = open(contig + ".raw","rU")
     111                        contig_seq[contig] = handle.read()
     112                        handle.close()
     113                    except IOError:
     114                        print "Unable to open '%s'" % (sys.argv[1])
     115                        sys.exit(64)
     116                seq = contig_seq[contig]
     117               
    86118                # Make a forward orientation mark as positive
    87119                if orientation == '>':
     
    103135                stop_match = stop_codon in stop_codons
    104136
    105                 print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation),
     137                print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
    106138                print "%s, %s" % (start_match, stop_match)
     139                if not start_match or not stop_match:
     140                    print "### Function (comment):", function
    107141   
    108142                # Check for fucked up offsets, shifts
    109                 start_shift = None
    110                 stop_shift = None
     143                start_shift = []
     144                stop_shift = []
    111145                # Technically speaking should the cope only the reading frames,
    112                 # but what the heck looking if 'free'
    113                 search_range = 10
     146                # but what the heck looking if 'free', but make sure to include the original frames as well
     147                search_range = abs(base_shift) + 3
    114148                if not start_match:
    115149                    # Somewhere else in the reading frame?
     150                    matches = []
    116151                    for r in range(-search_range,search_range+1):
    117152                        l = start_pos + r
     
    119154                        new_match = t in start_codons
    120155                        if new_match:
    121                             start_shift = r
    122                             print "# Start codon reading frame match - shift %i: " % r,
    123                             print "%i, %s: %s" % (l,t,new_match)
     156                            matches.append("%i:%s" % (r, t))
     157                            start_shift.append(r)
     158                    if start_shift:
     159                        print "# Start codon reading frame matches: ", ",".join(matches)
    124160                if not stop_match:
    125161                    # Somewhere else in the reading frame?
     162                    matches = []
    126163                    for r in range(-search_range,search_range+1):
    127164                        l = stop_pos + r
     
    129166                        new_match = t in stop_codons
    130167                        if new_match:
    131                             stop_shift = r
    132                             print "# Stop codon reading frame match - shift %i: " % r,
    133                             print "%i, %s: %s" % (l,t,new_match)
     168                            stop_shift.append(r)
     169                            matches.append("%i:%s" % (r, t))
     170                    if stop_shift:
     171                        print "# Stop codon reading frame matches: ", ",".join(matches)
    134172
    135173
    136174                # Both wrong is something screwy in the data, check offset fix
    137175                if not start_match and not stop_match:
    138                     if stop_shift != None and start_shift == stop_shift:
    139                         print "# Matching shift %i" % start_shift
    140                         base_shift += start_shift
     176                    common_shift = list(set(start_shift) & set(stop_shift))
     177                    if common_shift:
     178                        print "# Matching shifts: " + ",".join(map(str,common_shift))
     179                        # Get the value closest to 0 for shifting purposes
     180                        # Currently asume no negative shifts
     181                        base_shift += min(common_shift)
    141182                        finished = False
    142183                        continue
     
    171212
    172213if __name__ == "__main__":
    173     # Load data
    174     try:
    175         handle = open(sys.argv[1],"rU")
    176     except IndexError:
    177         print "Usage %s <data_file>" % (sys.argv[0])
    178         sys.exit(64)
    179     except IOError:
    180         print "Unable to open '%s'" % (sys.argv[1])
    181         sys.exit(64)
    182    
    183     seq = handle.read()
    184     handle.close()
    185     ecoli_hmm(seq)
     214    ecoli_hmm()
  • liacs/dbdm/dbdm_4/parse_fasta.py

    r59 r63  
    1111from reading_frames import reading_frames
    1212
     13# The mapping is kind of odd, as 'r' could mean either 'g' or 'a'
    1314fasta_translate = {
    1415    'r' : 'ga', # purine
     
    8081# Get data
    8182seq1 = parse_file(file1)
    82 seq2 = parse_file(file1)
     83seq2 = parse_file(file2)
    8384
    84 seq1 = fasta.replace(seq1)
    85 seq2 = fasta.replace(seq2)
     85# Wrong assumption, replace is not possible as the real value is not know yet
     86# seq1 = fasta.replace(seq1)
     87# seq2 = fasta.replace(seq2)
    8688
    8789# Find overlap
     
    106108
    107109# Write to file for later further processing
     110out = open("AE005174v2-1.raw","w")
     111out.write(seq1)
     112out.close()
     113
     114out = open("AE005174v2-2.raw","w")
     115out.write(seq2)
     116out.close()
     117
    108118out = open("full_contig.raw","w")
    109119out.write(result)
  • liacs/dbdm/dbdm_4/reading_frames.py

    r59 r63  
    2424    See: http://en.wikipedia.org/wiki/Genetic_code'''
    2525
    26     # Populate all empty
    2726    final = {}
    28     for a in 'acgtn':
    29         for b in 'acgtn':
    30             for c in 'acgtn':
    31                 final[a + b + c] = [0,0,0,0,0,0]
    3227
    3328    for start in [0,1,2]:
     
    3530        retval = _frame_stat(seq,start)
    3631        for codon,v in retval.iteritems():
     32            if not final.has_key(codon):
     33                final[codon] = [0,0,0,0,0,0]
    3734            final[codon][start] += v
    3835        print "Reverse; start %i" % (start)
    3936        retval = _frame_stat(seq[::-1],start)
    4037        for codon,v in retval.iteritems():
     38            if not final.has_key(codon):
     39                final[codon] = [0,0,0,0,0,0]
    4140            final[codon][start+3] += v
    4241
Note: See TracChangeset for help on using the changeset viewer.