Changeset 64


Ignore:
Timestamp:
Dec 27, 2009, 7:57:56 PM (15 years ago)
Author:
Rick van der Zwet
Message:

Sample GHMM, needs statistics and verifing still

Location:
liacs/dbdm/dbdm_4
Files:
2 added
2 edited

Legend:

Unmodified
Added
Removed
  • liacs/dbdm/dbdm_4/ecoli_hmm.py

    r63 r64  
    1212# http://ghmm.sourceforge.net/
    1313import ghmm
     14
     15from MultiReplace  import MultiReplace
    1416
    1517# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
     
    2729    }
    2830
    29 DEBUG = True
     31dna_ascii_translate = {
     32    '0' : '*',
     33    '1' : '<',
     34    '2' : '<',
     35    '3' : '<',
     36    '4' : '-',
     37    '5' : '>',
     38    '6' : '>',
     39    '7' : '>',
     40    }
    3041
    31 dna_flip = string.maketrans('atgc','tacg')
    32 def get_codon(seq, position, order):
    33     codon = seq[position:position+3]
    34     if not order:
    35         # When living on the other side of the string make sure to flip
    36         # around before comparison
    37         codon = codon[::-1].translate(dna_flip)
    38     return(codon)
     42dna_ascii = MultiReplace(dna_ascii_translate)
    3943
     44def pretty_print(test_seq, ans_seq, v, length=70, parts=10, seperator=''):
     45    """ Pretty printing of output for verification purposes """
     46    for i in range(0,len(v[0]),length):
     47        seq = []
     48        ans = []
     49        result = []
     50        for j in range(0,length,parts):
     51            t = i + j
     52            seq.append(test_seq[t:t+parts])
     53            ans.append(ans_seq[t:t+parts])
     54            result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts]))))
     55
     56        print seperator.join(seq)
     57        print seperator.join(ans)
     58        print seperator.join(result)
     59        print ''
     60    print "fairness of test_seq: ", v[1]
    4061
    4162
     
    4869    # XXX: Proper values, based of statistics
    4970    # The transition matrix A is chosen such that it reflects the statistics
    50     # Part one will try to get us into a gene, part two will tell us when to
    51     # get out of it
    52     A = [[0.6,0.4], [0.3, 0.7]]
     71    # Probalities from moving from one state to an other
     72    # 0) Outer-gene  : will try to get us into a gene
     73    # 1) Start-codon : beginning of gene - part 1
     74    # 2) Start-codon : beginning of gene - part 2
     75    # 3) Start-codon : beginning of gene - part 3
     76    # 4) Inside-gene : in the gene
     77    # 5) Stop-codon  : end of gene - part 1
     78    # 6) Stop-codon  : end of gene - part 2
     79    # 7) Stop-codon  : end of gene - part 3
     80    A = [
     81            [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
     82            [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
     83            [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
     84            [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
     85            [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0],
     86            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
     87            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
     88            [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
     89        ]
    5390
    54     # The emission probabilities matrix is modeled after the statistics with a
    55     # total of 1
    56     etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
    57     eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
    58 
    59     B = [etogene,eingene]
     91    # XXX: Proper values, based of statistics
     92    # The emission probabilities matrix is modeled after the statistics
     93    # (['a', 'c', 'g', 't', 'n' ]
     94    B = [
     95            # e.g. state 0 -> emission probability
     96            [0.2, 0.2, 0.2, 0.2, 0.2] ,
     97            [0.9, 0.0, 0.1, 0.0, 0.0] ,
     98            [0.0, 0.0, 0.0, 1.0, 0.0] ,
     99            [0.0, 0.0, 1.0, 0.0, 0.0] ,
     100            [0.2, 0.2, 0.2, 0.2, 0.2] ,
     101            [0.0, 0.0, 0.0, 1.0, 0.0] ,
     102            [0.7, 0.0, 0.3, 0.0, 0.0] ,
     103            [0.7, 0.0, 0.3, 0.0, 0.0] ,
     104        ]
    60105   
    61106    # Initial distribution favors outside
    62     pi  = [0.9, 0.1]
     107    pi  = [0.9] + [0.1/7] * 7
    63108
    64109    m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
    65     print "Initial HMM", m
     110    print "Initial HMM"
     111    print m
    66112
    67113    obs_seq = m.sampleSingle(20)
     
    70116    print "Observations         : ", ''.join(obs)
    71117
    72     # Start codons
    73     # atg, (small chance) gtg
    74     # Stop codons
    75     # taa, tga
    76     # tag
    77     # lend = Left End
    78     # rend = Right End
     118    answer = {}
     119    handle = open('AE005174v2-2-gene.raw', 'rU')
     120    answer['AE005174v2-2'] = handle.read()
     121    handle.close()
    79122
    80     # Training and testing
    81     # A -- T, G -- C
    82     start_codons = ['atg', 'gtg']
    83     stop_codons = ['taa', 'tga', 'tag']
    84     edl_file = 'data/edl_genes.csv'
     123    contig_seq = {}
     124    handle = open('AE005174v2-2.raw', 'rU')
     125    contig_seq['AE005174v2-2'] = handle.read()
     126    handle.close()
    85127
    86     # Counter limit of how many 'hard' errors are allowed before bailing out
    87     stop_limit = 100
     128    handle = open('AE005174v2-1.raw', 'rU')
     129    contig_seq['AE005174v2-1'] = handle.read()
     130    handle.close()
    88131
    89     # Current shifting offset
    90     base_shift = 0
    91     stop_counter = 0
    92    
    93     # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
    94     contig_seq = {}
    95 
    96     reader = csv.reader(open(edl_file,"rU"))
    97     reader.next()
    98     try:
    99         for row in reader:
    100             finished = False
    101             while not finished:
    102                 (featureType, zNumber, contig, lend, rend, orientation,
    103                 segmentType, oIslandNumber, geneName, note, function, product,
    104                 translationNotes) = row
    105                 lend = int(lend)
    106                 rend = int(rend)
    107                 if not contig_seq.has_key(contig):
    108                     # Load data
    109                     try:
    110                         handle = open(contig + ".raw","rU")
    111                         contig_seq[contig] = handle.read()
    112                         handle.close()
    113                     except IOError:
    114                         print "Unable to open '%s'" % (sys.argv[1])
    115                         sys.exit(64)
    116                 seq = contig_seq[contig]
    117                
    118                 # Make a forward orientation mark as positive
    119                 if orientation == '>':
    120                     order = True
    121                 else:
    122                     order = False
    123 
    124                 # Living the world upside down to order is off
    125                 if order:
    126                     start_pos = base_shift + lend - 1
    127                     stop_pos = base_shift + rend - 3
    128                 else:
    129                     start_pos = base_shift + rend -3
    130                     stop_pos = base_shift + lend - 1
    131                    
    132                 start_codon = get_codon(seq,start_pos,order)
    133                 stop_codon = get_codon(seq,stop_pos,order)
    134                 start_match = start_codon in start_codons
    135                 stop_match = stop_codon in stop_codons
    136 
    137                 print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
    138                 print "%s, %s" % (start_match, stop_match)
    139                 if not start_match or not stop_match:
    140                     print "### Function (comment):", function
    141    
    142                 # Check for fucked up offsets, shifts
    143                 start_shift = []
    144                 stop_shift = []
    145                 # Technically speaking should the cope only the reading frames,
    146                 # but what the heck looking if 'free', but make sure to include the original frames as well
    147                 search_range = abs(base_shift) + 3
    148                 if not start_match:
    149                     # Somewhere else in the reading frame?
    150                     matches = []
    151                     for r in range(-search_range,search_range+1):
    152                         l = start_pos + r
    153                         t = get_codon(seq,l,order)
    154                         new_match = t in start_codons
    155                         if new_match:
    156                             matches.append("%i:%s" % (r, t))
    157                             start_shift.append(r)
    158                     if start_shift:
    159                         print "# Start codon reading frame matches: ", ",".join(matches)
    160                 if not stop_match:
    161                     # Somewhere else in the reading frame?
    162                     matches = []
    163                     for r in range(-search_range,search_range+1):
    164                         l = stop_pos + r
    165                         t = get_codon(seq,l,order)
    166                         new_match = t in stop_codons
    167                         if new_match:
    168                             stop_shift.append(r)
    169                             matches.append("%i:%s" % (r, t))
    170                     if stop_shift:
    171                         print "# Stop codon reading frame matches: ", ",".join(matches)
     132    test_seq = contig_seq['AE005174v2-2'][0:490]
     133    ans_seq = answer['AE005174v2-2'][0:490]
     134    test_eseq=ghmm.EmissionSequence(sigma,list(test_seq))
     135    v = m.viterbi(test_eseq)
     136    pretty_print(test_seq, ans_seq, v)
    172137
    173138
    174                 # Both wrong is something screwy in the data, check offset fix
    175                 if not start_match and not stop_match:
    176                     common_shift = list(set(start_shift) & set(stop_shift))
    177                     if common_shift:
    178                         print "# Matching shifts: " + ",".join(map(str,common_shift))
    179                         # Get the value closest to 0 for shifting purposes
    180                         # Currently asume no negative shifts
    181                         base_shift += min(common_shift)
    182                         finished = False
    183                         continue
    184                     # Exercise left to the reader
    185                     if (stop_counter > stop_limit):
    186                         return
    187                     else:
    188                         stop_counter += 1
    189                         finished = True
    190                 else:
    191                     stop_counter = 0
    192                     finished = True
     139    # Train sequence
     140    print "Training baumWelch"
     141    train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1']))
     142    v = m.baumWelch(train_seq)
     143    print m
     144   
    193145
    194     except csv.Error, e:
    195         sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
    196 
    197 
    198        
    199     test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
    200     v = m.viterbi(test_seq)
    201     #print "fairness of test_seq: ", v
    202 
     146    print "Results after training sequence..."
     147    v = m.viterbi(test_eseq)
     148    pretty_print(test_seq, ans_seq, v)
    203149   
    204     # Validation
    205     val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
    206     v = m.viterbi(val_seq)
    207     #print "Test sequence: ", v
    208150
    209151    # XXX: Results
    210152
    211 
    212 
    213153if __name__ == "__main__":
    214154    ecoli_hmm()
  • liacs/dbdm/dbdm_4/parse_fasta.py

    r63 r64  
    1111from reading_frames import reading_frames
    1212
    13 # The mapping is kind of odd, as 'r' could mean either 'g' or 'a'
     13# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our
     14# case we map them  all to unknown
     15
    1416fasta_translate = {
    15     'r' : 'ga', # purine
    16     'y' : 'tc', # pyrimide
    17     'k' : 'gt', # keto
    18     'm' : 'ac', # amino
    19     's' : 'gc', # strong
    20     'w' : 'at', # weak
    21     'b' : 'gtc',
    22     'd' : 'gat',
    23     'h' : 'act',
    24     'v' : 'gca',
     17    'r' : 'n', # purine
     18    'y' : 'n', # pyrimide
     19    'k' : 'n', # keto
     20    'm' : 'n', # amino
     21    's' : 'n', # strong
     22    'w' : 'n', # weak
     23    'b' : 'n',
     24    'd' : 'n',
     25    'h' : 'n',
     26    'v' : 'n',
    2527    }
    2628   
     
    8385seq2 = parse_file(file2)
    8486
    85 # Wrong assumption, replace is not possible as the real value is not know yet
    86 # seq1 = fasta.replace(seq1)
    87 # seq2 = fasta.replace(seq2)
     87# Simplify answers
     88seq1 = fasta.replace(seq1)
     89seq2 = fasta.replace(seq2)
    8890
    8991# Find overlap
Note: See TracChangeset for help on using the changeset viewer.