Ignore:
Timestamp:
Dec 23, 2009, 1:01:23 PM (15 years ago)
Author:
Rick van der Zwet
Message:

Start hacking to try to understand the so 'simple' csv file :-)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • liacs/dbdm/dbdm_4/ecoli_hmm.py

    r60 r61  
    44# http://en.wikipedia.org/wiki/Escherichia_coli
    55# http://en.wikipedia.org/wiki/Open_reading_frame
     6# http://nl.wikipedia.org/wiki/Genetische_code
    67
    78import ghmm
    89import sys
     10import csv
     11import string
    912
    1013
     
    2326    # The emission probabilities matrix is modeled after the statistics with a
    2427    # total of 1
    25     etogene = [ 0.2, 0.3, 0.3, 0.1, 0.2 ]
     28    etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
    2629    eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
    2730
     
    3942    print "Observations         : ", ''.join(obs)
    4043
    41     # XXX: Training
     44    # Start codons
     45    # atg, (small chance) gtg
     46    # Stop codons
     47    # taa, tga
     48    # tag
     49    # lend = Left End
     50    # rend = Right End
     51
     52    # Training
     53    # A -- T, G -- C
     54    start_condons = ['atg', 'gtg']
     55    stop_condons = ['taa', 'tga', 'tag']
     56    dna_flip = string.maketrans('atgc','tacg')
     57    edl_file = 'data/edl_genes.csv'
     58    reader = csv.reader(open(edl_file,"rU"))
     59    reader.next()
     60    try:
     61        for row in reader:
     62            (featureType, zNumber, contig, lend, rend, orientation,
     63            segmentType, oIslandNumber, geneName, note, function, product,
     64            translationNotes) = row
     65            lend = int(lend) - 1
     66            rend = int(rend)
     67            # Make a forward orientation as positive
     68            if orientation == '>':
     69                order = True
     70            else:
     71                order = False
     72
     73            if order:
     74                start_codon = seq[lend:lend+3]
     75                stop_codon = seq[rend-3:rend]
     76            else:
     77                start_codon = seq[rend-3:rend].translate(dna_flip)[::-1]
     78                stop_codon = seq[lend:lend+3].translate(dna_flip)[::-1]
     79
     80            print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation),
     81
     82            print "%s, %s" % (start_codon in start_condons, stop_codon in stop_condons)
     83            if not start_codon in start_condons or not stop_codon in stop_condons:
     84                return
     85    except csv.Error, e:
     86        sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
     87
     88
     89       
     90    test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
     91    v = m.viterbi(test_seq)
     92    #print "fairness of test_seq: ", v
     93
    4294   
    43     # XXX: Validation
     95    # Validation
     96    val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
     97    v = m.viterbi(val_seq)
     98    #print "Test sequence: ", v
    4499
    45100    # XXX: Results
Note: See TracChangeset for help on using the changeset viewer.