Changeset 59 for liacs


Ignore:
Timestamp:
Dec 22, 2009, 9:29:24 AM (15 years ago)
Author:
Rick van der Zwet
Message:

Do the reading_frames as well

Location:
liacs/dbdm/dbdm_4
Files:
1 copied
1 moved

Legend:

Unmodified
Added
Removed
  • liacs/dbdm/dbdm_4/parse_fasta.py

    r58 r59  
    99import Bio.Data.CodonTable
    1010from MultiReplace  import MultiReplace
     11from reading_frames import reading_frames
    1112
    1213fasta_translate = {
     
    9091print file1
    9192stats(seq1)
    92 print file1
    93 stats(seq1)
     93reading_frames(seq1)
     94print file2
     95stats(seq2)
     96reading_frames(seq2)
     97
    9498
    9599# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
     
    99103result = result + "n" * 4000;
    100104stats(result)
     105reading_frames(result)
    101106
    102107# Write to file for later further processing
  • liacs/dbdm/dbdm_4/reading_frames.py

    r56 r59  
    11#!/usr/bin/env python
    22#
    3 # Parse 2 FASTA files and print statistics
     3# Parse FASTA file and print statistics on reading frames
    44# BSDLicence
    55# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
    6 from Bio import SeqIO,Seq
    7 from Bio import Alphabet
    8 from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
    9 import Bio.Data.CodonTable
    10 from MultiReplace  import MultiReplace
     6import sys
    117
    12 fasta_translate = {
    13     'r' : 'ga', # purine
    14     'y' : 'tc', # pyrimide
    15     'k' : 'gt', # keto
    16     'm' : 'ac', # amino
    17     's' : 'gc', # strong
    18     'w' : 'at', # weak
    19     'b' : 'gtc',
    20     'd' : 'gat',
    21     'h' : 'act',
    22     'v' : 'gca',
    23     }
    24    
    25 fasta = MultiReplace(fasta_translate)
     8def _frame_stat(seq, start=0):
     9    pdict = {}
     10    for n in range(start,len(seq),2):
     11        codon = seq[n:n+3]
     12        if len(codon) < 3:
     13            continue
     14        if not pdict.has_key(codon):
     15            pdict[codon] = 1
     16        else:
     17            pdict[codon] += 1
    2618
    27 def parse_file(file):
    28     handle = open(file, "rU")
    29     for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
    30         # How to translate damm thing into plain nucleic acid codes
    31         # http://en.wikipedia.org/wiki/FASTA_format
    32         retval = seq_record.seq.__str__()
    33    
    34     handle.close()
    35     return(retval)
     19    return(pdict)
    3620
    37 def stats(seq):
    38     pdict = {}
    39     for n in range(1, len(seq)):
    40         protein = seq[n]
    41         if not pdict.has_key(protein):
    42             pdict[protein] = 1
    43         else:
    44             pdict[protein] += 1
    45    
    46     print pdict
     21def reading_frames(seq):
     22    '''Parse from left to right and right to left, at position 1,2,3 in
     23    in the so called nucleotide triplets
     24    See: http://en.wikipedia.org/wiki/Genetic_code'''
     25
     26    # Populate all empty
     27    final = {}
     28    for a in 'acgtn':
     29        for b in 'acgtn':
     30            for c in 'acgtn':
     31                final[a + b + c] = [0,0,0,0,0,0]
     32
     33    for start in [0,1,2]:
     34        print "Normal; start %i" % (start)
     35        retval = _frame_stat(seq,start)
     36        for codon,v in retval.iteritems():
     37            final[codon][start] += v
     38        print "Reverse; start %i" % (start)
     39        retval = _frame_stat(seq[::-1],start)
     40        for codon,v in retval.iteritems():
     41            final[codon][start+3] += v
     42
     43    print "CODON :  N:0   , N:1  , N:2  ,  R:0 , R:1  , R:2  "
     44
     45    for codon in sorted(final.keys()):
     46        print codon,"  : ", ",".join(["%6i" % x for x in final[codon]])
    4747
    4848
    49 def concat(head,tail,max_length=1000):
    50     "Concat two strings together removing common parts"
    51     l_head = len(head)
    52     l_tail = len(tail)
    5349
    54     # Start/Stop at the right time
    55     start = 1
    56     if (l_head < l_tail):
    57         stop = l_head + 1
    58     else:
    59         stop = l_tail + 1
    60 
    61     # Make sure not to run for-ever
    62     if (stop > max_length):
    63         stop = max_length
    64 
    65     # Find largest common part
    66     # XXX: Not very effient (on very large overlap sets
    67     for i in reversed(range(start,stop)):
    68         #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
    69         if head[-i:] == tail[0:i]:
    70             return((i,(tail[0:i]),head + tail[i:]))
    71 
    72     # None found return full part
    73     return(-1,'',head + tail)
    74 
    75 
    76 # Get data
    77 file1 = parse_file("data/AE005174v2-1.fas")
    78 file2 = parse_file("data/AE005174v2-2.fas")
    79 
    80 file1 = fasta.replace(file1)
    81 file2 = fasta.replace(file2)
    82 
    83 # Find overlap
    84 (retval, common, result) = concat(file2,file1)
    85 print retval, common
    86 
    87 # Strictly speaking there is a gap of about 4 kbs (4000 bs) between file1 and
    88 # file2, so lets' put that into the the statistics as well. Due to circular
    89 # nature, does not matter wether we add it in the beginning or in the end
    90 result = result + "n" * 4000;
    91 stats(result)
    92 
    93 # Write to file for later further processing
    94 out = open("full_contig.raw","w")
    95 out.write(result)
    96 out.close()
     50if __name__ == "__main__":
     51    # Load data
     52    try:
     53        handle = open(sys.argv[1],"rU")
     54    except IndexError:
     55        print "Usage %s <data_file>" % (sys.argv[0])
     56        sys.exit(64)
     57    except IOError:
     58        print "Unable to open '%s'" % (sys.argv[1])
     59        sys.exit(64)
     60   
     61    seq = handle.read()
     62    handle.close()
     63    reading_frames(seq)
Note: See TracChangeset for help on using the changeset viewer.