Changeset 54 for liacs/dbdm/dbdm_4


Ignore:
Timestamp:
Dec 22, 2009, 7:44:50 AM (15 years ago)
Author:
Rick van der Zwet
Message:

Pre-process the fasta file to it's desired output

File:
1 copied

Legend:

Unmodified
Added
Removed
  • liacs/dbdm/dbdm_4/parse-fasta.py

    r53 r54  
    66from MultiReplace  import MultiReplace
    77
     8fasta_translate = {
     9    'r' : 'ga', # purine
     10    'y' : 'tc', # pyrimide
     11    'k' : 'gt', # keto
     12    'm' : 'ac', # amino
     13    's' : 'gc', # strong
     14    'w' : 'at', # weak
     15    'b' : 'gtc',
     16    'd' : 'gat',
     17    'h' : 'act',
     18    'v' : 'gca',
     19    }
     20   
     21fasta = MultiReplace(fasta_translate)
     22
    823def parse_file(file):
    9     handle = open("data/AE005174v2-1.fas", "rU")
     24    handle = open(file, "rU")
    1025    for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
    1126        # How to translate damm thing into plain nucleic acid codes
    1227        # http://en.wikipedia.org/wiki/FASTA_format
    13         stupid = seq_record.seq.__str__()
    14         fasta_translate = {
    15             'r' : 'ga', # purine
    16             'y' : 'tc', # pyrimide
    17             'k' : 'gt', # keto
    18             'm' : 'ac', # amino
    19             's' : 'gc', # strong
    20             'w' : 'at', # weak
    21             'b' : 'gtc',
    22             'd' : 'gat',
    23             'h' : 'act',
    24             'v' : 'gca',
    25             }
    26            
    27         r = MultiReplace(fasta_translate)
    28         stupid = r.replace(stupid)
     28        retval = seq_record.seq.__str__()
    2929   
    30         pdict = {}
    31         for n in range(1, len(stupid)):
    32             protein = stupid[n]
    33             if not pdict.has_key(protein):
    34                 pdict[protein] = 1
    35             else:
    36                 pdict[protein] += 1
     30    handle.close()
     31    return(retval)
     32
     33def stats(seq):
     34    pdict = {}
     35    for n in range(1, len(seq)):
     36        protein = seq[n]
     37        if not pdict.has_key(protein):
     38            pdict[protein] = 1
     39        else:
     40            pdict[protein] += 1
    3741   
    38         print pdict
     42    print pdict
     43
     44
     45def concat(head,tail,max_length=1000):
     46    "Concat two strings together removing common parts"
     47    l_head = len(head)
     48    l_tail = len(tail)
     49
     50    # Start/Stop at the right time
     51    start = 1
     52    if (l_head < l_tail):
     53        stop = l_head + 1
     54    else:
     55        stop = l_tail + 1
     56
     57    # Make sure not to run for-ever
     58    if (stop > max_length):
     59        stop = max_length
     60
     61    # Find largest common part
     62    # XXX: Not very effient (on very large overlap sets
     63    for i in reversed(range(start,stop)):
     64        #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
     65        if head[-i:] == tail[0:i]:
     66            return((i,(tail[0:i]),head + tail[i:]))
     67
     68    # None found return full part
     69    return(-1,'',head + tail)
     70
    3971
    4072file1 = parse_file("data/AE005174v2-1.fas")
    4173file2 = parse_file("data/AE005174v2-2.fas")
     74file1 = fasta.replace(file1)
     75file2 = fasta.replace(file2)
     76(retval, common, result) = concat(file2,file1)
     77print retval, common
     78stats(result)
     79
     80out = open("full_contig.raw","w")
     81out.write(result)
     82out.close()
Note: See TracChangeset for help on using the changeset viewer.