Context Navigation

← Previous Change
Next Change →

Changeset 54 for liacs

Timestamp:

Dec 22, 2009, 7:44:50 AM (15 years ago)

Author:

Rick van der Zwet

Message:

Pre-process the fasta file to it's desired output

File:

: 1 copied

liacs/dbdm/dbdm_4/parse-fasta.py (copied) (copied from liacs/dbdm/dbdm_4/fasta-hmm.py ) (1 diff)

Legend:

: Unmodified
: Added
: Removed

liacs/dbdm/dbdm_4/parse-fasta.py

-              r53
+              r54
 from MultiReplace  import MultiReplace
+fasta_translate = {
+    'r' : 'ga', # purine
+    'y' : 'tc', # pyrimide
+    'k' : 'gt', # keto
+    'm' : 'ac', # amino
+    's' : 'gc', # strong
+    'w' : 'at', # weak
+    'b' : 'gtc',
+    'd' : 'gat',
+    'h' : 'act',
+    'v' : 'gca',
+    }
+fasta = MultiReplace(fasta_translate)
 def parse_file(file):
     handle = open("data/AE005174v2-1.fas", "rU")
+    handle = open(file, "rU")
     for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
         # How to translate damm thing into plain nucleic acid codes
         # http://en.wikipedia.org/wiki/FASTA_format
+        stupid = seq_record.seq.__str__()
+        fasta_translate = {
+            'r' : 'ga', # purine
+            'y' : 'tc', # pyrimide
+            'k' : 'gt', # keto
+            'm' : 'ac', # amino
+            's' : 'gc', # strong
+            'w' : 'at', # weak
+            'b' : 'gtc',
+            'd' : 'gat',
+            'h' : 'act',
+            'v' : 'gca',
+            }
+        r = MultiReplace(fasta_translate)
+        stupid = r.replace(stupid)
+        retval = seq_record.seq.__str__()
+        pdict = {}
+        for n in range(1, len(stupid)):
+            protein = stupid[n]
+            if not pdict.has_key(protein):
+                pdict[protein] = 1
+            else:
+                pdict[protein] += 1
+    handle.close()
+    return(retval)
+def stats(seq):
+    pdict = {}
+    for n in range(1, len(seq)):
+        protein = seq[n]
+        if not pdict.has_key(protein):
+            pdict[protein] = 1
+        else:
+            pdict[protein] += 1
+        print pdict
+    print pdict
+def concat(head,tail,max_length=1000):
+    "Concat two strings together removing common parts"
+    l_head = len(head)
+    l_tail = len(tail)
+    # Start/Stop at the right time
+    start = 1
+    if (l_head < l_tail):
+        stop = l_head + 1
+    else:
+        stop = l_tail + 1
+    # Make sure not to run for-ever
+    if (stop > max_length):
+        stop = max_length
+    # Find largest common part
+    # XXX: Not very effient (on very large overlap sets
+    for i in reversed(range(start,stop)):
+        #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
+        if head[-i:] == tail[0:i]:
+            return((i,(tail[0:i]),head + tail[i:]))
+    # None found return full part
+    return(-1,'',head + tail)
 file1 = parse_file("data/AE005174v2-1.fas")
 file2 = parse_file("data/AE005174v2-2.fas")
+file1 = fasta.replace(file1)
+file2 = fasta.replace(file2)
+(retval, common, result) = concat(file2,file1)
+print retval, common
+stats(result)
+out = open("full_contig.raw","w")
+out.write(result)
+out.close()

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 54 for liacs

Legend:

liacs/dbdm/dbdm_4/parse-fasta.py

Download in other formats: