Context Navigation

← Previous Change
Next Change →

Changeset 59 for liacs

Timestamp:

Dec 22, 2009, 9:29:24 AM (15 years ago)

Author:

Rick van der Zwet

Message:

Do the reading_frames as well

Location:

liacs/dbdm/dbdm_4

Files:

: 1 copied
: 1 moved

parse_fasta.py (copied) (copied from liacs/dbdm/dbdm_4/parse-fasta.py ) (3 diffs)
reading_frames.py (moved) (moved from liacs/dbdm/dbdm_4/parse-fasta.py ) (1 diff)

Legend:

: Unmodified
: Added
: Removed

liacs/dbdm/dbdm_4/parse_fasta.py

-              r58
+              r59
 import Bio.Data.CodonTable
 from MultiReplace  import MultiReplace
+from reading_frames import reading_frames
 fasta_translate = {
 …
 print file1
 stats(seq1)
+print file1
+stats(seq1)
+reading_frames(seq1)
+print file2
+stats(seq2)
+reading_frames(seq2)
 # Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
 …
 result = result + "n" * 4000;
 stats(result)
+reading_frames(result)
 # Write to file for later further processing

liacs/dbdm/dbdm_4/reading_frames.py

-              r56
+              r59
 #!/usr/bin/env python
+#
 # Parse 2 FASTA files and print statistics
+# Parse FASTA file and print statistics on reading frames
 # BSDLicence
 # Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
+from Bio import SeqIO,Seq
+from Bio import Alphabet
+from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
+import Bio.Data.CodonTable
+from MultiReplace  import MultiReplace
+import sys
+fasta_translate = {
+    'r' : 'ga', # purine
+    'y' : 'tc', # pyrimide
+    'k' : 'gt', # keto
+    'm' : 'ac', # amino
+    's' : 'gc', # strong
+    'w' : 'at', # weak
+    'b' : 'gtc',
+    'd' : 'gat',
+    'h' : 'act',
+    'v' : 'gca',
+    }
+fasta = MultiReplace(fasta_translate)
+def _frame_stat(seq, start=0):
+    pdict = {}
+    for n in range(start,len(seq),2):
+        codon = seq[n:n+3]
+        if len(codon) < 3:
+            continue
+        if not pdict.has_key(codon):
+            pdict[codon] = 1
+        else:
+            pdict[codon] += 1
+def parse_file(file):
+    handle = open(file, "rU")
+    for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
+        # How to translate damm thing into plain nucleic acid codes
+        # http://en.wikipedia.org/wiki/FASTA_format
+        retval = seq_record.seq.__str__()
+    handle.close()
+    return(retval)
+    return(pdict)
+def stats(seq):
+    pdict = {}
+    for n in range(1, len(seq)):
+        protein = seq[n]
+        if not pdict.has_key(protein):
+            pdict[protein] = 1
+        else:
+            pdict[protein] += 1
+    print pdict
+def reading_frames(seq):
+    '''Parse from left to right and right to left, at position 1,2,3 in
+    in the so called nucleotide triplets
+    See: http://en.wikipedia.org/wiki/Genetic_code'''
+    # Populate all empty
+    final = {}
+    for a in 'acgtn':
+        for b in 'acgtn':
+            for c in 'acgtn':
+                final[a + b + c] = [0,0,0,0,0,0]
+    for start in [0,1,2]:
+        print "Normal; start %i" % (start)
+        retval = _frame_stat(seq,start)
+        for codon,v in retval.iteritems():
+            final[codon][start] += v
+        print "Reverse; start %i" % (start)
+        retval = _frame_stat(seq[::-1],start)
+        for codon,v in retval.iteritems():
+            final[codon][start+3] += v
+    print "CODON :  N:0   , N:1  , N:2  ,  R:0 , R:1  , R:2  "
+    for codon in sorted(final.keys()):
+        print codon,"  : ", ",".join(["%6i" % x for x in final[codon]])
-def concat(head,tail,max_length=1000):
-    "Concat two strings together removing common parts"
-    l_head = len(head)
-    l_tail = len(tail)
+    # Start/Stop at the right time
+    start = 1
+    if (l_head < l_tail):
+        stop = l_head + 1
+    else:
+        stop = l_tail + 1
+    # Make sure not to run for-ever
+    if (stop > max_length):
+        stop = max_length
+    # Find largest common part
+    # XXX: Not very effient (on very large overlap sets
+    for i in reversed(range(start,stop)):
+        #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
+        if head[-i:] == tail[0:i]:
+            return((i,(tail[0:i]),head + tail[i:]))
+    # None found return full part
+    return(-1,'',head + tail)
+# Get data
+file1 = parse_file("data/AE005174v2-1.fas")
+file2 = parse_file("data/AE005174v2-2.fas")
+file1 = fasta.replace(file1)
+file2 = fasta.replace(file2)
+# Find overlap
+(retval, common, result) = concat(file2,file1)
+print retval, common
+# Strictly speaking there is a gap of about 4 kbs (4000 bs) between file1 and
+# file2, so lets' put that into the the statistics as well. Due to circular
+# nature, does not matter wether we add it in the beginning or in the end
+result = result + "n" * 4000;
+stats(result)
+# Write to file for later further processing
+out = open("full_contig.raw","w")
+out.write(result)
+out.close()
+if __name__ == "__main__":
+    # Load data
+    try:
+        handle = open(sys.argv[1],"rU")
+    except IndexError:
+        print "Usage %s <data_file>" % (sys.argv[0])
+        sys.exit(64)
+    except IOError:
+        print "Unable to open '%s'" % (sys.argv[1])
+        sys.exit(64)
+    seq = handle.read()
+    handle.close()
+    reading_frames(seq)

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 59 for liacs

Legend:

liacs/dbdm/dbdm_4/parse_fasta.py

liacs/dbdm/dbdm_4/reading_frames.py

Download in other formats: