source: liacs/dbdm/dbdm_4/parse-fasta.py@ 55

Last change on this file since 55 was 55, checked in by Rick van der Zwet, 15 years ago

Make it 'production' ready

  • Property svn:executable set to *
File size: 2.4 KB
RevLine 
[41]1#!/usr/bin/env python
[55]2#
3# Parse 2 FASTA files and print statistics
4# BSDLicence
5# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
[41]6from Bio import SeqIO,Seq
7from Bio import Alphabet
8from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
9import Bio.Data.CodonTable
[53]10from MultiReplace import MultiReplace
[41]11
[54]12fasta_translate = {
13 'r' : 'ga', # purine
14 'y' : 'tc', # pyrimide
15 'k' : 'gt', # keto
16 'm' : 'ac', # amino
17 's' : 'gc', # strong
18 'w' : 'at', # weak
19 'b' : 'gtc',
20 'd' : 'gat',
21 'h' : 'act',
22 'v' : 'gca',
23 }
24
25fasta = MultiReplace(fasta_translate)
26
[53]27def parse_file(file):
[54]28 handle = open(file, "rU")
[53]29 for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
30 # How to translate damm thing into plain nucleic acid codes
31 # http://en.wikipedia.org/wiki/FASTA_format
[54]32 retval = seq_record.seq.__str__()
[53]33
[54]34 handle.close()
35 return(retval)
36
37def stats(seq):
38 pdict = {}
39 for n in range(1, len(seq)):
40 protein = seq[n]
41 if not pdict.has_key(protein):
42 pdict[protein] = 1
43 else:
44 pdict[protein] += 1
[53]45
[54]46 print pdict
[41]47
[54]48
49def concat(head,tail,max_length=1000):
50 "Concat two strings together removing common parts"
51 l_head = len(head)
52 l_tail = len(tail)
53
54 # Start/Stop at the right time
55 start = 1
56 if (l_head < l_tail):
57 stop = l_head + 1
58 else:
59 stop = l_tail + 1
60
61 # Make sure not to run for-ever
62 if (stop > max_length):
63 stop = max_length
64
65 # Find largest common part
66 # XXX: Not very effient (on very large overlap sets
67 for i in reversed(range(start,stop)):
68 #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
69 if head[-i:] == tail[0:i]:
70 return((i,(tail[0:i]),head + tail[i:]))
71
72 # None found return full part
73 return(-1,'',head + tail)
74
75
[55]76# Get data
[53]77file1 = parse_file("data/AE005174v2-1.fas")
78file2 = parse_file("data/AE005174v2-2.fas")
[55]79
[54]80file1 = fasta.replace(file1)
81file2 = fasta.replace(file2)
[55]82
83# Find overlap
[54]84(retval, common, result) = concat(file2,file1)
85print retval, common
[55]86
87# Strictly speaking there is a gap of about 4 kbs (4000 bs) between file1 and
88# file2, so lets' put that into the the statistics as well. Due to circular
89# nature, does not matter wether we add it in the beginning or in the end
90result = result + "n" * 4000;
[54]91stats(result)
92
[55]93# Write to file for later further processing
[54]94out = open("full_contig.raw","w")
95out.write(result)
96out.close()
Note: See TracBrowser for help on using the repository browser.