forked from jkgiesler/stepic-bioinformatics-problems
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlongestkmer2.0.py
56 lines (39 loc) · 1.43 KB
/
longestkmer2.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
seq= "CGCCTACGCTGCTGCTTACCGGGCGCCGCTGCTGCTGCTGCTCGCCTAATACCGCCCGCCCGCCCGCCGCTCGGGCGCCGCTTACGCTTACCGGGCGCCTACGCTTACGCTGCTGCTTACTAATAAGCTTACTAACGCCCGCCTAAGCTTACTAACGCCTACCGGGTAATACTACTACCGCCTAACGCCCGCCGCTTAACGGGGCTCGCCGCTGCTTACTACGCTCGGGGCTCGGGCGCCTACTACTAACGCCCGCCCGGGTAACGCCCGGGTAACGCCGCTCGGGGCTTACGCTTACGCTGCTCGGGTACTAAGCTGCTCGCCTACTACTAATACCGGGTAATAATAATACCGGGTAATAATAA"
k=9 # kmer length
tol=3 #acceptable errors
##calculate all possible kmers in the DNA set
count =0
dict={}
totallenseq=len(seq)
while count<(totallenseq-9):
for i in range(len(seq)-k):
kmer=seq[i:i+k]
if kmer in dict:
dict[kmer]+=1
else:
dict[kmer]= 1
seq=seq[1:]
count+=1
##find which of the kmers has the highest number of matches
kmerlst=list(dict.keys())
score=0
matches=0
highmatches=0
winner=""
for sample in kmerlst:
#compare each kmer against all of the others..
#if it is acceptably close to matching with another it will count as a match.
#the one with the most matches will the most common.
for j in range(len(kmerlst)):
for i in range(len(sample)):
if ( sample[i]==kmerlst[j][i] ):
score+=1
if (score >= (k-tol) ):
matches+=1
if (matches>highmatches):
winner=sample
highmatches=matches
score=0
matches=0
print(winner)
#The purpose of this is just to try forking