-
Notifications
You must be signed in to change notification settings - Fork 0
/
nr_extract.py
37 lines (29 loc) · 932 Bytes
/
nr_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import gzip
from numba import jit
from Bio import SeqIO
matches_lis = []
with open(r'E:\matches_id.txt') as f:
for line in f:
matches_lis.append(line.replace('\n', ''))
set_lis = set(matches_lis)
matches_lis = list(set_lis)
print(len(matches_lis) - len(set_lis))
nr_len = 455043919
nr_extract = open(r'E:\nr_extract.fasta', "a+")
nr_extract_des = open(r'E:\nr_extract_des.fasta', "a+")
nr = SeqIO.parse(r'E:\nr', 'fasta')
cnt = 0
nr_idx = 0
for pr in nr:
if pr.id in set_lis:
cnt += 1
nr_extract.write('>' + str(pr.id) + '\n' + str(pr.seq) + '\n')
# nr_extract.flush()
# no effect ?
nr_extract_des.write('>' + str(pr.description) + '\n' + str(pr.seq) + '\n')
print("nr_idx: " + str(nr_idx) + " cnt: " + str(cnt) + " comp: {:.2f}%".format(nr_idx/nr_len))
nr_idx += 1
if cnt == len(matches_lis):
print("all match!")
else:
print("something wrong QAQ")