-
Notifications
You must be signed in to change notification settings - Fork 4
/
bio_script.py
143 lines (117 loc) · 4.82 KB
/
bio_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from Bio import SeqIO
import math
from multiprocessing import Pool
import os
import random
import subprocess
# def batch_iterator(iterator, batch_size):
# """Returns lists of length batch_size.
# This can be used on any iterator, for example to batch up
# SeqRecord objects from Bio.SeqIO.parse(...), or to batch
# Alignment objects from Bio.AlignIO.parse(...), or simply
# lines from a file handle.
# This is a generator function, and it returns lists of the
# entries from the supplied iterator. Each list will have
# batch_size entries, although the final list may be shorter.
# """
# entry = True # Make sure we loop once
# while entry:
# batch = []
# while len(batch) < batch_size:
# try:
# entry = iterator.__next__()
# except StopIteration:
# entry = None
# if entry is None:
# # End of file
# break
# batch.append(entry)
# if batch:
# yield batch
# def split_fasta(fasta_file, num_split=10):
# """Split original fasta file into several fasta files.
# """
# # num_reads = int(subprocess.check_output("grep -c '^>' {}".format(fasta_file), shell=True).split()[0])
# num_seq = 0
# for s in SeqIO.parse(fasta_file, 'fasta'):
# num_seq += 1
# num_per_split = num_seq // num_split + 1
# record_iter = SeqIO.parse(fasta_file, "fasta")
# num_files = 0
# for i, batch in enumerate(batch_iterator(record_iter, num_per_split)):
# filename = f"{fasta_file}.{i}"
# with open(filename, "w") as handle:
# count = SeqIO.write(batch, handle, "fasta")
# # print("Wrote %i records to %s" % (count, filename))
# num_files += 1
# return num_files
def split_fasta(fasta_file, num_split=10):
"""Split original fasta file into several fasta files.
"""
all_seq_id = [s.id for s in SeqIO.parse(fasta_file, 'fasta')]
random.shuffle(all_seq_id)
num_per_split = math.ceil(len(all_seq_id)/ num_split)
num_files = 0
seq_index = SeqIO.index(fasta_file, 'fasta')
for i in range(0, len(all_seq_id), num_per_split):
if len(all_seq_id[i: i + num_per_split]) > 0:
out_seq = [seq_index[s] for s in all_seq_id[i: i + num_per_split]]
SeqIO.write(out_seq, f"{fasta_file}.{num_files}", 'fasta')
num_files += 1
return num_files
def prodigal(dna_path):
"""Run prodigal.
"""
subprocess.run(f"prodigal -i {dna_path} -o {dna_path}.gff -a {dna_path}.aa_ -f gff -p meta -q", shell=True)
def run_multi_prodigal(contig_path, threads=32):
"""Run prodigal in multiple threads.
"""
num_files = split_fasta(fasta_file=contig_path, num_split=threads)
threads = num_files
pool = Pool(processes=threads)
for temp_id in range(threads):
pool.apply_async(prodigal, [f"{contig_path}.{temp_id}"])
pool.close()
pool.join()
# need to merge and delete the temporary files
# work_dir = os.path.dirname(contig_path)
for i in range(threads):
os.remove(f"{contig_path}.{i}")
# remove the * in the last
for i in range(threads):
subprocess.run(f"sed 's/*$//g' {contig_path}.{i}.aa_ > {contig_path}.{i}.aa", shell=True)
os.remove(f"{contig_path}.{i}.aa_")
# merge the files
with open(f'{contig_path}.aa', 'w') as outfile:
for i in range(threads):
fname = f"{contig_path}.{i}.aa"
with open(fname) as infile:
for line in infile:
outfile.write(line)
os.remove(fname)
with open(f'{contig_path}.gff', 'w') as outfile:
for i in range(threads):
fname = f"{contig_path}.{i}.gff"
with open(fname) as infile:
for line in infile:
outfile.write(line)
os.remove(fname)
def count_aa(aa_fasta):
"""Count the number of aa generated by prodigal.
"""
aa_num_dict = {}
for s in SeqIO.parse(aa_fasta, 'fasta'):
query_name = s.id.rsplit('_', 1)[0]
if query_name in aa_num_dict:
aa_num_dict[query_name] += 1
else:
aa_num_dict[query_name] = 1
with open(f"{aa_fasta}.aa_count", 'w') as aa_c:
for k, v in aa_num_dict.items():
aa_c.write(f"{k}\t{v}\n")
def run_diamond(db_path, query_path, threads):
"""Split the fasta file and run Diamond.
"""
# subprocess.run(f"./diamond blastp -d {db_path} -q {query_path} -o {query_path}.diamond -p {threads} -f 6 --sensitive -c1 --tmpdir /dev/shm", shell=True)
subprocess.run(f"diamond blastp -d {db_path} -q {query_path} -o {query_path}.diamond -p {threads} -f 6 --sensitive -c1 --quiet", shell=True)
subprocess.run("awk '{{print $1,$2,$11}}' %s.diamond > %s.abc" % (query_path, query_path), shell=True)