-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprepResult.py
226 lines (154 loc) · 7.22 KB
/
prepResult.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python
"""prepResult.py: Format the initial binning result from an existing binning tool.
Format the initial binning result from an existing binning tool in the .csv format
with contig ID and bin ID. Contigs are numbered starting from 0 and bins are
numbered starting from 1.
"""
import sys
import os
import csv
import argparse
import re
import subprocess
from Bio import SeqIO
__author__ = "Vijini Mallawaarachchi, Anuradha Wickramarachchi, and Yu Lin"
__copyright__ = "Copyright 2019, GraphBin Project"
__credits__ = "Benjamin Kaehler and Gavin Huttley"
__license__ = "GPL"
__type__ = "Support Script"
__maintainer__ = "Vijini Mallawaarachchi"
__email__ = "vijini.mallawaarachchi@anu.edu.au"
# Sample command
# -------------------------------------------------------------------
# python prepResult.py --binned /path/to/folder_with_binning_result
# --assembler name of the assembler used (SPAdes or SGA)
# --output /path/to/output_folder
# -------------------------------------------------------------------
# Setup argument parser
#-----------------------
ap = argparse.ArgumentParser()
ap.add_argument("--binned", required=True, type=str, help="path to the folder containing the initial binning result from an existing tool")
ap.add_argument("--assembler", required=True, type=str, help="name of the assembler used (SPAdes, SGA or MEGAHIT). GraphBin supports Flye and Canu long-read assemblies as well.")
ap.add_argument("--output", required=True, type=str, help="path to the output folder")
ap.add_argument("--prefix", required=False, type=str, default='', help="prefix for the output file")
args = vars(ap.parse_args())
contig_bins_folder = args["binned"]
assembler = args["assembler"]
output_path = args["output"]
prefix = ""
# Check assembler type
#---------------------------------------------------
if not (assembler.lower() == "spades" or assembler.lower() == "sga" or assembler.lower() == "megahit" or assembler.lower() == "flye" or assembler.lower() == "canu"):
print("\nPlease make sure to provide the correct assembler type (SPAdes, SGA, MEGAHIT, Flye or Canu).")
print("\nExiting prepResult.py...\nBye...!\n")
sys.exit(1)
# Check if folder to initial binning result exists
#---------------------------------------------------
# Handle for missing trailing forwardslash in folder path of binning result
if contig_bins_folder[-1:] != "/":
contig_bins_folder = contig_bins_folder + "/"
# Throw an error if folder path of binning result does not exist.
if not os.path.isdir(contig_bins_folder):
print("\nPlease enter a valid path to the folder containing the initial binning result.")
print("\nExiting prepResult.py...\nBye...!\n")
sys.exit(1)
# Get list of files in the folder path of binning result.
files = os.listdir(contig_bins_folder)
# Check if folder path of binning result is empty.
#---------------------------------------------------
if len(files) == 0:
print("\nFolder containing the initial binning result is empty. Please enter a valid path to the folder containing the initial binning result.")
print("\nExiting prepResult.py...\nBye...!\n")
sys.exit(1)
# Check if binning result folder contains fasta files.
#---------------------------------------------------
isFasta = False
for myfile in files:
if myfile.lower().endswith(('.fasta', '.fa', '.fna')):
isFasta = True
if not isFasta:
print("\nMake sure the folder containing the initial binning result contains fasta files (.fasta, .fa or .fna).")
print("\nExiting prepResult.py...\nBye...!\n")
sys.exit(1)
# Check if output folder exists
#---------------------------------------------------
# Handle for missing trailing forwardslash in output folder path
if output_path[-1:] != "/":
output_path = output_path + "/"
# Create output folder if it does not exist
if not os.path.isdir(output_path):
subprocess.run("mkdir -p "+output_path, shell=True)
# Validate prefix
#---------------------------------------------------
try:
if args["prefix"] != '':
if args["prefix"].endswith("_"):
prefix = args["prefix"]
else:
prefix = args["prefix"]+"_"
else:
prefix = ""
except:
print("\nPlease enter a valid string for prefix")
print("Exiting prepResult.py...\n")
sys.exit(1)
# Format binning results.
#---------------------------------------------------
print("\nFormatting initial binning results")
i = 1
contig_bins = []
bin_ids = []
for bin_file in files:
if bin_file.lower().endswith(('.fasta', '.fa', '.fna')):
bin_line = []
bin_line.append(str(bin_file))
bin_line.append(str(i))
bin_ids.append(bin_line)
for index, record in enumerate(SeqIO.parse(contig_bins_folder+bin_file, "fasta")):
contig_name = str(record.id)
contig_num = 0
line = []
try:
if assembler.lower() == "spades":
start_n = 'NODE_'
end_n = '_length'
contig_num = int(re.search('%s(.*)%s' % (start_n, end_n), contig_name).group(1))
line.append('NODE_'+str(contig_num))
elif assembler.lower() == "megahit":
start_k = 'k'
end_k = '_'
k_num = int(re.search('%s(.*)%s' % (start_k, end_k), contig_name).group(1))
start_n = '_'
end_n = ''
contig_num = int(re.search('%s(.*)%s' % (start_n, end_n), contig_name).group(1))
line.append('k'+str(k_num)+'_'+str(contig_num))
elif assembler.lower() == "sga":
start_n = 'contig-'
end_n = ''
contig_num = int(re.search('%s(.*)%s' % (start_n, end_n), contig_name).group(1))
line.append('contig-'+str(contig_num))
elif assembler.lower() == "canu" or assembler.lower() == "flye":
line.append(str(contig_name))
except:
print("\nContig naming does not match with the assembler type provided. Please make sure to provide the correct assembler type.")
print("\nExiting prepResult.py...\nBye...!\n")
sys.exit(1)
line.append(str(i))
contig_bins.append(line)
i = i + 1
# Write binning results to output file.
#---------------------------------------------------
print("\nWriting initial binning results to output file")
with open(output_path + prefix + 'initial_contig_bins.csv', mode='w') as contig_bins_file:
contig_writer = csv.writer(contig_bins_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in contig_bins:
contig_writer.writerow(row)
with open(output_path + prefix + 'bin_ids.csv', mode='w') as bin_ids_file:
bin_id_writer = csv.writer(bin_ids_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in bin_ids:
bin_id_writer.writerow(row)
print("\nFormatted initial binning results can be found at", contig_bins_file.name)
print("\nBin IDs and corresponding names of fasta files can be found at", bin_ids_file.name)
# Exit program
#--------------
print("\nThank you for using prepResult for GraphBin!\n")