-
Notifications
You must be signed in to change notification settings - Fork 7
/
rfplasmid.py
476 lines (412 loc) · 20.3 KB
/
rfplasmid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
#!/usr/bin/env python3
import os, sys, glob, argparse, shutil, multiprocessing
import pandas as pd
import numpy as np
from Bio import SeqIO
from collections import defaultdict
from itertools import groupby
from datetime import datetime
from contextlib import suppress
scriptlocation = os.path.dirname(os.path.realpath(__file__))
def cpu_threads(max_threads):
if multiprocessing.cpu_count() > max_threads:
return max_threads
else:
return multiprocessing.cpu_count()
# Import arguments
parser = argparse.ArgumentParser()
parser.add_argument("--species", help="define species (required)", required=('--input') in sys.argv and len(sys.argv) != 1)
parser.add_argument("--input", help="directory with input fasta files (required)", required=('--species') in sys.argv and len(sys.argv) != 1)
parser.add_argument("--specieslist", help="list of available species models", action="store_true", default=False)
parser.add_argument("--jelly", help="run jellyfish as kmer-count (faster)", action="store_true", default=False)
parser.add_argument("--out", help="specify output directory", default=False)
parser.add_argument("--debug", help="no cleanup of intermediate files", action="store_true", default=False)
parser.add_argument("--training", help="trainings mode Random Forest", action="store_true", default=False)
parser.add_argument("--threads", help="specify number of threads to be used, default is max available threads up to 16 threads", default=cpu_threads(16), type=int)
parser.add_argument("--version", help="print version number", action="store_true")
args = parser.parse_args()
species_import = args.species
input_directory = args.input
species_file = os.path.join(scriptlocation, "specieslist.txt")
df_species = pd.read_csv(species_file, header=None, sep=' ', names=['species', 'level'])
level_import = next(iter(df_species.loc[df_species['species'] == species_import, 'level']), 'no match')
species_list = df_species['species'].tolist()
# version
version = "1.0"
if args.version:
print('RFPlasmid version %s' % version)
sys.exit()
if args.specieslist:
print('Available species models: \n{}'.format(df_species.species.to_csv(index=False, header=False)))
sys.exit()
if len(sys.argv) <= 1:
print("Error; no arguments. Required to specificy --input and --species")
parser.print_help(sys.stderr)
sys.exit()
# check if input is directory
if not os.path.isdir(input_directory):
print('%r is not a directory. Please use directory as input.' % input_directory)
sys.exit()
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit()
# check if species exist in list
if not args.species in species_list:
print('Error: species classification model is not available')
print('Available species models: \n{}'.format(df_species.species.to_csv(index=False, header=False)))
sys.exit()
print('Start RFPlasmid, version %s' % version)
#make output folder
if args.out:
new_dir = args.out
if not os.path.exists(new_dir):
os.mkdir(new_dir)
else:
now = datetime.now().strftime('%Y%m%d_%H%M%S')
new_dir = new_dir + '_' + now
os.mkdir(new_dir)
if not args.out:
now = datetime.now().strftime('%Y%m%d_%H%M%S')
new_dir = 'RFPlasmid_output' + '_' + now
os.mkdir(new_dir)
# copy original contigIDs
print('copy original contig names')
for fasta_file in glob.glob(os.path.join(input_directory, '*.fasta')):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
with open(fasta_file, 'r') as f_fasta:
i = 0
records = SeqIO.parse(f_fasta, 'fasta')
for record in records:
i = i + 1
contig_id = basename + '\t' + str(i) + '\t' + record.description
with open(new_dir+'/contigID.out', 'a') as ID_file:
ID_file.write(contig_id + '\n')
# cleanup fasta files
print('cleanup contig names')
for fasta_file in glob.glob(os.path.join(input_directory, '*.fasta')):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
new_file = os.path.join(new_dir, basename+'.fasta')
i = 0
with open(fasta_file, 'r') as file_fasta, open(new_file, 'w') as new:
for i, seq_record in enumerate(SeqIO.parse(file_fasta, "fasta")):
seq_record.id = str(i + 1)
seq_record.description = ""
SeqIO.write(seq_record, new, "fasta")
# change working directory to new directory
os.chdir(new_dir)
# Checkm
print('start Checkm')
if args.species == 'Generic':
species_import = 'Bacteria'
os.system('checkm taxonomy_wf {} {} . checkm_output -x fasta --nt -t 16 > checkm_output.tsv'.format(level_import, species_import))
# check if Checkm files are made
if not os.path.isfile("./checkm_output/{file}.ms".format(file=species_import)):
print("\nError: Checkm is not working properly.\nHave you installed the Checkm databases according to the github page of Checkm https://ecogenomics.github.io/CheckM/?")
sys.exit()
# check if contigs contain ORFs
for fasta_file in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
if not os.path.isfile("./checkm_output/bins/{contig}/hmmer.analyze.txt".format(contig=basename)):
print('Error: File {} is too short for prediction'.format(basename))
sys.exit()
print('Checkm done')
# Blast plasmiddb genes
print('start blast plasmiddb')
plasmiddb_total_file = os.path.join(scriptlocation, "plasmiddb_total")
for root, dirs, files in os.walk(os.getcwd()):
filepath = os.path.join(root, 'genes.faa')
basename = os.path.basename(os.path.dirname(filepath))
if os.path.isfile(filepath):
output_file = os.path.join(root, 'blast_plasmiddb_output.txt')
os.system('diamond blastp -d {} -q {} -e 1E-30 -k 1 > {}'.format(plasmiddb_total_file, filepath, output_file))
else:
continue
for root, dirs, files in os.walk(os.getcwd()):
filepath = os.path.join(root, 'blast_plasmiddb_output.txt')
if os.path.isfile(filepath):
with open(filepath, 'r') as f1:
with suppress(ValueError):
df1 = pd.read_csv(f1, header=None, delim_whitespace=True, usecols=[0,2], names=['contig_gene', 'hit'])
df1['hit'] = df1['hit'].round(0).astype(int)
df1 = df1.loc[df1['hit'] > 80]
df1['contig_gene'] = df1['contig_gene'].drop_duplicates()
df1[['contig', 'gene']] = df1['contig_gene'].str.split('_', n=1, expand=True)
df2 = df1.groupby(['contig']).size().reset_index(name='counts')
output_file = os.path.join(root, 'plasmiddbgenes_per_contig.txt')
with open(output_file, 'w') as out_file:
df2.to_csv(out_file, sep=' ', index=False, header=False)
else:
continue
print('blast plasmid genes done')
# Start blast cge genes
print('start blast cge')
cgedb_file = os.path.join(scriptlocation, "plasmiddb_cge")
for fasta_file in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
output_file = basename + '_blast_plasmidcge_output.txt'
os.system('diamond blastx -d {} -q {} -e 1E-30 -k 1 > {}'.format(cgedb_file, fasta_file, output_file))
for cge_file in glob.glob('*_blast_plasmidcge_output.txt'):
with open(cge_file, 'r') as f1:
with suppress(ValueError):
genome_name = str(cge_file).split('_blast')[:-1][0]
df1 = pd.read_csv(f1, header=None, delim_whitespace=True, usecols=[0,2], names=['contig', 'hit'])
df1['hit'] = df1['hit'].astype(float)
df1 = df1.loc[df1.groupby('contig')['hit'].idxmax()]
df1['genome'] = genome_name
df1.to_csv('blast_plasmid_cge_outputpercontig.out', index=False, header=False, sep=' ', mode='a')
print('blast cge genes done')
# Start count genes per contig
for root, dirs, files in os.walk(os.getcwd()):
filepath = os.path.join(root, 'genes.faa')
basename = os.path.basename(os.path.dirname(filepath))
if os.path.isfile(filepath):
with open(filepath, 'r') as faa_file:
id = []
for gene in faa_file:
if gene.startswith(">"):
gene = gene.split(' ')[0]
id.append(gene[1:])
df = pd.DataFrame({'id': id})
df[['contig', 'gene']] = df['id'].str.split('_', n=1, expand=True)
df2 = df['contig'].value_counts()
output_file = os.path.join(root, 'genespercontig.txt')
with open(output_file, 'w') as out_file:
df2.to_csv(out_file, sep=' ', index=True, header=False)
print('count genes per contig done')
# Start count SCMs per contig
for root, dirs, files in os.walk(os.getcwd()):
filepath = os.path.join(root, 'hmmer.analyze.txt')
basename = os.path.basename(os.path.dirname(filepath))
if os.path.isfile(filepath):
with open(filepath, 'r') as scm_file:
with suppress(ValueError):
df = pd.read_csv(scm_file, header=None, delim_whitespace=True, usecols=[0], names=['contig_gene'])
df = df[~df['contig_gene'].str.contains("#")]
df['contig_gene'] = df['contig_gene'].drop_duplicates()
df[['contig', 'gene']] = df['contig_gene'].str.split('_', n=1, expand=True)
df2 = df.groupby(['contig']).size().reset_index(name='counts')
output_file = os.path.join(root, 'SCMpercontig.txt')
with open(output_file, 'w') as out_file:
df2.to_csv(out_file, sep=' ', index=False, header=False)
print('count SCMs per contig done')
print('Start kmer count')
if args.jelly:
print('You have selected kmer-counting with jellyfish')
kmer_dir = 'fasta_split'
os.mkdir(kmer_dir)
for fasta_file in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
for record in SeqIO.parse(fasta_file,"fasta"):
id = record.id
id_file = basename + '_' + id + '.fasta'
with open(os.path.join(kmer_dir, id_file), 'w') as out:
out.write('>'+record.id+'\n'+str(record.seq + 'NN' + record.seq.reverse_complement() + '\n'))
os.chdir(kmer_dir)
for split_fasta in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(split_fasta))[0]
os.system('cat {} | jellyfish count /dev/fd/0 -m 5 -c 20 -s 1000000 -t 16 -o /dev/fd/1 |jellyfish dump -c -t /dev/fd/0 |while read kmer ; do echo {} $kmer ; done >> ../kmers.out'.format(split_fasta, basename))
os.chdir("../")
print('Jellyfish kmer-count done')
if not (args.jelly):
print('Standard kmer-counting method')
kmer_dir = 'fasta_split'
os.mkdir(kmer_dir)
def count_kmers(read, k):
counts = defaultdict(list)
num_kmers = len(read) - k + 1
for i in range(num_kmers):
kmer = read[i:i+k]
if kmer not in counts:
counts[kmer] = 0
counts[kmer] += 1
return(counts)
for fasta_file in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
with open(fasta_file) as f_fasta:
for k, g in groupby(f_fasta, lambda x: x.startswith('>')):
if k:
sequence = next(g).strip('>\n')
else:
d1 = list(''.join(line.strip() for line in g))
d1 = [item.upper() for item in d1]
d2 = ''.join(d1)
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'U': 'N', 'M': 'N', 'R': 'N', 'W': 'N', 'S': 'N', 'Y': 'N', 'K': 'N', 'V': 'N', 'H': 'N', 'D': 'N', 'B': 'N', 'X': 'N'}
reverse_complement = ''.join(complement.get(base, base) for base in reversed(d2))
d3 = (d2+'NN'+reverse_complement)
counting = count_kmers(d3, 5)
for item in counting:
output = basename, sequence, item, counting[item]
with open('kmer.out', 'a') as text_file:
text_file.write("%s %s %s %s\n" % output)
print('Kmer count standard method ready')
print('Count contig length')
for fasta_file in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
with open(fasta_file) as f_fasta:
for k, g in groupby(f_fasta, lambda x: x.startswith('>')):
if k:
sequence = next(g).strip('>\n')
else:
d1 = list(''.join(line.strip() for line in g))
d2 = ''.join(d1)
data = sum(1 for ch in d2)
output = basename, sequence, data
with open('count_contiglength.out', 'a') as out_file:
out_file.write("%s %s %s\n" % output)
print('count contig length ; done')
# next step: Work on files in different folders and merge to separate dfs
print('Start merging contig files')
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
for root, dirs, files in os.walk(os.getcwd()):
filepath = os.path.join(root, 'genespercontig.txt')
if os.path.isfile(filepath):
with open(filepath, 'r') as f1:
df1 = pd.read_csv(f1, header=None, delim_whitespace=True, names=['contig', 'genes'])
df1['genome'] = os.path.basename(os.path.dirname(filepath))
else:
continue
filepath = os.path.join(root, 'SCMpercontig.txt')
if os.path.isfile(filepath):
with open(filepath, 'r') as f2:
df2 = pd.read_csv(f2, header=None, delim_whitespace=True, names=['contig', 'SCM'])
df2['genome'] = os.path.basename(os.path.dirname(filepath))
if not os.path.isfile(filepath):
df2 = df1
df2 = df2.rename(columns={'genes': 'SCM'})
df2['SCM'] = np.NaN
filepath = os.path.join(root, 'plasmiddbgenes_per_contig.txt')
if os.path.isfile(filepath):
with open(filepath, 'r') as f3:
df3 = pd.read_csv(f3, header=None, delim_whitespace=True, names=['contig', 'plasmid_genes'])
df3['genome'] = os.path.basename(os.path.dirname(filepath))
if not os.path.isfile(filepath):
df3 = df1
df3 = df3.rename(columns={'genes': 'plasmid_genes'})
df3['plasmid_genes'] = np.NaN
# merge dataframes for each folder
df_merge1 = pd.merge(df1, df2, left_on=['genome', 'contig'], right_on=['genome', 'contig'], how='left')
df_merge2 = pd.merge(df_merge1, df3, on=['genome', 'contig'], how='left')
df_merge2.to_csv(os.path.join(root,'outputgenesdf.csv'))
print('Create df_files ; done')
# next step: concatenate csv files
dfList = []
for root, dirs, files in os.walk(os.getcwd()):
filepath = os.path.join(root, 'outputgenesdf.csv')
if os.path.isfile(filepath):
with open(filepath, 'r') as fnew:
frame = pd.read_csv(fnew, converters={'genome': lambda x: str(x)})
dfList.append(frame)
df1a = pd.concat(dfList)
df1a['contig'] = df1a['contig'].astype(object)
#import contigIDnumbers
df1b = pd.read_csv('contigID.out', header=None, sep='\t', names=['genome', 'contig', 'contigID'])
df1b['contig'] = df1b['contig'].astype(object)
df1 = pd.merge(df1a, df1b, how='right', left_on=['genome','contig'], right_on=['genome','contig'])
df1['contig'] = df1['contig'].astype(int)
df1['source'] = df1.genome.astype(str).str.cat(df1.contig.astype(str), sep='_')
df1 = df1.set_index('source')
print('Start to add blast output to dataframe')
# next step; make dataframe
# import kmers
kmer_file = os.path.join(scriptlocation, "kmer.txt")
df2 = pd.read_csv(kmer_file, delim_whitespace=True, header=None)
# use row as columnheaders
df2.columns = df2.iloc[0]
df2 = df2.loc[:-1]
# samenvoegen dfs
df3 = pd.concat([df1, df2], axis=1)
df3['source'] = df3.genome.astype(str).str.cat(df3.contig.astype(str), sep='_')
df3 = df3.set_index('source')
# import kmers
if args.jelly:
df4 = pd.read_csv('kmers.out', sep=' ', low_memory=False, header=None, names=['source', 'gene', 'bitscore'])
df4[['genome', 'contig']] = df4['source'].str.rsplit('_', n=1, expand=True)
if not (args.jelly):
df4 = pd.read_csv('kmer.out', sep=' ', low_memory=False, header=None, names=['genome', 'contig', 'gene', 'bitscore'])
df4['source'] = df4.genome.astype(str).str.cat(df4.contig.astype(str), sep='_')
# combine df3 and df4
df5 = df4.pivot_table(index='source', columns='gene', values='bitscore').reindex(index=df3.index, columns=df3.columns).fillna(df3)
df5['genome'] = df5['genome'].astype(object)
df5['contig'] = df5['contig'].astype(object)
# import blastcge_outputpercontig
df5aa = pd.read_csv('blast_plasmid_cge_outputpercontig.out', header=None, delim_whitespace=True, names=["contig", "plasmidcge_id", "genome"])
df5aa['source'] = df5aa.genome.astype(str).str.cat(df5aa.contig.astype(str), sep='_')
df5aa = df5aa.set_index('source')
df5aa['plasmidcge_id'] = df5aa['plasmidcge_id'].astype(object)
df5aa['contig'] = df5aa['contig'].astype(object)
df5aa['genome'] = df5aa['genome'].astype(object)
df5a = pd.merge(df5, df5aa, how='outer')
df5a['source'] = df5a.genome.astype(str).str.cat(df5a.contig.astype(str), sep='_')
df5a['contig'] = df5a['contig'].astype(object)
# import contig length
df5bb = pd.read_csv('count_contiglength.out', header=None, delim_whitespace=True, names=["genome", "contig", "contig_length"])
df5bb['source'] = df5bb.genome.astype(str).str.cat(df5bb.contig.astype(str), sep='_')
df5bb = df5bb.set_index('source')
df5bb['genome'] = df5bb['genome'].astype(object)
df5bb['contig'] = df5bb['contig'].astype(object)
df5bb['contig_length'] = df5bb['contig_length'].astype(object)
df5b = pd.merge(df5a, df5bb, how='outer')
df5b.fillna(int(0), inplace=True)
# add column with kmer_numbers
df5b['kmer_number'] = df5b.contig_length - 4
df5b['kmer_number'] = df5b.kmer_number * 2
# add column with genes/SCM en round to 2 decimals
df5b['SCM_genes'] = df5b['SCM']/df5b['genes']
df5b['SCM_genes'] = df5b['SCM_genes'].fillna(0)
# add column with genes/plasmid_genes en round to 2 decimals
df5b['plasmid_genes_genes'] = df5b['plasmid_genes']/df5b['genes']
df5b['plasmid_genes_genes'] = df5b['plasmid_genes_genes'].fillna(0)
# ID contig column to first position
cols = list(df5b.columns.values)
cols.pop(cols.index('source'))
cols.pop(cols.index('contigID'))
cols.pop(cols.index('contig_length'))
cols.pop(cols.index('kmer_number'))
cols.pop(cols.index('plasmid_genes_genes'))
cols.pop(cols.index('SCM_genes'))
cols.pop(cols.index('contig'))
cols.pop(cols.index('genome'))
cols.pop(cols.index('plasmidcge_id'))
df5b = df5b[['source', 'contigID', 'genome', 'contig', 'contig_length', 'SCM_genes', 'plasmid_genes_genes', 'plasmidcge_id', 'kmer_number']+cols]
df5b.drop('Unnamed: 0', axis=1, inplace=True)
df5da = df5b.loc[:, 'source':'plasmidcge_id']
df5db = df5b.loc[:, 'kmer_number':'TTTTT'].astype(int)
df6 = pd.concat([df5da, df5db], axis=1)
# kmer fraction
df6[['kmer{}'.format(i) for i in range(1, 1+(2002-979)+1)]] = df6.loc[:, 'AAAAA':'TTTTT'].div(df6['kmer_number'], axis=0)
df6.loc[:, 'kmer1':'kmer1024'] = df6.loc[:, 'kmer1':'kmer1024'].applymap('{:.2e}'.format)
# delete some columns
df6.drop(df6.columns.to_series()["AAAAA":"TTTTT"], axis=1, inplace=True)
df6.drop(['contig','genes'], axis=1, inplace=True)
df6.to_csv('outputdataframe.csv', index=False)
print('CSV file from dataframe ready in file outputdataframe.csv')
if args.debug:
print('debug mode, no cleaning up')
if not (args.debug):
print('Cleaning-up intermediate files')
shutil.rmtree(kmer_dir)
shutil.rmtree('checkm_output')
for file in glob.glob('*.txt'):
os.remove(file)
for file in glob.glob('*.out'):
os.remove(file)
for file in glob.glob('*.fasta'):
os.remove(file)
os.remove('checkm_output.tsv')
print('Start R')
if args.training:
print('You have selected training mode')
trainings_location = os.path.join(scriptlocation, "training.R")
os.system('R --vanilla < {}'.format(trainings_location))
print('training done')
if not (args.training):
print('RFPlasmid prediction mode started')
classification_location = os.path.join(scriptlocation, "classification.R")
if args.species == 'Generic':
species_import = 'Generic'
os.system('R --vanilla --args {} {} < {}'.format(species_import, scriptlocation, classification_location))
print('Prediction done')
print('RFPlasmid output can be found in directory %s' %new_dir)
print('If you like this tool, please cite:')
print('van der Graaf-van Bloois, L., Wagenaar, J. A. & Zomer, A. L. RFPlasmid: predicting plasmid sequences from short read assembly data using machine learning. Microb. Genom. 7 (2021), doi.org/10.1099/mgen.0.000683')