-
Notifications
You must be signed in to change notification settings - Fork 1
/
single_end_AMAISE_evaluation.py
72 lines (63 loc) · 3 KB
/
single_end_AMAISE_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import subprocess
from accuracy import *
import joblib
# define the file with the true labels and the file with the corresponding DNA sequences
truelabels = 'demo_test_data/nanopore_demo_data.txt'
truefile = truelabels
inputfile = 'demo_test_data/nanopore_demo_data.fastq'
# define the model used to classify the input sequences
modelpath = 'models_and_references/single_end_model'
# define the thresholds used to convert AMAISE's output probabilities into classification labels
threshs = {25: 0.31313131313131315, 50: 0.4141414141414142, 100: 0.5454545454545455, 150: 0.6262626262626263, 200: 0.7070707070707072, 250: 0.6363636363636365, 300: 0.6666666666666667, 500: 0.6464646464646465, 1000: 0.4747474747474748, 5000: 0.48484848484848486, 10000: 0.4646464646464647}
'''
Inputs:
inputfile:
Outputs:
outputfolder:
resourcefile:
ml calculates the ...
'''
def ml(inputfile):
typefile = 'fastq'
outputfolder = 'single_end_output'
resourcefile = '%s/single_end_resource.txt'%outputfolder
# get the elapsed wall clock time and peak memory usage of running AMAISE
cmd = 'time -v taskset -c 0 python3 host_depletion.py -i %s -t %s -o %s'%(inputfile, typefile, outputfolder)
print(cmd)
output = subprocess.check_output([cmd], shell = True, stderr=subprocess.STDOUT)
elems = output.decode("utf-8")
for i in range(len(elems.split('\n'))):
if 'Elapsed (wall clock) time' in elems.split('\n')[i]:
print(elems.split('\n')[i].split('\t')[1])
mlspeed = elems.split('\n')[i].split('\t')[1].split(':')[-3:]
try:
mlspeed = int(mlspeed[0])*60*60 + float(mlspeed[1])*60 + float(mlspeed[2])
except:
mlspeed = float(mlspeed[1])*60 + float(mlspeed[2])
if 'Maximum resident set size' in elems.split('\n')[i]:
size_line = elems.split('\n')[i].split('\t')[1]
size = int(size_line.split(': ')[1])
print('Maximum resident set size (GB): %0.4f'%(size/(10**6)))
mlressetsize = size/(10**6)
# get the total storage needed to run AMAISE
output = subprocess.check_output(['ls -l %s'%modelpath], shell = True, stderr=subprocess.STDOUT)
elems = output.decode("utf-8").split('meerak ')
total_storage = 0
for elem in elems:
inside_elem = elem.split()
if len(inside_elem) > 2:
total_storage += int(inside_elem[0])
print('Total storage (GB): %0.4f'%(total_storage/(10**9)))
mltotalstorage = total_storage/(10**9)
# get the accuracy, sensitivity, and specificity from running AMAISE
accuracy, sens, spec = ml_rhd(truelabels, '%s/mlprobs.txt'%outputfolder, threshs)
print(accuracy, sens, spec)
with open(resourcefile, 'w') as f:
f.write('Speed: %0.10f\n'%mlspeed)
f.write('Peak Memory Usage: %0.10f\n'%mlressetsize)
f.write('Total Storage: %0.10f\n'%mltotalstorage)
f.write('Accuracy: %0.10f\n'%accuracy)
f.write('Sensitivity: %0.10f\n'%sens)
f.write('Specificity: %0.10f\n'%spec)
ml(inputfile)