-
Notifications
You must be signed in to change notification settings - Fork 0
/
recalc-ani.py
61 lines (52 loc) · 2.56 KB
/
recalc-ani.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# goal: read in prefetch file(s); recalc ANI if necessary
import sys
import argparse
import csv
#import pandas as pd
import numpy as np
from sourmash.logging import notify
from sourmash.distance_utils import containment_to_distance
def main(args):
# handle file input
prefetch_csvs = args.prefetch_csvs
if args.from_file:
for inF in args.from_file:
ff_csvs = [x.strip() for x in open(inF, 'r')]
prefetch_csvs += ff_csvs
# read in each file and load into table. there should only be a single line in each
writer=None
with open(args.output_csv, 'w') as outF:
for inF in prefetch_csvs:
with open(inF, 'r') as pf:
pf_r = csv.DictReader(pf)
if writer is None:
writer = csv.DictWriter(outF, fieldnames = pf_r.fieldnames)
writer.writeheader()
for row in pf_r:
if not row["query_containment_ani"]:
# grab required columns
q_containment = float(row['f_match_query'])
m_containment = float(row['f_query_match'])
ksize = int(row['ksize'])
scaled = int(row['scaled'])
n_unique_kmers = int(row['query_bp'])
# recalculate containment ani
query_ani_res = containment_to_distance(q_containment, ksize, scaled, n_unique_kmers=n_unique_kmers)
match_ani_res = containment_to_distance(m_containment, ksize, scaled, n_unique_kmers=n_unique_kmers)
# don't let any ANI values get zeroed out --> estimate independtly
query_ani = 1-query_ani_res.dist
match_ani = 1-match_ani_res.dist
avg_ani = np.mean([query_ani, match_ani])
max_ani = max(query_ani, match_ani)
row["query_containment_ani"] = query_ani
row["match_containment_ani"] = match_ani
row["average_containment_ani"] = avg_ani
row["max_containment_ani"] = max_ani
writer.writerow(row)
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument('prefetch_csvs', nargs='*')
p.add_argument('--from-file', '--prefetch-from-file', nargs="*", help="file(s) containing paths to prefetch csvs")
p.add_argument('-o', '--output-csv', required=True, help='output csv')
args = p.parse_args()
sys.exit(main(args))