-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_uc_clusters.py
executable file
·104 lines (90 loc) · 4.25 KB
/
merge_uc_clusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
# merge_uc_clusters.py created 2022-12-07
# allow gzip 2023-04-11
# check for write formats from R that cause problems 2024-12-11
"""merge_uc_clusters.py last modified 2024-12-11
combine values of OTU counts based on cluster output from vsearch
./merge_uc_clusters.py cluster.uc otu_table.tab > merged_table.tab
assumes OTU table is in tab-delimited format, as:
seq_name sample1 sample2 sample3
seq1234 0 10 100
seq5678 50 5 0
assumes tab-delimited cluster format from vsearch, as:
S 0 503 * * * * * ASV1210682 *
S 1 503 * * * * * ASV1218341 *
S 2 503 * * * * * ASV1221110 *
after running vsearch:
~/vsearch-2.22.1-linux-x86_64-static/bin/vsearch --cluster_fast dna-sequences.fasta --id 0.97 --centroids centroids.fas --uc clusters.uc
"""
import sys
import gzip
from collections import defaultdict
if len(sys.argv) < 2:
sys.exit(__doc__)
else:
cluster_file = sys.argv[1]
# 1. Record type: S, H, or C. Each fasta sequence in the input file can be either a cluster centroid (S) or a hit (H) assigned to a cluster. Cluster records (C) summarize information (size, centroid label) for each cluster.
# 2. Cluster number (zero-based).
# 3. Sequence length (S, H), or cluster size (C).
# 4. Percentage of similarity with the centroid sequence (H), or set to ’*’ (S,C).
# 5. Match orientation + or - (H), or set to ’*’ (S, C).
# 6. Not used, always set to ’*’ (S, C) or 0 (H).
# 7. Not used, always set to ’*’ (S, C) or 0 (H).
# 8. Not used, always set to ’*’.
# 9. Label of the query sequence (H), or of the centroid sequence (S, C).
# 10. Label of the centroid sequence (H), or set to ’*’ (S, C).
# S 3 454 * * * * * 64814cf8dbb546e9c8b75f122ad7cdc0 *
# H 3 453 99.8 + 0 0 453MI 6564221144419fdc7114c189a9b11116 64814cf8dbb546e9c8b75f122ad7cdc0
seqid_to_centroid = {} # key is seqid, value is centroid id
sys.stderr.write("# Reading cluster data from {}\n".format(cluster_file) )
for line in open(cluster_file,'r'):
line = line.strip()
if line:
lsplits = line.split("\t")
record_type = lsplits[0]
if record_type == "S": # centroid sequence
seqid_to_centroid[ lsplits[8] ] = lsplits[8]
elif record_type == "H": # hit to a cluster
seqid_to_centroid[ lsplits[8] ] = lsplits[9]
else: # implicitly "C" meaning cluster record
pass
sys.stderr.write("# Counted {} seqs from cluster data\n".format( len(seqid_to_centroid) ) )
line_counter = 0
counts_table_file = sys.argv[2]
merged_counts_dict = {}
if counts_table_file.rsplit('.',1)[-1]=="gz": # autodetect gzip format
_opentype = gzip.open
sys.stderr.write("# Reading counts table from {} as gzipped\n".format(counts_table_file) )
else: # otherwise assume normal open
_opentype = open
sys.stderr.write("# Reading counts table from {}\n".format(counts_table_file) )
for line in _opentype(counts_table_file,'rt'):
line = line.rstrip() # use rstrip() in case of initial empty tab from R
if line:
line_counter += 1
lsplits = line.split("\t")
if line_counter==1: # first line
n_samples = len(lsplits[1:])
if lsplits[0]=="": # first column is empty, give name
line = "sampleID" + line
sys.stderr.write("# Detected {} samples\n".format(n_samples) )
print(line, file=sys.stdout) # use print to add back rstripped return
continue
seqid = lsplits[0] # seq name should always be first column
cluster_target = seqid_to_centroid.get(seqid, None)
if cluster_target is None:
sys.stderr.write("# WARNING: no cluster found for {}\n".format(seqid) )
cluster_target = seqid
# for each value in the line, convert to float
seq_counts = [ float(x) for x in lsplits[1:] ]
# if that cluster already exists, add to existing values
if cluster_target in merged_counts_dict:
merged_counts_dict[cluster_target] = [ i+j for i,j in zip(seq_counts, merged_counts_dict.get(cluster_target) ) ]
else: # otherwise create cluster
merged_counts_dict[cluster_target] = seq_counts
sys.stderr.write("# Read {} lines from {}\n".format(line_counter, counts_table_file) )
# write new clusters
for k,v in merged_counts_dict.items():
sys.stdout.write("{}\t{}\n".format(k, "\t".join(list(map(str,v))) ) )
sys.stderr.write("# Wrote {} clusters\n".format( len(merged_counts_dict) ) )
#