-
Notifications
You must be signed in to change notification settings - Fork 27
/
augur_from_assemblies.wdl
204 lines (178 loc) · 8.2 KB
/
augur_from_assemblies.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
version 1.0
import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_utils.wdl" as utils
workflow augur_from_assemblies {
meta {
description: "Align assemblies, build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/"
author: "Broad Viral Genomics"
email: "viral-ngs@broadinstitute.org"
allowNestedInputs: true
}
input {
Array[File]+ assembly_fastas
Array[File]? contextual_genome_fastas
Array[File]+ sample_metadata_tsvs
File ref_fasta
Int min_unambig_genome
File? clades_tsv
Array[String]? ancestral_traits_to_infer
Boolean make_snps_vcf = false
}
parameter_meta {
assembly_fastas: {
description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.",
patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"]
}
contextual_genome_fastas: {
description: "Set of near-complete contextual genomes to include in tree build. Each fasta provided must represent a single chromosome/segment of a genome. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture. ",
patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"]
}
sample_metadata_tsvs: {
description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.",
patterns: ["*.txt", "*.tsv"]
}
ref_fasta: {
description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.",
patterns: ["*.fasta", "*.fa"]
}
min_unambig_genome: {
description: "Minimum number of called bases in genome to pass prefilter."
}
ancestral_traits_to_infer: {
description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
}
clades_tsv: {
description: "A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades",
patterns: ["*.tsv", "*.txt"]
}
}
#### mafft_and_snp
call utils.zcat {
input:
infiles = flatten([assembly_fastas, select_first([contextual_genome_fastas,[]])]),
output_name = "all_samples_combined_assembly.fasta"
}
call utils.filter_sequences_by_length {
input:
sequences_fasta = zcat.combined,
min_non_N = min_unambig_genome
}
call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs {
input:
sequences_fasta = filter_sequences_by_length.filtered_fasta
}
call nextstrain.mafft_one_chr as mafft {
input:
sequences = dedup_seqs.sequences_deduplicated_fasta,
ref_fasta = ref_fasta,
basename = "all_samples_aligned.fasta"
}
if(make_snps_vcf) {
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
}
}
#### subsample_by_metadata_with_focal
if(length(sample_metadata_tsvs)>1) {
call utils.tsv_join {
input:
input_tsvs = sample_metadata_tsvs,
id_col = 'strain',
out_basename = "metadata-merged"
}
}
call nextstrain.derived_cols {
input:
metadata_tsv = select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs]))
}
call nextstrain.filter_subsample_sequences as prefilter {
input:
sequences_fasta = mafft.aligned_sequences,
sample_metadata_tsv = derived_cols.derived_metadata
}
call utils.fasta_to_ids {
input:
sequences_fasta = prefilter.filtered_fasta
}
#### augur_from_msa
call nextstrain.augur_mask_sites {
input:
sequences = prefilter.filtered_fasta
}
call nextstrain.draft_augur_tree {
input:
msa_or_vcf = augur_mask_sites.masked_sequences
}
call nextstrain.refine_augur_tree {
input:
raw_tree = draft_augur_tree.aligned_tree,
msa_or_vcf = augur_mask_sites.masked_sequences,
metadata = derived_cols.derived_metadata
}
if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) {
call nextstrain.ancestral_traits {
input:
tree = refine_augur_tree.tree_refined,
metadata = derived_cols.derived_metadata,
columns = select_first([ancestral_traits_to_infer,[]])
}
}
call nextstrain.ancestral_tree {
input:
tree = refine_augur_tree.tree_refined,
msa_or_vcf = augur_mask_sites.masked_sequences
}
call nextstrain.translate_augur_tree {
input:
tree = refine_augur_tree.tree_refined,
nt_muts = ancestral_tree.nt_muts_json
}
call nextstrain.tip_frequencies {
input:
tree = refine_augur_tree.tree_refined,
metadata = derived_cols.derived_metadata
}
if(defined(clades_tsv)) {
call nextstrain.assign_clades_to_nodes {
input:
tree_nwk = refine_augur_tree.tree_refined,
nt_muts_json = ancestral_tree.nt_muts_json,
aa_muts_json = translate_augur_tree.aa_muts_json,
ref_fasta = ref_fasta,
clades_tsv = select_first([clades_tsv])
}
}
call nextstrain.export_auspice_json {
input:
tree = refine_augur_tree.tree_refined,
sample_metadata = derived_cols.derived_metadata,
node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json])
}
output {
File combined_assemblies = filter_sequences_by_length.filtered_fasta
File multiple_alignment = mafft.aligned_sequences
File? unmasked_snps = snp_sites.snps_vcf
File metadata_merged = derived_cols.derived_metadata
File keep_list = fasta_to_ids.ids_txt
File subsampled_sequences = prefilter.filtered_fasta
Int sequences_kept = prefilter.sequences_out
File masked_alignment = augur_mask_sites.masked_sequences
File ml_tree = draft_augur_tree.aligned_tree
File time_tree = refine_augur_tree.tree_refined
Array[File] node_data_jsons = select_all([
refine_augur_tree.branch_lengths,
ancestral_traits.node_data_json,
ancestral_tree.nt_muts_json,
translate_augur_tree.aa_muts_json,
assign_clades_to_nodes.node_clade_data_json])
File auspice_input_json = export_auspice_json.virus_json
File tip_frequencies_json = tip_frequencies.node_data_json
File root_sequence_json = export_auspice_json.root_sequence_json
}
}