Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add option to skip cluster sets #161

Merged
merged 7 commits into from
Apr 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:
matrix:
# Nextflow versions: check pipeline minimum and current latest
nxf_ver: ["21.10.3", ""]
profile: ["test_tcr", "test_no_umi", "test_reveal", "test_tcr_thr"]
profile: ["test_tcr", "test_no_umi", "test_reveal", "test_tcr_thr", "test_nocluster"]
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- [#143](https://github.com/nf-core/airrflow/pull/128) Template update to nf-core tools v2.2.
- [#150](https://github.com/nf-core/airrflow/pull/150) Added option to search for reverse primers.
- [#159](https://github.com/nf-core/airrflow/pull/159) Template update to nf-core tools v2.3.1, v2.3.1
- [#161](https://github.com/nf-core/airrflow/pull/161) Add option to skip clustering sequences in the UMI workflow

### `Fixed`

Expand Down
90 changes: 56 additions & 34 deletions bin/log_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,39 @@
import pandas as pd
import subprocess
import re
import argparse


parser = argparse.ArgumentParser(description='Parse logs to identify the number of sequences passing through every step.')
parser.add_argument('-c','--cluster_sets',
help='Including the cluster_sets process',
action = 'store_true')
args = parser.parse_args()

# Processes
processes = [
"filter_by_sequence_quality",
"mask_primers",
"pair_sequences",
"cluster_sets",
"build_consensus",
"repair_mates",
"assemble_pairs",
"deduplicates",
"igblast",
]
if args.cluster_sets:
processes = [
"filter_by_sequence_quality",
"mask_primers",
"pair_sequences",
"build_consensus",
"repair_mates",
"assemble_pairs",
"deduplicates",
"igblast",
"cluster_sets",
]
else:
processes = [
"filter_by_sequence_quality",
"mask_primers",
"pair_sequences",
"build_consensus",
"repair_mates",
"assemble_pairs",
"deduplicates",
"igblast",
]

# Path of logs will be:
# process_name/sample_name_command_log.txt
Expand Down Expand Up @@ -76,7 +96,6 @@
for logfile in log_files:
c = 0
with open(logfile, "r") as f:
# print(f.read())
for line in f:
if " START>" in line:
if c < 1:
Expand Down Expand Up @@ -127,7 +146,6 @@

for logfile in log_files:
with open(logfile, "r") as f:
# print(f.read())
for line in f:
if " START>" in line:
s_code.append(logfile.split("/")[1].split("_command_log")[0])
Expand Down Expand Up @@ -162,7 +180,6 @@

for logfile in log_files:
with open(logfile, "r") as f:
# print(f.read())
for line in f:
if " START>" in line:
s_code.append(logfile.split("/")[1].split("_command_log")[0])
Expand Down Expand Up @@ -327,6 +344,7 @@

df_process_list.append(df_process)

# Getting table colnames

colnames = [
"Sample",
Expand All @@ -344,22 +362,24 @@
"Igblast",
]


values = [
df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[6].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
df_process_list[8].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
df_process_list[8].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
]


# Tables provide extra info and help debugging
df_process_list[0].to_csv(
path_or_buf="Table_all_details_filter_quality.tsv",
Expand All @@ -374,30 +394,32 @@
path_or_buf="Table_all_details_paired.tsv", sep="\t", header=True, index=False
)
df_process_list[3].to_csv(
path_or_buf="Table_all_details_cluster_sets.tsv", sep="\t", header=True, index=False
)
df_process_list[4].to_csv(
path_or_buf="Table_all_details_build_consensus.tsv",
sep="\t",
header=True,
index=False,
)
df_process_list[5].to_csv(
df_process_list[4].to_csv(
path_or_buf="Table_all_details_repaired.tsv", sep="\t", header=True, index=False
)
df_process_list[6].to_csv(
df_process_list[5].to_csv(
path_or_buf="Table_all_details_assemble_mates.tsv",
sep="\t",
header=True,
index=False,
)
df_process_list[7].to_csv(
df_process_list[6].to_csv(
path_or_buf="Table_all_details_deduplicate.tsv", sep="\t", header=True, index=False
)
df_process_list[8].to_csv(
df_process_list[7].to_csv(
path_or_buf="Table_all_details_igblast.tsv", sep="\t", header=True, index=False
)

if args.cluster_sets:
df_process_list[8].to_csv(
path_or_buf="Table_all_details_cluster_sets.tsv", sep="\t", header=True, index=False
)

final_table = dict(zip(colnames, values))
print(final_table)
df_final_table = pd.DataFrame.from_dict(final_table)
Expand Down
36 changes: 36 additions & 0 deletions conf/test_nocluster.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run nf-core/airrflow -profile test_nocluster,<docker/singularity> --outdir <OUTDIR>

----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Test pipeline without the cluster sets process'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test.tsv'
cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/C_primers.fasta'
vprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/V_primers.fasta'

subworkflow = 'bcellmagic'

library_generation_method = 'specific_pcr_umi'
cprimer_position = 'R1'
index_file = true
umi_length = 8
umi_start = 6
umi_position = 'R1'
cluster_sets = false
}
4 changes: 3 additions & 1 deletion modules/local/parse_logs.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,17 @@ process PARSE_LOGS {

output:
path "Table_sequences_process.tsv", emit: logs
path "Table*.tsv", emit:tables

script:
if (params.umi_length == 0) {
"""
log_parsing_no-umi.py
"""
} else {
def clustersets = params.cluster_sets? "--cluster_sets":""
"""
log_parsing.py
log_parsing.py $clustersets
"""
}
}
5 changes: 3 additions & 2 deletions modules/local/presto/presto_buildconsensus.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ process PRESTO_BUILDCONSENSUS {


script:
def barcode_field = params.cluster_sets ? "CLUSTER" : "BARCODE"
"""
BuildConsensus.py -s $R1 --bf CLUSTER --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R1 --log ${meta.id}_R1.log > "${meta.id}_command_log.txt"
BuildConsensus.py -s $R2 --bf CLUSTER --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> "${meta.id}_command_log.txt"
BuildConsensus.py -s $R1 --bf ${barcode_field} --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R1 --log ${meta.id}_R1.log > "${meta.id}_command_log.txt"
BuildConsensus.py -s $R2 --bf ${barcode_field} --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> "${meta.id}_command_log.txt"
ParseLog.py -l "${meta.id}_R1.log" "${meta.id}_R2.log" -f ID BARCODE SEQCOUNT PRIMER PRCOUNT PRCONS PRFREQ CONSCOUNT
"""
}
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ params {
primer_consensus = 0.6
buildconsensus_maxerror = 0.1
buildconsensus_maxgap = 0.5
cluster_sets = true

// Clustering parameters
set_cluster_threshold = false
Expand Down Expand Up @@ -182,6 +183,7 @@ profiles {
test_no_umi { includeConfig 'conf/test_no-umi.config' }
test_reveal { includeConfig 'conf/test_reveal.config' }
test_reveal_no_cc { includeConfig 'conf/test_reveal_no_cc.config' }
test_nocluster { includeConfig 'conf/test_nocluster.config' }
}

// Load igenomes.config if required
Expand Down
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@
"default": 0.5,
"fa_icon": "fas fa-align-center",
"description": "Maximum gap for building the sequence consensus in the Presto BuildConsensus step."
},
"cluster_sets": {
"type": "boolean",
"default": true,
"fa_icon": "fas fa-layer-group",
"description": "Cluster sequences by similarity regardless of any annotation with Presto ClusterSets and annotate the cluster ID additionally to the UMI barcode."
}
},
"fa_icon": "fas fa-align-center"
Expand Down
31 changes: 20 additions & 11 deletions subworkflows/local/presto_umi.nf
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,29 @@ workflow PRESTO_UMI {
PRESTO_MASKPRIMERS_UMI.out.reads
)

// Cluster sequences by similarity
PRESTO_CLUSTERSETS_UMI (
PRESTO_PAIRSEQ_UMI.out.reads
)
ch_versions = ch_versions.mix(PRESTO_CLUSTERSETS_UMI.out.versions.ifEmpty(null))
if (params.cluster_sets) {

// Annotate cluster into barcode field
PRESTO_PARSE_CLUSTER_UMI (
PRESTO_CLUSTERSETS_UMI.out.reads
)
// Cluster sequences by similarity
PRESTO_CLUSTERSETS_UMI (
PRESTO_PAIRSEQ_UMI.out.reads
)
ch_versions = ch_versions.mix(PRESTO_CLUSTERSETS_UMI.out.versions.ifEmpty(null))

// Annotate cluster into barcode field
PRESTO_PARSE_CLUSTER_UMI (
PRESTO_CLUSTERSETS_UMI.out.reads
)
ch_for_buildconsensus = PRESTO_PARSE_CLUSTER_UMI.out.reads
ch_clustersets_logs = PRESTO_CLUSTERSETS_UMI.out.logs.collect()

} else {
ch_for_buildconsensus = PRESTO_PAIRSEQ_UMI.out.reads
ch_clustersets_logs = Channel.empty()
}

// Build consensus of sequences with same UMI barcode
PRESTO_BUILDCONSENSUS_UMI (
PRESTO_PARSE_CLUSTER_UMI.out.reads
ch_for_buildconsensus
)

// Post-consensus pair
Expand Down Expand Up @@ -121,7 +130,7 @@ workflow PRESTO_UMI {
presto_filterseq_logs = PRESTO_FILTERSEQ_UMI.out.logs
presto_maskprimers_logs = PRESTO_MASKPRIMERS_UMI.out.logs.collect()
presto_pairseq_logs = PRESTO_PAIRSEQ_UMI.out.logs.collect()
presto_clustersets_logs = PRESTO_CLUSTERSETS_UMI.out.logs.collect()
presto_clustersets_logs = ch_clustersets_logs
presto_buildconsensus_logs = PRESTO_BUILDCONSENSUS_UMI.out.logs.collect()
presto_postconsensus_pairseq_logs = PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.logs.collect()
presto_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_UMI.out.logs.collect()
Expand Down