Skip to content

Commit

Permalink
Merge pull request #161 from ggabernet/clustersets
Browse files Browse the repository at this point in the history
add option to skip cluster sets
  • Loading branch information
ggabernet authored Apr 4, 2022
2 parents 0ddf735 + 9b4007e commit 6d47a97
Show file tree
Hide file tree
Showing 9 changed files with 128 additions and 49 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:
matrix:
# Nextflow versions: check pipeline minimum and current latest
nxf_ver: ["21.10.3", ""]
profile: ["test_tcr", "test_no_umi", "test_reveal", "test_tcr_thr"]
profile: ["test_tcr", "test_no_umi", "test_reveal", "test_tcr_thr", "test_nocluster"]
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- [#143](https://github.com/nf-core/airrflow/pull/128) Template update to nf-core tools v2.2.
- [#150](https://github.com/nf-core/airrflow/pull/150) Added option to search for reverse primers.
- [#159](https://github.com/nf-core/airrflow/pull/159) Template update to nf-core tools v2.3.1, v2.3.1
- [#161](https://github.com/nf-core/airrflow/pull/161) Add option to skip clustering sequences in the UMI workflow

### `Fixed`

Expand Down
90 changes: 56 additions & 34 deletions bin/log_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,39 @@
import pandas as pd
import subprocess
import re
import argparse


parser = argparse.ArgumentParser(description='Parse logs to identify the number of sequences passing through every step.')
parser.add_argument('-c','--cluster_sets',
help='Including the cluster_sets process',
action = 'store_true')
args = parser.parse_args()

# Processes
processes = [
"filter_by_sequence_quality",
"mask_primers",
"pair_sequences",
"cluster_sets",
"build_consensus",
"repair_mates",
"assemble_pairs",
"deduplicates",
"igblast",
]
if args.cluster_sets:
processes = [
"filter_by_sequence_quality",
"mask_primers",
"pair_sequences",
"build_consensus",
"repair_mates",
"assemble_pairs",
"deduplicates",
"igblast",
"cluster_sets",
]
else:
processes = [
"filter_by_sequence_quality",
"mask_primers",
"pair_sequences",
"build_consensus",
"repair_mates",
"assemble_pairs",
"deduplicates",
"igblast",
]

# Path of logs will be:
# process_name/sample_name_command_log.txt
Expand Down Expand Up @@ -76,7 +96,6 @@
for logfile in log_files:
c = 0
with open(logfile, "r") as f:
# print(f.read())
for line in f:
if " START>" in line:
if c < 1:
Expand Down Expand Up @@ -127,7 +146,6 @@

for logfile in log_files:
with open(logfile, "r") as f:
# print(f.read())
for line in f:
if " START>" in line:
s_code.append(logfile.split("/")[1].split("_command_log")[0])
Expand Down Expand Up @@ -162,7 +180,6 @@

for logfile in log_files:
with open(logfile, "r") as f:
# print(f.read())
for line in f:
if " START>" in line:
s_code.append(logfile.split("/")[1].split("_command_log")[0])
Expand Down Expand Up @@ -327,6 +344,7 @@

df_process_list.append(df_process)

# Getting table colnames

colnames = [
"Sample",
Expand All @@ -344,22 +362,24 @@
"Igblast",
]


values = [
df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[6].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
df_process_list[8].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
df_process_list[8].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
]


# Tables provide extra info and help debugging
df_process_list[0].to_csv(
path_or_buf="Table_all_details_filter_quality.tsv",
Expand All @@ -374,30 +394,32 @@
path_or_buf="Table_all_details_paired.tsv", sep="\t", header=True, index=False
)
df_process_list[3].to_csv(
path_or_buf="Table_all_details_cluster_sets.tsv", sep="\t", header=True, index=False
)
df_process_list[4].to_csv(
path_or_buf="Table_all_details_build_consensus.tsv",
sep="\t",
header=True,
index=False,
)
df_process_list[5].to_csv(
df_process_list[4].to_csv(
path_or_buf="Table_all_details_repaired.tsv", sep="\t", header=True, index=False
)
df_process_list[6].to_csv(
df_process_list[5].to_csv(
path_or_buf="Table_all_details_assemble_mates.tsv",
sep="\t",
header=True,
index=False,
)
df_process_list[7].to_csv(
df_process_list[6].to_csv(
path_or_buf="Table_all_details_deduplicate.tsv", sep="\t", header=True, index=False
)
df_process_list[8].to_csv(
df_process_list[7].to_csv(
path_or_buf="Table_all_details_igblast.tsv", sep="\t", header=True, index=False
)

if args.cluster_sets:
df_process_list[8].to_csv(
path_or_buf="Table_all_details_cluster_sets.tsv", sep="\t", header=True, index=False
)

final_table = dict(zip(colnames, values))
print(final_table)
df_final_table = pd.DataFrame.from_dict(final_table)
Expand Down
36 changes: 36 additions & 0 deletions conf/test_nocluster.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/airrflow -profile test_nocluster,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Test pipeline without the cluster sets process'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test.tsv'
cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/C_primers.fasta'
vprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/V_primers.fasta'

subworkflow = 'bcellmagic'

library_generation_method = 'specific_pcr_umi'
cprimer_position = 'R1'
index_file = true
umi_length = 8
umi_start = 6
umi_position = 'R1'
cluster_sets = false
}
4 changes: 3 additions & 1 deletion modules/local/parse_logs.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,17 @@ process PARSE_LOGS {

output:
path "Table_sequences_process.tsv", emit: logs
path "Table*.tsv", emit:tables

script:
if (params.umi_length == 0) {
"""
log_parsing_no-umi.py
"""
} else {
def clustersets = params.cluster_sets? "--cluster_sets":""
"""
log_parsing.py
log_parsing.py $clustersets
"""
}
}
5 changes: 3 additions & 2 deletions modules/local/presto/presto_buildconsensus.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ process PRESTO_BUILDCONSENSUS {


script:
def barcode_field = params.cluster_sets ? "CLUSTER" : "BARCODE"
"""
BuildConsensus.py -s $R1 --bf CLUSTER --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R1 --log ${meta.id}_R1.log > "${meta.id}_command_log.txt"
BuildConsensus.py -s $R2 --bf CLUSTER --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> "${meta.id}_command_log.txt"
BuildConsensus.py -s $R1 --bf ${barcode_field} --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R1 --log ${meta.id}_R1.log > "${meta.id}_command_log.txt"
BuildConsensus.py -s $R2 --bf ${barcode_field} --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> "${meta.id}_command_log.txt"
ParseLog.py -l "${meta.id}_R1.log" "${meta.id}_R2.log" -f ID BARCODE SEQCOUNT PRIMER PRCOUNT PRCONS PRFREQ CONSCOUNT
"""
}
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ params {
primer_consensus = 0.6
buildconsensus_maxerror = 0.1
buildconsensus_maxgap = 0.5
cluster_sets = true

// Clustering parameters
set_cluster_threshold = false
Expand Down Expand Up @@ -182,6 +183,7 @@ profiles {
test_no_umi { includeConfig 'conf/test_no-umi.config' }
test_reveal { includeConfig 'conf/test_reveal.config' }
test_reveal_no_cc { includeConfig 'conf/test_reveal_no_cc.config' }
test_nocluster { includeConfig 'conf/test_nocluster.config' }
}

// Load igenomes.config if required
Expand Down
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@
"default": 0.5,
"fa_icon": "fas fa-align-center",
"description": "Maximum gap for building the sequence consensus in the Presto BuildConsensus step."
},
"cluster_sets": {
"type": "boolean",
"default": true,
"fa_icon": "fas fa-layer-group",
"description": "Cluster sequences by similarity regardless of any annotation with Presto ClusterSets and annotate the cluster ID additionally to the UMI barcode."
}
},
"fa_icon": "fas fa-align-center"
Expand Down
31 changes: 20 additions & 11 deletions subworkflows/local/presto_umi.nf
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,29 @@ workflow PRESTO_UMI {
PRESTO_MASKPRIMERS_UMI.out.reads
)

// Cluster sequences by similarity
PRESTO_CLUSTERSETS_UMI (
PRESTO_PAIRSEQ_UMI.out.reads
)
ch_versions = ch_versions.mix(PRESTO_CLUSTERSETS_UMI.out.versions.ifEmpty(null))
if (params.cluster_sets) {

// Annotate cluster into barcode field
PRESTO_PARSE_CLUSTER_UMI (
PRESTO_CLUSTERSETS_UMI.out.reads
)
// Cluster sequences by similarity
PRESTO_CLUSTERSETS_UMI (
PRESTO_PAIRSEQ_UMI.out.reads
)
ch_versions = ch_versions.mix(PRESTO_CLUSTERSETS_UMI.out.versions.ifEmpty(null))

// Annotate cluster into barcode field
PRESTO_PARSE_CLUSTER_UMI (
PRESTO_CLUSTERSETS_UMI.out.reads
)
ch_for_buildconsensus = PRESTO_PARSE_CLUSTER_UMI.out.reads
ch_clustersets_logs = PRESTO_CLUSTERSETS_UMI.out.logs.collect()

} else {
ch_for_buildconsensus = PRESTO_PAIRSEQ_UMI.out.reads
ch_clustersets_logs = Channel.empty()
}

// Build consensus of sequences with same UMI barcode
PRESTO_BUILDCONSENSUS_UMI (
PRESTO_PARSE_CLUSTER_UMI.out.reads
ch_for_buildconsensus
)

// Post-consensus pair
Expand Down Expand Up @@ -121,7 +130,7 @@ workflow PRESTO_UMI {
presto_filterseq_logs = PRESTO_FILTERSEQ_UMI.out.logs
presto_maskprimers_logs = PRESTO_MASKPRIMERS_UMI.out.logs.collect()
presto_pairseq_logs = PRESTO_PAIRSEQ_UMI.out.logs.collect()
presto_clustersets_logs = PRESTO_CLUSTERSETS_UMI.out.logs.collect()
presto_clustersets_logs = ch_clustersets_logs
presto_buildconsensus_logs = PRESTO_BUILDCONSENSUS_UMI.out.logs.collect()
presto_postconsensus_pairseq_logs = PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.logs.collect()
presto_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_UMI.out.logs.collect()
Expand Down

0 comments on commit 6d47a97

Please sign in to comment.