Merge pull request #161 from ggabernet/clustersets

add option to skip cluster sets
nf-core · Apr 4, 2022 · 6d47a97 · 6d47a97
2 parents 0ddf735 + 9b4007e
commit 6d47a97
Show file tree

Hide file tree

Showing 9 changed files with 128 additions and 49 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -57,7 +57,7 @@ jobs:
       matrix:
         # Nextflow versions: check pipeline minimum and current latest
         nxf_ver: ["21.10.3", ""]
-        profile: ["test_tcr", "test_no_umi", "test_reveal", "test_tcr_thr"]
+        profile: ["test_tcr", "test_no_umi", "test_reveal", "test_tcr_thr", "test_nocluster"]
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v2

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - [#143](https://github.com/nf-core/airrflow/pull/128) Template update to nf-core tools v2.2.
 - [#150](https://github.com/nf-core/airrflow/pull/150) Added option to search for reverse primers.
 - [#159](https://github.com/nf-core/airrflow/pull/159) Template update to nf-core tools v2.3.1, v2.3.1
+- [#161](https://github.com/nf-core/airrflow/pull/161) Add option to skip clustering sequences in the UMI workflow
 
 ### `Fixed`
 

diff --git a/bin/log_parsing.py b/bin/log_parsing.py
@@ -5,19 +5,39 @@
 import pandas as pd
 import subprocess
 import re
+import argparse
+
+
+parser = argparse.ArgumentParser(description='Parse logs to identify the number of sequences passing through every step.')
+parser.add_argument('-c','--cluster_sets',
+                    help='Including the cluster_sets process',
+                    action = 'store_true')
+args = parser.parse_args()
 
 # Processes
-processes = [
-    "filter_by_sequence_quality",
-    "mask_primers",
-    "pair_sequences",
-    "cluster_sets",
-    "build_consensus",
-    "repair_mates",
-    "assemble_pairs",
-    "deduplicates",
-    "igblast",
-]
+if args.cluster_sets:
+    processes = [
+        "filter_by_sequence_quality",
+        "mask_primers",
+        "pair_sequences",
+        "build_consensus",
+        "repair_mates",
+        "assemble_pairs",
+        "deduplicates",
+        "igblast",
+        "cluster_sets",
+    ]
+else:
+    processes = [
+        "filter_by_sequence_quality",
+        "mask_primers",
+        "pair_sequences",
+        "build_consensus",
+        "repair_mates",
+        "assemble_pairs",
+        "deduplicates",
+        "igblast",
+    ]
 
 # Path of logs will be:
 # process_name/sample_name_command_log.txt
@@ -76,7 +96,6 @@
         for logfile in log_files:
             c = 0
             with open(logfile, "r") as f:
-                # print(f.read())
                 for line in f:
                     if " START>" in line:
                         if c < 1:
@@ -127,7 +146,6 @@
 
         for logfile in log_files:
             with open(logfile, "r") as f:
-                # print(f.read())
                 for line in f:
                     if " START>" in line:
                         s_code.append(logfile.split("/")[1].split("_command_log")[0])
@@ -162,7 +180,6 @@
 
         for logfile in log_files:
             with open(logfile, "r") as f:
-                # print(f.read())
                 for line in f:
                     if " START>" in line:
                         s_code.append(logfile.split("/")[1].split("_command_log")[0])
@@ -327,6 +344,7 @@
 
         df_process_list.append(df_process)
 
+# Getting table colnames
 
 colnames = [
     "Sample",
@@ -344,22 +362,24 @@
     "Igblast",
 ]
 
+
 values = [
-    df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
-    df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
-    df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
-    df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[6].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[7].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
-    df_process_list[8].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
-    df_process_list[8].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
+        df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
+        df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(),
+        df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(),
+        df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
+        df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
+        df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
+        df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
+        df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
+        df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
+        df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
+        df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
+        df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
+        df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
 ]
 
+
 # Tables provide extra info and help debugging
 df_process_list[0].to_csv(
     path_or_buf="Table_all_details_filter_quality.tsv",
@@ -374,30 +394,32 @@
     path_or_buf="Table_all_details_paired.tsv", sep="\t", header=True, index=False
 )
 df_process_list[3].to_csv(
-    path_or_buf="Table_all_details_cluster_sets.tsv", sep="\t", header=True, index=False
-)
-df_process_list[4].to_csv(
     path_or_buf="Table_all_details_build_consensus.tsv",
     sep="\t",
     header=True,
     index=False,
 )
-df_process_list[5].to_csv(
+df_process_list[4].to_csv(
     path_or_buf="Table_all_details_repaired.tsv", sep="\t", header=True, index=False
 )
-df_process_list[6].to_csv(
+df_process_list[5].to_csv(
     path_or_buf="Table_all_details_assemble_mates.tsv",
     sep="\t",
     header=True,
     index=False,
 )
-df_process_list[7].to_csv(
+df_process_list[6].to_csv(
     path_or_buf="Table_all_details_deduplicate.tsv", sep="\t", header=True, index=False
 )
-df_process_list[8].to_csv(
+df_process_list[7].to_csv(
     path_or_buf="Table_all_details_igblast.tsv", sep="\t", header=True, index=False
 )
 
+if args.cluster_sets:
+    df_process_list[8].to_csv(
+        path_or_buf="Table_all_details_cluster_sets.tsv", sep="\t", header=True, index=False
+    )
+
 final_table = dict(zip(colnames, values))
 print(final_table)
 df_final_table = pd.DataFrame.from_dict(final_table)

diff --git a/conf/test_nocluster.config b/conf/test_nocluster.config
@@ -0,0 +1,36 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/airrflow -profile test_nocluster,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Test pipeline without the cluster sets process'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test.tsv'
+    cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/C_primers.fasta'
+    vprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/V_primers.fasta'
+
+    subworkflow = 'bcellmagic'
+
+    library_generation_method = 'specific_pcr_umi'
+    cprimer_position = 'R1'
+    index_file = true
+    umi_length = 8
+    umi_start = 6
+    umi_position = 'R1'
+    cluster_sets = false
+}
diff --git a/modules/local/parse_logs.nf b/modules/local/parse_logs.nf
@@ -25,15 +25,17 @@ process PARSE_LOGS {
 
     output:
     path "Table_sequences_process.tsv", emit: logs
+    path "Table*.tsv", emit:tables
 
     script:
     if (params.umi_length == 0) {
         """
         log_parsing_no-umi.py
         """
     } else {
+        def clustersets = params.cluster_sets? "--cluster_sets":""
         """
-        log_parsing.py
+        log_parsing.py $clustersets
         """
     }
 }
diff --git a/modules/local/presto/presto_buildconsensus.nf b/modules/local/presto/presto_buildconsensus.nf
@@ -19,9 +19,10 @@ process PRESTO_BUILDCONSENSUS {
 
 
     script:
+    def barcode_field = params.cluster_sets ? "CLUSTER" : "BARCODE"
     """
-    BuildConsensus.py -s $R1 --bf CLUSTER --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R1 --log ${meta.id}_R1.log > "${meta.id}_command_log.txt"
-    BuildConsensus.py -s $R2 --bf CLUSTER --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> "${meta.id}_command_log.txt"
+    BuildConsensus.py -s $R1 --bf ${barcode_field} --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R1 --log ${meta.id}_R1.log > "${meta.id}_command_log.txt"
+    BuildConsensus.py -s $R2 --bf ${barcode_field} --nproc ${task.cpus} --pf PRIMER --prcons $params.primer_consensus --maxerror $params.buildconsensus_maxerror --maxgap $params.buildconsensus_maxgap --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> "${meta.id}_command_log.txt"
     ParseLog.py -l "${meta.id}_R1.log" "${meta.id}_R2.log" -f ID BARCODE SEQCOUNT PRIMER PRCOUNT PRCONS PRFREQ CONSCOUNT
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -49,6 +49,7 @@ params {
     primer_consensus = 0.6
     buildconsensus_maxerror = 0.1
     buildconsensus_maxgap = 0.5
+    cluster_sets = true
 
     // Clustering parameters
     set_cluster_threshold = false
@@ -182,6 +183,7 @@ profiles {
     test_no_umi { includeConfig 'conf/test_no-umi.config' }
     test_reveal { includeConfig 'conf/test_reveal.config' }
     test_reveal_no_cc { includeConfig 'conf/test_reveal_no_cc.config' }
+    test_nocluster { includeConfig 'conf/test_nocluster.config' }
 }
 
 // Load igenomes.config if required

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -215,6 +215,12 @@
                     "default": 0.5,
                     "fa_icon": "fas fa-align-center",
                     "description": "Maximum gap for building the sequence consensus in the Presto BuildConsensus step."
+                },
+                "cluster_sets": {
+                    "type": "boolean",
+                    "default": true,
+                    "fa_icon": "fas fa-layer-group",
+                    "description": "Cluster sequences by similarity regardless of any annotation with Presto ClusterSets and annotate the cluster ID additionally to the UMI barcode."
                 }
             },
             "fa_icon": "fas fa-align-center"

diff --git a/subworkflows/local/presto_umi.nf b/subworkflows/local/presto_umi.nf
@@ -58,20 +58,29 @@ workflow PRESTO_UMI {
         PRESTO_MASKPRIMERS_UMI.out.reads
     )
 
-    // Cluster sequences by similarity
-    PRESTO_CLUSTERSETS_UMI (
-        PRESTO_PAIRSEQ_UMI.out.reads
-    )
-    ch_versions = ch_versions.mix(PRESTO_CLUSTERSETS_UMI.out.versions.ifEmpty(null))
+    if (params.cluster_sets) {
 
-    // Annotate cluster into barcode field
-    PRESTO_PARSE_CLUSTER_UMI (
-        PRESTO_CLUSTERSETS_UMI.out.reads
-    )
+        // Cluster sequences by similarity
+        PRESTO_CLUSTERSETS_UMI (
+            PRESTO_PAIRSEQ_UMI.out.reads
+        )
+        ch_versions = ch_versions.mix(PRESTO_CLUSTERSETS_UMI.out.versions.ifEmpty(null))
+
+        // Annotate cluster into barcode field
+        PRESTO_PARSE_CLUSTER_UMI (
+            PRESTO_CLUSTERSETS_UMI.out.reads
+        )
+        ch_for_buildconsensus = PRESTO_PARSE_CLUSTER_UMI.out.reads
+        ch_clustersets_logs = PRESTO_CLUSTERSETS_UMI.out.logs.collect()
+
+    } else {
+        ch_for_buildconsensus = PRESTO_PAIRSEQ_UMI.out.reads
+        ch_clustersets_logs = Channel.empty()
+    }
 
     // Build consensus of sequences with same UMI barcode
     PRESTO_BUILDCONSENSUS_UMI (
-        PRESTO_PARSE_CLUSTER_UMI.out.reads
+        ch_for_buildconsensus
     )
 
     // Post-consensus pair
@@ -121,7 +130,7 @@ workflow PRESTO_UMI {
     presto_filterseq_logs = PRESTO_FILTERSEQ_UMI.out.logs
     presto_maskprimers_logs = PRESTO_MASKPRIMERS_UMI.out.logs.collect()
     presto_pairseq_logs = PRESTO_PAIRSEQ_UMI.out.logs.collect()
-    presto_clustersets_logs = PRESTO_CLUSTERSETS_UMI.out.logs.collect()
+    presto_clustersets_logs = ch_clustersets_logs
     presto_buildconsensus_logs = PRESTO_BUILDCONSENSUS_UMI.out.logs.collect()
     presto_postconsensus_pairseq_logs = PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.logs.collect()
     presto_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_UMI.out.logs.collect()