Refactor reference channels

genomic-medicine-sweden · Oct 29, 2024 · 036da2c · 036da2c
1 parent 79b836b
commit 036da2c
Show file tree

Hide file tree

Showing 18 changed files with 117 additions and 116 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -64,6 +64,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#435](https://github.com/genomic-medicine-sweden/nallo/pull/435) - Updated and refactored processes and workflows related to variant ranking
 - [#438](https://github.com/genomic-medicine-sweden/nallo/pull/438) - Updated pipeline tests to use functions in nft-utils instead of checking hardcoded paths
 - [#440](https://github.com/genomic-medicine-sweden/nallo/pull/440) - Updated hifiasm to 0.20 with new default parameters for telomeres and scaffolding ([#295](https://github.com/genomic-medicine-sweden/nallo/issues/295))
+- [#443](https://github.com/genomic-medicine-sweden/nallo/pull/443) - Refactored reference channel assignments
+- [#443](https://github.com/genomic-medicine-sweden/nallo/pull/443) - Updated schemas for `vep_plugin_files` and `snp_db`
 
 ### `Removed`
 

diff --git a/assets/schema_snpdb.json → assets/schema_snp_db.json b/assets/schema_snpdb.json → assets/schema_snp_db.json
@@ -1,23 +1,22 @@
 {
     "$schema": "https://json-schema.org/draft/2020-12/schema",
-    "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_gvcfs.json",
-    "title": "genomic-medicine-sweden/nallo pipeline - params.extra_gvcfs schema",
-    "description": "Schema for the file provided with params.extra_gvcfs",
+    "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_snp_db.json",
+    "title": "genomic-medicine-sweden/nallo pipeline - params.snp_db schema",
+    "description": "Schema for the file provided with params.snp_db",
     "type": "array",
     "items": {
         "type": "object",
         "properties": {
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces",
-                "meta": ["id"]
+                "errorMessage": "Sample must be provided and cannot contain spaces."
             },
             "file": {
                 "format": "file-path",
                 "type": "string",
                 "pattern": "^\\S+\\.zip$",
-                "errorMessage": "gVCF file must be provided, cannot contain spaces and must have extension 'g.vcf.gz' or 'gvcf.gz'"
+                "errorMessage": "Echtvar database must be provided, cannot contain spaces and must have extension '.zip'"
             }
         },
         "required": ["sample", "file"]

diff --git a/assets/schema_vep_plugin_files.json b/assets/schema_vep_plugin_files.json
@@ -0,0 +1,20 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_vep_plugin_files.json",
+    "title": "genomic-medicine-sweden/nallo pipeline - params.vep_plugin_files schema",
+    "description": "Schema for the file provided with params.vep_plugin_files",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "vep_files": {
+                "format": "file-path",
+                "type": "string",
+                "pattern": "^\\S+",
+                "exists": true,
+                "errorMessage": "Vep plugin file must be a path and exist."
+            }
+        },
+        "required": ["vep_files"]
+    }
+}
diff --git a/modules.json b/modules.json
@@ -68,7 +68,8 @@
                     "cadd": {
                         "branch": "master",
                         "git_sha": "cf3ed075695639b0a0924eb0901146df1996dc08",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/cadd/cadd.diff"
                     },
                     "cat/fastq": {
                         "branch": "master",

diff --git a/modules/nf-core/cadd/cadd.diff b/modules/nf-core/cadd/cadd.diff
diff --git a/modules/nf-core/cadd/main.nf b/modules/nf-core/cadd/main.nf
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -163,7 +163,7 @@
                     "pattern": "^\\S+\\.csv$",
                     "format": "file-path",
                     "mimetype": "text/csv",
-                    "schema": "/assets/schema_snpdb.json",
+                    "schema": "/assets/schema_snp_db.json",
                     "description": "A csv file with echtvar databases to annotate SNVs with",
                     "exists": true
                 },

diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf
@@ -17,9 +17,9 @@ workflow ANNOTATE_CADD {
     ch_fai            // channel: [mandatory] [ val(meta), path(fai) ]
     ch_vcf            // channel: [mandatory] [ val(meta), path(vcfs) ]
     ch_index          // channel: [mandatory] [ val(meta), path(tbis) ]
-    ch_header         // channel: [mandatory] [ path(txt) ]
-    ch_cadd_resources // channel: [mandatory] [ path(dir) ]
-    ch_cadd_prescored // channel: [mandatory] [ path(dir) ]
+    ch_header         // channel: [mandatory] [ val(meta), path(txt) ]
+    ch_cadd_resources // channel: [mandatory] [ val(meta), path(dir) ]
+    ch_cadd_prescored // channel: [mandatory] [ val(meta), path(dir) ]
 
     main:
     ch_versions = Channel.empty()
@@ -64,7 +64,7 @@ workflow ANNOTATE_CADD {
 
     ANNOTATE_INDELS (
         ch_annotate_indels_in,
-        ch_header,
+        ch_header.map { meta, header -> header },
         CADD_TO_REFERENCE_CHRNAMES.out.output.map { meta, txt -> txt }
     )
     ch_versions = ch_versions.mix(ANNOTATE_INDELS.out.versions)

diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
@@ -10,7 +10,6 @@ workflow PREPARE_GENOME {
     gunzip_fasta               //    bool: should we gunzip fasta
     ch_vep_cache               // channel: [optional] [ val(meta), path(cache) ]
     split_vep_files            //    bool: are there vep extra files
-    ch_vep_extra_files_unsplit // channel: [optional] [ val(meta), path(csv) ]
 
     main:
     ch_versions = Channel.empty()
@@ -40,33 +39,13 @@ workflow PREPARE_GENOME {
     ch_versions = ch_versions.mix(UNTAR_VEP_CACHE.out.versions)
 
     UNTAR_VEP_CACHE.out.untar
-        .map { meta, files -> [ files ] }
         .collect()
         .set { untarred_vep }
 
-    // Read and store paths in the vep_plugin_files file
-    if ( split_vep_files ) {
-        ch_vep_extra_files_unsplit
-            .splitCsv ( header:true )
-            .map { row ->
-                path = file(row.vep_files[0])
-                if(path.exists()) {
-                    return [path]
-                } else {
-                    error("\nVep database file ${path} does not exist.")
-                }
-            }
-            .collect()
-            .set { ch_vep_extra_files }
-    } else {
-        ch_vep_extra_files = Channel.value([])
-    }
-
     emit:
     mmi             = MINIMAP2_INDEX.out.index.collect() // channel: [ val(meta), path(mmi) ]
     fai             = SAMTOOLS_FAIDX.out.fai.collect()   // channel: [ val(meta), path(fai) ]
     fasta           = ch_fasta                           // channel: [ val(meta), path(fasta) ]
-    vep_resources   = untarred_vep                       // channel: [ path(cache) ]
-    vep_extra_files = ch_vep_extra_files                 // channel: [ path(files) ]
+    vep_resources   = untarred_vep                       // channel: [ val(meta), path(cache) ]
     versions        = ch_versions                        // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/rank_variants/tests/main.nf.test b/subworkflows/local/rank_variants/tests/main.nf.test
@@ -20,9 +20,6 @@ nextflow_workflow {
                     file(params.pipelines_testdata_base_path + 'reference/vep_cache_test_data.tar.gz', checkIfExists:true)
                 ]
                 input[3] = true
-                input[4] = Channel.of([
-                    file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true)
-                ])
                 """
             }
         }
@@ -69,9 +66,11 @@ nextflow_workflow {
                 ]
                 input[2] = PREPARE_GENOME.out.fasta
                 input[3] = PREPARE_GENOME.out.fai
-                input[4] = PREPARE_GENOME.out.vep_resources
+                input[4] = PREPARE_GENOME.out.vep_resources.map { meta, cache -> cache }
                 input[5] = Channel.value('110')
-                input[6] = PREPARE_GENOME.out.vep_extra_files
+                input[6] = Channel.of([
+                    file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true)
+                ]).splitCsv(header:true).map { row -> row.vep_files }.collect()
                 input[7] = false
                 input[8] = Channel.value([])
                 input[9] = null

diff --git a/subworkflows/local/snv_annotation/main.nf b/subworkflows/local/snv_annotation/main.nf
@@ -8,16 +8,16 @@ workflow SNV_ANNOTATION {
 
     take:
     ch_vcf                // channel [mandatory] [ val(meta), path(vcf) ]
-    ch_databases          // channel: [mandatory] [ val(meta), path(db) ]
+    ch_databases          // channel: [mandatory] [ path(db) ]
     ch_fasta              // channel: [mandatory] [ val(meta), path(fasta) ]
     ch_fai                // channel: [mandatory] [ val(meta), path(fai) ]
     ch_vep_cache          // channel: [mandatory] [ path(cache) ]
     val_vep_cache_version // string: [mandatory] default: 110
     ch_vep_extra_files    // channel: [mandatory] [ path(files) ]
     val_annotate_cadd     // bool: [mandatory]
     ch_cadd_header        // channel: [mandatory] [ path(txt) ]
-    ch_cadd_resources     // channel: [mandatory] [ path(annotation) ]
-    ch_cadd_prescored     // channel: [mandatory] [ path(prescored) ]
+    ch_cadd_resources     // channel: [mandatory] [ val(meta), path(annotation) ]
+    ch_cadd_prescored     // channel: [mandatory] [ val(meta), path(prescored) ]
 
     main:
     ch_versions = Channel.empty()

diff --git a/subworkflows/local/snv_annotation/tests/main.nf.test b/subworkflows/local/snv_annotation/tests/main.nf.test
@@ -88,11 +88,11 @@ nextflow_workflow {
                 ]
                 input[2] = GUNZIP.out.gunzip
                 input[3] = SAMTOOLS_FAIDX.out.fai
-                input[4] = UNTAR.out.untar.map { meta, cache -> cache }
+                input[4] = UNTAR.out.untar.map { meta, cache -> cache}
                 input[5] = Channel.value('110')
-                input[6] = [
+                input[6] = Channel.of([
                     file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true)
-                ]
+                ]).splitCsv(header:true).map { row -> row.vep_files }.collect()
                 input[7] = false
                 input[8] = Channel.value([])
                 input[9] = null
@@ -132,9 +132,9 @@ nextflow_workflow {
                 input[3] = SAMTOOLS_FAIDX.out.fai
                 input[4] = UNTAR.out.untar.map { meta, cache -> cache }
                 input[5] = Channel.value('110')
-                input[6] = [
+                input[6] = Channel.of([
                     file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true)
-                ]
+                ]).splitCsv(header:true).map { row -> row.vep_files }.collect()
                 input[7] = false
                 input[8] = Channel.value([])
                 input[9] = null

diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf
@@ -640,3 +640,15 @@ def findKeyForValue(def valueToFind, Map map) {
     }
     return null // Value not found
 }
+
+// Utility function to create channels from references
+def createReferenceChannelFromPath(param, defaultValue = '') {
+    return param ? Channel.fromPath(param, checkIfExists: true)
+        .map { [ [ id: it.simpleName ], it ] }
+        .collect() : defaultValue
+}
+// Utility function to create channels from samplesheets
+def createReferenceChannelFromSamplesheet(param, schema, defaultValue = '') {
+    return param ? Channel.fromList(samplesheetToList(param, schema)) : defaultValue
+}
+
diff --git a/tests/.nftignore b/tests/.nftignore
@@ -10,7 +10,7 @@ paraphase/**/*.{vcf.gz,tbi,bam,bai,json}
 phased_variants/**/*.{vcf.gz,tbi}
 pipeline_info/*.{html,json,txt,yml}
 qc/cramino/**/*.txt
-qc/fastqc/**/*.zip
+qc/fastqc/**/*.{zip,html}
 qc/somalier/**/*.{html,tsv}
 repeat_annotation/**/*.{vcf.gz,tbi}
 repeat_calling/**/*.{vcf.gz,tbi,bam,bai}

diff --git a/tests/samplesheet.nf.test.snap b/tests/samplesheet.nf.test.snap
@@ -439,7 +439,6 @@
                 "test.ped:md5,bd5cec27ba7337a85cf98e787131e2b5",
                 "HG002_Revio_cramino_aligned_phased.arrow:md5,a76219e9046db32c4b3d6d78425c5d78",
                 "HG002_Revio_cramino_aligned.arrow:md5,a76219e9046db32c4b3d6d78425c5d78",
-                "HG002_Revio_fastqc.html:md5,1080b519dbbb66f45eee74e311d4922c",
                 "HG002_Revio.mosdepth.global.dist.txt:md5,63701e857361046628f89cb84988ea1d",
                 "HG002_Revio.mosdepth.region.dist.txt:md5,6b46396518979ff9d9771cb8a8fbbab0",
                 "HG002_Revio.mosdepth.summary.txt:md5,311aad293c6d8a646b6dd4edc337845c",
@@ -551,6 +550,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.04.4"
         },
-        "timestamp": "2024-10-25T13:00:09.69999597"
+        "timestamp": "2024-10-29T08:07:45.120070133"
     }
 }
diff --git a/tests/samplesheet_multisample_bam.nf.test.snap b/tests/samplesheet_multisample_bam.nf.test.snap
@@ -563,8 +563,6 @@
                 "HG002_Revio_B_cramino_aligned_phased.arrow:md5,3bb08ac5958c6cb0801f319066c3a1b2",
                 "HG002_Revio_A_cramino_aligned.arrow:md5,a76219e9046db32c4b3d6d78425c5d78",
                 "HG002_Revio_B_cramino_aligned.arrow:md5,3bb08ac5958c6cb0801f319066c3a1b2",
-                "HG002_Revio_A_fastqc.html:md5,25f875c3a542ff8590655685bc152658",
-                "HG002_Revio_B_fastqc.html:md5,4b7d698cbe79dbfb4a74e8e7f84891d5",
                 "HG002_Revio_A.mosdepth.global.dist.txt:md5,63701e857361046628f89cb84988ea1d",
                 "HG002_Revio_A.mosdepth.region.dist.txt:md5,6b46396518979ff9d9771cb8a8fbbab0",
                 "HG002_Revio_A.mosdepth.summary.txt:md5,311aad293c6d8a646b6dd4edc337845c",
@@ -759,6 +757,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.04.4"
         },
-        "timestamp": "2024-10-25T13:01:47.642764093"
+        "timestamp": "2024-10-29T08:09:35.63908858"
     }
 }
diff --git a/tests/samplesheet_multisample_ont_bam.nf.test.snap b/tests/samplesheet_multisample_ont_bam.nf.test.snap
@@ -402,8 +402,6 @@
                 "HG002_ONT_B_cramino_aligned_phased.arrow:md5,61af72539e105cec79db7c9b78eb15a7",
                 "HG002_ONT_A_cramino_aligned.arrow:md5,d2a5c81595fa34925ab8f03078487d81",
                 "HG002_ONT_B_cramino_aligned.arrow:md5,61af72539e105cec79db7c9b78eb15a7",
-                "HG002_ONT_A_fastqc.html:md5,94d86b38a30f846de64b840656663d18",
-                "HG002_ONT_B_fastqc.html:md5,2ec692ee5acf69717811be481d38f775",
                 "HG002_ONT_A.mosdepth.global.dist.txt:md5,5ae0972357f99aa481a0bf12fb9e0b0b",
                 "HG002_ONT_A.mosdepth.region.dist.txt:md5,023b1c6aeaf8fa5ededd6b711a5cd012",
                 "HG002_ONT_A.mosdepth.summary.txt:md5,c3b664b0983213f73edf3c0d5a0b04a2",
@@ -502,6 +500,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.04.4"
         },
-        "timestamp": "2024-10-25T13:10:46.10939576"
+        "timestamp": "2024-10-29T08:11:05.903725502"
     }
 }