Software updates, enhancements and bugfixes

- GATK 4.5.0.0 - CNVkit 0.9.11 - BLAST 2.15 - multiqc 1.21 - pVACtools 4.1.1(fixes #43, #60, #66) - VEP 111.0 - added DeepImmuno to pVACseq predictors - remove sequenza from nextNEOpiENV and use it in separate ENVS - switch from SGE to slurm in cluster profile example - set process memory defaults for resource reservation to avoid OOM kills - set default process time directive for max process execution time - refine process CPU directive settings for - fix custom HLA file handling (fixes and closes #70, thanks to @mantczakaus) - added custom HLA file example (closes #72) - changed the way how pvacseq results are concatenated. Now a python script using pandas is doing the job.
icbi-lab · May 11, 2024 · fc01dde · fc01dde
1 parent 50779e7
commit fc01dde
Show file tree

Hide file tree

Showing 9 changed files with 266 additions and 169 deletions.
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ The pipeline uses the following tools:
 * CNVkit
 * OptiType
 * HLA-HD
-* pVACseq (netMHCpan, netMHCIIpan, mhcflurry)
+* pVACseq (NetMHCpan NetMHCpanEL MHCflurry MHCflurryEL NetMHCIIpan NetMHCIIpanEL DeepImmuno)
 * NeoFuse
 * mixMHC2pred
 * mixcr
@@ -155,7 +155,7 @@ its dependencies.
 
 We strongly recommend to run the pipeline on a HPC cluster. You can enable runs in cluster mode by using a profile named e.g. **cluster** and the option ```-profile singularity,cluster``` or ```-profile conda,cluster```
 
-For an example SGE cluster profile, please see ```profiles``` in ```conf/profiles.config```. You may uncomment and adjust the cluster profile to your scheduling system.
+For an example Slurm cluster profile, please see ```profiles``` in ```conf/profiles.config```. You may uncomment and adjust the cluster profile to your scheduling system.
 
 **Sequencing data input:**
 
@@ -203,7 +203,7 @@ Make sure that your batchFile CSV includes the column names as shown in the exam
 * reads1: forward reads (can be fastq or gzipped fastq)
 * reads2: reverse reads (if paired end sequencing was used, empty for single end)
 * sampleType: one of `tumor_DNA, normal_DNA. tumor_RNA`
-* HLAfile: optional file with HLA types (default: empty)
+* HLAfile: optional file with HLA types (default: empty, example see `nextNEOpi_testdata.tar.gz`)
 * sex: gender of the sample if known (female, male, xx, xy) or NA if unknown
 
 A sample may have multiple read files for a single `sampleType`, nextNEOpi will merge them accordingly. As shown in the above example `sample4` has 2 fastq files for the `tumor_DNA`, in this cases `reads_1_1.fastq.gz` will be merged with `reads_2_1.fastq.gz`. The same applies to `reads2`.
@@ -363,15 +363,15 @@ If you prefer local installation of the analysis tools please install the follow
 * FASTQC        (Version >= 0.11.8)
 * FASTP         (Version >= v0.20.1)
 * JAVA7   (Version 1.7)
-* JAVA8   (Version 1.8)
+* JAVA8   (Version => 1.17)
 * BWA    (Version >= 0.7.17)
 * SAMTOOLS   (Version >= 1.9)
 * GATK3   (Version 3.8-0)
-* GATK4   (Version >= 4.4.0.0)
+* GATK4   (Version >= 4.5.0.0)
 * VARSCAN   (Version 2.4.6)
 * MUTECT1   (Version 1.1.7) ---- optional
 * BAMREADCOUNT  (Version 0.8.0)
-* VEP    (Version v110)
+* VEP    (Version v111)
 * BGZIP
 * TABIX
 * BCFTOOLS

diff --git a/assets/nextNEOpi.def b/assets/nextNEOpi.def
@@ -22,18 +22,18 @@ From: mambaorg/micromamba:0.24.0
     mkdir -p /opt/gatk
     mkdir -p /opt/conda/bin
 
-    curl -L -o gatk-4.4.0.0.zip https://github.com/broadinstitute/gatk/releases/download/4.4.0.0/gatk-4.4.0.0.zip
-    unzip -j gatk-4.4.0.0.zip gatk-4.4.0.0/gatkPythonPackageArchive.zip -d ./
-    unzip -j gatk-4.4.0.0.zip gatk-4.4.0.0/gatk-package-4.4.0.0-local.jar -d ./opt/gatk/
-    unzip -j gatk-4.4.0.0.zip gatk-4.4.0.0/gatk -d ./opt/gatk/
+    curl -L -o gatk-4.5.0.0.zip https://github.com/broadinstitute/gatk/releases/download/4.5.0.0/gatk-4.5.0.0.zip
+    unzip -j gatk-4.5.0.0.zip gatk-4.5.0.0/gatkPythonPackageArchive.zip -d ./
+    unzip -j gatk-4.5.0.0.zip gatk-4.5.0.0/gatk-package-4.5.0.0-local.jar -d ./opt/gatk/
+    unzip -j gatk-4.5.0.0.zip gatk-4.5.0.0/gatk -d ./opt/gatk/
 
     chmod +x /opt/gatk/gatk
     ln -s /opt/gatk/gatk /opt/conda/bin/gatk
 
     micromamba install --yes --name base --file /nextNEOpi.yml
 
     rm -f /nextNEOpi.yml
-    rm -f gatk-4.4.0.0.zip
+    rm -f gatk-4.5.0.0.zip
     rm -f gatkPythonPackageArchive.zip
 
     apt-get clean

diff --git a/assets/nextNEOpi.yml b/assets/nextNEOpi.yml
@@ -15,11 +15,12 @@ dependencies:
   - yara
   - optitype
 
-  # core python dependencies for GATK4 (4.4.0.0)
+  # core python dependencies for GATK4 (4.5.0.0)
   - conda-forge::python=3.6.10 # do not update
-  - pip=20.0.2 # specifying channel may cause a warning to be emitted by conda
+  - pip=21.3.1 # specifying channel may cause a warning to be emitted by conda
   - conda-forge::mkl=2019.5 # MKL typically provides dramatic performance increases for theano, tensorflow, and other key dependencies
   - conda-forge::mkl-service=2.3.0
+  - conda-forge::joblib=1.1.1 # must pin joblib - versions after 1.1.1 no longer support python 3.6
   - conda-forge::numpy=1.17.5 # do not update, this will break scipy=1.0.0
     #   verify that numpy is compiled against MKL (e.g., by checking *_mkl_info using numpy.show_config())
     #   and that it is used in tensorflow, theano, and other key dependencies
@@ -40,19 +41,19 @@ dependencies:
   - conda-forge::pandas=1.0.3
   - conda-forge::typing_extensions=4.1.1   # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs
   - conda-forge::dill=0.3.4                # used for pickling lambdas in TrainVariantAnnotationsModel
-  - conda-forge::joblib=1.1.1
+  - conda-forge::libxcrypt
+  - conda-forge::absl-py=1.4.0
 
   # core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
   - r-base>=3.6.2
-  - r-data.table
+  - r-data.table=1.12.8
   - r-dplyr=0.8.5
   - r-getopt=1.20.3
   - r-ggplot2=3.3.0
   - r-gplots=3.0.3
   - r-gsalib=2.1
   - r-optparse=1.6.4
   - r-backports=1.1.10
-  - r-sequenza
 
   # other python dependencies; these should be removed after functionality is moved into Java code
   - biopython=1.76
@@ -62,4 +63,3 @@ dependencies:
   # pip installs should be avoided, as pip may not respect the dependencies found by the conda solver
   - pip:
       - gatkPythonPackageArchive.zip
-      - sequenza-utils
diff --git a/assets/pVACtools_icbi.def b/assets/pVACtools_icbi.def
@@ -40,7 +40,8 @@
 #
 
 Bootstrap: docker
-From: continuumio/miniconda3:4.9.2
+#From: continuumio/miniconda3:4.11.0
+From: continuumio/miniconda3:22.11.1
 
 %files
     ./.mambarc /root/.mambarc
@@ -80,10 +81,13 @@ From: continuumio/miniconda3:4.9.2
     export MHCFLURRY_DATA_DIR=/opt/mhcflurry_data
 
     pip install --upgrade pip
-    pip install protobuf==3.20.1
-    pip install tensorflow>=2.2.2
+    pip install 'tensorflow<2.16.0'
 
-    pip install pvactools==4.0.1
+    pip install git+https://github.com/griffithlab/bigmhc.git#egg=bigmhc
+    pip install git+https://github.com/griffithlab/deepimmuno.git#egg=deepimmuno
+
+    pip install pvactools==4.1.1
+    pip install mhcflurry==2.1.1
 
     cd /opt
     mkdir tmp_src
@@ -92,7 +96,7 @@ From: continuumio/miniconda3:4.9.2
     ## work around current version 20210210 is not building (https://github.com/atks/vt/issues/113)
     conda install vt==0.57721
 
-    pip install cyvcf2==0.30.4
+    pip install cyvcf2
     pip install vatools
     wget -O /usr/local/bin/bam_readcount_helper.py https://raw.githubusercontent.com/genome/docker-bam_readcount_helper-cwl/master/bam_readcount_helper.py
     chmod 755 /usr/local/bin/bam_readcount_helper.py

diff --git a/bin/concat_pvacseq.py b/bin/concat_pvacseq.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+"""
+Concatenate pvacseq outputfiles
+
+Requirements:
+    * Python >= 3.6.2
+    * pandas >= 1.4
+
+Copyright (c) 2024 Dietmar Rieder <dietmar.rieder@i-med.ac.at>
+MIT License <http://opensource.org/licenses/MIT>
+
+"""
+
+import os
+import argparse
+import pandas as pd
+
+def concat_files(directory, pattern):
+    # List all files in the current directory that match the pattern
+    files_to_concat = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(pattern)]
+    print(files_to_concat)
+
+    # Initialize an empty list to store DataFrames
+    dfs = []
+
+    # Iterate over the files and read them into DataFrames
+    for file in files_to_concat:
+        df = pd.read_csv(file, sep="\t")
+        dfs.append(df)
+
+    # Concatenate the DataFrames
+    result = pd.concat(dfs, ignore_index=True)
+
+    return result
+
+if __name__ == "__main__":
+    # Set up argparse to accept command-line arguments
+    parser = argparse.ArgumentParser(description='Concatenate files in the current directory.')
+    parser.add_argument('--dir', type=str, default='./', help='Directory to search for files (default: current directory)')
+    parser.add_argument('--pattern', type=str, help='Pattern to match filenames')
+    parser.add_argument('--output', type=str, default='out.tsv', help='Output filename')
+
+    # Parse the command-line arguments
+    args = parser.parse_args()
+
+    # Call the function to concatenate files with the specified pattern
+    concatenated_df = concat_files(args.dir, args.pattern)
+
+    # Save the concatenated DataFrame to a CSV file
+    concatenated_df.to_csv(args.output, index=False, sep="\t", na_rep='NA')
diff --git a/conf/params.config b/conf/params.config
@@ -108,11 +108,12 @@ params {
   // REQUIRED: Path to the installation directory of HLA-HD
   // Please install HLA-HD locally, you can get your own copy of HLA-HD at:
   // https://www.genome.med.kyoto-u.ac.jp/HLA-HD/
-  HLAHD_DIR             = "" //  path to HLA_HD hlahd.1.5.0
+  HLAHD_DIR             = "" //  path to HLA_HD hlahd.1.7.0
+  HLAHD_module          = "" //  hlahd module name, if installed a s module
   HLA_HD_genome_version = "hg38"
 
   // URL to the installation package of MIXCRC, will be installed automatically.
-  MIXCR_url       = "https://github.com/milaboratory/mixcr/releases/download/v4.4.1/mixcr-4.4.1.zip"
+  MIXCR_url       = "https://github.com/milaboratory/mixcr/releases/download/v4.6.0/mixcr-4.6.0.zip"
   MIXCR_lic       = "" // path to MiXCR license file
   MIXCR           = "" // Optional: specify path to mixcr directory if already installed, will be installed automatically otherwise
   // analyze TCRs using mixcr
@@ -129,22 +130,13 @@ params {
   IGS = "" // optional path to IGS
 
   // IEDB tools urls for MHCI and MHCII. These will be used for IEDB installation into resources.databases.IEDB_dir
-  IEDB_MHCI_url  = "https://downloads.iedb.org/tools/mhci/3.1.4/IEDB_MHC_I-3.1.4.tar.gz"
-  IEDB_MHCII_url = "https://downloads.iedb.org/tools/mhcii/3.1.8/IEDB_MHC_II-3.1.8.tar.gz"
-
-
-  // Java settings: please adjust to your memory available
-  JAVA_Xmx = "-Xmx64G"
-
-  // samtools memory: please adjust to your memory available
-  STperThreadMem = "8G"
+  IEDB_MHCI_url  = "https://downloads.iedb.org/tools/mhci/3.1.5/IEDB_MHC_I-3.1.5.tar.gz"
+  IEDB_MHCII_url = "https://downloads.iedb.org/tools/mhcii/3.1.12/IEDB_MHC_II-3.1.12.tar.gz"
 
   // sambamba settings: please adjust to your memory available
   SB_hash_table_size = "1048576"
   SB_overflow_list_size = "1000000"
   SB_io_buffer_size = "1024"
-  SB_sort_mem = "64G"
-
 
   // Filter variants (FilterMutect2Tumor): set minimum allele depth
   minAD = 5
@@ -178,9 +170,9 @@ params {
 
 
   // VEP
-  vep_version             = "110.0"
+  vep_version             = "111.0"
   vep_assembly            = "GRCh38"
-  vep_cache_version       = "110"
+  vep_cache_version       = "111"
   vep_species             = "homo_sapiens"
   vep_options             = "--everything" // "--af --af_1kg --af_gnomad --appris --biotype --check_existing --distance 5000 --failed 1 --merged --numbers --polyphen b --protein --pubmed --regulatory --sift b --symbol --xref_refseq --tsl --gene_phenotype"
 
@@ -206,7 +198,7 @@ params {
   // pVACseq settings
   mhci_epitope_len         = "8,9,10,11"
   mhcii_epitope_len        = "15,16,17,18,19,20,21,22,23,24,25" // minimum length has to be at least 15 (see pVACtools /opt/iedb/mhc_ii/mhc_II_binding.py line 246)
-  epitope_prediction_tools = "NetMHCpan NetMHCpanEL MHCflurry MHCflurryEL NetMHCIIpan NetMHCIIpanEL"
+  epitope_prediction_tools = "NetMHCpan NetMHCpanEL MHCflurry MHCflurryEL NetMHCIIpan NetMHCIIpanEL DeepImmuno"
   use_NetChop              = false
   use_NetMHCstab           = true
 
@@ -278,4 +270,4 @@ singularity {
     enabled = true
     autoMounts = true
     runOptions =  process.containerOptions + " -H " + params.singularityTmpMount + " -B " +  params.singularityAssetsMount + " -B " + params.singularityTmpMount + " -B " + params.resourcesBaseDir + params.singularityHLAHDmount + " -B " + params.databases.IEDB_dir + ":/opt/iedb" + " -B " + params.databases.MHCFLURRY_dir + ":/opt/mhcflurry_data"
-}
+}
diff --git a/conf/process.config b/conf/process.config
@@ -7,6 +7,10 @@ process {
 
     // default number of cpus to uses
     cpus = 1
+    memory = '8 G'
+
+    // default time
+    time = '4 h'
 
     // set default cache policy
     // Cache keys are created indexing input files path and size attributes
@@ -19,6 +23,7 @@ process {
 
     withName:bam2fastq {
         cpus = 4
+        memory = '32 G'
     }
 
     withName:FastQC {
@@ -35,14 +40,17 @@ process {
 
     withName:make_uBAM {
         beforeScript = "ulimit -n 4096"
+        memory = '64 G'
     }
 
     withName:Bwa {
         cpus = 16
+        memory = '64 G'
     }
 
     withName:MarkDuplicates {
         cpus = 4
+        memory = '64 G'
     }
 
     withName:Mutect2 {
@@ -51,46 +59,54 @@ process {
 
     withName:Mutect1scattered {
         cpus = 1
+        memory = '32 G'
     }
 
     withName:VarscanSomaticScattered {
         cpus = 2
+        memory = '16 G'
     }
 
     withName:IndelRealignerIntervals {
         cpus = 8
+        memory = '16 G'
     }
 
     withName:GatherRealignedBamFiles {
         cpus = 8
+        memory = '16 G'
     }
 
     withName:GatherRecalBamFiles {
         cpus = 8
+        memory = '16 G'
     }
 
     withName:scatterGATK4applyBQSRS {
         cpus = 2
     }
 
     withName:alignmentMetrics {
-        cpus = 8
+        cpus = 1
     }
 
     withName:HaploTypeCaller {
-        cpus = 2
+        cpus = 4
+        memory = '16 G'
     }
 
     withName:CNNScoreVariants {
         cpus = 2
+        memory = '16 G'
     }
 
     withName:mkCombinedVCF {
         cpus = 4
     }
 
     withName:ReadBackedphasing {
-        cpus = 8
+        cpus = 2
+        memory = '16 G'
     }
 
     withName:MantaSomaticIndels {
@@ -101,7 +117,11 @@ process {
         cpus = 16
     }
 
-    withLabel:VEP {
+    withName:VEPtab {
+        cpus = 16
+    }
+
+    withName:VEPvcf {
         cpus = 16
     }
 
@@ -173,10 +193,15 @@ process {
         container = 'https://apps-01.i-med.ac.at/images/singularity/NeoFuse_dev_92712e07.sif'
         containerOptions = "--no-home --containall"
         cpus = 10
+        memory = '96 G'
     }
 
     withLabel:pVACtools {
-        container = 'https://apps-01.i-med.ac.at/images/singularity/pVACtools_4.0.1_icbi_4ae2625d.sif'
+        container = 'https://apps-01.i-med.ac.at/images/singularity/pVACtools_4.1.1_icbi_test.sif'
+    }
+
+    withLabel:nextNEOpiENV {
+        container = 'https://apps-01.i-med.ac.at/images/singularity/nextNEOpi_1.4.1_test.sif'
     }
 
     withName:pVACseq {