Merge pull request #10 from zjnolen/develop

Merge in full changes for v0.1
zjnolen · Sep 14, 2023 · 097eb90 · 097eb90
2 parents 7eabe74 + 2b3fd85
commit 097eb90
Show file tree

Hide file tree

Showing 22 changed files with 282 additions and 362 deletions.
diff --git a/.test/config/units.tsv b/.test/config/units.tsv
@@ -1,7 +1,8 @@
-sample	unit	platform	fq1	fq2	realname
-hist1	BHVN22DSX2.2	ILLUMINA	data/fastq/hist1.r1.fastq.gz	data/fastq/hist1.r2.fastq.gz	MZLU153040
-hist2	BHVN22DSX2.2	ILLUMINA	data/fastq/hist2.r1.fastq.gz	data/fastq/hist2.r2.fastq.gz	MZLU153042
-hist3	BHVN22DSX2.2	ILLUMINA	data/fastq/hist3.r1.fastq.gz	data/fastq/hist3.r2.fastq.gz	MZLU153045
-mod1	AHW5NGDSX2.3	ILLUMINA	data/fastq/mod1.r1.fastq.gz	data/fastq/mod1.r2.fastq.gz	MZLU107434
-mod2	AHW5NGDSX2.3	ILLUMINA	data/fastq/mod2.r1.fastq.gz	data/fastq/mod2.r2.fastq.gz	MZLU107435
-mod3	AHW5NGDSX2.3	ILLUMINA	data/fastq/mod3.r1.fastq.gz	data/fastq/mod3.r2.fastq.gz	MZLU107436
+sample	unit	lib	platform	fq1	fq2	realname
+hist1	BHVN22DSX2.2	hist1	ILLUMINA	data/fastq/hist1.r1.fastq.gz	data/fastq/hist1.r2.fastq.gz	MZLU153040
+hist1	BHVN22DSX2.3	hist1	ILLUMINA	data/fastq/hist1.unit2.r1.fastq.gz	data/fastq/hist1.unit2.r2.fastq.gz	MZLU153040
+hist2	BHVN22DSX2.2	hist2	ILLUMINA	data/fastq/hist2.r1.fastq.gz	data/fastq/hist2.r2.fastq.gz	MZLU153042
+hist3	BHVN22DSX2.2	hist2	ILLUMINA	data/fastq/hist3.r1.fastq.gz	data/fastq/hist3.r2.fastq.gz	MZLU153045
+mod1	AHW5NGDSX2.3	mod1	ILLUMINA	data/fastq/mod1.r1.fastq.gz	data/fastq/mod1.r2.fastq.gz	MZLU107434
+mod2	AHW5NGDSX2.3	mod2	ILLUMINA	data/fastq/mod2.r1.fastq.gz	data/fastq/mod2.r2.fastq.gz	MZLU107435
+mod3	AHW5NGDSX2.3	mod3	ILLUMINA	data/fastq/mod3.r1.fastq.gz	data/fastq/mod3.r2.fastq.gz	MZLU107436
diff --git a/.test/data/fastq/hist1.unit2.r1.fastq.gz b/.test/data/fastq/hist1.unit2.r1.fastq.gz
diff --git a/.test/data/fastq/hist1.unit2.r2.fastq.gz b/.test/data/fastq/hist1.unit2.r2.fastq.gz
diff --git a/README.md b/README.md
@@ -1,49 +1,57 @@
 # Genotype likelihood population genomics pipeline
 
-This workflow aimmed at processing raw sequencing reads and calculating population
-genomic statistics within a genotype likelihood framework. As it is focused on GL
-methods, it has options to adapt the workflow for processing data with low or variable
-coverage and/or contains historical/ancient samples with degraded DNA. It is under
-active development, so new features will be added.
+This workflow aimmed at processing raw sequencing reads and calculating
+population genomic statistics within a genotype likelihood framework. As it is
+focused on GL methods, it has options to adapt the workflow for processing data
+with low or variable coverage and/or contains historical/ancient samples with
+degraded DNA. It is under active development, so new features will be added.
 
 ## Getting Started
 
-To run this workflow, you'll need paired-end raw sequencing data and a reference genome
-to map it to. Currently, the workflow is only compatiblity with datasets containing only
-a single library and paired-end sequencing run per sample, and reference genomes must be
-uncompressed.
+To run this workflow, you'll need paired-end raw sequencing data and a
+reference genome to map it to. Currently, the workflow is only compatiblity
+with datasets containing only a single library and paired-end sequencing run
+per sample, and reference genomes must be uncompressed.
 
-To run the workflow, you will need to be working on a machine with the following:
+To run the workflow, you will need to be working on a machine with the
+following:
 
 - Conda (or, preferably, it's drop-in replacement mamba)
 - Singularity/Apptainer
 
-Once these softwares are installed, to deploy and configure the workflow, you can follow
-the instructions provided in the [Snakemake workflow catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=zjnolen/angsd-snakemake-pipeline).
-You can refer to the Snakemake Documentation for additional information that may be
-relevant to your computing environment (running jobs through cluster job queues, setting
-default resources).
+Once these softwares are installed, to deploy and configure the workflow, you
+can follow the instructions provided in the [Snakemake workflow catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=zjnolen/angsd-snakemake-pipeline).
+You can refer to the Snakemake Documentation for additional information that
+may be relevant to your computing environment (running jobs through cluster job
+queues, setting default resources).
 
 ### Notes on inputs
 
-Reference genomes should be uncompressed, and contig names should be clear and concise.
-Currently, there are some issues parsing contig names with underscores, so please change
-these in your reference before running the pipeline. Alphanumeric characters, as well as
-`.` in contig names have been tested to work so far, other symbols have not been tested.
+Reference genomes should be uncompressed, and contig names should be clear and
+concise. Currently, there are some issues parsing contig names with
+underscores, so please change these in your reference before running the
+pipeline. Alphanumeric characters, as well as `.` in contig names have been
+tested to work so far, other symbols have not been tested.
 
-Potentially the ability to use bgzipped genomes will be added, I just need to check that
-it works with all underlying tools. Currently, it will for sure not work, as calculating
-chunks is hard-coded to work on an uncompressed genome.
+Potentially the ability to use bgzipped genomes will be added, I just need to
+check that it works with all underlying tools. Currently, it will for sure not
+work, as calculating chunks is hard-coded to work on an uncompressed genome.
 
 ### Running on a cluster
 
-Development was done on UPPMAX's Rackham cluster, and a simple profile is included in
-the [`rackham`](rackham) folder to simplify running this workflow through SLURM there.
-For running on other SLURM based cluster configs, this file should largely work with a
-few minor modifications of the defaults. See
+Development was done on UPPMAX's Rackham cluster, and a simple profile is
+included in the [`rackham`](rackham) folder to simplify running this workflow
+through SLURM there. For running on other SLURM based cluster configs, this
+file should largely work with a few minor modifications of the defaults. See
 [Snakemake's cluster support documentation](https://snakemake.readthedocs.io/en/stable/executing/cluster.html)
 for information on how to adapt the profile for other HPC environments.
 
+#### Notes on resources
+
+As Rackham ties memory reservations to cpu reservations, the resource
+allocation for rules is mostly done through threads currently. In the future
+thread and memory resources will be more explicitly defined.
+
 ## Features
 
 Currently, the pipeline performs the following tasks:
@@ -55,16 +63,17 @@ Currently, the pipeline performs the following tasks:
 ### Raw read preparation
 
 - Trimming of paired-end reads from high quality libraries
-- Collapsing of paired-end reads from fragmented (aDNA/historical DNA) libraries
+- Collapsing of paired-end reads from fragmented (aDNA/historical DNA)
+libraries
 
 ### Read mapping
 
-- Mapping prepared raw reads to reference using bwa-mem and clipping of overlapping
-  reads
-  - **NOTE**: Reads marked as historical (degraded) will only map reads short reads that
-    overlap and collapse, to reduce mapping of likely contaminants.
-- Removal of PCR and sequencing duplicates separately for fresh (Picard) and fragmented
-  (DeDup) DNA reads
+- Mapping prepared raw reads to reference using bwa-mem and clipping of
+  overlapping reads, combining of multiple sample runs/libraries
+  - **NOTE**: Reads marked as historical (degraded) will only map reads short
+    reads that overlap and collapse, to reduce mapping of likely contaminants.
+- Removal of PCR and sequencing duplicates separately for fresh (Picard) and
+  fragmented (DeDup) DNA reads
 - Realignment around indels
 - Optional recalibration of base quality scores on degraded DNA bam files with
   [MapDamage2](https://ginolhac.github.io/mapDamage/)
@@ -74,37 +83,41 @@ Currently, the pipeline performs the following tasks:
 
 - Assess post-mortem DNA damage with DamageProfiler
 - Assess mapping quality stats with Qualimap
-- Assess endogenous content using mapping proportion before duplicate reads are removed
+- Assess endogenous content using mapping proportion before duplicate reads are
+  removed
 
 ### Data quality filtering
 
 - Analyses can be set with minimum mapping and base quality thresholds
-- Exclusion of entire scaffolds (i.e. sex-linked, low quality) through user config (both
-  list and contig size based)
+- Exclusion of entire scaffolds (i.e. sex-linked, low quality) through user
+  config (both list and contig size based)
 - Exclusion of repeat regions from analyses using RepeatModeler/RepeatMasker
 - Exclusion of low mappability regions with GenMap
-- Exclusion of sites with extreme global depth values (determined separately for the
-  entire dataset, and subsets at certain coverage ranges, then merged)
-- Exclusion of sites based on data missingness across dataset and/or per population
+- Exclusion of sites with extreme global depth values (determined separately
+  for the entire dataset, and subsets at certain coverage ranges, then merged)
+- Exclusion of sites based on data missingness across dataset and/or per
+  population
 
 ### GL based population genomic analyses
 
-To speed up the pipeline, many of these analyses are done for part of the genome at a
-time, then later merged. This is only done for analyses where possible and where the
-time saved is appreciable. These chunks are made to be a user configured size to allow
-tuning of run-times (i.e. more jobs, shorter runtimes vs fewer jobs, longer runtimes).
-
-SAF based analyses are done on variable and non-variable sites passing quality filters.
-This set is the same across all populations in the dataset and is based on the positions
-passing all the requested filters. Beagle (SNP) based analyses are done on a SNP set
-that is constant across all populations, determined from the output of the Beagle file
-for the whole dataset, and major and minor alleles are inferred from the whole dataset.
-When relevant, pruned SNPs are used. Pruning is done on the whole dataset beagle file
-and the same pruned sites are used for all populations.
-
-Additionally, all analyses can be repeated with samples subsampled to a lower user
-configured depth. This helps to ensure results are not simply due to variance in depth
-between groups.
+To speed up the pipeline, many of these analyses are done for part of the
+genome at a time, then later merged. This is only done for analyses where
+possible and where the time saved is appreciable. These chunks are made to be a
+user configured size to allowtuning of run-times (i.e. more jobs, shorter
+runtimes vs fewer jobs, longer runtimes).
+
+SAF based analyses are done on variable and non-variable sites passing quality
+filters. This set is the same across all populations in the dataset and is
+based on the positions passing all the requested filters. Beagle (SNP) based
+analyses are done on a SNP set that is constant across all populations,
+determined from the output of the Beagle file for the whole dataset, and major
+and minor alleles are inferred from the whole dataset. When relevant, pruned
+SNPs are used. Pruning is done on the whole dataset beagle file and the same
+pruned sites are used for all populations.
+
+Additionally, all analyses can be repeated with samples subsampled to a lower
+user configured depth. This helps to ensure results are not simply due to
+variance in depth between groups.
 
 **Analyses:**
 
@@ -113,30 +126,28 @@ between groups.
 - Admixture with NGSAdmix
 - Relatedness using NgsRelate and methods from Waples et al. 2019, *Mol. Ecol.*
 - 1D and 2D Site frequency spectrum production with ANGSD
-- Neutrality statistics per population (Watterson's theta, pairwise pi, Tajima's D) in
-  user defined sliding windows with ANGSD
+- Neutrality statistics per population (Watterson's theta, pairwise pi,
+  Tajima's D) in user defined sliding windows with ANGSD
 - Estimation of heterozygosity per sample from 1D SFS with ANGSD
-- Pairwise $F_{ST}$ between all populations or individuals in user defined sliding
-  windows with ANGSD
-- Inbreeding coefficients and runs of homozygosity per sample with ngsF-HMM (**NOTE**
-  This is currently only possible for samples that are within a population sampling,
-  not for lone samples which will always return an inbreeding coefficient of 0)
+- Pairwise $F_{ST}$ between all populations or individuals in user defined
+  sliding windows with ANGSD
+- Inbreeding coefficients and runs of homozygosity per sample with ngsF-HMM
+  (**NOTE** This is currently only possible for samples that are within a
+  population sampling, not for lone samples which will always return an
+  inbreeding coefficient of 0)
 
 ### Planned
 
 Some additional components to the pipeline are planned, the order below roughly
 corresponding to priority:
 
-- Allow custom site list - either as a supplement to the filters already present or as
-  the only filter (by setting all other filters to `false`)
 - Add calculation of bootstrapped SFS
-- Estimate LD decay with ngsLD
 - Manhattan plots in report for sliding window results
-- Allow starting with bam files - for those that want to process raw reads in their own
-  way before performing analyses
+- Allow starting with bam files - for those that want to process raw reads in
+  their own way before performing analyses
 - Add calculation of Dxy
-- Add schema for configuration files to improve incorrect format handling and to enable
-  defaults
+- Add schema for configuration files to improve incorrect format handling and
+  to enable defaults
 
 ## Workflow directed action graph
 
@@ -149,8 +160,8 @@ it is a close approximation of the flow as of now.
 
 ## Acknowledgements
 
-The computations required for developing and testing this workflow has been enabled by
-resources provided by the National Academic Infrastructure for Supercomputing in Sweden
-(NAISS) and the Swedish National Infrastructure for Computing (SNIC) at UPPMAX partially
-funded by the Swedish Research Council through grant agreements no. 2022-06725 and no.
-2018-05973.
+The computations required for developing and testing this workflow has been
+enabled by resources provided by the National Academic Infrastructure for
+Supercomputing in Sweden (NAISS) and the Swedish National Infrastructure for
+Computing (SNIC) at UPPMAX partially funded by the Swedish Research Council
+through grant agreements no. 2022-06725 and no. 2018-05973.
diff --git a/config/README.md b/config/README.md
@@ -38,20 +38,19 @@ tab separated:
 
 - `sample` - The sample name, same as in `samples.tsv`.
 - `unit` - This is used to fill out the `ID` read group in the bam file. It
-  must be unique within each BAM file, so the same sample shouldn't have the
-  same unit for more than one set of fastq reads. I have not implemented
-  multiple lanes or libraries per sample in this pipeline (yet), so any value
-  will currently work, but a good format might be `library_id.sequencer_barcode.lane`.
-- `platform` - This is used to fill out the `PL` read group. Put what you'd want
-  there. Usually `ILLUMINA` for Illumina platforms.
+  must be unique to each read group, so the same sample shouldn't have the
+  same unit for more than one sequencing run. A good format might be
+  `sequencer_barcode.lane`. Optical duplicates will be removed within units.
+- `lib` - This is used to fill out the `LB` read group. This should be a unique
+  identifier for each sample library. Sequencing runs from the same library,
+  but different runs, will have the same value in `lib`, but different in
+  `unit`.
+- `platform` - This is used to fill out the `PL` read group. Put what you'd
+  want there. Usually `ILLUMINA` for Illumina platforms.
 - `fq1` and `fq2` - The absolute or relative paths from the working directory
   to the raw fastq files for the sample. Currently the pipeline only supports
   paired-end sequencing, single end may be added down the line.
 
-Note, as I have not implemented multiple libraries per sample, I haven't put
-the `LB` read group in. This will be added in the future when multiple library
-support is added.
-
 ## Configuration file
 
 `config.yaml` contains the configuration for the workflow, this is where you

diff --git a/config/units.tsv b/config/units.tsv
@@ -1 +1 @@
-sample	unit	platform	fq1	fq2
+sample	unit	lib	platform	fq1	fq2
diff --git a/workflow/envs/fastp.yaml b/workflow/envs/fastp.yaml
diff --git a/workflow/envs/gatk.yaml b/workflow/envs/gatk.yaml
diff --git a/workflow/envs/mapping.yaml b/workflow/envs/mapping.yaml
diff --git a/workflow/envs/picard.yaml b/workflow/envs/picard.yaml
diff --git a/workflow/envs/qualimap.yaml b/workflow/envs/qualimap.yaml
diff --git a/workflow/rules/0.1_ref_prep.smk b/workflow/rules/0.1_ref_prep.smk
@@ -29,19 +29,22 @@ rule bwa_index:
     input:
         "results/ref/{ref}/{ref}.fa",
     output:
-        multiext("results/ref/{ref}/{ref}.fa", ".amb", ".ann", ".bwt", ".pac", ".sa"),
+        idx=multiext(
+            "results/ref/{ref}/{ref}.fa",
+            ".amb",
+            ".ann",
+            ".bwt",
+            ".pac",
+            ".sa",
+        ),
     log:
         "logs/ref/bwa_index/{ref}.log",
-    conda:
-        "../envs/mapping.yaml"
     resources:
         runtime=120,
     benchmark:
         "benchmarks/ref/bwa_index/{ref}.log"
-    shell:
-        """
-        bwa index {input} 2> {log}
-        """
+    wrapper:
+        "v2.6.0/bio/bwa/index"
 
 
 rule samtools_faidx:
@@ -52,14 +55,10 @@ rule samtools_faidx:
         "results/ref/{ref}/{ref}.fa.fai",
     log:
         "logs/ref/samtools_faidx/{ref}.log",
-    conda:
-        "../envs/samtools.yaml"
     benchmark:
         "benchmarks/ref/samtools_faidx/{ref}.log"
-    shell:
-        """
-        samtools faidx {input} 2> {log}
-        """
+    wrapper:
+        "v2.4.0/bio/samtools/faidx"
 
 
 rule ref_chunking:
@@ -88,12 +87,7 @@ rule picard_dict:
         "results/ref/{ref}/{ref}.dict",
     log:
         "logs/ref/picard_dict/{ref}.log",
-    conda:
-        "../envs/picard.yaml"
     benchmark:
         "benchmarks/ref/picard_dict/{ref}.log"
-    shell:
-        r"""
-        picard CreateSequenceDictionary -Xmx{resources.mem_mb}m \
-            R={input} O={output} 2> {log}
-        """
+    wrapper:
+        "v2.4.0/bio/picard/createsequencedictionary"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		sample unit platform fq1 fq2
		sample unit lib platform fq1 fq2