zjnolen · zjnolen · Jun 24, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 22, 2024
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -66,6 +66,7 @@ jobs:
     - name: Test workflow
       shell: bash -l {0}
       run: |
+        conda config --set channel_priority strict
         mkdir -p temp
         workdir=$(pwd -P)
         TMPDIR=$workdir/temp

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -76,11 +76,12 @@ subsample_dp: 2
 
 subsample_by: "sitefilt" # three options: unfilt, mapqbaseq, sitefilt; sitefilt recommended (subsamples bams to `subsample_dp` within filtered regions)
 redo_depth_filts: false # has no effect when `subsample_by` == "sitefilt"
+drop_samples: ["mod1"] # These samples won't be included in the downsampled dataset
 
 subsample_analyses:
   estimate_ld: true
   ld_decay: true
-  pca_pcangsd: true
+  pca_pcangsd: false
   admix_ngsadmix: true
   relatedness:
     ngsrelate: true
@@ -102,7 +103,7 @@ subsample_analyses:
 
 filter_beds:
   example: "config/example-filt.bed"
-  badexample: "config/badexample-filt.bed"
+  #badexample: "config/badexample-filt.bed"
 
 only_filter_beds: false
 
@@ -128,6 +129,8 @@ params:
     min_overlap_hist: 20
   bwa_aln:
     extra: "-l 16500 -n 0.01 -o 2"
+  samtools:
+    subsampling_seed: "$RANDOM"
   picard:
     MarkDuplicates: "--REMOVE_DUPLICATES true --OPTICAL_DUPLICATE_PIXEL_DISTANCE 2500"
   damageprofiler:

diff --git a/config/README.md b/config/README.md
diff --git a/config/config.yaml b/config/config.yaml
@@ -76,6 +76,7 @@ subsample_dp:
 
 subsample_by: "sitefilt" # three options: unfilt, mapqbaseq, sitefilt; sitefilt recommended (subsamples bams to `subsample_dp` within filtered regions)
 redo_depth_filts: false # has no effect when `subsample_by` == "sitefilt"
+drop_samples: [] # These samples won't be included in the downsampled dataset
 
 subsample_analyses:
   estimate_ld: false
@@ -127,6 +128,8 @@ params:
     min_overlap_hist: 20
   bwa_aln:
     extra: "-l 16500 -n 0.01 -o 2"
+  samtools:
+    subsampling_seed: "$RANDOM"
   picard:
     MarkDuplicates: "--REMOVE_DUPLICATES true --OPTICAL_DUPLICATE_PIXEL_DISTANCE 2500"
   damageprofiler:

diff --git a/docs/config.md b/docs/config.md
@@ -177,7 +177,10 @@ to exclude them where necessary.
   be a list in [].
 - `excl_pca-admix:` Sample name(s) that will be excluded *only* from PCA and
   Admixture analyses. Useful for close relatives that violate the assumptions
-  of these analyses, but that you want in others. Should be a list in [].
+  of these analyses, but that you want in others. Should be a list in []. If you
+  want relatives out of all downstream analyses, not just PCA/Admix, put them in
+  `exclude_ind` instead. Note this will trigger a re-run for relatedness
+  analyses, but you can just disable them now as they've already been run.
 
 #### Analysis Selection
 
@@ -249,8 +252,9 @@ settings for each analysis are set in the next section.
     setting here. (`true`/`false`)
   - `ld_decay:` Use ngsLD to plot LD decay curves for each population and for
     the dataset as a whole (`true`/`false`)
-  - `pca_pcangsd:` Perform Principal Component Analysis with PCAngsd
-    (`true`/`false`)
+  - `pca_pcangsd:` Perform Principal Component Analysis with PCAngsd. Currently
+    requires at least 4 samples to finish, as it will by default try to plot
+    PCs1-4. (`true`/`false`)
   - `admix_ngsadmix:` Perform admixture analysis with NGSadmix (`true`/`false`)
   - `relatedness:` Can be performed multiple ways, set any combination of the
     three options below. Note, that I've mostly incorporated these with the
@@ -353,6 +357,11 @@ for.
   the full depth bams will be used. If `subsample_by` is set to `"sitefilt"`
   this will have no effect, as the subsampling is already in reference to a set
   site list. (`true`/`false`)
+- `drop_samples`: When performing depth subsampling, you may want to leave some
+  samples out that you kept in your 'full' dataset. These can be listed here and
+  they will be removed from ALL depth subsampled analyses. A use case for this
+  might be if you have a couple samples that are below your targeted subsample
+  depth, and you don't want to include them. (list of strings: `[]`)
 - `subsample_analyses:` Individually enable analyses to be performed with the
   subsampled data. These are the same as the ones above in the analyses
   section. Enabling here will only run the analysis for the subsampled data,

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -2,6 +2,10 @@
 
 ## Tutorial
 
+!!! note
+    A tutorial is in progress, but not yet available. The pipeline can still be
+    used by following the rest of the guide.
+
 A tutorial is available with a small(ish) dataset where biologically meaningful
 results can be produced. This can help get an understanding of a good workflow
 to use different modules. You can also follow along with your own data and just

diff --git a/docs/index.md b/docs/index.md
@@ -36,34 +36,28 @@ user-provided filter lists (e.g. to limit to neutral sites, genic regions,
 etc.).
 
 After samples have been processed, quality control reports produced, and the
-sites file has been produced, the pipeline can continue to the analyses. These
-are largely divided up into two categories - SNP based and allele frequency
-based.
-
-### SNP based
+sites file has been produced, the pipeline can continue to the analyses.
 
 - Linkage disequilibrium estimation, LD decay, LD pruning (ngsLD)
 - PCA (PCAngsd)
 - Admixture (NGSAdmix)
 - Inbreeding/Runs of Homozygosity (ngsF-HMM)
-- Relatedness (NGSRelate)
+- Relatedness (NGSRelate, IBSrelate)
 - Identity by state matrix (ANGSD)
-
-### Allele frequency based
-
-- Relatedness (IBSrelate)
 - Site frequency spectrum (ANGSD)
-- Watterson's , Nucleotide diversity (), Tajima's D (ANGSD)
-- Individual heterozygosity (ANGSD)
+- Watterson's estimator ($θ_w$), Nucleotide diversity ($π$), Tajima's $D$
+  (ANGSD)
+- Individual heterozygosity with bootstrapped confidence intervals (ANGSD)
 - Pairwise $F_{ST}$ (ANGSD)
 
 These all can be enabled and processed independently, and the pipeline will
-share input files across them, deleting temporary intermediate files when they
-are no longer needed.
+generate genotype likelihood input files using ANGSD and share them across
+analyses as appropriate, deleting temporary intermediate files when they are no
+longer needed.
 
 At any point after a successful completion of a portion of the pipeline, a
 report can be made that contains tables and figures summarizing the results
 for the currently enabled parts of the pipeline.
 
-If you're interested in using this, head to the [Getting Started](getting-started.md)
-page!
+If you're interested in using this, head to the
+[Getting Started](getting-started.md) page!
diff --git a/environment.yaml b/environment.yaml
@@ -3,14 +3,9 @@ name: angsd-snakemake-pipeline
 channels:
   - conda-forge
   - bioconda
-  - defaults
 
 dependencies:
   # core packages for workflow
-  - snakemake-minimal =7.32.*
-  - python =3.11.*
-  - pandas =2.0.1
-  - mamba
-  - graphviz
-  - pygments
-  - pulp =2.7.*
+  - snakemake =7.32.*
+  - python =3.11
+  - mamba
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -48,6 +48,14 @@ wildcard_constraints:
         + [i for i in samples.population.values.tolist()]
         + [i for i in samples.depth.values.tolist()]
     ),
+    population1="|".join(
+        [i for i in samples.index.tolist()]
+        + [i for i in samples.population.values.tolist()]
+    ),
+    population2="|".join(
+        [i for i in samples.index.tolist()]
+        + [i for i in samples.population.values.tolist()]
+    ),
     dp=".{0}|.dp[1-9][0-9]*",
     chunk="[0-9]+",
     sites="|".join(filters),
@@ -351,7 +359,9 @@ if config["subsample_analyses"]["relatedness"]["ibsrelate_ibs"]:
     )
 
 if config["subsample_analyses"]["pca_pcangsd"]:
-    if config["excl_pca-admix"]:
+    exclinds = config["excl_pca-admix"]
+    exclinds = [s for s in exclinds if s not in config["drop_samples"]]
+    if exclinds:
         all_outputs.extend(
             expand(
                 [
@@ -436,7 +446,9 @@ if config["subsample_analyses"]["inbreeding_ngsf-hmm"]:
     )
 
 if config["subsample_analyses"]["admix_ngsadmix"]:
-    if config["excl_pca-admix"]:
+    exclinds = config["excl_pca-admix"]
+    exclinds = [s for s in exclinds if s not in config["drop_samples"]]
+    if exclinds:
         all_outputs.extend(
             expand(
                 [

diff --git a/workflow/rules/0.2_ref_filt.smk b/workflow/rules/0.2_ref_filt.smk
@@ -570,7 +570,7 @@ rule angsd_missdata:
     container:
         angsd_container
     params:
-        nind=lambda w: len(get_samples_from_pop(w.population)),
+        nind=get_nind,
         mindpind=config["params"]["angsd"]["mindepthind"],
         extra=config["params"]["angsd"]["extra"],
         mapQ=config["mapQ"],

diff --git a/workflow/rules/2.0_mapping.smk b/workflow/rules/2.0_mapping.smk
@@ -370,14 +370,13 @@ rule samtools_subsample:
         "logs/mapping/samtools/subsample/{dataset}_{sample}.{ref}{dp}.log",
     benchmark:
         "benchmarks/mapping/samtools/subsample/{dataset}_{sample}.{ref}{dp}.log"
-    wildcard_constraints:
-        dp=f".dp{config['subsample_dp']}",
     conda:
         "../envs/samtools.yaml"
     shadow:
         "minimal"
     params:
-        dp=config["subsample_dp"],
+        dp=lambda w: w.dp.replace(".dp", ""),
+        seed=config["params"]["samtools"]["subsampling_seed"],
     resources:
         runtime=lambda wildcards, attempt: attempt * 720,
     shell:
@@ -389,7 +388,7 @@ rule samtools_subsample:
         if [ `awk 'BEGIN {{print ('$prop' <= 1.0)}}'` = 1 ]; then
             propdec=$(echo $prop | awk -F "." '{{print $2}}')
             samtools view -h -F 4 -q 30 -@ {threads} -u {input.bam} |
-                samtools view -h -s 0.${{propdec}} -@ {threads} -b \
+                samtools view -h -s {params.seed}.${{propdec}} -@ {threads} -b \
                 > {output.bam} 2> {log}
             samtools index {output.bam} 2>> {log}
         else

diff --git a/workflow/rules/2.1_sample_qc.smk b/workflow/rules/2.1_sample_qc.smk
@@ -112,14 +112,16 @@ rule compile_endo_cont:
     input:
         lambda w: expand(
             "results/datasets/{{dataset}}/qc/endogenous_content/{{dataset}}.{sample}.{{ref}}.endo",
-            sample=get_samples_from_pop("all"),
+            sample=get_popfile_inds(w),
         ),
     output:
-        "results/datasets/{dataset}/qc/endogenous_content/{dataset}.{ref}_all.endo.tsv",
+        "results/datasets/{dataset}/qc/endogenous_content/{dataset}.{ref}_{population}{dp}.endo.tsv",
+    wildcard_constraints:
+        population="all",
     log:
-        "logs/datasets/{dataset}/qc/endogenous_content/{dataset}.{ref}_compile-endocont.log",
+        "logs/datasets/{dataset}/qc/endogenous_content/{dataset}.{ref}_{population}{dp}_compile-endocont.log",
     benchmark:
-        "benchmarks/datasets/{dataset}/qc/endogenous_content/{dataset}.{ref}_compile-endocont.log"
+        "benchmarks/datasets/{dataset}/qc/endogenous_content/{dataset}.{ref}_{population}{dp}_compile-endocont.log"
     conda:
         "../envs/shell.yaml"
     resources:
@@ -271,19 +273,21 @@ rule merge_ind_depth:
     input:
         depth=lambda w: expand(
             "{{prefix}}{{dataset}}.{{ref}}_{sample}{{dp}}_{{group}}.depthSample",
-            sample=get_samples_from_pop("all"),
+            sample=get_popfile_inds(w),
         ),
         summary=lambda w: expand(
             "{{prefix}}{{dataset}}.{{ref}}_{sample}{{dp}}_{{group}}.depth.sum",
-            sample=get_samples_from_pop("all"),
+            sample=get_popfile_inds(w),
         ),
     output:
-        dep="{prefix}{dataset}.{ref}_all{dp}_{group}.depth",
-        sum="{prefix}{dataset}.{ref}_all{dp}_{group}.depth.sum",
+        dep="{prefix}{dataset}.{ref}_{population}{dp}_{group}.depth",
+        sum="{prefix}{dataset}.{ref}_{population}{dp}_{group}.depth.sum",
+    wildcard_constraints:
+        population="all",
     log:
-        "logs/merge_depth/{prefix}{dataset}.{ref}_all{dp}_{group}.log",
+        "logs/merge_depth/{prefix}{dataset}.{ref}_{population}{dp}_{group}.log",
     benchmark:
-        "benchmarks/merge_depth/{prefix}{dataset}.{ref}_all{dp}_{group}.log"
+        "benchmarks/merge_depth/{prefix}{dataset}.{ref}_{population}{dp}_{group}.log"
     conda:
         "../envs/shell.yaml"
     shell:

diff --git a/workflow/rules/3.0_genotype_likelihoods.smk b/workflow/rules/3.0_genotype_likelihoods.smk
@@ -31,14 +31,14 @@ rule popfile:
     are not propagated into downstream files.
     """
     output:
-        inds="results/datasets/{dataset}/poplists/{dataset}_{population}.indiv.list",
+        inds="results/datasets/{dataset}/poplists/{dataset}_{population}{dp}.indiv.list",
     log:
-        "logs/{dataset}/poplists/{dataset}_{population}_makelist.log",
+        "logs/{dataset}/poplists/{dataset}_{population}{dp}_makelist.log",
     conda:
         "../envs/python.yaml"
     params:
         samplelist=samples,
-        inds=lambda w: get_samples_from_pop(w.population),
+        inds=get_popfile_inds,
     script:
         "../scripts/make_popfile.py"
 

diff --git a/workflow/rules/3.2_beagles.smk b/workflow/rules/3.2_beagles.smk
@@ -39,7 +39,7 @@ rule angsd_doGlf2:
         majmin=config["params"]["angsd"]["domajorminor"],
         counts=get_docounts,
         trans=get_trans,
-        nind=lambda w: len(get_samples_from_pop(w.population)),
+        nind=get_nind,
         minind=get_minind,
         mininddp=config["params"]["angsd"]["mindepthind"],
         out=lambda w, output: os.path.splitext(output.arg)[0],

diff --git a/workflow/rules/5.0_relatedness.smk b/workflow/rules/5.0_relatedness.smk
@@ -98,7 +98,7 @@ rule ibsrelate:
     container:
         angsd_container
     params:
-        nind=lambda w: len(get_samples_from_pop(w.population)),
+        nind=get_nind,
         maxsites=config["chunk_size"],
         out=lambda w, output: os.path.splitext(output[0])[0],
     resources:
@@ -121,7 +121,7 @@ rule est_kinship_stats_ibs:
             "results/datasets/{{dataset}}/analyses/kinship/ibsrelate_ibs/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.ibspair",
             chunk=chunklist,
         ),
-        inds="results/datasets/{dataset}/poplists/{dataset}_all.indiv.list",
+        inds="results/datasets/{dataset}/poplists/{dataset}_{population}{dp}.indiv.list",
     output:
         "results/datasets/{dataset}/analyses/kinship/ibsrelate_ibs/{dataset}.{ref}_{population}{dp}_{sites}-filts.kinship",
     log:
@@ -163,21 +163,23 @@ rule ngsrelate:
     """
     input:
         beagle=expand(
-            "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_all{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz",
+            "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz",
             maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"],
             r2=config["params"]["ngsld"]["pruning_min-weight_dataset"],
         ),
-        inds="results/datasets/{dataset}/poplists/{dataset}_all.indiv.list",
+        inds="results/datasets/{dataset}/poplists/{dataset}_{population}{dp}.indiv.list",
     output:
-        relate="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_all{dp}_{sites}-filts_relate.tsv",
-        samples="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_all{dp}_{sites}-filts_samples.list",
+        relate="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts_relate.tsv",
+        samples="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts_samples.list",
+    wildcard_constraints:
+        population="all",
     log:
-        "logs/{dataset}/kinship/ngsrelate/{dataset}.{ref}_all{dp}_{sites}-filts.log",
+        "logs/{dataset}/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts.log",
     container:
         ngsrelate_container
     threads: lambda wildcards, attempt: attempt * 4
     params:
-        nind=lambda w: len(get_samples_from_pop("all")),
+        nind=get_nind,
     resources:
         runtime=lambda wildcards, attempt: attempt * 360,
     shell:

diff --git a/workflow/rules/6.0_pca.smk b/workflow/rules/6.0_pca.smk
@@ -8,13 +8,15 @@ rule remove_excl_pca_admix:
     results), while allowing them in all other analyses.
     """
     input:
-        "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_all{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz",
+        "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz",
     output:
-        "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz",
+        "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz",
+    wildcard_constraints:
+        population="all",
     log:
-        "logs/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log",
+        "logs/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log",
     benchmark:
-        "benchmarks/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log"
+        "benchmarks/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log"
     conda:
         "../envs/shell.yaml"
     params:
@@ -60,7 +62,7 @@ rule plot_pca:
     """
     input:
         "results/datasets/{dataset}/analyses/pcangsd/{dataset}.{ref}_{population}{dp}_{sites}-filts.cov",
-        "results/datasets/{dataset}/poplists/{dataset}_{population}.indiv.list",
+        "results/datasets/{dataset}/poplists/{dataset}_{population}{dp}.indiv.list",
     output:
         report(
             "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_{population}{dp}_{sites}-filts_pc{xpc}-{ypc}.svg",