zjnolen · zjnolen · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -65,6 +65,7 @@ jobs:
         mkdir -p temp
         workdir=$(pwd -P)
         TMPDIR=$workdir/temp
+        snakemake --directory .test --configfile .test/config/config.yaml --config samples=config/samples_testSRA.tsv --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --use-singularity --default-resources mem_mb=1000 --until fastp_mergedout
         snakemake --directory .test --configfile .test/config/config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --use-singularity --default-resources mem_mb=1000
     - name: Test report
       shell: bash -l {0}

diff --git a/.test/config/samples_testSRA.tsv b/.test/config/samples_testSRA.tsv
@@ -0,0 +1,8 @@
+sample	population	time	depth
+hist1	Hjelmseryd	historical	low
+hist2	Hjelmseryd	historical	low
+#hist3	Hjelmseryd	historical	low
+mod1	Gotafors	modern	high
+mod2	Gotafors	modern	high
+#mod3	Gotafors	modern	high
+SAMN13218652	Gotafors	modern	high
diff --git a/.test/config/units.tsv b/.test/config/units.tsv
@@ -1,8 +1,9 @@
-sample	unit	lib	platform	fq1	fq2	bam	realname
-hist1	BHVN22DSX2.2	hist1	ILLUMINA	data/fastq/hist1.r1.fastq.gz	data/fastq/hist1.r2.fastq.gz		MZLU153040
-hist1	BHVN22DSX2.3	hist1	ILLUMINA	data/fastq/hist1.unit2.r1.fastq.gz	data/fastq/hist1.unit2.r2.fastq.gz		MZLU153040
-hist2	BHVN22DSX2.2	hist2	ILLUMINA	data/fastq/hist2.r1.fastq.gz	data/fastq/hist2.r2.fastq.gz		MZLU153042
-hist3	BHVN22DSX2.2	hist2	ILLUMINA	data/fastq/hist3.r1.fastq.gz	data/fastq/hist3.r2.fastq.gz		MZLU153045
-mod1	AHW5NGDSX2.3	mod1	ILLUMINA	data/fastq/mod1.r1.fastq.gz	data/fastq/mod1.r2.fastq.gz		MZLU107434
-mod2	AHW5NGDSX2.3	mod2	ILLUMINA	data/fastq/mod2.r1.fastq.gz	data/fastq/mod2.r2.fastq.gz	data/bam/mod2.bam	MZLU107435
-mod3	AHW5NGDSX2.3	mod3	ILLUMINA	data/fastq/mod3.r1.fastq.gz	data/fastq/mod3.r2.fastq.gz		MZLU107436
+sample	unit	lib	platform	fq1	fq2	bam	sra	realname
+hist1	BHVN22DSX2.2	hist1	ILLUMINA	data/fastq/hist1.r1.fastq.gz	data/fastq/hist1.r2.fastq.gz			MZLU153040
+hist1	BHVN22DSX2.3	hist1	ILLUMINA	data/fastq/hist1.unit2.r1.fastq.gz	data/fastq/hist1.unit2.r2.fastq.gz			MZLU153040
+hist2	BHVN22DSX2.2	hist2	ILLUMINA	data/fastq/hist2.r1.fastq.gz	data/fastq/hist2.r2.fastq.gz			MZLU153042
+hist3	BHVN22DSX2.2	hist2	ILLUMINA	data/fastq/hist3.r1.fastq.gz	data/fastq/hist3.r2.fastq.gz			MZLU153045
+mod1	AHW5NGDSX2.3	mod1	ILLUMINA	data/fastq/mod1.r1.fastq.gz	data/fastq/mod1.r2.fastq.gz			MZLU107434
+mod2	AHW5NGDSX2.3	mod2	ILLUMINA	data/fastq/mod2.r1.fastq.gz	data/fastq/mod2.r2.fastq.gz	data/bam/mod2.bam		MZLU107435
+mod3	AHW5NGDSX2.3	mod3	ILLUMINA	data/fastq/mod3.r1.fastq.gz	data/fastq/mod3.r2.fastq.gz			MZLU107436
+SAMN13218652	SRR10398077	SAMN13218652	ILLUMINA				SRR10398077	SAMN13218652 from NCBI, for testing NCBI downloading. Is a random, smallish paired end run fastq
diff --git a/README.md b/README.md
@@ -11,18 +11,21 @@ population structure, genetic diversity, genetic differentiation, allele
 frequencies, linkage disequilibrium, and more.
 
 The workflow is designed with two entry points in mind. Users with raw
-sequencing data in FASTQ format can perform raw sequencing data alignment and
-processing, followed up by the population genomic analyses. Alternatively, users
-who have already mapped and processed their reads into BAM files can use these
-to start directly at the population genomic analyses (for instance, if you
-bring BAM files from GenErode, nf-core/eager, or your own custom processing).
+sequencing data in FASTQ format stored locally or as an NCBI/ENA run accession
+can perform raw sequencing data alignment and processing, followed up by the
+population genomic analyses. Alternatively, users who have already mapped and
+processed their reads into BAM files can use these to start directly at the
+population genomic analyses (for instance, if you bring BAM files from
+GenErode, nf-core/eager, or your own custom processing).
 
 ## Features
 
 ### Sequence data processing
 
 If starting with raw sequencing data in FASTQ format, the pipeline will handle
 mapping and processing of the reads, with options speific for historical DNA.
+These steps are available if providing paths to local FASTQ files or SRA
+accessions from e.g. NCBI/ENA.
 
 - Trimming and collapsing of overlapping paired end reads with fastp
 - Mapping of reads using bwa mem

diff --git a/config/README.md b/config/README.md
@@ -33,8 +33,9 @@ only characters permitted in filenames on your system.
 
 All your raw data will be pointed to in `units.tsv`.
 
-Each sample must have five columns filled in the units sheet. The columns are
-tab separated:
+Each row will contain a sample 'unit', this is a unique combination of a
+sample, sequencing run, and library. As such, these columns, as well as a
+fourth for the sequencing platform are required:
 
 - `sample` - The sample name, same as in `samples.tsv`.
 - `unit` - This describes the sequencing platform unit. The expected format is
@@ -46,9 +47,21 @@ tab separated:
   the same value in `lib`, but different in `unit`.
 - `platform` - This is used to fill out the `PL` read group. Put what you'd
   want there. Usually `ILLUMINA` for Illumina platforms.
+
+Additionally, you have three ways to specify sequencing data sources. You can
+provide fastq files available locally on your machine, an SRA run accession to
+automatically download fastq files from NCBI, or a fully processed bam file.
+Only one of these three categories of columns need to be defined. If multiple
+are, the pipeline will prefer bam files > local fastq files > SRA accessions.
+
 - `fq1` and `fq2` - The absolute or relative paths from the working directory
   to the raw fastq files for the sample. Currently the pipeline only supports
-  paired-end sequencing, single end may be added down the line.
+  paired-end sequencing, so both columns are neede. Single end may be added
+  down the line if requested.
+- `sra` - A short read run accession. Since NCBI and ENA mirror each other, the
+  run accession can come from either. Only supports paired-end runs. These are
+  treated as temporary and deleted after trimming, unlike local fastq files,
+  which we never delete.
 - `bam` - If you do not want to map the raw reads, provide a pre-processed
   BAM file path here. Samples with a BAM file may only appear once in the units
   list, so multiple sequencing runs should be merged beforehand. If a BAM file

diff --git a/config/units.tsv b/config/units.tsv
@@ -1 +1 @@
-sample	unit	lib	platform	fq1	fq2	bam
+sample	unit	lib	platform	fq1	fq2	bam	sra
diff --git a/workflow/rules/1.0_preprocessing.smk b/workflow/rules/1.0_preprocessing.smk
@@ -2,6 +2,19 @@
 # samples only) with fastp
 
 
+rule ncbi_download:
+    output:
+        temp("results/downloaded_fastq/{accession}_1.fastq.gz"),
+        temp("results/downloaded_fastq/{accession}_2.fastq.gz"),
+    log:
+        "logs/download_fastq/{accession}.log",
+    params:
+        extra="--skip-technical -x",
+    threads: 6
+    wrapper:
+        "v3.3.6/bio/sra-tools/fasterq-dump"
+
+
 rule fastp_mergedout:
     """Process reads with fastp, collapsing overlapping read pairs"""
     input:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -144,13 +144,28 @@ if any(config["filter_beds"].values()):
 
 ## Get fastq inputs for fastp
 def get_raw_fastq(wildcards):
-    unit = units.loc[
-        (units["sample"] == wildcards.sample)
-        & (units["unit"] == wildcards.unit)
-        & (units["lib"] == wildcards.lib),
-        ["sample", "fq1", "fq2"],
-    ].set_index("sample")
-    return {"sample": [unit.fq1[0], unit.fq2[0]]}
+    if "fq1" in units:
+        unit = units.loc[
+            (units["sample"] == wildcards.sample)
+            & (units["unit"] == wildcards.unit)
+            & (units["lib"] == wildcards.lib),
+            ["sample", "fq1", "fq2"],
+        ].set_index("sample")
+        if not pd.isna(unit.fq1[0]):
+            return {"sample": [unit.fq1[0], unit.fq2[0]]}
+    if "sra" in units:
+        sra = units[
+            (units["sample"] == wildcards.sample)
+            & (units["unit"] == wildcards.unit)
+            & (units["lib"] == wildcards.lib)
+        ].sra.item()
+        if not pd.isna(sra):
+            return {
+                "sample": [
+                    f"results/downloaded_fastq/{sra}_1.fastq.gz",
+                    f"results/downloaded_fastq/{sra}_2.fastq.gz",
+                ]
+            }
 
 
 ## Get minimum overlap to collapse read pairs per sample
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		sample unit lib platform fq1 fq2 bam
		sample unit lib platform fq1 fq2 bam sra