diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 2e469b19..ea342c2f 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -65,6 +65,7 @@ jobs: mkdir -p temp workdir=$(pwd -P) TMPDIR=$workdir/temp + snakemake --directory .test --configfile .test/config/config.yaml --config samples=config/samples_testSRA.tsv --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --use-singularity --default-resources mem_mb=1000 --until fastp_mergedout snakemake --directory .test --configfile .test/config/config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache --use-singularity --default-resources mem_mb=1000 - name: Test report shell: bash -l {0} diff --git a/.test/config/samples_testSRA.tsv b/.test/config/samples_testSRA.tsv new file mode 100644 index 00000000..0751c614 --- /dev/null +++ b/.test/config/samples_testSRA.tsv @@ -0,0 +1,8 @@ +sample population time depth +hist1 Hjelmseryd historical low +hist2 Hjelmseryd historical low +#hist3 Hjelmseryd historical low +mod1 Gotafors modern high +mod2 Gotafors modern high +#mod3 Gotafors modern high +SAMN13218652 Gotafors modern high \ No newline at end of file diff --git a/.test/config/units.tsv b/.test/config/units.tsv index 010a5f8d..121db330 100644 --- a/.test/config/units.tsv +++ b/.test/config/units.tsv @@ -1,8 +1,9 @@ -sample unit lib platform fq1 fq2 bam realname -hist1 BHVN22DSX2.2 hist1 ILLUMINA data/fastq/hist1.r1.fastq.gz data/fastq/hist1.r2.fastq.gz MZLU153040 -hist1 BHVN22DSX2.3 hist1 ILLUMINA data/fastq/hist1.unit2.r1.fastq.gz data/fastq/hist1.unit2.r2.fastq.gz MZLU153040 -hist2 BHVN22DSX2.2 hist2 ILLUMINA data/fastq/hist2.r1.fastq.gz data/fastq/hist2.r2.fastq.gz MZLU153042 -hist3 BHVN22DSX2.2 hist2 ILLUMINA data/fastq/hist3.r1.fastq.gz data/fastq/hist3.r2.fastq.gz MZLU153045 -mod1 AHW5NGDSX2.3 mod1 ILLUMINA data/fastq/mod1.r1.fastq.gz data/fastq/mod1.r2.fastq.gz MZLU107434 -mod2 AHW5NGDSX2.3 mod2 ILLUMINA data/fastq/mod2.r1.fastq.gz data/fastq/mod2.r2.fastq.gz data/bam/mod2.bam MZLU107435 -mod3 AHW5NGDSX2.3 mod3 ILLUMINA data/fastq/mod3.r1.fastq.gz data/fastq/mod3.r2.fastq.gz MZLU107436 \ No newline at end of file +sample unit lib platform fq1 fq2 bam sra realname +hist1 BHVN22DSX2.2 hist1 ILLUMINA data/fastq/hist1.r1.fastq.gz data/fastq/hist1.r2.fastq.gz MZLU153040 +hist1 BHVN22DSX2.3 hist1 ILLUMINA data/fastq/hist1.unit2.r1.fastq.gz data/fastq/hist1.unit2.r2.fastq.gz MZLU153040 +hist2 BHVN22DSX2.2 hist2 ILLUMINA data/fastq/hist2.r1.fastq.gz data/fastq/hist2.r2.fastq.gz MZLU153042 +hist3 BHVN22DSX2.2 hist2 ILLUMINA data/fastq/hist3.r1.fastq.gz data/fastq/hist3.r2.fastq.gz MZLU153045 +mod1 AHW5NGDSX2.3 mod1 ILLUMINA data/fastq/mod1.r1.fastq.gz data/fastq/mod1.r2.fastq.gz MZLU107434 +mod2 AHW5NGDSX2.3 mod2 ILLUMINA data/fastq/mod2.r1.fastq.gz data/fastq/mod2.r2.fastq.gz data/bam/mod2.bam MZLU107435 +mod3 AHW5NGDSX2.3 mod3 ILLUMINA data/fastq/mod3.r1.fastq.gz data/fastq/mod3.r2.fastq.gz MZLU107436 +SAMN13218652 SRR10398077 SAMN13218652 ILLUMINA SRR10398077 SAMN13218652 from NCBI, for testing NCBI downloading. Is a random, smallish paired end run fastq \ No newline at end of file diff --git a/README.md b/README.md index da681309..21588d6e 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,12 @@ population structure, genetic diversity, genetic differentiation, allele frequencies, linkage disequilibrium, and more. The workflow is designed with two entry points in mind. Users with raw -sequencing data in FASTQ format can perform raw sequencing data alignment and -processing, followed up by the population genomic analyses. Alternatively, users -who have already mapped and processed their reads into BAM files can use these -to start directly at the population genomic analyses (for instance, if you -bring BAM files from GenErode, nf-core/eager, or your own custom processing). +sequencing data in FASTQ format stored locally or as an NCBI/ENA run accession +can perform raw sequencing data alignment and processing, followed up by the +population genomic analyses. Alternatively, users who have already mapped and +processed their reads into BAM files can use these to start directly at the +population genomic analyses (for instance, if you bring BAM files from +GenErode, nf-core/eager, or your own custom processing). ## Features @@ -23,6 +24,8 @@ bring BAM files from GenErode, nf-core/eager, or your own custom processing). If starting with raw sequencing data in FASTQ format, the pipeline will handle mapping and processing of the reads, with options speific for historical DNA. +These steps are available if providing paths to local FASTQ files or SRA +accessions from e.g. NCBI/ENA. - Trimming and collapsing of overlapping paired end reads with fastp - Mapping of reads using bwa mem diff --git a/config/README.md b/config/README.md index c2de3d32..2ef16210 100644 --- a/config/README.md +++ b/config/README.md @@ -33,8 +33,9 @@ only characters permitted in filenames on your system. All your raw data will be pointed to in `units.tsv`. -Each sample must have five columns filled in the units sheet. The columns are -tab separated: +Each row will contain a sample 'unit', this is a unique combination of a +sample, sequencing run, and library. As such, these columns, as well as a +fourth for the sequencing platform are required: - `sample` - The sample name, same as in `samples.tsv`. - `unit` - This describes the sequencing platform unit. The expected format is @@ -46,9 +47,21 @@ tab separated: the same value in `lib`, but different in `unit`. - `platform` - This is used to fill out the `PL` read group. Put what you'd want there. Usually `ILLUMINA` for Illumina platforms. + +Additionally, you have three ways to specify sequencing data sources. You can +provide fastq files available locally on your machine, an SRA run accession to +automatically download fastq files from NCBI, or a fully processed bam file. +Only one of these three categories of columns need to be defined. If multiple +are, the pipeline will prefer bam files > local fastq files > SRA accessions. + - `fq1` and `fq2` - The absolute or relative paths from the working directory to the raw fastq files for the sample. Currently the pipeline only supports - paired-end sequencing, single end may be added down the line. + paired-end sequencing, so both columns are neede. Single end may be added + down the line if requested. +- `sra` - A short read run accession. Since NCBI and ENA mirror each other, the + run accession can come from either. Only supports paired-end runs. These are + treated as temporary and deleted after trimming, unlike local fastq files, + which we never delete. - `bam` - If you do not want to map the raw reads, provide a pre-processed BAM file path here. Samples with a BAM file may only appear once in the units list, so multiple sequencing runs should be merged beforehand. If a BAM file diff --git a/config/units.tsv b/config/units.tsv index 19cbe287..0903081a 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -1 +1 @@ -sample unit lib platform fq1 fq2 bam \ No newline at end of file +sample unit lib platform fq1 fq2 bam sra \ No newline at end of file diff --git a/workflow/rules/1.0_preprocessing.smk b/workflow/rules/1.0_preprocessing.smk index 0d023678..da6e378f 100644 --- a/workflow/rules/1.0_preprocessing.smk +++ b/workflow/rules/1.0_preprocessing.smk @@ -2,6 +2,19 @@ # samples only) with fastp +rule ncbi_download: + output: + temp("results/downloaded_fastq/{accession}_1.fastq.gz"), + temp("results/downloaded_fastq/{accession}_2.fastq.gz"), + log: + "logs/download_fastq/{accession}.log", + params: + extra="--skip-technical -x", + threads: 6 + wrapper: + "v3.3.6/bio/sra-tools/fasterq-dump" + + rule fastp_mergedout: """Process reads with fastp, collapsing overlapping read pairs""" input: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7f8887be..9b34d291 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -144,13 +144,28 @@ if any(config["filter_beds"].values()): ## Get fastq inputs for fastp def get_raw_fastq(wildcards): - unit = units.loc[ - (units["sample"] == wildcards.sample) - & (units["unit"] == wildcards.unit) - & (units["lib"] == wildcards.lib), - ["sample", "fq1", "fq2"], - ].set_index("sample") - return {"sample": [unit.fq1[0], unit.fq2[0]]} + if "fq1" in units: + unit = units.loc[ + (units["sample"] == wildcards.sample) + & (units["unit"] == wildcards.unit) + & (units["lib"] == wildcards.lib), + ["sample", "fq1", "fq2"], + ].set_index("sample") + if not pd.isna(unit.fq1[0]): + return {"sample": [unit.fq1[0], unit.fq2[0]]} + if "sra" in units: + sra = units[ + (units["sample"] == wildcards.sample) + & (units["unit"] == wildcards.unit) + & (units["lib"] == wildcards.lib) + ].sra.item() + if not pd.isna(sra): + return { + "sample": [ + f"results/downloaded_fastq/{sra}_1.fastq.gz", + f"results/downloaded_fastq/{sra}_2.fastq.gz", + ] + } ## Get minimum overlap to collapse read pairs per sample