Skip to content

Commit

Permalink
separated sqlite and download rules
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmeaton committed Sep 20, 2020
1 parent 6ac0dd7 commit 30318da
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 30 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ cd plague-phylogeography
```bash
conda install -c conda-forge mamba
mamba env create -f environment.yaml
conda activate plague-phylogeography-0.1.5dev
```

## Contributing
Expand Down
138 changes: 113 additions & 25 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,50 +14,124 @@ import os # Path manipulation

from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
FTP = FTPRemoteProvider()
import sqlite3

configfile: "config/snakemake.yaml"

# Overview
# 1) Fetch FTP urls for reference,assemblies,SRA samples from ncbimeta debug
# 2) Download samples as fna (reference, assembly) and fastq (SRA)
# 3) Get masking coordiantes from reference, build SnpEff ddatabase
# 4) Pre-process fastq files with eager
# 5) Align fna files with snippy, align bam files with snippy
# -----------------------------------------------------------------------------#
# Help and Usage #
# -----------------------------------------------------------------------------#

rule help:
"""
Print list of all targets with help.
"""
run:
for rule in workflow.rules:
print("rule: ", rule.name )
print(rule.docstring )
if rule._input: print("input: ", rule._input)
if rule._output: print("output: ", rule._output)
if rule._params: print("params: ", rule._params)
print("")

# -----------------------------------------------------------------------------#
# Data Download #
# Setup #
# -----------------------------------------------------------------------------#

# Identify the name (prefix) of the reference genome
ref_local_name = os.path.splitext(os.path.splitext(os.path.basename(config["reference_genome_remote_fna"]))[0])[0]

'''
# Identify the url (prefix) of the target assemblies
asm_download_file = os.path.join(config["outdir"], "sqlite_import", "assembly_download.txt")
with open(asm_download_file) as temp_file:
asm_download_url = [line.rstrip() for line in temp_file]
asm_local_name = [os.path.splitext(os.path.splitext(os.path.basename(url))[0])[0] for url in asm_download_url]
'''
# -----------------------------------------------------------------------------#
# Master Target #
# -----------------------------------------------------------------------------#

rule all:
input:
expand("snakeTest/download_reference/{reference}.fna", reference=ref_local_name),
expand("snakeTest/download_assembly/{assembly}.fna", assembly=asm_local_name)
expand("snakeTest/download_reference/{reference}.fna", reference=ref_local_name)

# -----------------------------------------------------------------------------#
# Database Import #
# -----------------------------------------------------------------------------#

# -----------------------------------------------------------------------------#
rule sqlite_import_reference:
"""
Import Reference genome url from database.
"""
params:
sql_command = config["sqlite_select_command_ref"],
input:
db = "{outdir}/ncbimeta_db/" + config["sqlite_db"]
output:
ref_txt = "{outdir}/sqlite_import/download_reference.txt"
run:
conn = sqlite3.connect(input.db)
cur = conn.cursor()
# Reference Genome URLs
ref_url = cur.execute(params.sql_command).fetchone()[0]
ref_fna_file = ref_url + "/" + ref_url.split("/")[9] + "_genomic.fna.gz"
with open(output.ref_txt, "w") as temp_file:
temp_file.write(ref_fna_file)
cur.close()
# -----------------------------------------------------------------------------#
rule sqlite_import_assembly:
"""
Import Assembly genome url from database.
"""
params:
sql_command = config["sqlite_select_command_asm"],
max_assembly = config["max_datasets_assembly"]
input:
db = "{outdir}/ncbimeta_db/" + config["sqlite_db"]
output:
asm_txt = "{outdir}/sqlite_import/download_assembly.txt"
run:
conn = sqlite3.connect(input.db)
cur = conn.cursor()
# Assembled Genome URLs
asm_fna_urls = cur.execute(params.sql_command).fetchall()
asm_fna_list = []
for url_list in asm_fna_urls:
for url in url_list[0].split(";"):
if url:
fna_gz_file = url + "/" + url.split("/")[9] + "_genomic.fna.gz"
asm_fna_list.append(fna_gz_file)
# Filter based on max number of assemblies for analysis
asm_fna_list = asm_fna_list[0:params.max_assembly]
with open(output.asm_txt, "w") as temp_file:
[temp_file.write(url + "\n") for url in asm_fna_list]
cur.close()
# -----------------------------------------------------------------------------#
rule sqlite_import_sra:
"""
Import SRA accessions from database.
"""
params:
sql_command = config["sqlite_select_command_sra"],
organism = config["organism"],
max_sra = config["max_datasets_sra"],
sqlite_eager_script = os.path.join(config["script_dir"],"sqlite_EAGER_tsv.py")
input:
db = "{outdir}/ncbimeta_db/" + config["sqlite_db"]
output:
asm_txt = "{outdir}/sqlite_import/metadata_sra_eager.tsv"
run:
shell("{params.sqlite_eager_script} \
--database {input.db} \
--query \"{params.sql_command}\" \
--organism \"{params.organism}\" \
--max-datasets {params.max_sra} \
--output {wildcards.outdir}/sqlite_import/metadata_sra_eager.tsv \
--fastq-dir {wildcards.outdir}/sra_download/")

# -----------------------------------------------------------------------------#
# Data Download #
# -----------------------------------------------------------------------------#
rule download_fna:
"""
Download fasta files, by seaching for sample name matches.
"""
input:
"{outdir}/sqlite_import/assembly_download.txt",
"{outdir}/sqlite_import/reference_download.txt"
"{outdir}/sqlite_import/{download_dir}.txt"
output:
"{outdir}/{dir}/{sample}.fna"
"{outdir}/{download_dir}/{sample}.fna"
run:
for file in input:
with open(file) as temp_file:
Expand All @@ -66,5 +140,19 @@ rule download_fna:
match = file_parse[0]
shell("wget --quiet -O - {match} | gunzip -c > {output}")

#shell("echo {wildcards.sample} {input} {output}")
#shell("grep '{wildcards.sample}' {input} > {output}")
# -----------------------------------------------------------------------------#
# Help and Usage #
# -----------------------------------------------------------------------------#

rule help:
"""
Print list of all targets with help.
"""
run:
for rule in workflow.rules:
print("rule: ", rule.name )
print(rule.docstring )
if rule._input: print("input: ", rule._input)
if rule._output: print("output: ", rule._output)
if rule._params: print("params: ", rule._params)
print("")
53 changes: 53 additions & 0 deletions Snakefile.bak
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,56 @@ rule eager:
" --bam_mapping_quality_threshold {config[snippy_map_qual]}"
" --bam_discard_unmapped"
" --bam_unmapped_type discard"

# Reference fasta
ref_local_fna_gz = os.path.basename(config["reference_genome_remote_fna"])
ref_local_fna = os.path.splitext(ref_local_fna_gz)[0]
ref_local_fna_out = os.path.join(config["outdir"], "reference_download", ref_local_fna)
# Reference gb
ref_local_gb_gz = os.path.basename(config["reference_genome_remote_gb"])
ref_local_gb = os.path.splitext(ref_local_gb_gz)[0]
ref_local_gb_out = os.path.join(config["outdir"], "reference_download", ref_local_gb)
# Reference gff
ref_local_gff_gz = os.path.basename(config["reference_genome_remote_gff"])
ref_local_gff = os.path.splitext(ref_local_gff_gz)[0]
ref_local_gff_out = os.path.join(config["outdir"], "reference_download", ref_local_gff)

rule reference_download:
"""
Download the reference genome of interest from the NCBI FTP site.
"""
input:
fna_gz = FTP.remote(config["reference_genome_remote_fna"], keep_local=True),
gb_gz = FTP.remote(config["reference_genome_remote_gb"], keep_local=True),
gff_gz = FTP.remote(config["reference_genome_remote_gff"], keep_local=True)
output:
fna = ref_local_fna_out,
gb = ref_local_gb_out,
gff = ref_local_gff_out
shell:
"gunzip -c {input.fna_gz} > {output.fna}; "
"gunzip -c {input.gb_gz} > {output.gb}; "
"gunzip -c {input.gff_gz} > {output.gff}; "

# -----------------------------------------------------------------------------#
# Assembly Download #
# -----------------------------------------------------------------------------#

# Load the URLs for assembly_download
assembly_download_input = os.path.join(config["outdir"],"sqlite_import/assembly_download.txt")
with open(assembly_download_input) as temp_file:
assembly_download_urls = [line.rstrip() for line in temp_file]
assembly_download_gz = [os.path.splitext(os.path.basename(url))[0] for url in assembly_download_urls]
assembly_download_gz = os.path.

rule assembly_download:
"""
Download genomic assembly fasta files using FTP.
"""
input:
txt = assembly_download_input
run:
with open(str(input.txt)) as temp_file:
assembly_download_urls = [line.rstrip() for line in temp_file]
for url,fna in zip(assembly_download_urls, assembly_download_fna):
shell("wget --no-clobber --quiet --output-file {fna} {url}")
15 changes: 10 additions & 5 deletions config/snakemake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,19 @@

# General
outdir : snakeTest
script_dir : scripts

# Conda Environments
conda_eager_env : "nf-core-eager-2.2.0dev"

# SQLITE Parameters
sqlite_db : "yersinia_pestis_db.sqlite"
sqlite_select_command_asm : "SELECT AssemblyFTPGenbank FROM Master WHERE (BioSampleComment LIKE '%KEEP%Assembly%')"
sqlite_select_command_sra : "SELECT BioSampleAccession,SRARunAccession,SRALibraryLayout,SRAFileURL FROM Master WHERE (BioSampleComment LIKE '%KEEP: EAGER Ancient%')"
sqlite_select_command_ref : "SELECT AssemblyFTPGenbank FROM Master WHERE (BioSampleComment LIKE '%Reference%')"
max_datasets_assembly : 3
max_datasets_sra : 2

# Reference genome download
#reference_genome_fna_ftp : "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/009/065/GCA_000009065.1_ASM906v1/GCA_000009065.1_ASM906v1_genomic.fna.gz"
#reference_genome_gb_ftp : "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/009/065/GCA_000009065.1_ASM906v1/GCA_000009065.1_ASM906v1_genomic.gbff.gz"
Expand All @@ -24,13 +33,9 @@ eager_rev : "7b51863957"
eager_clip_readlength : 35
eager_bwaalnn : 0.01
eager_bwaalnl : 16
organism : "Yersinia pestis"

# Snippy Parameters
snippy_ctg_depth : 10
snippy_bam_depth : 3
snippy_map_qual : 30

# Testing
samples:
A: data/samples/A.fastq
B: data/samples/B.fastq
1 change: 1 addition & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ channels:
- anaconda
dependencies:
- python=3.7
- sqlite=3.31.1
- snakemake=5.23.0
- samtools=1.10
- bwa=0.7.17
Expand Down

0 comments on commit 30318da

Please sign in to comment.