Skip to content

Commit

Permalink
Remove chromosome definition from config.
Browse files Browse the repository at this point in the history
Add resource definitions to all rules.
Define VEP version and fasta paths more centrally
  • Loading branch information
Nicolai-vKuegelgen committed Aug 26, 2024
1 parent 20382a9 commit c251147
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 43 deletions.
1 change: 1 addition & 0 deletions stemcnv_check/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from stemcnv_check.version import __version__

STEM_CNV_CHECK = 'stemcnv_check'
VEP_version = 112
6 changes: 3 additions & 3 deletions stemcnv_check/app/make_staticdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from snakemake.api import SnakemakeApi
from snakemake.settings.types import ResourceSettings, ConfigSettings, DeploymentSettings, DAGSettings, OutputSettings, DeploymentMethod
from .check_input import check_config
from .. import STEM_CNV_CHECK
from .. import STEM_CNV_CHECK, VEP_version
from ..helpers import config_extract, make_apptainer_args, read_sample_table, get_cache_dir, get_vep_cache_path, load_config
from loguru import logger as logging

Expand Down Expand Up @@ -74,7 +74,7 @@ def create_missing_staticdata(args):
genome_fasta = config['global_settings'][f'{genome_build}_genome_fasta']
if genome_fasta == '__use-vep__':
genome_fasta = os.path.join(vep_cache_path, 'fasta',
'homo_sapiens', f'112_{vep_genome}',
'homo_sapiens', f'{VEP_version}_{vep_genome}',
'Homo_sapiens.GRCh38.dna.toplevel.fa.gz' if vep_genome == 'GRCh38' else
'Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz'
)
Expand Down Expand Up @@ -162,7 +162,7 @@ def create_missing_staticdata(args):
# 'configfile': args.config,
# 'target': 'SNP-probe-data',
'use_vep_cache': vep_cache_path,
'global_settings': {f'{genome_build}_genome_fasta': genome_fasta}
# 'global_settings': {f'{genome_build}_genome_fasta': genome_fasta}
}
),
deployment_settings=DeploymentSettings(
Expand Down
9 changes: 0 additions & 9 deletions stemcnv_check/app/run_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,6 @@ def run_stemcnv_check_workflow(args):

# Define / overwrite place-holder values for VEP downloaded data
vep_cache_path = get_vep_cache_path(config['settings']['VEP_annotation']['VEP_cache_path'], cache_path)
hg19_fasta = config['global_settings']['hg19_genome_fasta'].replace(
'__use-vep__',
os.path.join(vep_cache_path, 'fasta', 'homo_sapiens', '112_GRCh37', 'Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz')
)
hg38_fasta = config['global_settings']['hg38_genome_fasta'].replace(
'__use-vep__',
os.path.join(vep_cache_path, 'fasta', 'homo_sapiens', '112_GRCh38', 'Homo_sapiens.GRCh38.dna.toplevel.fa.gz')
)

basedir = args.directory if args.directory else os.getcwd()
argv += [
Expand All @@ -54,7 +46,6 @@ def run_stemcnv_check_workflow(args):
f'configfile={args.config}',
f'target={args.target}',
f'use_vep_cache={vep_cache_path}',
f"global_settings='{{hg19_genome_fasta: {hg19_fasta}, hg38_genome_fasta: {hg38_fasta}}}'"
]
#FIXME: use a clearer local vs cluster submission
if args.cluster_profile:
Expand Down
31 changes: 17 additions & 14 deletions stemcnv_check/control_files/allowedvalues_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,27 @@
# - filterset, filtersetdefault, sections, sectionsall & insamplehseet are special functions
# - everything else is treated as regex (needs to match the *full* value)
static_data:
bpm_manifest_file: str
csv_manifest_file: str
egt_cluster_file: str
genome_gtf_file: str
penncnv_pfb_file: str
penncnv_GCmodel_file: str
array_density_file: str
array_gaps_file: str
genomeInfo_file: str
bpm_manifest_file: str # Path|file
csv_manifest_file: str # Path|file
egt_cluster_file: str # Path|file
genome_gtf_file: str # Path|file
penncnv_pfb_file: str # Path|file
penncnv_GCmodel_file: str # Path|file
array_density_file: str # Path|file
array_gaps_file: str # Path|file
genomeInfo_file: str # Path|file

genome_version: str_(hg38|GRCh38|hg19|GRCh37)
array_name: str

raw_data_folder: str
data_path: str
log_path: str
raw_data_folder: str # Path|dir
data_path: str # Path|dir|no-exist-ok
log_path: str # Path|dir|no-exist-ok

global_settings:
cache_dir: str # Path|dir|no-exist-ok
hg19_genome_fasta: str # Path|__-use-vep__
hg38_genome_fasta: str # Path|__-use-vep__

settings:
CNV.calling.tools: list__str__(PennCNV|CBS)
Expand All @@ -32,8 +37,6 @@ settings:
GenCallScore: float_le1_ge0
Position.duplicates: str_(keep|remove|highest-GenCall|highest-GenTrain)

chromosomes: list__str__(chr)?[0-9XY]+

default-filter-set: filtersetnodefault

PennCNV:
Expand Down
1 change: 1 addition & 0 deletions stemcnv_check/envs/vep-annotation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ channels:
- nodefaults
dependencies:
- bcftools
# This needs to match VEP_version in the base __init__.py
- ensembl-vep = 112
10 changes: 3 additions & 7 deletions stemcnv_check/rules/StemCNV-check.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ from loguru import logger as logging
import tempfile
import ruamel.yaml as ruamel_yaml
from stemcnv_check import STEM_CNV_CHECK
from stemcnv_check.helpers import (
read_sample_table,
collect_SNP_cluster_ids,
)
from stemcnv_check.helpers import read_sample_table
from stemcnv_check.exceptions import SampleConstraintError, ConfigValueError

SNAKEDIR = str(importlib.resources.files(STEM_CNV_CHECK))
Expand Down Expand Up @@ -62,7 +59,6 @@ wildcard_constraints:

# Never submit these to cluster
localrules:
relink_gencall,
all,


Expand Down Expand Up @@ -170,9 +166,9 @@ rule run_CBS:
mem_mb=get_tool_resource("CBS", "memory"),
partition=get_tool_resource("CBS", "partition"),
params:
# SDundo = config['settings']['CBS']['SDundo'],
# filter=get_tool_filter_settings('CBS'),
# Ensure rerun on changes to settings or sample meta data
settings=config["settings"]["CBS"],
sex_info=lambda wildcards: get_ref_id(wildcards,True),
log:
err=os.path.join(LOGPATH, "CBS", "{sample_id}", "error.log"),
out=os.path.join(LOGPATH, "CBS", "{sample_id}", "out.log"),
Expand Down
16 changes: 13 additions & 3 deletions stemcnv_check/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import importlib.resources
import os
from pathlib import Path
from stemcnv_check import STEM_CNV_CHECK
from stemcnv_check import STEM_CNV_CHECK, VEP_version
from stemcnv_check.helpers import config_extract
from stemcnv_check.exceptions import SampleConstraintError

Expand Down Expand Up @@ -94,10 +94,20 @@ def get_genome_fasta(wildcards):
# #FIXME: future
# chip = get_sample_info(wildcards.sample_id)['array_name']
# genome = config['array_definitions'][chip]['genome_version']

if config["genome_version"] in ("hg38", "GRCh38"):
return config["global_settings"]["hg38_genome_fasta"]
out = config["global_settings"]["hg38_genome_fasta"]
else:
return config["global_settings"]["hg19_genome_fasta"]
out = config["global_settings"]["hg19_genome_fasta"]

if out == '__use-vep__':
vep_cache_path = config['use_vep_cache']
if config["genome_version"] in ("hg38", "GRCh38"):
return os.path.join(vep_cache_path, 'fasta', 'homo_sapiens', f'{VEP_version}_GRCh38', 'Homo_sapiens.GRCh38.dna.toplevel.fa.gz')
else:
return os.path.join(vep_cache_path, 'fasta', 'homo_sapiens', f'{VEP_version}_GRCh37', 'Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz')
else:
return out


def cnv_vcf_input_function(tool):
Expand Down
3 changes: 3 additions & 0 deletions stemcnv_check/rules/illumina_raw_processing.smk
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import os

localrules:
relink_gencall,

rule run_gencall:
input:
bpm=config["static_data"]["bpm_manifest_file"],
Expand Down
12 changes: 12 additions & 0 deletions stemcnv_check/rules/penncnv.smk
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import os

# Never submit these to cluster
localrules:
prep_PennCNV_sexfile,


rule prep_PennCNV_sexfile:
input:
Expand Down Expand Up @@ -30,6 +34,10 @@ rule prep_PennCNV_input:
tsv=temp(os.path.join(DATAPATH, "{sample_id}", "{sample_id}.penncnv.input.tsv")),
log:
os.path.join(LOGPATH, "PennCNV", "{sample_id}", "input.log"),
resources:
runtime=get_tool_resource("PennCNV", "runtime"),
mem_mb=get_tool_resource("PennCNV", "memory"),
partition=get_tool_resource("PennCNV", "partition"),
conda:
"../envs/vembrane.yaml"
params:
Expand Down Expand Up @@ -168,6 +176,10 @@ rule combined_PennCNV_output:
# stats=os.path.join(DATAPATH,"{sample_id}","{sample_id}.CNV_calls.penncnv.stats.tsv")
log:
err=os.path.join(LOGPATH, "PennCNV", "{sample_id}", "combine.error.log"),
resources:
runtime=get_tool_resource("PennCNV", "runtime"),
mem_mb=get_tool_resource("PennCNV", "memory"),
partition=get_tool_resource("PennCNV", "partition"),
conda:
"../envs/general-R.yaml"
script:
Expand Down
6 changes: 5 additions & 1 deletion stemcnv_check/rules/report_generation.smk
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from stemcnv_check.helpers import config_extract
from stemcnv_check.helpers import config_extract, collect_SNP_cluster_ids

def get_report_sample_input(wildcards):
sample_id, ref_id, sex, ref_sex = get_ref_id(wildcards, True)
Expand Down Expand Up @@ -61,6 +61,10 @@ def get_report_sample_input(wildcards):
rule check_latex_installation:
output:
os.path.join(LOGPATH, "report", "_latex_installation_check"),
resources:
runtime=get_tool_resource("default", "runtime"),
mem_mb=get_tool_resource("default", "memory"),
partition=get_tool_resource("default", "partition"),
conda:
"../envs/general-R.yaml"
shell:
Expand Down
6 changes: 3 additions & 3 deletions stemcnv_check/rules/staticdata_creation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import importlib.resources
import os
from pathlib import Path
import tempfile
from stemcnv_check import STEM_CNV_CHECK
from stemcnv_check import STEM_CNV_CHECK, VEP_version

DOWNLOAD_DIR = config["TMPDIR"] if "TMPDIR" in config else tempfile.mkdtemp()
GENOME = config["genome"]
Expand Down Expand Up @@ -269,7 +269,7 @@ def get_vep_fasta_path():
)
return os.path.join(config["vep_fasta_path"],
"homo_sapiens",
"112_{genome}",
f"{VEP_version}_{{genome}}",
filename
)

Expand All @@ -288,7 +288,7 @@ rule download_vep_cache:
output:
done=os.path.join(config["vep_cache_path"], ".{genome}.done"),
folder=directory(
os.path.join(config["vep_cache_path"], "homo_sapiens", "112_{genome}")
os.path.join(config["vep_cache_path"], "homo_sapiens", f"{VEP_version}_{{genome}}")
),
conda:
"../envs/vep-annotation.yaml"
Expand Down
7 changes: 4 additions & 3 deletions tests/test_app_check_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,9 @@ def update_config(testconfig):

# Check for Error on entry outside specifications:
del testconfig['unknown_entry']
testconfig['settings'] = dict()
# - wrong type
testconfig['settings'] = {'chromosomes': '1-22'}
testconfig['settings']['VEP_annotation'] = {'enabled': 'True'}
# - wrong number type (float vs int)
testconfig['settings']['array_attribute_summary'] = {'density.windows': 1.5}
# - value not matching regex
Expand All @@ -184,8 +185,8 @@ def update_config(testconfig):
logrecords = caplog.records[-3:]
assert [rec.levelname for rec in logrecords] == ['ERROR'] * 3
assert [rec.message for rec in logrecords] == [
"The config entry '1-22' for 'settings:chromosomes' is invalid. " +
"Value(s) need to be in a list, and matching this regex: (chr)?[0-9XY]+.",
"The config entry 'True' for 'settings:VEP_annotation:enabled' is invalid. " +
"Value(s) need to be booleans (True/False).",
"The config entry '1.5' for 'settings:array_attribute_summary:density.windows' is invalid. " +
"Value(s) need to be integers (whole numbers).",
"The config entry '('PennCNV', 'GATK')' for 'settings:CNV.calling.tools' is invalid. " +
Expand Down

0 comments on commit c251147

Please sign in to comment.