From 77d15522dad17e5fdf23a64ea6ccb5a30e9186b1 Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Thu, 7 Apr 2022 08:29:47 +0200 Subject: [PATCH 1/9] updated extracted QC metrics for WGS --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 15 +++++++---- BALSAMIC/constants/quality_check_reporting.py | 12 ++++++++- .../qc/multiqc_data/multiqc_data.json | 26 +++++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) mode change 100644 => 100755 BALSAMIC/assets/scripts/collect_qc_metrics.py diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py old mode 100644 new mode 100755 index 46377bd0d..dc5dfa474 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -49,6 +49,7 @@ def collect_qc_metrics( def capture_kit_resolve_type(capture_kit: str): """Resolves the capture_kit type (NoneType or String)""" + if capture_kit == "None": return None else: @@ -67,9 +68,12 @@ def get_multiqc_data_source(multiqc_data: dict, sample: str, tool: str) -> str: A source file that was used to produce a specific metric """ - # Use case: splits multiqc_picard_dups into ['multiqc', 'picard', 'dup'] in order to retrieve the - # ["report_data_sources"]["Picard"]["DuplicationMetrics"] values from multiqc_data.json - subtool_name = tool[:-1].split("_") + if tool == "multiqc_general_stats": + subtool_name = ["multiqc", "FastQC", "all_sections"] + else: + # Use case: splits multiqc_picard_dups into ['multiqc', 'picard', 'dup'] in order to retrieve the + # ["report_data_sources"]["Picard"]["DuplicationMetrics"] values from multiqc_data.json + subtool_name = tool[:-1].split("_") # Nested json fetching for source_tool in multiqc_data["report_data_sources"]: @@ -139,7 +143,8 @@ def extract(data, output_metrics, sample=None, source=None): if isinstance(data, dict): for k in data: - if "umi" not in k: + # Ignore UMI and reverse reads metrics + if "umi" not in k and "R_2" not in str(sample): if k in requested_metrics: output_metrics.append( MetricModel( @@ -147,7 +152,7 @@ def extract(data, output_metrics, sample=None, source=None): input=get_multiqc_data_source( multiqc_data, sample, source ), - name=k, + name=k if "FastQC" not in k else "PERCENT_DUPLICATION", step=source, value=data[k], condition=requested_metrics[k]["condition"], diff --git a/BALSAMIC/constants/quality_check_reporting.py b/BALSAMIC/constants/quality_check_reporting.py index dba06df8e..15f634240 100644 --- a/BALSAMIC/constants/quality_check_reporting.py +++ b/BALSAMIC/constants/quality_check_reporting.py @@ -90,5 +90,15 @@ "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, }, }, - "wgs": {"FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}}, + "wgs": { + "MEAN_INSERT_SIZE": {"condition": None}, + "MEDIAN_COVERAGE": {"condition": None}, + "FastQC_mqc-generalstats-fastqc-percent_duplicates": {"condition": None}, + "PCT_5X": {"condition": None}, + "PCT_15X": {"condition": None}, + "PCT_30X": {"condition": None}, + "PCT_60X": {"condition": None}, + "PCT_100X": {"condition": None}, + "FOLD_80_BASE_PENALTY": {"condition": {"norm": "lt", "threshold": 1.8}}, + }, } diff --git a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json index e8b15f614..b06d8b626 100755 --- a/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json +++ b/tests/test_data/qc_files/analysis/qc/multiqc_data/multiqc_data.json @@ -11,6 +11,14 @@ "DuplicationMetrics": { "concatenated_tumor_XXXXXX_R": "tests/test_data/qc_files/analysis/bam/concatenated_tumor_XXXXXX_R.sorted.mrkdup.txt" } + }, + "FastQC": { + "all_sections": { + "concatenated_tumor_XXXXXX_R_2": "tests/test_data/qc_files/analysis/fastqc/concatenated_tumor_XXXXXX_R_2_fastqc.zip", + "concatenated_normal_XXXXXX_R_1": "tests/test_data/qc_files/analysis/fastqc/concatenated_normal_XXXXXX_R_1_fastqc.zip", + "concatenated_normal_XXXXXX_R_2": "tests/test_data/qc_files/analysis/fastqc/concatenated_normal_XXXXXX_R_2_fastqc.zip", + "concatenated_tumor_XXXXXX_R_1": "tests/test_data/qc_files/analysis/fastqc/concatenated_tumor_XXXXXX_R_1_fastqc.zip" + } } }, "report_saved_raw_data": { @@ -76,6 +84,24 @@ "READ_PAIR_DUPLICATES": 18741892.0, "PERCENT_DUPLICATION": 0.391429 } + }, + "multiqc_general_stats": { + "concatenated_tumor_XXXXXX_R_2": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.03521942842923, + "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 + }, + "concatenated_normal_XXXXXX_R_1": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.426654287440797, + "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 + }, + "concatenated_normal_XXXXXX_R_2": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 14.214689357571501, + "FastQC_mqc-generalstats-fastqc-total_sequences": 464581551.0 + }, + "concatenated_tumor_XXXXXX_R_1": { + "FastQC_mqc-generalstats-fastqc-percent_duplicates": 15.213739762327492, + "FastQC_mqc-generalstats-fastqc-total_sequences": 600529762.0 + } } } } From 86f298c9b5c313a19ef60e7adc23312ea67715fd Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Thu, 7 Apr 2022 08:35:05 +0200 Subject: [PATCH 2/9] changelog --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 98841f9a4..67b876a62 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -17,6 +17,7 @@ Added: * Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 * SVdb to TGA workflow #871 * SVdb merge SV and CNV #871 +* WGS metrics to be extracted and stored to _metrics_deliverables.yaml file #907 Changed: ^^^^^^^^ From b471dd7dd7d9c5f16bcceca377938bd94416fead Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Thu, 7 Apr 2022 08:52:54 +0200 Subject: [PATCH 3/9] add validator --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 2 +- BALSAMIC/utils/models.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py index dc5dfa474..4243ebb7b 100755 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -152,7 +152,7 @@ def extract(data, output_metrics, sample=None, source=None): input=get_multiqc_data_source( multiqc_data, sample, source ), - name=k if "FastQC" not in k else "PERCENT_DUPLICATION", + name=k, step=source, value=data[k], condition=requested_metrics[k]["condition"], diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 9ee9ba8c1..37a38b702 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -739,6 +739,15 @@ class MetricModel(BaseModel): value: Any = ... condition: Optional[MetricConditionModel] = ... + @validator("name") + def check_(cls, value): + """Updates the name if the source is FastQC""" + + value = ( + value if "fastqc-percent_duplicates" not in value else "PERCENT_DUPLICATION" + ) + return value + class MetricValidationModel(BaseModel): """Defines the metric validation model From 45ae6a606801b0c2495d5613325fa3a47dac7803 Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Fri, 8 Apr 2022 15:01:59 +0200 Subject: [PATCH 4/9] percent depulication per reads --- BALSAMIC/assets/scripts/collect_qc_metrics.py | 2 +- BALSAMIC/utils/models.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py index 4243ebb7b..015735f7b 100755 --- a/BALSAMIC/assets/scripts/collect_qc_metrics.py +++ b/BALSAMIC/assets/scripts/collect_qc_metrics.py @@ -144,7 +144,7 @@ def extract(data, output_metrics, sample=None, source=None): if isinstance(data, dict): for k in data: # Ignore UMI and reverse reads metrics - if "umi" not in k and "R_2" not in str(sample): + if "umi" not in k: if k in requested_metrics: output_metrics.append( MetricModel( diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 37a38b702..5fbd99b73 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -740,13 +740,11 @@ class MetricModel(BaseModel): condition: Optional[MetricConditionModel] = ... @validator("name") - def check_(cls, value): + def validate_name(cls, name, values): """Updates the name if the source is FastQC""" - value = ( - value if "fastqc-percent_duplicates" not in value else "PERCENT_DUPLICATION" - ) - return value + if "fastqc-percent_duplicates" in name: + return "PERCENT_DUPLICATION_R_" + values["input"].split("_")[-2] class MetricValidationModel(BaseModel): @@ -762,7 +760,7 @@ class MetricValidationModel(BaseModel): metrics: List[MetricModel] @validator("metrics", each_item=True) - def check_squares(cls, metric): + def validate_metrics(cls, metric): """Checks if a metric meets its filtering condition""" if metric.condition and not VALID_OPS[metric.condition.norm]( From 358ed1bbcd1aca8e6fbe126a029f2905a6fd8100 Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Fri, 8 Apr 2022 15:18:14 +0200 Subject: [PATCH 5/9] add validation return --- BALSAMIC/utils/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 5fbd99b73..7b4a55d66 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -746,6 +746,8 @@ def validate_name(cls, name, values): if "fastqc-percent_duplicates" in name: return "PERCENT_DUPLICATION_R_" + values["input"].split("_")[-2] + return name + class MetricValidationModel(BaseModel): """Defines the metric validation model From fa69e1f3fea556a93f639a679b66d49270f6d592 Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Fri, 8 Apr 2022 16:17:15 +0200 Subject: [PATCH 6/9] typo --- BALSAMIC/utils/models.py | 2 +- tests/scripts/test_collect_qc_metrics.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/utils/models.py b/BALSAMIC/utils/models.py index 7b4a55d66..b1642c7b9 100644 --- a/BALSAMIC/utils/models.py +++ b/BALSAMIC/utils/models.py @@ -744,7 +744,7 @@ def validate_name(cls, name, values): """Updates the name if the source is FastQC""" if "fastqc-percent_duplicates" in name: - return "PERCENT_DUPLICATION_R_" + values["input"].split("_")[-2] + return "PERCENT_DUPLICATION_R" + values["input"].split("_")[-2] return name diff --git a/tests/scripts/test_collect_qc_metrics.py b/tests/scripts/test_collect_qc_metrics.py index 9b2cb7e16..570167b77 100644 --- a/tests/scripts/test_collect_qc_metrics.py +++ b/tests/scripts/test_collect_qc_metrics.py @@ -184,6 +184,8 @@ def test_collect_qc_metrics_wgs(tmp_path, multiqc_data_path, cli_runner): [str(output_path), multiqc_data_path, seq_type, capture_kit], ) + print(output_path) + # THEN check if the YAML is correctly created and there are no errors assert result.exit_code == 0 assert Path(output_path).exists() From 916a8a00b4578818706a7e167d4be2f5179eadd0 Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Fri, 8 Apr 2022 16:17:45 +0200 Subject: [PATCH 7/9] remove print --- tests/scripts/test_collect_qc_metrics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/scripts/test_collect_qc_metrics.py b/tests/scripts/test_collect_qc_metrics.py index 570167b77..9b2cb7e16 100644 --- a/tests/scripts/test_collect_qc_metrics.py +++ b/tests/scripts/test_collect_qc_metrics.py @@ -184,8 +184,6 @@ def test_collect_qc_metrics_wgs(tmp_path, multiqc_data_path, cli_runner): [str(output_path), multiqc_data_path, seq_type, capture_kit], ) - print(output_path) - # THEN check if the YAML is correctly created and there are no errors assert result.exit_code == 0 assert Path(output_path).exists() From 380691444dd24ddf9d7d880e1774d1b907f6718e Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Mon, 11 Apr 2022 15:20:28 +0200 Subject: [PATCH 8/9] remove pct_5x --- BALSAMIC/constants/quality_check_reporting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/BALSAMIC/constants/quality_check_reporting.py b/BALSAMIC/constants/quality_check_reporting.py index 15f634240..64b6a80ef 100644 --- a/BALSAMIC/constants/quality_check_reporting.py +++ b/BALSAMIC/constants/quality_check_reporting.py @@ -94,7 +94,6 @@ "MEAN_INSERT_SIZE": {"condition": None}, "MEDIAN_COVERAGE": {"condition": None}, "FastQC_mqc-generalstats-fastqc-percent_duplicates": {"condition": None}, - "PCT_5X": {"condition": None}, "PCT_15X": {"condition": None}, "PCT_30X": {"condition": None}, "PCT_60X": {"condition": None}, From a19323a926daf101ed1e246b7fe8459f321d3271 Mon Sep 17 00:00:00 2001 From: Vadym Ivanchuk Date: Mon, 11 Apr 2022 15:56:52 +0200 Subject: [PATCH 9/9] changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 67b876a62..66cbcac3a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -17,7 +17,7 @@ Added: * Snakemake rule for creating `.cgh` files from `CNVkit` outputs #858 * SVdb to TGA workflow #871 * SVdb merge SV and CNV #871 -* WGS metrics to be extracted and stored to _metrics_deliverables.yaml file #907 +* Additional WGS metrics to be stored in ``_metrics_deliverables.yaml`` #907 Changed: ^^^^^^^^