NCI-CGR · jaamarks · Jul 9, 2024 · Jun 5, 2024 · Jun 25, 2024 · Jul 8, 2024
diff --git a/docs/sub_workflows/contamination.rst b/docs/sub_workflows/contamination.rst
@@ -27,6 +27,6 @@ Contamination Sub-workflow
    The contamination sub-workflow.
    This workflow will estimate contamination using verifyIDintensity on each sample individually.
    It requires that you have GTC/IDAT files.
-   It first pull B-allele frequencies from the 1000 Genomes VCF file.
+   It first pulls B-allele frequencies from the 1000 Genomes VCF file.
    It then estimate contamination for each sample and aggregates these results.
    Finally, it also estimates the per sample median IDAT intensity, which is used to filter contamination results in the :ref:`sample-qc`
diff --git a/poetryinstall.output b/poetryinstall.output
diff --git a/src/cgr_gwas_qc/models/config/software_params.py b/src/cgr_gwas_qc/models/config/software_params.py
@@ -81,7 +81,7 @@ class SoftwareParams(BaseModel):
     )
     contam_population: str = Field(
         "AF",
-        description="While population from the 1000 Genomes project to use for B-allele frequencies during contamination testing ."
+        description="Which population from the 1000 Genomes project to use for B-allele frequencies during contamination testing. "
         "Can be one of {AF, EAS_AF, AMR_AF, AFR_AF, EUR_AF, SAS_AF}. "
         "``workflow/scripts/bpm2abf.py``",
     )

diff --git a/src/cgr_gwas_qc/reporting/qc_exclusions.py b/src/cgr_gwas_qc/reporting/qc_exclusions.py
@@ -59,7 +59,7 @@ def _create_sample_exclusion_counts_table(df: pd.DataFrame) -> pd.DataFrame:
         "contam_pass_cr": "Contaminated",
         "internal_control_pass": "Internal QC Samples Removed",
         "samples_remaining": "Samples Remaining for Analysis",  # added below
-        "dropped_replicate": "Expected Duplicates Removed",  # added below
+        "dropped_replicate": "Expected Replicates Removed",  # added below
         "is_subject_representative": "Subjects Remaining",
     }
 

diff --git a/src/cgr_gwas_qc/reporting/templates/qc_report.md.j2 b/src/cgr_gwas_qc/reporting/templates/qc_report.md.j2
@@ -116,7 +116,7 @@ NOTE: Samples that fail the sex concordance check may be eligible for recovery e
 Next, we checked if two study subjects were taken from the same individual (i.e., biological replicates or tumor/normal sampling).
 We consider subjects with pairwise concordance (proportion IBS2) > {{ config.software_params.dup_concordance_cutoff | toPct | numFormat(0) }}%, for a set of LD-pruned SNPs, as replicates.
 In this project, we found {{ subject_qc.unexpected_replicates.num_unexpected_replicates | numFormat }} subjects that were unexpected replicates.
-See the "SUBJECT_CONCORDANCE" tab of "{{ excel_file_name }}" for more details.
+See the "SAMPLE_CONCORDANCE" tab of "{{ excel_file_name }}" for more details.
 
 NOTE: Subjects with unexpected replicates identified may be eligible for recovery efforts, including confirmation of concordance by STR profiling and confirmation of subject information with the investigator. Final QC reports will have already addressed possible recovery avenues, and thus failures at this step will be included in final analytical exclusions.
 
@@ -130,7 +130,7 @@ A `PI_HAT` value > 0.2 is considered closely related (1st and 2nd degree).
 Using a combination of HGMR and AGMR, each pair is classified as duplicate or identical twin (ID), parent-offspring (PO), full siblings (FS), 2nd degree (D2), 3rd degree (D3), or unrelated (UN) individuals.
 [KING](https://people.virginia.edu/~wc9c/KING/) estimates pairwise kinship coefficients and assigns relationships ([described here](https://people.virginia.edu/~wc9c/KING/manual.html#WITHIN)) as duplicate or identical twin (ID), 1st degree (D1), 2nd degree (D2), 3rd degree (D3), and unrelated (UN) individuals.
 This global view of relatedness can help identify related subjects across populations.
-See the "SUBJECT_CONCORDANCE" tab of "{{ excel_file_name }}" for more details.
+See the "SAMPLE_CONCORDANCE" tab of "{{ excel_file_name }}" for more details.
 However, we also estimate relatedness and prune relatives on a per population basis (see below).
 
 ## Ancestry Assignment

diff --git a/src/cgr_gwas_qc/workflow/scripts/qc_report_table.py b/src/cgr_gwas_qc/workflow/scripts/qc_report_table.py
@@ -72,7 +72,7 @@ def main(
     "IdatIntensity",
     "Expected Replicate",
     "Expected Replicate Discordance",
-    "is_unexpected_replicate",
+    "Unexpected Replicate",
     "Sex Discordant",
     "Expected_Sex",
     "Predicted_Sex",
@@ -112,7 +112,7 @@ def _sample_qc(sample_sheet_csv: PathLike, sample_qc_csv: PathLike) -> pd.DataFr
     "Ancestry2",
     "Expected Replicate",
     "Expected Replicate Discordance",
-    "is_unexpected_replicate",
+    "Unexpected Replicate",
     "PLINK_PI_HAT",
     "PLINK_concordance",
     "PLINK_is_ge_pi_hat",
@@ -177,7 +177,7 @@ def _sample_concordance(sample_qc_csv: PathLike, sample_concordance_csv: PathLik
     "Group_By_Subject_ID",
     "Sample_ID",
     "Case/Control_Status",
-    "is_unexpected_replicate",
+    "Unexpected Replicate",
     "unexpected_replicate_ids",
     "Expected_Sex",
     "Predicted_Sex",
@@ -190,11 +190,12 @@ def _sample_concordance(sample_qc_csv: PathLike, sample_concordance_csv: PathLik
 ]
 
 
-def _subject_qc(sample_sheet_csv: PathLike, sample_qc_csv: PathLike) -> pd.DataFrame:
+def _subject_qc(sample_sheet_csv: PathLike, subject_qc_csv: PathLike) -> pd.DataFrame:
     ss = sample_sheet.read(sample_sheet_csv).rename(REPORT_NAME_MAPPER, axis=1)
     _additional_columns = [x for x in ss.columns if x not in _SUBJECT_QC_COLUMNS]
+
     return (
-        subject_qc_table.read(sample_qc_csv)
+        subject_qc_table.read(subject_qc_csv)
         .rename(REPORT_NAME_MAPPER, axis=1)
         .merge(ss, on="Sample_ID", suffixes=["", "_DROP"])
         .filter(regex="^(?!.*_DROP)")