d3b-center · ewafula · Dec 1, 2022 · Nov 3, 2022 · Nov 3, 2022 · Nov 3, 2022
@@ -31,7 +31,7 @@ To run this from the command line, use:
 Rscript -e "rmarkdown::render('01-frequencies-tables-checks.Rmd', clean = TRUE)"
 ```
 _This assumes you are in the analysis module directory of the repository,_
-_OpenPedCan-analysis/analyses/mutation-frequencies-tables-checks._
+_OpenPedCan-analysis/analyses/mtp-tables-qc-checks._
 
 
 ### Load packages
@@ -50,7 +50,7 @@ suppressPackageStartupMessages(library(openxlsx))
 ```{r set output directory}
 # directories for input and output files
 root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
-analyses_dir <- file.path(root_dir, "analyses", "mutation-frequencies-table-checks")
+analyses_dir <- file.path(root_dir, "analyses", "mtp-tables-qc-checks")
 results_dir <- file.path(analyses_dir, "results")
 
 # Create results folder if it doesn't exist
@@ -110,7 +110,7 @@ get_num_samples <- function(freq_df) {
 
 # changes in common columns among MPT mutation frequencies  between 
 # current and previous tables
-changes_in_columns <- function(current_tables, previous_table, column_name) {
+changes_in_columns <- function(current_table, previous_table, column_name) {
   # values specific to current table
   specific_to_current <- current_table %>% 
   dplyr::select(Dataset, column_name) %>% 
@@ -266,7 +266,7 @@ all_cohorts_cancer_groups
 #### Check gene symbols
 ```{r gene symbols}
 changes_gene_symbols <- 
-  changes_in_columns(current_tables, previous_table, "Gene_symbol")
+  changes_in_columns(current_table, previous_table, "Gene_symbol")
 
 # write to file
 changes_gene_symbols %>% 
@@ -281,7 +281,7 @@ changes_gene_symbols
 #### Check Ensembl IDs
 ```{r ensembl ids}
 changes_ensembl_ids <- 
-  changes_in_columns(current_tables, previous_table, "targetFromSourceId")
+  changes_in_columns(current_table, previous_table, "targetFromSourceId")
 
 # write to file
 changes_ensembl_ids %>% 
@@ -296,7 +296,7 @@ changes_ensembl_ids
 #### Check cancer groups
 ```{r cancer groups}
 changes_cancer_groups <- 
-  changes_in_columns(current_tables, previous_table, "Disease")
+  changes_in_columns(current_table, previous_table, "Disease")
 
 # write to file
 changes_cancer_groups %>% 
@@ -311,7 +311,7 @@ changes_cancer_groups
 #### Check EFO IDs
 ```{r efo ids}
 changes_efo_ids <- 
-  changes_in_columns(current_tables, previous_table, "diseaseFromSourceMappedId")
+  changes_in_columns(current_table, previous_table, "diseaseFromSourceMappedId")
 
 # write to file
 changes_efo_ids %>% 
@@ -325,7 +325,7 @@ changes_efo_ids
 #### Check MONDO IDs
 ```{r mondo ids}
 changes_mondo_ids <- 
-  changes_in_columns(current_tables, previous_table, "MONDO")
+  changes_in_columns(current_table, previous_table, "MONDO")
 
 # write to file
 changes_mondo_ids %>% 

@@ -0,0 +1,303 @@
+---
+title: "Gene Expression TPM Table Summary and QC Checks"
+output:
+  html_notebook:
+    toc: TRUE
+    toc_float: TRUE
+    toc_depth: 4
+author: Aditya Lahiri and Eric Wafula
+date: 2022-11-07
+params:
+  current_table:
+    label: "current expression tpm table"
+    value: current/long_n_tpm_mean_sd_quantile_gene_wise_zscore.tsv.gz
+    input: file
+  previous_table:
+    label: "previous expression tpm table"
+    value: previous/long_n_tpm_mean_sd_quantile_gene_wise_zscore.tsv.gz
+    input: file
+---
+
+
+
+### Purpose
+Performs summary and QC checks comparing the current and the previous OpenPedCan 
+tpm tables.
+
+### Usage
+
+To run this from the command line, use:
+```
+Rscript -e "rmarkdown::render('02-tpm-tables-checks.Rmd', clean = TRUE)"
+```
+_This assumes you are in the analysis module directory of the repository,_
+_OpenPedCan-analysis/analyses/mtp-tables-qc-checks._
+
+
+
+### Load packages
+```{r load packages}
+# R packages
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(data.table))
+suppressPackageStartupMessages(library(openxlsx))
+
+# Magrittr pipe
+`%>%` <- dplyr::`%>%`
+```
+
+
+### Set output directory
+```{r set output directory}
+# directories for input and output files
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+analyses_dir <- file.path(root_dir, "analyses", "mtp-tables-qc-checks")
+results_dir <- file.path(analyses_dir, "results")
+
+# Create results folder if it doesn't exist
+if (!dir.exists(results_dir)) {
+  dir.create(results_dir)
+}
+```
+
+### Functions
+```{r functions}
+# get number of samples in cohorts
+get_num_samples <- function(tpm_df) {
+  samples <- tpm_df %>% 
+    dplyr::filter(cohort != "All Cohorts") %>%
+    dplyr::select(cohort, Disease,
+                  n_samples) %>%
+    dplyr::distinct() %>% 
+    dplyr::select(-Disease) %>%
+    dplyr::group_by(cohort) %>% 
+    dplyr::summarise(n_samples = 
+                       sum(as.integer(n_samples, na.rm = TRUE)))
+    return(samples)
+}
+
+# changes in common columns among MPT mutation frequencies  between 
+# current and previous tables
+changes_in_columns <- function(current_table, previous_table, column_name) {
+  # values specific to current table
+  specific_to_current <- current_table %>% 
+  dplyr::select(cohort, column_name) %>% 
+  dplyr::distinct() %>% 
+    setdiff(previous_table %>% dplyr::select(cohort, column_name) %>%
+              dplyr::distinct()) %>% 
+    dplyr::rename(current_dataset = cohort)
+  # values specific to previous table
+  specific_to_previous <- previous_table %>% 
+    dplyr::select(cohort, column_name) %>% 
+    dplyr::distinct() %>% 
+    setdiff(current_table %>% dplyr::select(cohort, column_name) %>%
+              dplyr::distinct()) %>% 
+    dplyr::rename(previous_dataset = cohort)
+  # combine differences 
+  changes_in_columns <- specific_to_current %>%
+    dplyr::full_join(specific_to_previous, by = column_name) %>% 
+    dplyr::select(column_name, current_dataset, previous_dataset)
+  return(changes_in_columns)
+}
+```
+
+
+### Read in current and previous tpm files
+```{r load files}
+current_table <- data.table::fread(params$current_table, 
+                                   data.table = FALSE, 
+                                   showProgress = FALSE) %>% 
+  purrr::discard(~all(is.na(.))) 
+
+previous_table <- data.table::fread(params$previous_table, 
+                                    data.table = FALSE, 
+                                    showProgress = FALSE) %>% 
+  purrr::discard(~all(is.na(.))) 
+```
+
+
+### First 50 records from a static cancer group (GMKF Neuroblastoma) 
+Records from a static cancer_group should not change between the current and previous mutation frequencies tables. There should be no changes in computed mutation frequencies. But other fields in the table will likely change if annotations are updated. 
+
+#### Current table
+```{r current table}
+# ordered top 50 GMKF Neuroblastoma records from current tpm table
+gmkf_nbl_current <- current_table %>% 
+  dplyr::filter(cohort == "GMKF", Disease == "Neuroblastoma") %>% 
+  dplyr::arrange() %>% 
+  head(n = 50)
+
+# write to file
+gmkf_nbl_current %>% 
+  readr::write_tsv(file.path(results_dir, "current_group_static_cancer.tsv")) 
+
+# display      
+gmkf_nbl_current   
+```
+
+#### Previous table
+
+```{r previous table}
+# ordered top 50 GMKF Neuroblastoma records from previous tpm table
+gmkf_nbl_previous <- previous_table %>% 
+  dplyr::filter(cohort == "GMKF", Disease == "Neuroblastoma") %>% 
+  dplyr::arrange() %>% 
+  head(n = 50)
+
+# write to file
+gmkf_nbl_previous %>% 
+  readr::write_tsv(file.path(results_dir, "previous_group_static_cancer.tsv"))
+
+# display   
+gmkf_nbl_previous  
+```
+
+
+
+### Number of samples in each cohort 
+```{r number of samples}
+# get number of samples in each cohort
+num_samples <- get_num_samples(current_table) %>% 
+  dplyr::rename(samples_in_current = n_samples) %>% 
+  dplyr::full_join(get_num_samples(previous_table) %>% 
+                    dplyr::rename(samples_in_previous = n_samples), by = "cohort") %>% 
+  dplyr::select(cohort, samples_in_current, samples_in_previous)
+
+# write to file
+num_samples %>% 
+  readr::write_tsv(file.path(results_dir, "number_of_samples.tsv"), na = "NA")
+
+# display   
+num_samples
+```
+
+
+
+### Cancer groups in the "All Cohorts" category
+```{r all cohorts cancer groups}
+# get All Cohorts cancer groups
+all_cohorts_cancer_groups <- current_table %>% 
+  dplyr::select(cohort, Disease) %>% 
+  dplyr::filter(cohort == "All Cohorts") %>% 
+  dplyr::distinct() %>%
+  dplyr::rename(current_cohort = cohort) %>% 
+  dplyr::full_join(previous_table %>% 
+                     dplyr::select(cohort, Disease) %>% 
+                     dplyr::filter(cohort == "All Cohorts") %>% 
+                     dplyr::distinct() %>% 
+                     dplyr::rename(previous_cohort = cohort),
+                   by = "Disease") %>% 
+  dplyr::select(Disease, current_cohort, previous_cohort)
+
+# write to file
+all_cohorts_cancer_groups %>% 
+  readr::write_tsv(file.path(results_dir, 
+                             "all_cohorts_cancer_groups.tsv"), na = "NA")
+
+# display   
+all_cohorts_cancer_groups
+```
+
+
+### Changes in common columns across tpm tables
+
+#### Check gene symbols
+```{r gene symbols}
+changes_gene_symbols <- 
+  changes_in_columns(current_table, previous_table, "Gene_symbol")
+
+# write to file
+changes_gene_symbols %>% 
+  readr::write_tsv(file.path(results_dir, 
+                             "changes_in_gene_symbols.tsv"), na = "NA")
+
+# display   
+changes_gene_symbols
+```
+
+
+
+#### Check Ensembl IDs
+```{r ensembl ids}
+changes_ensembl_ids <- 
+  changes_in_columns(current_table, previous_table, "Gene_Ensembl_ID")
+
+# write to file
+changes_ensembl_ids %>% 
+  readr::write_tsv(file.path(results_dir, 
+                             "changes_in_ensembl_ids.tsv"), na = "NA")
+
+# display   
+changes_ensembl_ids
+```
+
+
+#### Check cancer groups
+```{r cancer groups}
+changes_cancer_groups <- 
+  changes_in_columns(current_table, previous_table, "Disease")
+
+# write to file
+changes_cancer_groups %>% 
+  readr::write_tsv(file.path(results_dir, 
+                             "changes_in_cancer_groups.tsv"), na = "NA")
+
+# display   
+changes_cancer_groups
+```
+
+
+
+#### Check EFO IDs
+```{r efo ids}
+changes_efo_ids <- 
+  changes_in_columns(current_table, previous_table, "EFO")
+
+# write to file
+changes_efo_ids %>% 
+  readr::write_tsv(file.path(results_dir, "changes_in_efo_ids.tsv"), na = "NA")
+
+# display   
+changes_efo_ids
+```
+
+
+#### Check MONDO IDs
+```{r mondo ids}
+changes_mondo_ids <- 
+  changes_in_columns(current_table, previous_table, "MONDO")
+
+# write to file
+changes_mondo_ids %>% 
+  readr::write_tsv(file.path(results_dir, "changes_in_mondo_ids.tsv"), na = "NA")
+
+# display   
+changes_mondo_ids
+```
+
+
+### Generate excel output for all QC checks and summaries
+```{r}
+# read all QC  checks and summaries files 
+qc_files <- list.files(results_dir, pattern = "tsv$")
+qc_files_list <- lapply(qc_files, function(x) {
+  # read each file
+  qc_file_df <- readr::read_tsv(file.path(results_dir, x))
+  return(qc_file_df)
+})
+# get file names
+qc_files_names <- gsub(".tsv", "", qc_files)
+names(qc_files_list) <- qc_files_names
+# write results to excel workbook 
+qc_files_list %>% 
+  openxlsx::write.xlsx(file.path(results_dir, 
+                                 paste0(gsub(".tsv.gz", "", 
+                                             basename(params$current_table)), 
+                                        ".xlsx")), 
+                       overwrite = TRUE, keepNA = TRUE, na.string = "NA")
+```
+
+
+```{r session info}
+sessionInfo()
+```