Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Sample distribution plots: account for multiple samples from same individual #170

Merged
merged 4 commits into from
Oct 24, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 34 additions & 24 deletions analyses/sample-distribution-analysis/01-filter-across-types.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,29 @@
# of key variables within the dataset.
#
# Chante Bethell for CCDL 2019
#
#
# #### USAGE
# This script is intended to be run via the command line from the top directory
# This script is intended to be run via the command line from the top directory
# of the repository as follows:
#
# Rscript analyses/sample-distribution-analysis/01-filter-across-types.R

# magrittr pipe
`%>%` <- dplyr::`%>%`

# Function to filter based on primary_site
# Function to filter based on primary_site
location_fn <- function(location) {
# Given the name of a primary site, create a vector containing the disease
# Given the name of a primary site, create a vector containing the disease
# types expressed within the primary site.
#
# Note: the disease types found at all instances of the location substring
# will be included
#
# Note: the disease types found at all instances of the location substring
# will be included
#
# Args:
# location: the name of a primary site found within the dataset
#
# Returns:
# disease_type_vector: the vector of disease types expressed at the
# disease_type_vector: the vector of disease types expressed at the
# named primary site
disease_type_vector <- brain_location %>%
dplyr::arrange(dplyr::desc(n)) %>%
Expand All @@ -51,26 +51,34 @@ if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}

# Read in dataset
df2 <- data.frame(readr::read_tsv(
file.path(root_dir, "data", "pbta-histologies.tsv")
))

# Remove na's
df2 <- df2 %>%
# Read in dataset and remove NAs
histologies_df <-
readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv")) %>%
as.data.frame() %>%
dplyr::filter(!is.na(disease_type_new))

# Filter the histologies file to account for multiple samples from the same
# individual and the fact that multiple experimental strategies are in this
# data.frame

# Retain only tumors for this analysis
histologies_df <- histologies_df %>%
dplyr::filter(sample_type == "Tumor",
composition == "Solid Tissue")

# data.frame with the count of each unique cancer type expression
disease_expression <- df2 %>%
disease_expression <- histologies_df %>%
# some recurrences can have different disease_type_new values
dplyr::distinct(Kids_First_Participant_ID, disease_type_new) %>%
dplyr::group_by(disease_type_new) %>%
dplyr::count(name = "count") %>%
dplyr::arrange(dplyr::desc(count))

# Calculate the total count of the dataset
sum_count <- sum(disease_expression$count)

# Create a percent variable and round to 4 decimal places
# (so values will have 2 decimal places as percentages)
# Create a percent variable and round to 4 decimal places
# (so values will have 2 decimal places as percentages)
disease_expression <- disease_expression %>%
dplyr::mutate(percent = paste0((round(count / sum_count, 4) * 100), "%"))

Expand Down Expand Up @@ -108,15 +116,17 @@ ggplot2::ggsave(
height = 10
)

# data.frame with the location where each cancer type in the dataset is
# data.frame with the location where each cancer type in the dataset is
# expressed, sorted to show highest expression
brain_location <- df2 %>%
brain_location <- histologies_df %>%
dplyr::distinct(Kids_First_Participant_ID, disease_type_new,
primary_site) %>%
dplyr::select(disease_type_new, primary_site) %>%
dplyr::group_by(disease_type_new, primary_site) %>%
dplyr::tally() %>%
dplyr::arrange(dplyr::desc(n))

# Make a vector of primary sites
# Make a vector of primary sites
primary_sites_vector <- c(
"Basal Ganglia",
"Brain Stem- Midbrain",
Expand All @@ -143,11 +153,11 @@ primary_sites_vector <- c(
# This step helps us with melting
names(primary_sites_vector) <- primary_sites_vector

# For each string in primary sites vector use location_fn to get the vector of
# disease types it's sorted by
# For each string in primary sites vector use location_fn to get the vector of
# disease types it's sorted by
cancer_types_list <- lapply(primary_sites_vector, location_fn)

# Count the disease types for each primary site by taking the length of each
# Count the disease types for each primary site by taking the length of each
# element of the list
cancer_types_counts <- lapply(cancer_types_list, length)

Expand Down
28 changes: 16 additions & 12 deletions analyses/sample-distribution-analysis/02-multilayer-plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Chante Bethell for CCDL 2019
#
# #### USAGE
# This script is intended to be run via the command line from the top directory
# This script is intended to be run via the command line from the top directory
# of the repository as follows:
#
# Rscript analyses/sample-distribution-analysis/02-multilayer-plots.R
Expand All @@ -25,15 +25,19 @@ output_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis")
results_dir <- file.path(output_dir, "results")
plots_dir <- file.path(output_dir, "plots")

# Read in dataset
df2 <- readr::read_tsv(file.path(root_dir, "data",
# Read in dataset
histologies_df <- readr::read_tsv(file.path(root_dir, "data",
"pbta-histologies.tsv"))

# Create a colorblind-friendly color vector
color <- colorblindr::palette_OkabeIto

# Create final data.frame prepped for treemap and sunburst functions
final_df <- df2 %>%
final_df <- histologies_df %>%
dplyr::filter(sample_type == "Tumor",
composition == "Solid Tissue") %>%
dplyr::distinct(Kids_First_Participant_ID, broad_histology,
short_histology, disease_type_new) %>%
# Select our 3 columns of interest
dplyr::select(broad_histology, short_histology, disease_type_new) %>%
# Remove any row that has an NA
Expand All @@ -45,7 +49,7 @@ final_df <- df2 %>%
# Place the value 1 in a column named counter for treemap and sunburt plots
dplyr::mutate(counter= c(1)) %>%
# Change the column names
dplyr::rename(level1 = broad_histology,
dplyr::rename(level1 = broad_histology,
level2 = short_histology,
level3 = disease_type_new) %>%
# Reorder the rows according to the 3 levels
Expand All @@ -56,7 +60,7 @@ final_df <- df2 %>%
# Save to tsv file
readr::write_tsv(final_df, file.path(results_dir, "plots_df.tsv"))

# Create a treemap
# Create a treemap
tm <-
treemap::treemap(
final_df,
Expand All @@ -67,19 +71,19 @@ tm <-
)$tm

# Convert the tm data.frame into a d3.js hierarchy object which is needed
# for the sund2b plot
# for the sund2b plot
tmnest <-
d3r::d3_nest(tm[, c("level1", "level2", "level3", "vSize")],
value_cols = c("vSize"))

# Create an interactive treemap
# Create an interactive treemap
interactive_tm <-
d3treeR::d3tree(tm,
rootname = "Cancer Histologies Treemap",
width = 1200,
height = 700)

# Create a sunburst plot
# Create a sunburst plot
sun_plot <-
sunburstR::sunburst(
data = tmnest,
Expand All @@ -89,10 +93,10 @@ sun_plot <-
colors = color
)

# Create an interactive sund2b plot
# Create an interactive sund2b plot
p <- sunburstR::sund2b(tmnest, colors = color, valueField = "vSize")

# Create HTML outputs for the interactive plots
mapview::mapshot(interactive_tm, url = file.path(plots_dir,
# Create HTML outputs for the interactive plots
mapview::mapshot(interactive_tm, url = file.path(plots_dir,
"histology-treemap.html"))
mapview::mapshot(p, url = file.path(plots_dir, "histology-pie.html"))
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

146 changes: 73 additions & 73 deletions analyses/sample-distribution-analysis/results/disease_expression.tsv
Original file line number Diff line number Diff line change
@@ -1,74 +1,74 @@
disease_type_new count percent
Low-grade glioma;astrocytoma (WHO grade I/II) 498 24.88%
High-grade glioma;astrocytoma (WHO grade III/IV) 230 11.49%
Medulloblastoma 228 11.39%
Ependymoma 173 8.64%
Brainstem glioma- Diffuse intrinsic pontine glioma 130 6.49%
Ganglioglioma 95 4.75%
Craniopharyngioma 74 3.7%
Atypical Teratoid Rhabdoid Tumor 60 3%
Meningioma 59 2.95%
Dysembryoplastic neuroepithelial tumor 50 2.5%
Neurofibroma;Plexiform 39 1.95%
Schwannoma 37 1.85%
Choroid plexus papilloma 30 1.5%
Supratentorial or Spinal Cord PNET 29 1.45%
Dysplasia;Gliosis 27 1.35%
Ewings Sarcoma 15 0.75%
Teratoma 15 0.75%
Chordoma 12 0.6%
Metastatic secondary tumors 11 0.55%
Pineoblastoma 10 0.5%
Germinoma 8 0.4%
Langerhans Cell histiocytosis 8 0.4%
Adenoma 7 0.35%
Choroid plexus carcinoma 7 0.35%
Glial-neuronal tumor NOS 7 0.35%
Malignant peripheral nerve sheath tumor 7 0.35%
Sarcoma 7 0.35%
Ganglioneuroblastoma 6 0.3%
Hemangioblastoma 6 0.3%
Meningioangiomatosis 6 0.3%
Neuroblastoma 6 0.3%
Neurocytoma 6 0.3%
Subependymal Giant Cell Astrocytoma (SEGA) 6 0.3%
Cyst 5 0.25%
Dermoid Cyst 5 0.25%
Gliomatosis Cerebri 4 0.2%
Non-Langerhans Histiocytosis;JXG 4 0.2%
Oligodendroglioma 4 0.2%
Osteoblastoma 4 0.2%
Rhabdomyosarcoma 4 0.2%
Cavernoma 3 0.15%
Brain arteriovenous malformation 2 0.1%
Chondrosarcoma 2 0.1%
Choroid plexus cyst 2 0.1%
CNS embryonal tumor 2 0.1%
Cortical Tubers 2 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Ganglioglioma 2 0.1%
Dysplasia;Gliosis;Glial-neuronal tumor NOS 2 0.1%
Embryonal tumor with multilayer rosettes NOS 2 0.1%
Ependymoblastoma 2 0.1%
Fibroma 2 0.1%
Fibromyxoid Tumor 2 0.1%
Ganglioglioma;Low-grade glioma/astrocytoma (WHO grade I/II) 2 0.1%
Ganglioneuroma 2 0.1%
Germinoma;Teratoma 2 0.1%
Hamartoma 2 0.1%
Intraneural perineurioma 2 0.1%
Malignant melanocytic neoplasm 2 0.1%
Metastatic secondary tumors;Neuroblastoma 2 0.1%
Myeloid Sarcoma 2 0.1%
Myofibroblastoma 2 0.1%
Myxoid spindle cell tumor 2 0.1%
NeuroInflammatory systemic disease 2 0.1%
Non-germinomatous germ cell tumor;Teratoma 2 0.1%
Ossifying Fibroma 2 0.1%
Primary CNS lymphoma 2 0.1%
Reactive Connective Tissue 2 0.1%
Reactive Gliosis 2 0.1%
Rosai-Dorfman Disease 2 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Dysplasia/Gliosis;Ganglioglioma 1 0.05%
Medulloepithelioma 1 0.05%
Papillary glioneuronal tumor 1 0.05%
Prolactinoma 1 0.05%
Low-grade glioma;astrocytoma (WHO grade I/II) 239 24.95%
Medulloblastoma 119 12.42%
High-grade glioma;astrocytoma (WHO grade III/IV) 101 10.54%
Ependymoma 85 8.87%
Ganglioglioma 46 4.8%
Brainstem glioma- Diffuse intrinsic pontine glioma 45 4.7%
Craniopharyngioma 39 4.07%
Atypical Teratoid Rhabdoid Tumor 28 2.92%
Meningioma 27 2.82%
Dysembryoplastic neuroepithelial tumor 25 2.61%
Neurofibroma;Plexiform 19 1.98%
Choroid plexus papilloma 16 1.67%
Schwannoma 16 1.67%
Supratentorial or Spinal Cord PNET 16 1.67%
Dysplasia;Gliosis 14 1.46%
Teratoma 8 0.84%
Ewings Sarcoma 7 0.73%
Metastatic secondary tumors 5 0.52%
Adenoma 4 0.42%
Choroid plexus carcinoma 4 0.42%
Cyst 4 0.42%
Germinoma 4 0.42%
Glial-neuronal tumor NOS 4 0.42%
Langerhans Cell histiocytosis 4 0.42%
Malignant peripheral nerve sheath tumor 4 0.42%
Neuroblastoma 4 0.42%
Pineoblastoma 4 0.42%
Sarcoma 4 0.42%
Chordoma 3 0.31%
Dermoid Cyst 3 0.31%
Meningioangiomatosis 3 0.31%
Neurocytoma 3 0.31%
Subependymal Giant Cell Astrocytoma (SEGA) 3 0.31%
Cavernoma 2 0.21%
Ganglioneuroblastoma 2 0.21%
Gliomatosis Cerebri 2 0.21%
Hemangioblastoma 2 0.21%
Non-Langerhans Histiocytosis;JXG 2 0.21%
Oligodendroglioma 2 0.21%
Osteoblastoma 2 0.21%
Rhabdomyosarcoma 2 0.21%
Brain arteriovenous malformation 1 0.1%
Chondrosarcoma 1 0.1%
Choroid plexus cyst 1 0.1%
CNS embryonal tumor 1 0.1%
Cortical Tubers 1 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Dysplasia/Gliosis;Ganglioglioma 1 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Ganglioglioma 1 0.1%
Dysplasia;Gliosis;Glial-neuronal tumor NOS 1 0.1%
Embryonal tumor with multilayer rosettes NOS 1 0.1%
Ependymoblastoma 1 0.1%
Fibroma 1 0.1%
Fibromyxoid Tumor 1 0.1%
Ganglioglioma;Low-grade glioma/astrocytoma (WHO grade I/II) 1 0.1%
Ganglioneuroma 1 0.1%
Germinoma;Teratoma 1 0.1%
Hamartoma 1 0.1%
Intraneural perineurioma 1 0.1%
Malignant melanocytic neoplasm 1 0.1%
Medulloepithelioma 1 0.1%
Metastatic secondary tumors;Neuroblastoma 1 0.1%
Myeloid Sarcoma 1 0.1%
Myofibroblastoma 1 0.1%
Myxoid spindle cell tumor 1 0.1%
NeuroInflammatory systemic disease 1 0.1%
Non-germinomatous germ cell tumor;Teratoma 1 0.1%
Ossifying Fibroma 1 0.1%
Papillary glioneuronal tumor 1 0.1%
Primary CNS lymphoma 1 0.1%
Prolactinoma 1 0.1%
Reactive Connective Tissue 1 0.1%
Reactive Gliosis 1 0.1%
Rosai-Dorfman Disease 1 0.1%
Loading