Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Commit

Permalink
Sample distribution plots: account for multiple samples from same ind…
Browse files Browse the repository at this point in the history
…ividual (#170)

* Only include tumors; distinct individual disease type pairs

* Only include tumors; distinct histologies/disease type

* Safer bash while we're at it
  • Loading branch information
jaclyn-taroni authored Oct 24, 2019
1 parent ff04105 commit b38dc64
Show file tree
Hide file tree
Showing 9 changed files with 1,095 additions and 2,121 deletions.
58 changes: 34 additions & 24 deletions analyses/sample-distribution-analysis/01-filter-across-types.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,29 @@
# of key variables within the dataset.
#
# Chante Bethell for CCDL 2019
#
#
# #### USAGE
# This script is intended to be run via the command line from the top directory
# This script is intended to be run via the command line from the top directory
# of the repository as follows:
#
# Rscript analyses/sample-distribution-analysis/01-filter-across-types.R

# magrittr pipe
`%>%` <- dplyr::`%>%`

# Function to filter based on primary_site
# Function to filter based on primary_site
location_fn <- function(location) {
# Given the name of a primary site, create a vector containing the disease
# Given the name of a primary site, create a vector containing the disease
# types expressed within the primary site.
#
# Note: the disease types found at all instances of the location substring
# will be included
#
# Note: the disease types found at all instances of the location substring
# will be included
#
# Args:
# location: the name of a primary site found within the dataset
#
# Returns:
# disease_type_vector: the vector of disease types expressed at the
# disease_type_vector: the vector of disease types expressed at the
# named primary site
disease_type_vector <- brain_location %>%
dplyr::arrange(dplyr::desc(n)) %>%
Expand All @@ -51,26 +51,34 @@ if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}

# Read in dataset
df2 <- data.frame(readr::read_tsv(
file.path(root_dir, "data", "pbta-histologies.tsv")
))

# Remove na's
df2 <- df2 %>%
# Read in dataset and remove NAs
histologies_df <-
readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv")) %>%
as.data.frame() %>%
dplyr::filter(!is.na(disease_type_new))

# Filter the histologies file to account for multiple samples from the same
# individual and the fact that multiple experimental strategies are in this
# data.frame

# Retain only tumors for this analysis
histologies_df <- histologies_df %>%
dplyr::filter(sample_type == "Tumor",
composition == "Solid Tissue")

# data.frame with the count of each unique cancer type expression
disease_expression <- df2 %>%
disease_expression <- histologies_df %>%
# some recurrences can have different disease_type_new values
dplyr::distinct(Kids_First_Participant_ID, disease_type_new) %>%
dplyr::group_by(disease_type_new) %>%
dplyr::count(name = "count") %>%
dplyr::arrange(dplyr::desc(count))

# Calculate the total count of the dataset
sum_count <- sum(disease_expression$count)

# Create a percent variable and round to 4 decimal places
# (so values will have 2 decimal places as percentages)
# Create a percent variable and round to 4 decimal places
# (so values will have 2 decimal places as percentages)
disease_expression <- disease_expression %>%
dplyr::mutate(percent = paste0((round(count / sum_count, 4) * 100), "%"))

Expand Down Expand Up @@ -108,15 +116,17 @@ ggplot2::ggsave(
height = 10
)

# data.frame with the location where each cancer type in the dataset is
# data.frame with the location where each cancer type in the dataset is
# expressed, sorted to show highest expression
brain_location <- df2 %>%
brain_location <- histologies_df %>%
dplyr::distinct(Kids_First_Participant_ID, disease_type_new,
primary_site) %>%
dplyr::select(disease_type_new, primary_site) %>%
dplyr::group_by(disease_type_new, primary_site) %>%
dplyr::tally() %>%
dplyr::arrange(dplyr::desc(n))

# Make a vector of primary sites
# Make a vector of primary sites
primary_sites_vector <- c(
"Basal Ganglia",
"Brain Stem- Midbrain",
Expand All @@ -143,11 +153,11 @@ primary_sites_vector <- c(
# This step helps us with melting
names(primary_sites_vector) <- primary_sites_vector

# For each string in primary sites vector use location_fn to get the vector of
# disease types it's sorted by
# For each string in primary sites vector use location_fn to get the vector of
# disease types it's sorted by
cancer_types_list <- lapply(primary_sites_vector, location_fn)

# Count the disease types for each primary site by taking the length of each
# Count the disease types for each primary site by taking the length of each
# element of the list
cancer_types_counts <- lapply(cancer_types_list, length)

Expand Down
28 changes: 16 additions & 12 deletions analyses/sample-distribution-analysis/02-multilayer-plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Chante Bethell for CCDL 2019
#
# #### USAGE
# This script is intended to be run via the command line from the top directory
# This script is intended to be run via the command line from the top directory
# of the repository as follows:
#
# Rscript analyses/sample-distribution-analysis/02-multilayer-plots.R
Expand All @@ -25,15 +25,19 @@ output_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis")
results_dir <- file.path(output_dir, "results")
plots_dir <- file.path(output_dir, "plots")

# Read in dataset
df2 <- readr::read_tsv(file.path(root_dir, "data",
# Read in dataset
histologies_df <- readr::read_tsv(file.path(root_dir, "data",
"pbta-histologies.tsv"))

# Create a colorblind-friendly color vector
color <- colorblindr::palette_OkabeIto

# Create final data.frame prepped for treemap and sunburst functions
final_df <- df2 %>%
final_df <- histologies_df %>%
dplyr::filter(sample_type == "Tumor",
composition == "Solid Tissue") %>%
dplyr::distinct(Kids_First_Participant_ID, broad_histology,
short_histology, disease_type_new) %>%
# Select our 3 columns of interest
dplyr::select(broad_histology, short_histology, disease_type_new) %>%
# Remove any row that has an NA
Expand All @@ -45,7 +49,7 @@ final_df <- df2 %>%
# Place the value 1 in a column named counter for treemap and sunburt plots
dplyr::mutate(counter= c(1)) %>%
# Change the column names
dplyr::rename(level1 = broad_histology,
dplyr::rename(level1 = broad_histology,
level2 = short_histology,
level3 = disease_type_new) %>%
# Reorder the rows according to the 3 levels
Expand All @@ -56,7 +60,7 @@ final_df <- df2 %>%
# Save to tsv file
readr::write_tsv(final_df, file.path(results_dir, "plots_df.tsv"))

# Create a treemap
# Create a treemap
tm <-
treemap::treemap(
final_df,
Expand All @@ -67,19 +71,19 @@ tm <-
)$tm

# Convert the tm data.frame into a d3.js hierarchy object which is needed
# for the sund2b plot
# for the sund2b plot
tmnest <-
d3r::d3_nest(tm[, c("level1", "level2", "level3", "vSize")],
value_cols = c("vSize"))

# Create an interactive treemap
# Create an interactive treemap
interactive_tm <-
d3treeR::d3tree(tm,
rootname = "Cancer Histologies Treemap",
width = 1200,
height = 700)

# Create a sunburst plot
# Create a sunburst plot
sun_plot <-
sunburstR::sunburst(
data = tmnest,
Expand All @@ -89,10 +93,10 @@ sun_plot <-
colors = color
)

# Create an interactive sund2b plot
# Create an interactive sund2b plot
p <- sunburstR::sund2b(tmnest, colors = color, valueField = "vSize")

# Create HTML outputs for the interactive plots
mapview::mapshot(interactive_tm, url = file.path(plots_dir,
# Create HTML outputs for the interactive plots
mapview::mapshot(interactive_tm, url = file.path(plots_dir,
"histology-treemap.html"))
mapview::mapshot(p, url = file.path(plots_dir, "histology-pie.html"))
Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

146 changes: 73 additions & 73 deletions analyses/sample-distribution-analysis/results/disease_expression.tsv
Original file line number Diff line number Diff line change
@@ -1,74 +1,74 @@
disease_type_new count percent
Low-grade glioma;astrocytoma (WHO grade I/II) 498 24.88%
High-grade glioma;astrocytoma (WHO grade III/IV) 230 11.49%
Medulloblastoma 228 11.39%
Ependymoma 173 8.64%
Brainstem glioma- Diffuse intrinsic pontine glioma 130 6.49%
Ganglioglioma 95 4.75%
Craniopharyngioma 74 3.7%
Atypical Teratoid Rhabdoid Tumor 60 3%
Meningioma 59 2.95%
Dysembryoplastic neuroepithelial tumor 50 2.5%
Neurofibroma;Plexiform 39 1.95%
Schwannoma 37 1.85%
Choroid plexus papilloma 30 1.5%
Supratentorial or Spinal Cord PNET 29 1.45%
Dysplasia;Gliosis 27 1.35%
Ewings Sarcoma 15 0.75%
Teratoma 15 0.75%
Chordoma 12 0.6%
Metastatic secondary tumors 11 0.55%
Pineoblastoma 10 0.5%
Germinoma 8 0.4%
Langerhans Cell histiocytosis 8 0.4%
Adenoma 7 0.35%
Choroid plexus carcinoma 7 0.35%
Glial-neuronal tumor NOS 7 0.35%
Malignant peripheral nerve sheath tumor 7 0.35%
Sarcoma 7 0.35%
Ganglioneuroblastoma 6 0.3%
Hemangioblastoma 6 0.3%
Meningioangiomatosis 6 0.3%
Neuroblastoma 6 0.3%
Neurocytoma 6 0.3%
Subependymal Giant Cell Astrocytoma (SEGA) 6 0.3%
Cyst 5 0.25%
Dermoid Cyst 5 0.25%
Gliomatosis Cerebri 4 0.2%
Non-Langerhans Histiocytosis;JXG 4 0.2%
Oligodendroglioma 4 0.2%
Osteoblastoma 4 0.2%
Rhabdomyosarcoma 4 0.2%
Cavernoma 3 0.15%
Brain arteriovenous malformation 2 0.1%
Chondrosarcoma 2 0.1%
Choroid plexus cyst 2 0.1%
CNS embryonal tumor 2 0.1%
Cortical Tubers 2 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Ganglioglioma 2 0.1%
Dysplasia;Gliosis;Glial-neuronal tumor NOS 2 0.1%
Embryonal tumor with multilayer rosettes NOS 2 0.1%
Ependymoblastoma 2 0.1%
Fibroma 2 0.1%
Fibromyxoid Tumor 2 0.1%
Ganglioglioma;Low-grade glioma/astrocytoma (WHO grade I/II) 2 0.1%
Ganglioneuroma 2 0.1%
Germinoma;Teratoma 2 0.1%
Hamartoma 2 0.1%
Intraneural perineurioma 2 0.1%
Malignant melanocytic neoplasm 2 0.1%
Metastatic secondary tumors;Neuroblastoma 2 0.1%
Myeloid Sarcoma 2 0.1%
Myofibroblastoma 2 0.1%
Myxoid spindle cell tumor 2 0.1%
NeuroInflammatory systemic disease 2 0.1%
Non-germinomatous germ cell tumor;Teratoma 2 0.1%
Ossifying Fibroma 2 0.1%
Primary CNS lymphoma 2 0.1%
Reactive Connective Tissue 2 0.1%
Reactive Gliosis 2 0.1%
Rosai-Dorfman Disease 2 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Dysplasia/Gliosis;Ganglioglioma 1 0.05%
Medulloepithelioma 1 0.05%
Papillary glioneuronal tumor 1 0.05%
Prolactinoma 1 0.05%
Low-grade glioma;astrocytoma (WHO grade I/II) 239 24.95%
Medulloblastoma 119 12.42%
High-grade glioma;astrocytoma (WHO grade III/IV) 101 10.54%
Ependymoma 85 8.87%
Ganglioglioma 46 4.8%
Brainstem glioma- Diffuse intrinsic pontine glioma 45 4.7%
Craniopharyngioma 39 4.07%
Atypical Teratoid Rhabdoid Tumor 28 2.92%
Meningioma 27 2.82%
Dysembryoplastic neuroepithelial tumor 25 2.61%
Neurofibroma;Plexiform 19 1.98%
Choroid plexus papilloma 16 1.67%
Schwannoma 16 1.67%
Supratentorial or Spinal Cord PNET 16 1.67%
Dysplasia;Gliosis 14 1.46%
Teratoma 8 0.84%
Ewings Sarcoma 7 0.73%
Metastatic secondary tumors 5 0.52%
Adenoma 4 0.42%
Choroid plexus carcinoma 4 0.42%
Cyst 4 0.42%
Germinoma 4 0.42%
Glial-neuronal tumor NOS 4 0.42%
Langerhans Cell histiocytosis 4 0.42%
Malignant peripheral nerve sheath tumor 4 0.42%
Neuroblastoma 4 0.42%
Pineoblastoma 4 0.42%
Sarcoma 4 0.42%
Chordoma 3 0.31%
Dermoid Cyst 3 0.31%
Meningioangiomatosis 3 0.31%
Neurocytoma 3 0.31%
Subependymal Giant Cell Astrocytoma (SEGA) 3 0.31%
Cavernoma 2 0.21%
Ganglioneuroblastoma 2 0.21%
Gliomatosis Cerebri 2 0.21%
Hemangioblastoma 2 0.21%
Non-Langerhans Histiocytosis;JXG 2 0.21%
Oligodendroglioma 2 0.21%
Osteoblastoma 2 0.21%
Rhabdomyosarcoma 2 0.21%
Brain arteriovenous malformation 1 0.1%
Chondrosarcoma 1 0.1%
Choroid plexus cyst 1 0.1%
CNS embryonal tumor 1 0.1%
Cortical Tubers 1 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Dysplasia/Gliosis;Ganglioglioma 1 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Ganglioglioma 1 0.1%
Dysplasia;Gliosis;Glial-neuronal tumor NOS 1 0.1%
Embryonal tumor with multilayer rosettes NOS 1 0.1%
Ependymoblastoma 1 0.1%
Fibroma 1 0.1%
Fibromyxoid Tumor 1 0.1%
Ganglioglioma;Low-grade glioma/astrocytoma (WHO grade I/II) 1 0.1%
Ganglioneuroma 1 0.1%
Germinoma;Teratoma 1 0.1%
Hamartoma 1 0.1%
Intraneural perineurioma 1 0.1%
Malignant melanocytic neoplasm 1 0.1%
Medulloepithelioma 1 0.1%
Metastatic secondary tumors;Neuroblastoma 1 0.1%
Myeloid Sarcoma 1 0.1%
Myofibroblastoma 1 0.1%
Myxoid spindle cell tumor 1 0.1%
NeuroInflammatory systemic disease 1 0.1%
Non-germinomatous germ cell tumor;Teratoma 1 0.1%
Ossifying Fibroma 1 0.1%
Papillary glioneuronal tumor 1 0.1%
Primary CNS lymphoma 1 0.1%
Prolactinoma 1 0.1%
Reactive Connective Tissue 1 0.1%
Reactive Gliosis 1 0.1%
Rosai-Dorfman Disease 1 0.1%
Loading

0 comments on commit b38dc64

Please sign in to comment.