Skip to content

Commit

Permalink
Merge pull request #524 from d3b-center/pineo-v13
Browse files Browse the repository at this point in the history
(14/N) Pineoblastoma v13
  • Loading branch information
zzgeng committed Jan 2, 2024
2 parents d1622ca + 32fb140 commit d0b717c
Show file tree
Hide file tree
Showing 190 changed files with 142,729 additions and 111,664 deletions.
176 changes: 123 additions & 53 deletions analyses/create-subset-files/01-get_biospecimen_identifiers.R

Large diffs are not rendered by default.

78 changes: 13 additions & 65 deletions analyses/create-subset-files/02-subset_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,40 +16,8 @@ suppressWarnings(
)
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(arrow))
suppressPackageStartupMessages(options(readr.show_col_types = FALSE))

write_maf_file <- function(maf_df, file_name, version_string) {
# Given a data.frame that contains the fields for a MAF file, write a gzipped
# MAF file and include the version information provided in version_string.
#
# Note: if file_name exists, it will be overwritten
#
# Args:
# maf_df: A data.frame that contains the MAF info.
# file_name: Output file name, including the full path.
# version_string: the version string that will be written to the first line
# of the file at file_name
#
# Returns: intended to be used to write files only

# if the file name supplied to this function ends in `.gz`, take it out for
# the purposes of writeLines, etc.
# we'll gzip it at the end with R.utils::gzip and this extension is not needed
if (grepl(".gz", file_name)) {
file_name <- sub(".gz", "", file_name)
}

# write the version string to the top of the file
writeLines(version_string, con = file_name)

# write the tabular data of maf_df
readr::write_tsv(maf_df, path = file_name, append = TRUE, col_names = TRUE)

# now gzip the file
R.utils::gzip(file_name, overwrite = TRUE)
}

subset_files <- function(filename, biospecimen_ids, output_directory) {
# given the full path to a file to be subset and the list of biospecimen ids
# to use for subsetting, write a file of the same name to the output directory
Expand All @@ -75,30 +43,6 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
# filtering strategy depends on the file type, mostly because how the sample
# IDs change based on the file type -- that's why this logic is required
if (grepl("snv", filename)) {
# if (grepl("hotspots", filename)) {
# snv_file <- data.table::fread(filename,
# skip = 1, # skip version string
# data.table = FALSE,
# showProgress = FALSE)
# # we need to obtain the version string from the first line of the MAF file
# version_string <- readLines(filename, n = 1)
# # filter + write to file with custom function
# snv_file %>%
# dplyr::filter(Tumor_Sample_Barcode %in% biospecimen_ids) %>%
# write_maf_file(file_name = output_file,
# version_string = version_string)
# snv_file %>%
# dplyr::filter(Tumor_Sample_Barcode %in% biospecimen_ids) %>%
# readr::write_tsv(output_file)
# } else {
# # in a column 'Tumor_Sample_Barcode'
# snv_file <- data.table::fread(filename, data.table = FALSE,
# showProgress = FALSE)
# snv_file %>%
# dplyr::filter(Tumor_Sample_Barcode %in% biospecimen_ids) %>%
# readr::write_tsv(output_file)
# }
# in a column 'Tumor_Sample_Barcode'
snv_file <- data.table::fread(filename, data.table = FALSE,
showProgress = FALSE)
snv_file %>%
Expand Down Expand Up @@ -133,7 +77,7 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
fusion_file %>%
dplyr::filter(Sample %in% biospecimen_ids |
# this is required for the the fusion-summary module and TP53 module
grepl("RELA|MN1|EWSR1|FGFR1--TACC1|MYB--QKI|BRAF|TP53--TRPS1|TP53--PSMG4", FusionName)) %>%
grepl("ZFTA|MN1|EWSR1|FGFR1--TACC1|MYB--QKI|BRAF|TP53--TRPS1|TP53--PSMG4", FusionName)) %>%
readr::write_tsv(output_file)
} else if (grepl("dgd", filename)) {
fusion_file %>%
Expand Down Expand Up @@ -168,9 +112,14 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
expression_file %>% dplyr::select(transcript_id, gene_symbol,
!!!rlang::quos(any_of(biospecimen_ids))) %>%
readr::write_rds(output_file)
} else if (grepl("methyl", filename)) {
expression_file %>% dplyr::select(Probe_ID,
!!!rlang::quos(any_of(biospecimen_ids))) %>%
# } else if (grepl("methyl", filename)) {
# expression_file %>% dplyr::select(Probe_ID,
# !!!rlang::quos(any_of(biospecimen_ids))) %>%
# readr::write_rds(output_file)
} else if (grepl("gtex", filename)) {
expression_file <- readr::read_rds(filename)
biospecimen_ids <- intersect(colnames(expression_file), biospecimen_ids)
expression_file %>% dplyr::select(!!!rlang::quos(any_of(biospecimen_ids))) %>%
readr::write_rds(output_file)
} else {
expression_file %>% dplyr::select(!!!rlang::quos(any_of(biospecimen_ids))) %>%
Expand All @@ -182,12 +131,11 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
independent_file %>%
dplyr::filter(Kids_First_Biospecimen_ID %in% biospecimen_ids) %>%
readr::write_tsv(output_file)
} else if (grepl("splice-events-rmats", filename)) {
# } else if (grepl("splice-events-rmats", filename)) {
# in a column 'sample_id'
rmats_file <- arrow::read_tsv_arrow(filename)
rmats_file %>%
dplyr::filter(sample_id %in% biospecimen_ids) %>%
readr::write_tsv(output_file)
# rmats_file <- vroom::vroom(filename) %>%
# dplyr::filter(sample_id %in% biospecimen_ids) %>%
# readr::write_tsv(output_file)
} else {
# error-handling
stop("File type unrecognized by 'subset_files'")
Expand Down
6 changes: 4 additions & 2 deletions analyses/create-subset-files/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## Steps for creating subset files for CI
## Steps for creating subset files for GitHub Actions CI

1. Update to the most recent release of the data by running `bash download-data.sh` in the root directory of the repository.
2. Run the shell script to generate subset files (from the root directory of the repository):
Expand All @@ -21,6 +21,8 @@ Non-matched samples are also added to each file (10% of `--num_matched`), which
Some files are copied over in their entirety (e.g., BED files).
See `create_subset_files.sh` for more information.

Note: `splice-events-rmats.tsv.gz` and all `methyl*` files are skipped in v13 due to large size and that no modules currently routinely utilize these files.

#### Special considerations

Certain analysis modules have required modifications to the subset file creation steps beyond randomly selecting participants.
Expand Down Expand Up @@ -55,6 +57,6 @@ Running the following from the root directory of the repository
SKIP_SUBSETTING=1 ./analyses/create-subset-files/create_subset_files.sh
```

will skip the subsetting file steps that are implemented in R and only copy files that are included in full (e.g., `pbta-histologies.tsv`) and generate a new `md5sum.txt`.
will skip the subsetting file steps that are implemented in R and only copy files that are included in full (e.g., `histologies.tsv`) and generate a new `md5sum.txt`.
This is intended to be used when the only files that need to be updated are those that are copied over without being reduced in size in anyway.

Binary file modified analyses/create-subset-files/biospecimen_ids_for_subset.RDS
Binary file not shown.
12 changes: 10 additions & 2 deletions analyses/create-subset-files/create_subset_files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set -o pipefail

# Set defaults for release and biospecimen file name
BIOSPECIMEN_FILE=${BIOSPECIMEN_FILE:-biospecimen_ids_for_subset.RDS}
RELEASE=${RELEASE:-v12}
RELEASE=${RELEASE:-v13}
NUM_MATCHED=${NUM_MATCHED:-15}

# This option controls whether or not the two larger MAF files are skipped as
Expand Down Expand Up @@ -41,7 +41,6 @@ fi
# download Illumina methylation annotations file if does not exist in data
# from the data release s3 bucket
URL="https://d3b-openaccess-us-east-1-prd-pbta.s3.amazonaws.com/open-targets"
RELEASE="v12"
PROBES="infinium.gencode.v39.probe.annotations.tsv.gz"
if [ -f "${DATA_DIRECTORY}/${PROBES}" ]; then
echo "${PROBES} exists, skip downloading"
Expand Down Expand Up @@ -101,6 +100,15 @@ cp $FULL_DIRECTORY/cnv-consensus-gistic.zip $SUBSET_DIRECTORY
# all bed files
cp $FULL_DIRECTORY/*.bed $SUBSET_DIRECTORY

# DGD fusion file
cp $FULL_DIRECTORY/fusion-dgd.tsv.gz $SUBSET_DIRECTORY

# All proteomic files
cp $FULL_DIRECTORY/*protein* $SUBSET_DIRECTORY

# Full tumor only MAF (for now, it is small)
cp $FULL_DIRECTORY/snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz $SUBSET_DIRECTORY

# if the md5sum.txt file already exists, get rid of it
cd $SUBSET_DIRECTORY
rm -f md5sum.txt
Expand Down
36 changes: 18 additions & 18 deletions analyses/efo-mondo-mapping/results/efo-mondo-map-prefill.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -4,73 +4,70 @@ Acute Myeloid Leukemia EFO_0000222 MONDO_0018874 NCIT_C3171
Adamantinomatous Craniopharyngioma EFO_1000069 MONDO_0002787 NCIT_C4726
Adenocarcinoma EFO_0000228 MONDO_0004970 NCIT_C2852
Adrenocortical Carcinoma EFO_1000796 MONDO_0006639 NCIT_C9325
Anaplastic Large Cell Lymphoma EFO_0003032 MONDO_0020325 NCIT_C3720
Alveolar soft part sarcoma NA NA NA
Angiosarcoma EFO_0003968 MONDO_0016982 NCIT_C3088
Astroblastoma MONDO_0016707 MONDO_0016707 NCIT_C4324
Astrocytoma EFO_0000272 MONDO_0019781 NCIT_C6958
Atypical Teratoid Rhabdoid Tumor EFO_1002008 MONDO_0020560 NCIT_C6906
Atypical choroid plexus papilloma MONDO_0002684 MONDO_0002684 NCIT_C53686
B Acute Lymphoblastic Leukemia/Lymphoma EFO_0000094 MONDO_0004967 NCIT_C8644
Bladder Urothelial Carcinoma EFO_0006544 MONDO_0005611 NCIT_C39851
Breast Invasive Carcinoma EFO_1000307 MONDO_0006256 NCIT_C9245
Burkitt Leukemia/Lymphoma EFO_0000309 MONDO_0007243 NCIT_C2912
CIC-DUX4 Sarcoma EFO_0000691 MONDO_0005089 NCIT_C165663
CIC-rearranged sarcoma NA NA NA
CNS Burkitt's lymphoma EFO_0000309 MONDO_0007243 NCIT_C2912
CNS Embryonal tumor EFO_0005784 MONDO_0018843 NCIT_C5398
CNS Melanoma EFO_0002617 MONDO_0005191 NCIT_C133504
CNS neuroblastoma EFO_0000621 MONDO_0006130 NCIT_C4826
CNS tumor with BCOR internal tandem duplication NA NA NA
Cavernoma EFO_1000151 MONDO_0003155 NCIT_C3086
Central neurocytoma EFO_1000856 MONDO_0019134 NCIT_C3791
Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma EFO_1000162 MONDO_0006143 NCIT_C157526
Cholangiocarcinoma EFO_0005221 MONDO_0019087 NCIT_C4436
Chondromyxoid fibroma EFO_0000332 MONDO_0018447 NCIT_C3830
Chordoma Orphanet_178 MONDO_0008978 NCIT_C2947
Choroid plexus carcinoma MONDO_0016718 MONDO_0016718 NCIT_C4715
Choroid plexus papilloma EFO_1000177 MONDO_0009837 NCIT_C3698
Choroid plexus tumor EFO_0007206 MONDO_0016717 NCIT_C4533
Chromophobe renal cell carcinoma EFO_0000335 MONDO_0017885 NCIT_C4146
Chronic Myelogenous Leukemia EFO_0000339 MONDO_0011996 NCIT_C3174
Clear cell sarcoma of the kidney EFO_0000350 MONDO_0005006 NCIT_C4264
Colon Adenocarcinoma EFO_1001949 MONDO_0002271 NCIT_C4349
Colon Carcinoma NA NA NA
Congenital malignant brain tumor NA NA NA
Craniopharyngioma EFO_1000209 MONDO_0002787 NCIT_C2964
Cutaneous Melanoma EFO_0000389 MONDO_0005012 NCIT_C3510
Desmoid-type fibromatosis EFO_0009907 Orphanet_873 NCIT_C9182
Desmoplastic infantile astrocytoma and ganglioglioma MONDO_0016731 MONDO_0016731 NCIT_C4747
Diffuse fibrillary astrocytoma MONDO_0016688 MONDO_0016688 NCIT_C4322
Diffuse hemispheric glioma MONDO_0016680 MONDO_0016680 NA
Diffuse intrinsic pontine glioma EFO_1000026 MONDO_0006033 NCIT_C94764
Diffuse leptomeningeal glioneuronal tumor MONDO_0016745 MONDO_0016745 NCIT_C129424
Diffuse midline glioma EFO_1000026 MONDO_0006033 NCIT_C129309
Dysembryoplastic neuroepithelial tumor EFO_0005551 MONDO_0005505 NCIT_C9505
Dysgerminoma MONDO_0003002 MONDO_0003002 NCIT_C2996
EBV-Positive Diffuse Large B-Cell Lymphoma NA NA NA
Embryonal tumor with multilayer rosettes MONDO_0016715 MONDO_0016715 NCIT_C129499
Ependymoma EFO_1000028 MONDO_0016698 NCIT_C3017
Epstein-Barr virus-related tumor MONDO_0017342 MONDO_0017342 NA
Esophageal Carcinoma EFO_0002916 MONDO_0019086 NCIT_C3513
Ewing sarcoma EFO_0000174 MONDO_0012817 NCIT_C4817
Extraventricular neurocytoma MONDO_0016727 MONDO_0016727 NCIT_C92555
Fibromyxoid lesion MONDO_0037745 MONDO_0037745 NCIT_C66760
Follicular Variant Thyroid Gland Papillary Carcinoma NA NA NA
Ganglioglioma EFO_0003094 MONDO_0016733 NCIT_C3788
Ganglioneuroblastoma EFO_0000502 MONDO_0005035 NCIT_C3790
Ganglioneuroma EFO_0000500 MONDO_0005033 NCIT_C3049
Germ Cell Tumor EFO_0000514 MONDO_0005040 NCIT_C3708
Germinoma MONDO_0020580 MONDO_0020580 NCIT_C121618
Glial-neuronal tumor MONDO_0016729 MONDO_0016729 NCIT_C4747
Glial-neuronal tumor NOS MONDO_0016729 MONDO_0016729 NCIT_C4747
Glioblastoma MONDO_0018177 MONDO_0018177 NCIT_C30587
Glioblastoma Multiforme EFO_0000519 MONDO_0018177 NCIT_C3058
Head and Neck Squamous Cell Carcinoma EFO_0000181 MONDO_0010150 NCIT_C34447
Hemangioblastoma MONDO_0016748 MONDO_0016748 NCIT_C3801
Hepatoblastoma EFO_1000292 MONDO_0018666 NCIT_C3728
Hepatocellular Carcinoma EFO_0000182 MONDO_0007256 NCIT_C3099
Hepatocellular neoplasm NOS NA NA NA
High-grade glioma MONDO_0100342 MONDO_0100342 NCIT_C4822
High-grade neuroepithelial tumor NA NA NA
Histiocytic tumor MONDO_0020081 MONDO_0020081 NCIT_C9294
Hodgkin's lymphoma EFO_0000183 MONDO_0004952 NCIT_C9357
Infant-type hemispheric glioma EFO_0005543 MONDO_0014695 NCIT_C185471
Infantile Fibrosarcoma MONDO_0002678 MONDO_0002678 NCIT_C4244
Infantile hemispheric glioma NA NA NA
Inflammatory Myofibroblastic Tumor MONDO_0015798 MONDO_0015798 NCIT_C6481
Intrahepatic Cholangiocarcinoma EFO_1001961 MONDO_0003210 NCIT_C35417
Intraneural perineuroma MONDO_0015032 MONDO_0015032 NCIT_C6911
Juvenile xanthogranuloma EFO_1000311 MONDO_0015534 NCIT_C3451
Langerhans Cell histiocytosis EFO_1000318 MONDO_0018310 NCIT_C3107
Expand All @@ -87,22 +84,22 @@ Mesenchymal tumor EFO_1000473 MONDO_0003512 NCIT_C7059
Mesothelioma EFO_0000588 MONDO_0005065 NCIT_C3234
Metastatic secondary tumors EFO_0009812 MONDO_0024883 NCIT_C4968
Mixed germ cell tumor MONDO_0015864 MONDO_0015864 NCIT_C4290
Myeloid Leukemia Associated with Down Syndrome NA NA NA
Myeloid Sarcoma NA NA NA
Neuroblastoma EFO_0000621 MONDO_0005072 NCIT_C3270
Neuroepithelial tumor with PATZ1 fusion NA NA NA
Neurofibroma/Plexiform EFO_0000658 MONDO_0003304 NCIT_C3797
Non-Hodgkin Lymphoma EFO_0005952 MONDO_0018908 NCIT_C3211
Non-germinomatous germ cell tumor MONDO_0020580 MONDO_0020580 NCIT_C121619
Oligodendroglioma EFO_0000632 MONDO_0016695 NCIT_C3288
Osteosarcoma EFO_0000637 MONDO_0009807 NCIT_C9145
Other tumor NA NA NA
Ovarian Serous Cystadenocarcinoma EFO_1000043 MONDO_0006046 NCIT_C7978
Pancreatic Adenocarcinoma EFO_1000044 MONDO_0006047 NCIT_C8294
Perineuroma MONDO_0019404 MONDO_0019404 NCIT_C4973
Pancreatoblastoma NA NA NA
Papillary Carcinoma NA NA NA
Pheochromocytoma and Paraganglioma EFO_0020005 MONDO_0035540 NA
Pilocytic astrocytoma Orphanet_251612 MONDO_0016691 NCIT_C4047
Pineoblastoma EFO_1000475 MONDO_0016722 NCIT_C9344
Pineocytoma EFO_1000476 MONDO_0016723 NCIT_C6966
Pleomorphic xanthoastrocytoma MONDO_0016690 MONDO_0016690 NCIT_C4323
Primary intracranial sarcoma NA NA NA
Primary mediastinal large B cell lymphoma MONDO_0004021 MONDO_0020323 NCIT_C9280
Prostate Adenocarcinoma EFO_0000673 MONDO_0005082 NCIT_C2919
Rectum Adenocarcinoma EFO_0005631 MONDO_0002169 NCIT_C9383
Expand All @@ -115,15 +112,18 @@ Rosai-Dorfman disease MONDO_0006412 MONDO_0006412 NCIT_C36075
Rosette-forming glioneuronal tumor MONDO_0016736 MONDO_0016736 NCIT_C129431
Sarcoma EFO_0000691 MONDO_0005089 NCIT_C9118
Schwannoma EFO_0000693 MONDO_0002546 NCIT_C3269
Small Cell Carcinoma NA NA NA
Spindle cell neoplasm NA NA NA
Stomach Adenocarcinoma EFO_0000503 MONDO_0005036 NCIT_C4004
Subependymal Giant Cell Astrocytoma MONDO_0016693 MONDO_0016693 NCIT_C3696
T Acute Lymphoblastic Leukemia/Lymphoma EFO_0000209 MONDO_0004963 NCIT_C3183
Teratoma MONDO_0002601 MONDO_0002601 NCIT_C3403
Testicular Germ Cell Tumor EFO_1000566 MONDO_0010108 NCIT_C8591
Thymoma EFO_1000581 MONDO_0006456 NCIT_C3411
Thyroid Carcinoma EFO_0002892 MONDO_0015075 NCIT_C4815
Thyroid Gland Follicular Carcinoma EFO_0000501 MONDO_0005034 NCIT_C8054
Thyroid Gland Papillary Carcinoma EFO_0000641 MONDO_0005075 NCIT_C4035
Thyroid gland neoplasm NA NA NA
Type I Pleuropulmonary Blastoma NA NA NA
Uterine Carcinosarcoma EFO_1000613 MONDO_0006485 NCIT_C42700
Uterine Corpus Endometrial Carcinoma EFO_0007532 MONDO_0000553 NCIT_C159413
Uveal Melanoma EFO_1000616 MONDO_0006486 NCIT_C7712
Expand Down
553 changes: 295 additions & 258 deletions analyses/independent-samples/00-repeated-samples.nb.html

Large diffs are not rendered by default.

1,531 changes: 930 additions & 601 deletions analyses/independent-samples/03-qc-independent-samples.nb.html

Large diffs are not rendered by default.

Loading

0 comments on commit d0b717c

Please sign in to comment.