forked from AlexsLemonade/OpenPBTA-analysis
-
Notifications
You must be signed in to change notification settings - Fork 14
/
01-add-ploidy-cnvkit.Rmd
142 lines (112 loc) · 5.55 KB
/
01-add-ploidy-cnvkit.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
---
title: "Add ploidy column, status to CNVkit output"
output: html_notebook
author: J. Taroni for ALSF CCDL
date: 2019
---
The `histologies.tsv` file contains a `tumor_ploidy` column, which is tumor ploidy as inferred by ControlFreeC.
The copy number information should be interpreted in the light of this information (see: [current version of Data Formats section of README](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/390f1e08e481da5ec0b2c62d886d5fd298bbf017#data-formats)).
This notebook adds ploidy information to the CNVkit results and adds a status column that defines gain and loss broadly.
```{r}
library(dplyr)
```
### Read in data
```{r}
cnvkit_file <- file.path("..", "..", "data", "cnv-cnvkit.seg.gz")
cnvkit_df <- readr::read_tsv(cnvkit_file)
```
```{r}
histologies_file <- file.path("..", "..", "data", "histologies-base.tsv")
histologies_df <- readr::read_tsv(histologies_file, guess_max = 100000)
```
# Edits for https://github.com/PediatricOpenTargets/ticket-tracker/issues/177
```{r}
# Filter to WXS tumor with no germline sex estimate
no_estimate <- histologies_df %>%
filter(sample_type == "Tumor" & experimental_strategy == "WXS" &
is.na(germline_sex_estimate)) %>%
select(-germline_sex_estimate)
# Extract participant ID with no germline sex estimate
no_estimate_participants <- no_estimate %>%
pull(Kids_First_Participant_ID) %>%
unique()
# Find out matching WGS
# Use Kids_First_Participant_ID for filtering samples that have WGS, when WXS was performed
no_estimate_participants_match_wgs <- histologies_df %>%
filter(Kids_First_Participant_ID %in% no_estimate_participants &
experimental_strategy == "WGS" & sample_type == "Tumor") %>%
select(Kids_First_Participant_ID, germline_sex_estimate)
# Merge data that was fixed with WGS germline sex estimate
no_estimate_fixed_through_wgs <- left_join(no_estimate, no_estimate_participants_match_wgs
, by = "Kids_First_Participant_ID") %>%
filter(!is.na(germline_sex_estimate))
# Identify data that was not fixed due to NA
no_estimate_still_na <- left_join(no_estimate, no_estimate_participants_match_wgs
, by = "Kids_First_Participant_ID") %>%
filter(is.na(germline_sex_estimate))
# Fix NA data using gender
no_estimate_still_na_fixed <- no_estimate_still_na %>%
mutate(germline_sex_estimate = case_when(
reported_gender == "Female" ~ "Female",
reported_gender == "Male" ~ "Male",
TRUE ~ as.character(NA))
)
# Define column type to avoid error
no_estimate_fixed_through_wgs$germline_sex_estimate <- as.character(no_estimate_fixed_through_wgs$germline_sex_estimate)
# Merge back fixed_wgs, fixed_na, and exist_germline_estimate data
histologies_df_fixed <- bind_rows(no_estimate_fixed_through_wgs, no_estimate_still_na_fixed)
# remove samples in histologies_df_fixed from histologies_df
histologies_df_rm <- histologies_df %>%
anti_join(histologies_df_fixed, by = "Kids_First_Biospecimen_ID")
# Define column type to avoid error
histologies_df_rm$germline_sex_estimate <- as.character(histologies_df_rm$germline_sex_estimate)
# add new rows back
histologies_df_final <- histologies_df_rm %>%
bind_rows(histologies_df_fixed)
```
### Add inferred ploidy information to CNVkit results
```{r}
add_ploidy_df <- histologies_df_final %>%
select(Kids_First_Biospecimen_ID, tumor_ploidy, germline_sex_estimate) %>%
inner_join(cnvkit_df, by = c("Kids_First_Biospecimen_ID" = "ID")) %>%
select(-tumor_ploidy, -germline_sex_estimate, everything())
```
### Add status column
This is intended to mirror the information contained in the ControlFreeC output.
```{r}
add_autosomes_df <- add_ploidy_df %>%
# x and y chromosomes must be handled differently
filter(!(chrom %in% c("chrX", "chrY"))) %>%
mutate(status = case_when(
# when the copy number is less than inferred ploidy, mark this as a loss
copy.num < tumor_ploidy ~ "loss",
# if copy number is higher than ploidy, mark as a gain
copy.num > tumor_ploidy ~ "gain",
copy.num == tumor_ploidy ~ "neutral"
))
# this logic is consistent with what is observed in the controlfreec file
# specifically, in samples where germline sex estimate = Female, X chromosomes
# appear to be treated the same way as autosomes
add_xy_df <- add_ploidy_df %>%
filter(chrom %in% c("chrX", "chrY")) %>%
mutate(status = case_when(
germline_sex_estimate == "Male" & copy.num < (0.5 * tumor_ploidy) ~ "loss",
germline_sex_estimate == "Male" & copy.num > (0.5 * tumor_ploidy) ~ "gain",
germline_sex_estimate == "Male" & copy.num == (0.5 * tumor_ploidy) ~ "neutral",
# there are some instances where there are chromosome Y segments are in
# samples where the germline_sex_estimate is Female
chrom == "chrY" & germline_sex_estimate == "Female" & copy.num > 0 ~ NA_character_,
chrom == "chrY" & germline_sex_estimate == "Female" & copy.num == 0 ~ "neutral",
# mirroring controlfreec, X treated same as autosomes
chrom == "chrX" & germline_sex_estimate == "Female" & copy.num < tumor_ploidy ~ "loss",
chrom == "chrX" & germline_sex_estimate == "Female" & copy.num > tumor_ploidy ~ "gain",
chrom == "chrX" & germline_sex_estimate == "Female" & copy.num == tumor_ploidy ~ "neutral"
))
add_status_df <- dplyr::bind_rows(add_autosomes_df, add_xy_df) %>%
dplyr::select(-germline_sex_estimate)
```
### Write to `scratch`
```{r}
output_file <- file.path("results", "cnvkit_with_status.tsv")
readr::write_tsv(add_status_df, output_file)
```