-
Notifications
You must be signed in to change notification settings - Fork 0
/
reformatthyroid.R
90 lines (73 loc) · 3.44 KB
/
reformatthyroid.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# thyroid
# data obtained from: http://gdac.broadinstitute.org/
# miRNAseq
miRNAseq <- read.csv('THCA.mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.data.txt',sep='\t')
row.names(miRNAseq) <- miRNAseq[,1]
miRNAseq <- miRNAseq[,-1]
df <- miRNAseq
cols <- colSums(mapply('==', 'read_count', df))
new.df <- df[,which(cols == 0)]
cols <- colSums(mapply('==', 'cross-mapped', new.df))
new.df <- new.df[,which(cols == 0)]
cols <- colSums(mapply('==', 'N', new.df))
new.df <- new.df[,which(cols == 0)]
new.df <- new.df[-1,]
myvec <- colnames(new.df)
n <- 3
colnames(new.df) <- vapply(strsplit(myvec, '\\.'), function(x) paste(x[seq.int(n)], collapse='.'), character(1L))
miRNAseq <- new.df
miRNAseq[] <- lapply(miRNAseq, function(x) as.numeric(as.character(x)))
# filter for variance
miRNAseq <- log2(miRNAseq+2)
CV <- apply(miRNAseq,1,sd)/(rowMeans(miRNAseq))
names <- names(CV)[CV>as.numeric(quantile(CV)[4])]
miRNAseq <- subset(miRNAseq, row.names(miRNAseq) %in% names)
# mRNAseq
#RNAseq2 <- fread('PRAD.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt',sep='\t')
#RNAseq <- RNAseq2
RNAseq <- read.csv('THCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt',sep='\t')
row.names(RNAseq) <- RNAseq[,1]
RNAseq <- RNAseq[,-1]
df <- RNAseq
cols <- colSums(mapply('==', 'scaled_estimate', df))
new.df <- df[,which(cols == 0)]
cols <- colSums(mapply('==', 'transcript_id', new.df))
new.df <- new.df[,which(cols == 0)]
RNAseq <- new.df[-1,]
myvec <- colnames(RNAseq)
n <- 3
colnames(RNAseq) <- vapply(strsplit(myvec, '\\.'), function(x) paste(x[seq.int(n)], collapse='.'), character(1L))
RNAseq[] <- lapply(RNAseq, function(x) as.numeric(as.character(x)))
RNAseq <- RNAseq[complete.cases(RNAseq), ]
# filter for variance
RNAseq <- log2(RNAseq+2)
CV <- apply(RNAseq,1,sd)/(rowMeans(RNAseq))
names <- names(CV)[CV>as.numeric(quantile(CV)[4])]
RNAseq <- subset(RNAseq, row.names(RNAseq) %in% names)
# protein
protein <- read.csv('THCA.protein_exp__mda_rppa_core__mdanderson_org__Level_3__protein_normalization__data.data.txt',sep='\t')
protein <- protein[-1,]
row.names(protein) <- protein[,1]
protein <- protein[,-1]
myvec <- colnames(protein)
n <- 3
colnames(protein) <- vapply(strsplit(myvec, '\\.'), function(x) paste(x[seq.int(n)], collapse='.'), character(1L))
protein[] <- lapply(protein, function(x) as.numeric(as.character(x)))
protein <- protein[complete.cases(protein), ]
CV <- apply(protein,1,sd)/(rowMeans(protein))
names <- names(CV)[CV>as.numeric(quantile(CV)[4])]
protein <- subset(protein, row.names(protein) %in% names)
# clinical
clinical <- read.csv('THCA.clin.merged.picked.txt',sep='\t')
clinical <- data.frame(t(clinical))
colnames(clinical) <- as.character(unname(unlist(clinical[1,])))
clinical <- clinical[-1,]
row.names(clinical) <- toupper(row.names(clinical))
clinical$days_to_death <- as.numeric(as.character(clinical$days_to_death))
clinical$days_to_last_followup <- as.numeric(as.character(clinical$days_to_last_followup))
clinical$days_to_death[is.na(clinical$days_to_death)] <- clinical$days_to_last_followup[is.na(clinical$days_to_death)]
clinical <- clinical[c('vital_status','days_to_death')]
colnames(clinical) <- c('Death','Time')
# make a list of data and write the object
thyroid <- list(RNAseq=RNAseq,miRNAseq=miRNAseq,protein=protein,clinical=clinical)
rm(list=setdiff(ls(), "thyroid"))