-
Notifications
You must be signed in to change notification settings - Fork 8
/
3_calc_PEER_adj_exp.R
executable file
·159 lines (120 loc) · 5.45 KB
/
3_calc_PEER_adj_exp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
####by Heather E. Wheeler 20140602####
#Modifications by NWK 01/08/2015 ####
library(peer)
library(preprocessCore)
library(GenABEL)
args <- commandArgs(trailingOnly=T)
date <- Sys.Date()
if(length(args)!=4){
print("ERROR!:")
print(length(args))
stop()
}
###############################################
### Directories & Variables
tissue <- args[1]
my.dir <- args[2]
project.name <- args[3]
out.dir <- args[4]
Nk <- 15 ##number of peer factors to calculate, recommend 25% of sample size, but no more than 100, GTEx included 15 in pilot analyses
tis <- gsub(" ","",tissue)
#tissue <- "Nerve - Tibial" ###check GTEx_Analysis_2014-06-13.SampleTissue.annot for available tissues###
setwd(my.dir)
filenames <- paste0(project.name,c(".SampleTissue.annot",
".RNA-seq.ID.list",
".RNA-seq.GENE.list",
".protein-coding.RDS",
".fam",
".20genotPCs.txt",
".protein-coding.anno.RDS"))
if(!all(file.exists(filenames))){
print("Unable to find files:")
print(sum(!file.exists(filenames)))
print(paste(filenames[!file.exists(filenames)]))
stop()
}
names(filenames) <- c("SampleTissue","RNA-ID","RNA-Genelist","RNA-matrix","fam","PC","ANNO.RDS")
print("Reading SampleTissue")
sam <- read.table(filenames["SampleTissue"],header=T,sep="\t",stringsAsFactors=F,quote='',comment.char='')
sample <- subset(sam,SMTSD == tissue) ### pull sample list of chosen tissue###
expdata <- t(readRDS(filenames["RNA-matrix"]))
expanno <- readRDS(filenames["ANNO.RDS"])
rownames(expanno) <- expanno$id
expdata <- expdata[intersect(rownames(expdata),sample$SAMPID),] ###pull expression data for chosen tissue###
expdf <- data.frame(id=substr(rownames(expdata),1,10),expdata,stringsAsFactors=F)
print("Reading fam")
fam <- read.table(filenames["fam"],stringsAsFactors=F)
gtsamplelist <- fam$V1
gtsamplelist <- substr(gtsamplelist,1,10) ###to match with exp data###
rownames(fam) <- gtsamplelist
samplelist <- intersect(rownames(fam),expdf$id)
nsample <- length(samplelist)
expdf <- aggregate(.~id,data=expdf,FUN=mean)
rownames(expdf) <- expdf$id
expdata <- data.matrix(expdf[,-1])
expdata <- expdata[samplelist,]
expdata <- expdata[,apply(expdata,2,function(x)mean(x)>0)]
###get first 3 PCs from genos, as in GTEx
print("Reading PCs")
pc.matrix<-read.table(filenames["PC"],header=T,stringsAsFactors=F)
rownames(pc.matrix) <- substr(pc.matrix[,1],1,10)
pc.matrix <- data.matrix(pc.matrix[,3:5])
pc.matrix <- pc.matrix[samplelist,]
###pull gender, used as cov in GTEx
fam <- fam[samplelist,]
gender <- fam$V5
names(gender)<-rownames(fam)
###quantile normalize and transform to standard normal exp.w.geno matrix, as in GTEx###
t.expdata <- t(expdata)
rowtable<-function(x) length(table(x))>2 ##function to determine if more than 2 exp levels per gene
nonbin<-apply(t.expdata,1,rowtable) ##apply to matrix
t.expdata <- t.expdata[nonbin,] ##remove binary genes from matrix
qn.t.expdata <- normalize.quantiles(t.expdata) ##quantile normalize
nonbin <- apply(qn.t.expdata,1,rowtable)
qn.t.expdata <- qn.t.expdata[nonbin,]
rn.qn.t.expdata <- apply(qn.t.expdata,1,rntransform) ##rank transform to normality & transposes, not sure why?##
###Now we can create the model object, ### from https://github.com/PMBio/peer/wiki/Tutorial
model = PEER()
###set the observed data,
PEER_setPhenoMean(model,data.matrix(rn.qn.t.expdata))
dim(PEER_getPhenoMean(model))
###(NULL response means no error here), say we want to infer K=20 hidden confounders,
PEER_setNk(model,Nk)
PEER_getNk(model)
####and perform the inference. ###for Nk=20 and GTEx-NT, it took 323 iterations, for Nk=15 and GTEx-NT, it took 37 iterations
PEER_update(model)
factors = PEER_getX(model)
rownames(factors) <- rownames(expdata)
#Change to given tissue directory (Make it if it doesn't exist)
if(!file.exists(tis)){
dir.create(tis,showWarnings=T)
}
setwd(tis)
filename <- paste0(tis,".",Nk,".PEER.factors.",date,".txt")
write.table(factors,file =filename, quote=F)
weights = PEER_getW(model)
precision = PEER_getAlpha(model)
residuals = PEER_getResiduals(model)
##png(file= paste0(tis,".",Nk,".PEER.factors.plotmodel.",date,".png"))
##PEER_plotModel(model)
##dev.off()
adj.exp.matrix<-matrix(NA,nrow=nrow(rn.qn.t.expdata),ncol=ncol(rn.qn.t.expdata))
for(i in 1:ncol(rn.qn.t.expdata)){
res <- summary(lm(rn.qn.t.expdata[,i] ~ factors + pc.matrix + gender, na.action=na.exclude))
resid <- residuals(res)
adj.exp.matrix[,i] <- resid
}
colnames(adj.exp.matrix) <- rownames(t.expdata)
rownames(adj.exp.matrix) <- colnames(t.expdata)
outfiles <- paste0(out.dir,project.name,".",tis,".exp.adj.",Nk,c("PEERfactors.3PCs.gender.IDxGENE",
"PEERfactors.3PCs.gender.GENE.list",
"PEERfactors.3PCs.gender.ID.list",
"PEERfactors.3PCs.gender.ANNO."))
names(outfiles) <- c("IDxGENE",
"GENE.list",
"ID.list",
"ANNO.RDS")
write.table(adj.exp.matrix, file= outfiles["IDxGENE"], quote=F, row.names=F, col.names=F)
saveRDS(adj.exp.matrix,file=paste0(outfiles["IDxGENE"],".RDS"))
write(colnames(adj.exp.matrix), file = outfiles["GENE.list"], ncolumns=1)
write(rownames(adj.exp.matrix), file = outfiles["ID.list"], ncolumns=1)