-
Notifications
You must be signed in to change notification settings - Fork 0
/
scRNA_workspace_v3.1.R
241 lines (192 loc) · 9.24 KB
/
scRNA_workspace_v3.1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# Seurat v3.1 pipeline
library(Seurat)
library(scImpute)
library(dplyr)
library(magrittr)
library(pheatmap)
library(RColorBrewer)
library(scales)
library(xlsx)
# import UMAP
# IMPORTANT: Have to set path before loading reticulate package
Sys.setenv(PATH= paste("C:/Users/Stephanie/Anaconda3/envs/r-reticulate/Library/bin",Sys.getenv()["PATH"],sep=";"))
Sys.setenv(RETICULATE_PYTHON = "C:/Users/Stephanie/Anaconda3/envs/r-reticulate/python.exe")
library(reticulate)
use_condaenv("r-reticulate")
py_config() # check to make sure that Numpy is found
import("umap")
#py_install("umap-learn") # Only run if haven't installed umap python package
# Run through the analysis first and then if you see that you need to correct for batch effects
# PREPROCESSING
# using filtered as example
KOFIL.data <- Read10X("KO/KO/filtered_gene_bc_matrices/mm10/")
WTFIL.data <- Read10X("WT/WT/filtered_gene_bc_matrices/mm10/")
# Put label on cell names if you have more than one group
colnames(KOFIL.data) <- paste("KO_", colnames(KOFIL.data), sep = '')
colnames(WTFIL.data) <- paste("WT_", colnames(WTFIL.data), sep = '')
# # Dropout correction
# # THIS NEEDS A LOT OF MEMORY TO RUN!
# # Will take some time to run
# combined.data <- cbind(KOFIL.data, WTFIL.data) # merge data
# saveRDS(combined.data, file = 'raw_count_combined_KPC.rds')
# count_path <- "./raw_count_combined_KPC.rds"
# infile="rds"
# outfile="rds"
# drop_thre=0.5 # threshold to determine dropout values
# ncores=1 # if working on Windows, set to 1
# out_dir="./"
# combined_scimpute <- scimpute(count_path, infile, outfile, out_dir,
# labeled = FALSE, drop_thre, ncores = ncores,
# type = "count", Kcluster = 5)
# scimpute_count <- readRDS("scimpute_count.rds")
# OLD WAY
# Create KO and WT Seurat objects
KOFIL <- CreateSeuratObject(counts = KOFIL.data, project = 'KOFIL', min.cells = 3, min.features = 100)
WTFIL <- CreateSeuratObject(counts = WTFIL.data, project = 'WTFIL', min.cells = 3, min.features = 100)
# Add any meta.data needed
KOFIL@meta.data$group <- "KO"
WTFIL@meta.data$group <- "WT"
# If more than 2 objects, then need to merge
# Add IDs to cell names to distinguish groups
merge <- merge(x = KOFIL, y = WTFIL, add.cell.ids = c('KO', 'WT'))
# Cell cycle genes
s_genes <- cc.genes$s.genes
g2m_genes <- cc.genes$g2m.genes
# # Create Seurat object
# merge <- CreateSeuratObject(counts = scimpute_count, project = 'MERGED', min.cells = 3, min.features = 100)
# # Add groups to meta.data
# merge$group <- Idents(object = merge)
# IMPORTANT
# Changing between meta.data for identities
Idents(object = merge) <- 'group'
# Check active identity
levels(merge)
# QC
# get percent of mitochondrial genes reads
merge[["percent.mt"]] <- PercentageFeatureSet(object = merge, pattern = "^mt-")
# Look at nFeature, nCount, percent.mt
VlnPlot(object = merge, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,
pt.size = .5)
# Another way to look at nFeature, nCount, percent.mt
plot1 <- FeatureScatter(object = merge, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(object = merge, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
# Filter cells based on nFeature
# filtering cells that have nCount < 50000 and percent.mt < 20%
merge <- subset(x = merge, subset = nCount_RNA < 50000 & percent.mt < 25)
# Normalize merged data
merge <- NormalizeData(object = merge, normalization.method = "LogNormalize",
scale.factor = 10000)
# Can use code below if want default parameters (same thing as above)
# merge <- NormalizeData(object = merge)
# find subset of features that exhibit high cell-to-cell variation in the dataset
# vst --> loess.span, clip.max, nfeaturesmerge
merge <- FindVariableFeatures(object = merge, selection.method = 'vst', loess.span = .3, nfeatures = 2000)
# change top variable genes you want to keep w/ nfeatures
top10 <- head(x = VariableFeatures(object = merge), 10)
plot1 <- VariableFeaturePlot(object = merge)
plot2 <- LabelPoints(plot = plot1, points = top10)
CombinePlots(plots = list(plot1, plot2))
# ignore warning for now
# mean.var.plot --> mean.function, dispersion.function, num.bin, mean/dispersion.cutoff
# merge <- FindVariableFeatures(object = merge, selection.method = 'mean.var.plot', ,
# mean.cutoff = c(0.0125,Inf), dispersion.cutoff = c(0.5,Inf))
# Scale the data
# if leave features paramter blank, then it will use variable genes
# if don't use all genes, then DoHeatmap function might not work
# OPTION 1:
# all.genes <- rownames(merge)
# merge <- ScaleData(object = merge, features = all.genes,
# vars.to.regress = c("nCount", "percent.mt"))
# OPTION 2:
# merge <- ScaleData(object = merge,
# vars.to.regress = c("nCount", "percent.mt"))
# OPTION 3: w/ cell cycling correction
all.genes <- rownames(merge)
# use if want all cell cycle signals regressed out
merge <- CellCycleScoring(object = merge, s.features = s_genes, g2m.features = g2m_genes, set.ident = TRUE)
merge <- ScaleData(object = merge, features = all.genes,
vars.to.regress = c("nCount", "percent.mt", "S.Score", "G2M.Score"))
# use if want some cell cycle signals regressed out (cycling vs. non-cycling will save)
merge$CC.Difference <- merge$S.Score - merge$G2M.Score
merge <- ScaleData(object = merge, features = all.genes,
vars.to.regress = c("nCount", "percent.mt", "CC.Difference"))
# Dimensional Reduction: PCA
# Can control the number of pcs that you want to calculate w/ npcs
merge <- RunPCA(object = merge, features = VariableFeatures(object = merge),
nfeatures.print = 5)
# Choosing how many PCs for clustering
# visualizing pcs
ElbowPlot(object = merge, ndims = 50)
# JackStraw if you want
# merge <- JackStraw(object = merge, num.replicate = 100)
# merge <- ScoreJackStraw(object = merge, dims = 1:20)
# JackStrawPlot(object = merge, dims = 1:15)
# Calculating --> until get over 90% variance
stdev <- merge[['pca']]@stdev
var <- (merge[['pca']]@stdev)^2
sum(var[1:31])/sum(var)
# CLUSTERING
merge <- FindNeighbors(object = merge, dims = 1:31, k.param = 20)
merge <- FindClusters(object = merge, resolution = 0.5)
head(x = Idents(object = merge), 5)
# VISUALIZATION: TSNE/UMAP
# TSNE example
merge <- RunTSNE(merge, dims = 1:31)
DimPlot(object = merge, reduction = "tsne")
# UMAP example
merge <- RunUMAP(merge, dims = 1:31)
DimPlot(object = merge, reduction = 'umap')
# LABELING CLUSTERS
# Visualizing expression
FeaturePlot(merge, features = c("Krt18", "Cdh11", "Ptprc", "Hba-a2", "Cdh5"))
DotPlot(merge, features = c("Krt18", "Cdh11", "Ptprc", "Hba-a2", "Cdh5"))
# differential expression
# markers for cluster 1
cluster1.markers <- FindMarkers(object = merge, ident.1 = 1, min.pct = 0.25)
head(x = cluster1.markers, n = 5)
# markers for all clusters
merge.markers <- FindAllMarkers(object = merge, only.pos = TRUE, min.pct = 0.25,
logfc.threshold = 0.25)
merge.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
# Putting new labels on clusters
new.cluster.ids <- c("A", "B", "C", "D", "E", "F","G", "H", "I", "J", "K", "L", "M", "N",
"O")
names(x = new.cluster.ids) <- levels(x = merge)
merge <- RenameIdents(object = merge, new.cluster.ids)
DimPlot(object = merge, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
merge$labeled_clusters <- Idents(object = merge) # save identity to meta.data
# DON'T MOVE ONE TO LABELING UNTIL YOU'VE FILTERED ENOUGH
# otherwise --> repeat from filtering step
# VISUALIZATIONS
# Get top n genes for each cluster
top10 <- merge.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
# DoHeatmap
DoHeatmap(object = merge, features = top10$gene, size = 2) + NoLegend()
# pheatmap
# make sure when fetching data that there is no repeated genes
cells <- FetchData(object = merge, vars = c(unique(top10$gene), 'labeled_clusters'),
slot = 'scale.data')
cells <- cells[order(cells$labeled_clusters),]
anno <- data.frame(cluster = cells$labeled_clusters)
rownames(anno) <- rownames(cells)
cells <- apply(cells[,1:147], MARGIN = 2, FUN = function(x) (squish(x, range = c(-2.5,2.5))))
pheatmap(mat = t(cells[,1:147]), show_colnames = F,
cluster_rows = F, cluster_col = F,
annotation_col = anno, fontsize = 7,
color = colorRampPalette(colors = c('#8A2BE2','#000000','#FFFF00'))(250))
# Subset specific cell type: myeloid
Idents(merge) <- 'labeled_clusters'
myeloid <- subset(merge, subset = labeled_clusters == 'Myeloid')
'--------------'
# 8/20/20
# new version: creating metadata based on current idents
levels(KMT2D.combined)
#LOW-1262, 1261, 1296, 1141, 3210
#HIGH - 1299, 1314, 1302, 1246, 1229
old_ids <- levels(KMT2D.combined)
new_ids <- c('low', 'low', 'low', 'low', 'low', 'high', 'high', 'high', 'high', 'high')
KMT2D.combined$test <- plyr::mapvalues(KMT2D.combined$ID, from = old_ids, to = new_ids)
Idents(KMT2D.combined) <- 'test'
levels(KMT2D.combined)
table(KMT2D.combined$test, KMT2D.combined$ID)