-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnATACseq_unconstrained_integration.R
172 lines (134 loc) · 6.66 KB
/
snATACseq_unconstrained_integration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#--------------------------------------------------------------------------------------
#
# ArchR - Unconstrained integration
#
#--------------------------------------------------------------------------------------
## Resources ------------------------------------------------------------------------
# ArchR manual - https://www.archrproject.com/index.html
# ArchR GitHiub - https://github.com/GreenleafLab/ArchR
# Summarized Expriment - https://bioconductor.org/packages/release/bioc/vignettes/SummarizedExperiment/inst/doc/SummarizedExperiment.html
# Harmony - Github - https://github.com/immunogenomics/harmony
# Granges - https://kasperdanielhansen.github.io/genbioconductor/html/GenomicRanges_GRanges_Usage.html
## Info ------------------------------------------------------------------------------
# snATAC-seq - run unconstrained and contrained integration
# Here we map the RNA-seq cluster IDs to our ATAC-seq clusters. It is a 2-step process.
# First unconstrained integration is run which is a broad pass at mapping the RNA-seq
# cluster IDs to the ATAC-seq clusters. Then the constrained integration is run, this
# time by adding supervised cell groupings as IDed in the unconstrained analysis
# i.e. InNs (InN-1, InN-2) and ExNs (ExN-1, ExN-2) etc. are clumped. This enables more
# accurate cell mapping between the modalites.
## Load Packages --------------------------------------------------------------------
library(ArchR)
library(pheatmap)
library(tidyverse)
library(rmarkdown)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ComplexHeatmap)
library(clustree)
library(cowplot)
library(argparser)
library(Seurat)
## Parse region / set region variable -------------------------------------------------
cat('\nParsing args ... \n')
p <- arg_parser("\nRead brain region and output directory for snATACseq QC ... \n")
p <- add_argument(p, "region", help = "No brain region specified")
p <- add_argument(p, "data_dir", help = "No input data directory specified")
p <- add_argument(p, "archR_out_dir", help = "No ArchR output directory specified")
p <- add_argument(p, "markdown_file", help = "No markdown file path specified")
p <- add_argument(p, "report_dir", help = "No report output directory specified")
p <- add_argument(p, "report_file", help = "No report filename specified")
p <- add_argument(p, "pre_or_post_clust_QC", help = "State if running pre- or post- clust QC")
args <- parse_args(p)
print(args)
## Define global variables -----------------------------------------------------------
cat('\nDefining variables ... \n')
REGION <- args$region
DATA_DIR <- args$data_dir
OUT_DIR <- args$archR_out_dir
MARKDOWN_FILE <- args$markdown_file
REPORT_DIR <- args$report_dir
REPORT_FILE <- args$report_file
CLUST_QC_STATUS <- args$pre_or_post_clust_QC
addArchRThreads(threads = 8) # Set Hawk to 32 cores so 0.75 of total
addArchRGenome("hg38")
## Load ArchR project -------------------------------------------------------------------
cat(paste0('\nLoading ArchR snATACseq project for ', REGION, ' ... \n'))
archR <- loadArchRProject(path = OUT_DIR)
## Integrating snATACseq with snRNAseq - Cptr 8 -----------------------------------------
# Load Seurat RNA data
cat(paste0('\nLoading Seurat snRNAseq data for ', REGION, ' ... \n'))
if (REGION == 'FC') {
cat(paste0('\nLoading Seurat object for and region specific variables for', REGION, ' ... \n'))
seurat.obj <- readRDS("../resources/R_objects/seurat.pfc.final.rds")
seurat.obj$cellIDs <- gsub('FC-', '', seurat.obj$cellIDs)
} else {
cat(paste0('\nLoading Seurat object for and region specific variables for', REGION, ' ... \n'))
seurat.obj <- readRDS("../resources/R_objects/seurat.wge.final.rds")
seurat.obj$cellIDs <- gsub('GE-', '', seurat.obj$cellIDs)
}
# Specify if script is being run pre-or post cluster QC
if (CLUST_QC_STATUS == 'PRE') {
reducedDim_ID <- 'IterativeLSI'
clust_ID <- 'Clusters'
UMAP_ID <- 'UMAP'
} else {
reducedDim_ID <- 'IterativeLSI_reclust'
clust_ID <- 'Clusters_reclust'
UMAP_ID <- 'UMAP_reclust'
}
# Run unconstrained integration
cat('\nRunning unconstrained integration ... \n')
archR.2 <- addGeneIntegrationMatrix(
ArchRProj = archR,
useMatrix = "GeneScoreMatrix",
matrixName = "GeneIntegrationMatrix",
reducedDims = reducedDim_ID,
seRNA = seurat.obj,
addToArrow = FALSE,
groupRNA = "cellIDs",
nameCell = "predictedCell_Un",
nameGroup = "predictedGroup_Un",
nameScore = "predictedScore_Un"
)
## Unconstrained integration - reporting ---------------------------------------------
# Confusion matrix - unconstrained cell mappings
cat('\nCreating confusion matrix ... \n')
cM_geneExp <- as.matrix(confusionMatrix(unname(unlist(getCellColData(archR.2)[clust_ID])),
archR.2$predictedGroup_Un))
clust_CM_geneExp <- pheatmap::pheatmap(
mat = as.matrix(cM_geneExp),
color = paletteContinuous("whiteBlue"),
border_color = "black", display_numbers = TRUE, number_format = "%.0f",
cluster_rows = F, # Needed for row order https://stackoverflow.com/questions/59306714
treeheight_col = 0,
treeheight_row = 0,
number_color = 'black'
)
# Get df of top cellID matches from RNA for each ATAC cluster
preClust <- colnames(cM_geneExp)[apply(cM_geneExp, 1 , which.max)]
integration_df <- t(as.data.frame(cbind(preClust, rownames(cM_geneExp)))) #Assignments
rownames(integration_df) <- c("RNA", "ATAC")
colnames(integration_df) <- NULL
# Plot RNA and ATAC UMAPs for comparison
cat('\nCreating UMAP ... \n')
clusters_UMAP <- plotEmbedding(ArchRProj = archR.2, colorBy = "cellColData",
name = clust_ID, embedding = UMAP_ID) +
NoLegend() + ggtitle('Clusters')
# Prepare cell groupings for constrained integration
# Only cell-types in preClust need to be included
cM_unconstrained <- as.matrix(confusionMatrix(unname(unlist(getCellColData(archR.2)[clust_ID])),
archR.2$predictedGroup_Un))
preClust <- colnames(cM_unconstrained)[apply(cM_unconstrained, 1 , which.max)]
cM_unconstrained2 <- cbind(preClust, rownames(cM_unconstrained))
unique(unique(archR.2$predictedGroup_Un))
## Save ArchR project ----------------------------------------------------------------
cat('\nSaving project ... \n')
saveArchRProject(ArchRProj = archR.2,
outputDirectory = OUT_DIR,
load = FALSE)
## Create markdown doc ---------------------------------------------------------------
cat('\nCreating markdown report ... \n')
render(MARKDOWN_FILE, output_file = REPORT_FILE, output_dir = REPORT_DIR)
cat('\nDONE.\n')
#--------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------