PsychAD_r0/PsychAD_r0_analysis.Rmd

---
title: "Analysis of Alzheimer's Disease in [PsychAD](https://adknowledgeportal.synapse.org/Explore/Projects/DetailsPage?Grant%20Number=R01AG067025)"
subtitle: 'Public Release 0'
author: "Developed by [Gabriel Hoffman](http://gabrielhoffman.github.io/)"
date: "Run on `r Sys.time()`"
output: 
  html_document:
    toc: true
    smart: true
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = FALSE,
  warning=FALSE,
  message=FALSE,
  error = FALSE,
  tidy = FALSE,
  dev = c("png", "pdf"),
  package.startup.message = FALSE,
  cache = TRUE,
  cache.lazy = FALSE)
```

# Load libraries
```{r load.packages, cache=FALSE}
suppressPackageStartupMessages({
library(SingleCellExperiment)
library(zellkonverter)
library(dreamlet)
library(zenith)
library(DelayedArray)
library(GSEABase)
library(muscat)
library(cowplot)
library(scater)
library(tidyverse)
library(kableExtra)
library(qvalue)
library(scattermore)
library(RColorBrewer)
library(corrplot)
library(viridis)
library(dplyr)
library(org.Hs.eg.db)
library(EnrichmentBrowser)
library(ggrepel)
})
```


```{r load.data}
folder = "/sc/arion/projects/psychAD/NPS-AD/public_release_0/" 
file = paste0(folder, "PsychAD_r0_Dec_28_2022.h5ad")
sce = readH5AD(file, use_hdf5=TRUE, verbose=TRUE)
assayNames(sce)[1] = "counts"
sce$Dx = factor(sce$Dx_AD, c('Control','AD'))
```

Public release 0 includes `r length(table(sce$Channel))` samples, `r length(table(sce$round_num))` rounds, `r length(table(sce$poolID))` [10X](https://www.10xgenomics.com/products/single-cell-gene-expression) pools, `r length(table(sce$SubID))` donors and `r format(ncol(sce), big.mark=',')` cells passing QC.


```{r ctorder}
# cell type order
ctorder = c('EN_L2_3_IT', 'EN_L3_5_IT_1', 'EN_L3_5_IT_2', 'EN_L3_5_IT_3', 'EN_L5_6_NP', 'EN_L6_CT', 'EN_L6_IT', 'EN_NF', 'IN_ADARB2', 'IN_LAMP5', 'IN_PVALB', 'IN_PVALB_CHC', 'IN_SST', 'IN_VIP', 'Oligo', 'OPC', 'Astro', 'Micro_PVM', 'CD8_T', 'PC', 'VLMC','Endo')
```


# Summarize data
## UMAP
```{r umap}
plotProjection(sce, "X_umap", "class", legend.position="right", text=FALSE) + ggtitle("class")
```

```{r umap3}
plotProjection(sce, "X_umap", "subclass", order=ctorder)  + ggtitle("subclass")
```

```{r umap9}
plotProjection(sce, "X_umap", "subtype")  + ggtitle("subtype")
```


## Summarize cell counts
```{r summarize.cell.counts, fig.width=5}
# cells per Channel
colData(sce)$Channel %>% 
  table %>% 
  hist(main=paste0("Cell counts per Channel: mean=", format(mean(.), digits=2), ", median = ", format(median(.), digits=2)))

colData(sce)$SubID %>% 
  table %>% 
  hist(main=paste0("Cell counts per SubID: mean=", format(mean(.), digits=2), ", median = ", format(median(.), digits=2)))

# Number of cells observed per Channel
colData(sce) %>%
  xtabs( ~ Channel + subclass,.) %>%
  as_tibble %>%
  pivot_longer(cols=Channel) %>%
  mutate(subclass = factor(subclass, ctorder)) %>%  
  ggplot(aes(subclass, n, fill=subclass)) + 
    geom_violin(color = NA) + 
    geom_boxplot(width=.1, outlier.size=.1) +
    theme_classic() + 
    theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5),
      legend.position="none") + 
    scale_y_log10() +
    coord_flip() +
    ylab("Number of cells observed per Channel") +
    xlab('')

# Number of cells observed per Subject
fig.cells_subject = colData(sce) %>%
  xtabs( ~ SubID + subclass,.) %>%
  as_tibble %>%
  pivot_longer(cols=SubID) %>%
  mutate(subclass = factor(subclass, ctorder)) %>%  
  ggplot(aes(subclass, n, fill=subclass)) + 
    geom_violin(color = NA) + 
    geom_boxplot(width=.1, outlier.size=.1) +
    theme_classic() + 
    theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5),
      legend.position="none") + 
    scale_y_log10() +
    coord_flip() +
    ylab("Number of cells observed per Subject") +
    xlab('')
fig.cells_subject    
```


```{r combineData, message=TRUE} 
# Create pseudo-bulk SingleCellExperiment
pb = aggregateToPseudoBulk(sce,
    assay = "counts", 
    cluster_id = "subclass",
    sample_id = "Channel",
    BPPARAM = SnowParam(6))
```

```{r write.pb, eval=FALSE}
folder = "/sc/arion/projects/psychAD/NPS-AD/public_release_0/" 
file = paste0(folder, "PsychAD_r0_Dec_28_2022_pseudobulk.RDS")

saveRDS(pb, file)
```

# Summarize demographics
```{r demo}
library(gridExtra)
library(grid)
library(cowplot) 

df = colData(pb) %>%
  as_tibble %>%
  select(SubID, Dx, Sex, Age) %>%
  distinct

xtabs( ~ Dx + Sex, df)

figB = df %>%
  ggplot(aes(Sex, Age, fill=Dx)) +
    geom_boxplot() + 
    theme_classic() +
    theme(aspect.ratio=1) +
    scale_fill_manual("Diagnosis", values = c("dodgerblue", "red3"))

t1 <- tableGrob(xtabs( ~ Dx + Sex, df), theme = ttheme_minimal())
fig = plot_grid(t1, figB, ncol = 2, labels=LETTERS)
fig
```

## Summarize read counts
```{r summarize.read.counts}
# extract read counts for each Channel
df_counts = lapply( assayNames(pb), function(x){
  data = assay(pb, x)

  data.frame(celltype = x, ID = colnames(data), readCounts = colSums(data))
})
df_counts = do.call(rbind, df_counts)
df_counts$celltype = factor(df_counts$celltype, ctorder)  

ggplot(df_counts, aes(celltype, readCounts, fill=celltype)) +  
    geom_violin(color = NA) + 
    geom_boxplot(width=.1, outlier.size=.1) +
    theme_classic() + 
    theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5),
      legend.position="none") + 
    scale_y_log10() +
    coord_flip() +
    ylab("Number of reads observed for each Channel") +
    xlab('') +
    ggtitle('Reads per cell cluster for each Channel')

# extract cell counts
df_rate = cellCounts(pb) %>%
            as.data.frame %>%
            mutate(ID = rownames(.))  %>% 
            pivot_longer(cols=-ID, values_to="ncells", names_to="celltype") %>%
            mutate(celltype = factor(celltype, ctorder))  

# plot reads per cell
inner_join(df_counts, df_rate, by=c("celltype", "ID")) %>%
  ggplot(aes(celltype, readCounts/ncells, fill=celltype)) +  
    geom_violin(color = NA) + 
    geom_boxplot(width=.1, outlier.size=.1) +
    theme_classic() + 
    theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5),
      legend.position="none") + 
    scale_y_log10() +
    coord_flip() +
    ylab('Reads per cell') +
    xlab('') +
    ggtitle('Reads per cell for each Channel')
```

## Cell type specificity
```{r cellTypeSpecificity}
df = cellTypeSpecificity( pb )
 
plotViolin(df, assays=ctorder)  
```

### Show cell markers
```{r cellMarkers}
library(org.Hs.eg.db)
# /sc/arion/projects/CommonMind/leed62/app/leetools/pegasus/human_brain_cell_markers.json
# /sc/arion/projects/CommonMind/leed62/app/leetools/pegasus/human_brain_immune_cell_markers.json
# /sc/arion/projects/CommonMind/leed62/app/leetools/pegasus/human_immune_cell_markers.json
genes = c('RBFOX3', 'MEG3', 'SLC17A7',
'RBFOX3', 'MEG3', 'GAD1', 'GAD2', 'GRIK1',
'SST',
'PVALB',
'VIP',
'SLC1A3', 'GFAP', 'APOE', 'SLC1A2', 'SLC14A1',
'PLP1', 'MAG', 'MBP',
'PDGFRA', 'VCAN',
'FLT1', 'CLDN5',
'PDGFRB',
'TGFBR1', 'DOCK8', 'CD74', 'CSF1R', 'MS4A6A', 'PLXDC2')
 
df_genes = AnnotationDbi::select(org.Hs.eg.db, genes, "ENSEMBL", "SYMBOL")

df_sub = df[rownames(df) %in% df_genes$ENSEMBL,]
idx = match(rownames(df_sub), df_genes$ENSEMBL)
rownames(df_sub) = df_genes$SYMBOL[idx]

plotPercentBars(df, genes=unique(genes), assays=ctorder)  
```

```{r heatmap, fig.height=7, fig.width=7}
dreamlet::plotHeatmap(df, genes=unique(genes), assays=ctorder)  
```


```{r Dx.summary, eval=FALSE}
df = unique(data.frame(Dx = pb$dx, Donor = pb$SubID))

sort(table(df$dx), decreasing=TRUE) %>% kbl() %>% kable_styling(full_width=FALSE)
```

# Process data: log2 CPM + voom precision weights
```{r voom}
# Normalize and apply voom
form = ~ (1|SubID) + (1|poolID) + (1|Sex) + scale(Age) + (1|Dx)
 
res.proc = processAssays( pb, form,
                          min.cells = 5,
                          min.counts = 10,
                          BPPARAM = SnowParam(12))
```

```{r voom.plot, fig.height=15, fig.width=10, cache=TRUE} 
plotVoom( res.proc, ncol=4, assays=ctorder ) + theme_classic()  
```


# Variance Partitioning Analysis
```{r varPart}
form = ~ (1|SubID) + (1|poolID) + (1|Sex) + scale(Age) + (1|Dx)
 
res.vp = fitVarPart(res.proc, form, BPPARAM = SnowParam(6))
```

```{r vp.plot, fig.height=15, fig.width=10}
colnames(res.vp)[colnames(res.vp) == "scale.Age."] = 'Age'
colnames(res.vp)[colnames(res.vp) == "poolID"] = 'poolID'
colnames(res.vp)[colnames(res.vp) == "SubID"] = 'Subject'
 
col.vp = c(brewer.pal(ncol(res.vp)-3, "Set1"), "grey85")
names(col.vp) = colnames(sortCols(res.vp))[-seq(2)]

plotVarPart( sortCols(res.vp), label.angle=45, ncol=4, assays=ctorder, col=col.vp )
```


```{r √, fig.height=6, fig.width=10}
# Show variance fractions at the gene-level for each cell type
# genes = c("KCNMA1", 'RASGRF2', "CECR2", "PTPRG", 'NPNT', 'LINC01844', 'SPRY4-AS1', "PDE10A", "XIST", 'C1D')

# res.vp %>%
#  as_tibble %>%
#  filter(assay=="Micro_PVM") %>%
#  summarize(Dx = gene[which.max(Dx)],
#    poolID = gene[which.max(poolID)],
#    Sex = gene[which.max(Sex)],
#     Age = gene[which.max(Age)],
#    Subject = gene[which.max(Subject)],
#    Residuals = gene[which.max(Residuals)])

genes = c( "KCNMA1", "CECR2", "PTPRG","NXPE1", "XIST", "NDUFB6")

# res.vp %>%
#  as_tibble %>%
#  filter(assay=="Microglia") %>%
#  summarize(gene[order(Batch, decreasing=TRUE)][1:5])

df.vp.sub = sortCols(res.vp)[(res.vp$gene %in% genes) &(res.vp$assay=="Micro_PVM"),]
df.vp.sub$gene = factor(df.vp.sub$gene, rev(genes))
df.vp.sub = df.vp.sub[order(df.vp.sub$gene, decreasing=TRUE),]
 
fig.percent = plotPercentBars(df.vp.sub, assays=ctorder, col=col.vp) + theme(aspect.ratio=1)
fig.percent
```

## Batch effects for each cell type
```{r batch.cell}
res.vp %>%
  as_tibble %>%
  mutate(assay = factor(assay, ctorder)) %>%
  ggplot(aes(100*poolID, assay)) + 
    geom_violin(fill=col.vp["poolID"], scale="area") +
    geom_boxplot(width=.1, outlier.color=col.vp["poolID"]) +
    theme_classic() +
    theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) +
    xlab("Variance explained (%)") +
    ggtitle("10X pool")

res.vp %>%
  as_tibble %>%
  mutate(assay = factor(assay, ctorder)) %>%
  group_by(assay) %>%
  summarize(N = sum(poolID > 0.05), percent = 100*sum(poolID > 0.05) / length(poolID)) %>%
  pivot_longer(cols=c("N", "percent")) %>%
  ggplot(aes(value, assay, fill=assay)) +
    geom_bar(stat="identity") +
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none") +
    facet_wrap(~name, scales="free_x") +
    scale_x_continuous(expand=c(0,0))
```

### Batch effect for NXPE1
```{r NXPE1}
res.vp %>%
  as_tibble %>%
  mutate(assay = factor(assay, ctorder)) %>%
  filter( gene == "NXPE1") %>%
  ggplot(aes(100*poolID, assay, fill=assay)) +
   geom_bar(stat="identity") +
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none") +
    scale_x_continuous(expand=c(0,0), limits=c(0, 100)) +
    xlab("Variance explained (%)") 

df = extractData(res.proc, "Micro_PVM")

gene = "NXPE1"
df = df[,c('poolID', gene)]
colnames(df)[colnames(df)==gene] = "expr"

df_ord = df %>%
  group_by(poolID) %>%
  summarize(mean = mean(expr)) %>%
  arrange(mean)

df %>%
  mutate(poolID = factor(poolID, df_ord$poolID)) %>% 
  ggplot(aes(poolID, expr)) +
    geom_boxplot() +
    theme_classic() +
    theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5), legend.position="none", axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
    ggtitle(paste0("Micro_PVM: ", gene)) +
    ylab(bquote(log[2]~CPM)) 
```

```{r batch.examples, eval=FALSE}

res.vp %>%
  as_tibble %>%
  mutate(assay = factor(assay, ctorder)) %>%
  group_by(assay) %>%
  filter(poolID > 0.05) %>%
  summarize(N = length(unique(gene))) %>%
  ggplot(aes(N, assay, fill=assay)) +
    geom_bar(stat="identity") +
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none") + 
    xlab("# genes with poolID > 5%") +
    scale_x_continuous(limits=c(0, NA), expand=c(0,0))


genes = res.vp %>%
  as_tibble %>%
  filter(poolID > 0.05, assay=="Micro_PVM") %>%
  select(gene)
genes = genes$gene


df = extractData(res.proc, "Micro_PVM")
df = df[,c('poolID', genes)]
  

figList = lapply(colnames(df)[-1][1:60], function(geneid){  

  df.tmp = df[,c('poolID', geneid)]
  colnames(df.tmp)[colnames(df.tmp)==geneid] = "expr"

  frac = res.vp %>%
    as_tibble %>%
    filter(assay=="Micro_PVM", gene==geneid) %>%
    select(poolID) %>%
    as.numeric

  df_ord = df.tmp %>%
    group_by(poolID) %>%
    summarize(mean = mean(expr)) %>%
    arrange(mean)

  df.tmp %>%
    mutate(poolID = factor(poolID, df_ord$poolID)) %>% 
    ggplot(aes(poolID, expr)) +
      geom_boxplot() +
      theme_classic() +
      theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5), legend.position="none", axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
      ylab('') + xlab('') + 
      ggtitle(paste0(geneid, ": ", format(100*frac, digits=3), "%"))
})

file = "~/www/test.pdf"
fig = plot_grid(plotlist = figList, align="hv", axis="tblr", ncol=4)
ggsave(file, fig, height=49)

# estimate number of clusters
library(cluster)

res = lapply( colnames(df)[-1], function(gene){
  res = clusGap(df[,gene,drop=FALSE], FUNcluster = kmeans, K.max = 5, B=3)
  data.frame(gene, Nclust = which.max(res$Tab[,3]))
})
res = do.call(rbind, res)

 table(res$Nclust)

```


# Correlation with and between donors
```{r within_btw, fig.height=10, fig.width=10}
source("/sc/arion/projects/CommonMind/hoffman/NPS-AD/work/nps_ad/common_analyses.R")
 
fig = eval_within_across_donor( res.proc, "SubID" )

fig + theme(panel.grid.minor = element_blank(),
            panel.grid.minor.y = element_blank())
```

# Differential expression: dreamlet analysis
```{r dreamlet}
form = ~ (1|SubID) + (1|poolID) + (1|Sex) + scale(Age) + Dx

fit = dreamlet( res.proc, form, BPPARAM = SnowParam(6))
```

## Volcano
```{r volcano, fig.height=15, fig.width=10}
plotVolcano(fit, "DxAD", ncol=4, assays=ctorder) + theme_classic()
```

## Highlight genes
```{r genehatmap, fig.height=4, fig.width=7, cache=FALSE}
genes = c("PTPRG", "DUSP10", 'NPNT', 'DPYD', "VGF", 'SPRY4-AS1', "PDE10A", "NCAM2", 'PCDH7') 
fig.gene_heatmap = plotGeneHeatmap( fit, coef="DxAD", genes=genes, assays=ctorder) + 
  theme( axis.text.x = element_text(colour = variancePartition::ggColorHue(length(ctorder))))

fig.gene_heatmap
```

```{r plotForest, fig.height=10, fig.width=12, cache=FALSE}
figList = lapply(genes, function(g){  
  fig = plotForest( fit, coef = 'DxAD', gene = g, assays=ctorder) +
    theme(aspect.ratio=1, legend.position="none") 

  xlim = switch(g, PTPRG = c(-.6, 1.8), DPYD = c(-.2, 1))

  fig + coord_flip(ylim = xlim)
}) 
names(figList) = genes
plot_grid(plotlist=figList, ncol=3)
```

```{r boxplot, fig.height=10, fig.width=12, cache=FALSE}
gene_CT = c("PTPRG" = "Micro_PVM", 'NPNT' = "Astro", 'DPYD'= "Micro_PVM", 'SPRY4-AS1' = "EN_L3_5_IT_3", "PDE10A" = "EN_L6_IT", "NCAM2" = "EN_L3_5_IT_2", 'PCDH7' = "IN_LAMP5")

figList = lapply( names(gene_CT), function(g){

  df = extractData(res.proc, gene_CT[g])

  df = df[,c('Dx', g)]
  colnames(df)[colnames(df)==g] = "expr"

  ggplot(df, aes(Dx, expr, fill=Dx)) +
          geom_boxplot() +
          theme_classic() +
          theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5), legend.position="none") +
          ggtitle(paste0(gene_CT[g], ": ", g)) +
          ylab(bquote(log[2]~CPM)) +
          scale_fill_manual(values=c("grey50", "red3"))
})
names(figList) = names(gene_CT)

plot_grid(plotlist=figList, ncol=3)
```


### Summarize differential expression 
```{r plot.pi, fig.height=4, fig.width=12}
# Summarize differential expression for each coef and assay
df_de = fit %>%
  topTable(coef='DxAD', number=Inf) %>%
    as_tibble %>% 
    group_by(assay) %>% 
    summarise( 
      nGenes = length(adj.P.Val), 
      nDE = sum(adj.P.Val < 0.05),
      pi1 = 1 - qvalue(P.Value)$pi0) %>%
  mutate(assay = factor(assay, ctorder))  

ymax = 1.05*max(df_de$nGenes)
fig1 = ggplot(df_de, aes(nGenes, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none") +
    scale_x_continuous(limits=c(0,ymax), expand=c(0,0)) +
    xlab("# genes expressed") +
    ylab("Cell type") 

ymax = max(1.05*max(df_de$nDE), 100)
fig2 = ggplot(df_de, aes(nDE, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none", axis.text.y=element_blank()) +
    scale_x_continuous(limits=c(0,ymax), expand=c(0,0)) +
    xlab("# genes with FDR < 5%") +
    ylab('')

fig3 = ggplot(df_de, aes(pi1, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none", axis.text.y=element_blank()) +
    scale_x_continuous(limits=c(0,1), expand=c(0,0)) +
    xlab(bquote(pi[1]))+
    ylab('')

plot_grid(fig1, fig2, fig3, labels=LETTERS[1:3], nrow=1, axis="tblr", align="hv")
```    

```{r frac.de}
fig_de_count = ggplot(df_de, aes(2, assay)) +
        geom_tile(fill="white") +
        geom_point(aes(color=nDE/nGenes*100, size=nGenes)) +
        scale_color_viridis(name = "% genes\npassing FDR", lim=c(0, NA)) +
        scale_size_continuous(name = "# genes", breaks=c(1, 4, 8, 12, 16)*1000) +
        theme_classic() +
        theme(aspect.ratio=as.numeric(nrow(df_de)), 
          axis.ticks.x=element_blank(),
          axis.text.x=element_blank(),
          axis.text.y = element_text(colour = variancePartition::ggColorHue(nrow(df_de)))) +
        xlab('') + ylab('') 
fig_de_count
```


### Interpreting variance partitioning analysis
Observe that fraction of variation across batches correlates with GC content of the corresponding gene.  Also genes with higher counts per million show high variation across donor since more counts corresponds to a reduction in 'shot noise' due to low counts.

```{r gc.content, fig.height=5, fig.width=12}
library(broom)
source("/sc/arion/projects/CommonMind/hoffman/NPS-AD/work/nps_ad/common_analyses.R")

# read GC content
file = "/sc/arion/projects/CommonMind/hoffman/scRNAseq_data/misc/GRCh38.104_gc_content.tsv.gz"
df_GC = read.table(file, header=TRUE)

df_vp = merge(as.data.frame(res.vp), df_GC, by.x="gene", by.y="SYMBOL")

fig_batch_GC = df_vp %>%
  as_tibble %>%
  mutate(assay = factor(assay, ctorder)) %>%
  group_by(assay) %>%
  summarize(cor.se(poolID, GC, method="spearman")) %>%
  left_join( df_vp %>% group_by(assay) %>% summarize(tidy(cor.test(poolID, GC, method="spearman"))) %>% select(assay, p.value, estimate), by="assay") %>%
  mutate(FDR = p.adjust(p.value, "fdr")) %>%
  ggplot(aes(assay, rho, fill=assay, label=ifelse(FDR < 0.05, "#", ''))) +
    geom_errorbar(aes(ymin= rho - 1.96*rho.se, ymax= rho + 1.96*rho.se), width=0) +
    geom_bar(stat="identity") + 
    geom_text(aes(y = rho + 3*rho.se)) +
    theme_classic() +
    theme(aspect.ratio=1, 
      legend.position="none") +
    xlab("Cell type") +
    ylab("Spearman correlation between batch % and GC content") +
    coord_flip() 

tab = topTable(fit, coef='DxAD', number=Inf)

file = "./topTable_PsychAD_r0.tsv"
write.table(tab, file, quote=FALSE, sep='\t', row.names=FALSE)

df_vp = merge(as.data.frame(res.vp), tab, by.x=c("assay", "gene"), by.y=c("assay", "ID") )

fig2 = df_vp %>% 
  as_tibble %>%
  group_by(assay) %>%
  summarize(cor.se(Subject, AveExpr, method="spearman"))  %>%
  mutate(assay = factor(assay, ctorder)) %>%
  ggplot(aes(assay, rho, fill=assay)) +
    geom_bar(stat="identity") + 
    geom_errorbar(aes(ymin= rho - 1.96*rho.se, ymax= rho + 1.96*rho.se), width=0) +
    theme_classic() +
    theme(aspect.ratio=1, 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank(), 
      legend.position="none") +
    xlab("Cell type") +
    ylab("Spearman correlation between Donor % and CPM") +
    scale_y_continuous(limits=c(0, .7), expand=c(0,0)) +
    coord_flip() 

plot_grid(fig_batch_GC, fig2, labels=LETTERS[1:2], nrow=1, align="hv", axis="tblr")    
```


```{r vp.ncells}
# Number of cells observed per Subject
df = colData(sce) %>%
  xtabs( ~ SubID + subclass,.) %>%
  as_tibble %>%
  pivot_longer(cols=SubID) %>%
  mutate(assay = subclass) %>%
  group_by(assay) %>%
  summarize(sum.cells = sum(n)) 

df2 = df_vp %>%
      as_tibble %>%
      mutate(Donor = Dx + Sex + Subject + Age) %>%
      group_by(assay) %>%
      summarize(mean.vp = mean(Donor))

df3 = inner_join(df, df2, by='assay') %>%
        inner_join(details(res.proc) %>% as_tibble) %>%
        mutate(assay = factor(assay, ctorder)) 

fit_n.cells = lm(mean.vp ~ log10(sum.cells/n_retain), df3)#, weights=df3$n_retain)
# summary(fit_n.cells)
pv = coef(summary(fit_n.cells))[2,4]
pv = format(pv, digits=2)

fig.vp_ncells = ggplot(df3, aes(sum.cells/n_retain, mean.vp, color=assay, size=n_retain, label=assay)) +
  geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
  geom_point() +
  scale_x_log10() +
  theme_classic() +
  theme(aspect.ratio=1) + 
  scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
  xlab("Mean # cells per Subject") +
  ylab("Mean % variance across Subject") +
  geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none") +
  annotate("text", x=40, y=.7, label=bquote(p==.(pv)))
fig.vp_ncells
```

```{r de.ncells}
df3 = inner_join(df, df_de, by='assay') %>%
        inner_join(details(res.proc) %>% as_tibble) %>%
        mutate(assay = factor(assay, ctorder)) 

fit_n.cells = lm(nDE ~ log10(sum.cells/n_retain), df3)
pv = coef(summary(fit_n.cells))[2,4]
pv = format(pv, digits=2)

fig.nDE_ncells = ggplot(df3, aes(sum.cells/n_retain, nDE, color=assay, size=n_retain, label=assay)) +
  geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
  geom_point() +
  scale_x_log10() +
  theme_classic() +
  theme(aspect.ratio=1) + 
  scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
  xlab("Mean # nuclei per Subject") +
  ylab("# differentially expressed genes") +
  geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none") 

fit_n.genes = lm(nGenes ~ log10(sum.cells/n_retain), df3)

fig.nGenes = ggplot(df3, aes(sum.cells/n_retain, nGenes, color=assay, size=n_retain, label=assay)) +
  geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
  geom_point() +
  scale_x_log10() +
  theme_classic() +
  theme(aspect.ratio=1) + 
  scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
  xlab("Mean # nuclei per Subject") +
  ylab("# expressed genes") +
  geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none")

fig = plot_grid(fig.nGenes, fig.nDE_ncells )
fig
```


## Gene set analysis
```{r zenith}
# Load Gene Ontology database 
go.gs = get_GeneOntology(to="SYMBOL")

# remove set with > 1000 genes
go.gs = go.gs[sapply(go.gs, function(x) length(geneIds(x))) < 1000]

# zenith analysis 
res.gsa = zenith_gsa(fit, go.gs, coefs="DxAD")
```

```{r zenith.heatmap.1, fig.width=10, fig.height=25, cache=FALSE} 
# remove "GO0022625: " from gene set name
res.gsa$Geneset = gsub("^GO\\S+ ", "", res.gsa$Geneset)
res.gsa$assay = factor(res.gsa$assay, ctorder)  

plotZenithResults(res.gsa, 10, 3) + theme(legend.position="bottom") 
```

```{r zenith.heatmap.2, fig.width=10, fig.height=50, cache=FALSE} 
# get genesets with FDR < 5%
gs = unique(res.gsa$Geneset[res.gsa$FDR < 0.05])

# keep only results of these genesets
df = res.gsa[res.gsa$Geneset %in% gs,]

# plot results, but with no limit based on the highest/lowest t-statistic
plotZenithResults(df, Inf, Inf) 
```

```{r  fig.panels, eval=FALSE}


```


# Combine plots
```{r combine, eval=FALSE}
# Plot UMAP
fig_UMAP = plotProjection(sce, "X_umap", "subclass", order=ctorder)

# Mean variance trend
fig_mv = plotVoom( res.proc[['Micro_PVM']] ) + 
            theme_classic() +
            theme(aspect.ratio=1)

# Variance partitioning 
fig_vp = plotVarPart( sortCols(res.vp), label.angle=45, ncol=4, assays="Micro_PVM" ) 

# variance partitioning and eQTLs
# Batch effects and gene sets
# Gene set analysis
#
# axis.text.y=element_blank()
fig_batch_GC_mod = fig_batch_GC + 
  theme(axis.ticks.y=element_blank()) +
  coord_flip(ylim=c(0,.1), expand=FALSE) +
  ylab('Corr of batch effect\nand GC content') +
  xlab('')


file = '~/www/test.pdf'
ggsave(file=file, fig_batch_GC_mod)


fig = plotVolcano(res.dl, assay = "Micro_PVM")
file = '~/www/test.pdf'
ggsave(file=file, fig)


figA = ggplot() + annotate("text", x=0, y=0, label="Diagram") + theme_void()
figG = ggplot() + annotate("text", x=0, y=0, label="Differ, expressed genes") + theme_void()

# fig.cells_subject
fig = plot_grid(figA, fig_UMAP, fig_mv, fig_vp, fig.percent + theme(legend.position="bottom"), fig_batch_GC_mod, fig.vp_ncells, figG, labels=LETTERS, ncol=4, axis="tblr" , align="hv")


fig.voom = plotVoom( res.proc[["Micro_PVM"]] )
fig.vp = plotVarPart( sortCols(res.vp), label.angle=45, ncol=4, assays="Micro_PVM", col=col.vp )  

# axis.text.y=element_blank()
fig_batch_GC_mod = fig_batch_GC + 
  theme(axis.ticks.y=element_blank()) +
  coord_flip(ylim=c(0,.1), expand=FALSE) +
  ylab('Corr of batch effect\nand GC content') +
  xlab('')

fig = plot_grid(ggplot(), ggplot(), fig.voom, fig.vp, 
  fig.percent + theme(legend.position = "none"), 
  fig_batch_GC_mod,
  fig.vp_ncells,
  labels=LETTERS, ncol=4)


ggsave(fig, file="~/www/test.pdf", width=9, height=7)


# Last figure
#############
fig.volcano = plotVolcano(fit[['Micro_PVM']], "DxAD", ncol=4) + theme_classic() + theme(aspect.ratio=1, legend.position="none")

fig.PTPRG = plotForest( fit, coef = 'DxAD', gene = 'PTPRG', assays=ctorder) + theme(aspect.ratio=1, legend.position="none") +
  coord_flip(ylim = c(-.6, 1.8))

fig.PDE10A = plotForest( fit, coef = 'DxAD', gene = 'PDE10A', assays=ctorder, ylim=c(-1.3, 0.5)) + theme(aspect.ratio=1)


figE = ggplot() + annotate("text", x=0, y=0, label="Geneset by cell type\n2x wide") + theme_void()
figF = ggplot() + annotate("text", x=0, y=0, label="Geneset by cell type\n2x wide") + theme_void()
 
fig = plot_grid(fig.volcano, fig.PTPRG, figList[['PTPRG']], fig.PDE10A, figF, 
  labels=LETTERS, ncol=3)#, axis="tblr" , align="hv")

file = '~/www/test.pdf'
ggsave(file=file, fig, width=12)


# Gene sets
###########

# get genesets with FDR < 5%
gs = unique(res.gsa$Geneset[res.gsa$FDR < 0.05])


gs = c("cholesterol biosynthetic process", "regulation of epidermal growth factor receptor signaling pathway",'release of sequestered calcium ion into cytosol by endoplasmic reticulum',
'neurotransmitter receptor activity involved in regulation of postsynaptic membrane potential',
'ionotropic glutamate receptor activity',
'postsynaptic membrane',
'glutamate receptor activity',
'neuronal action potential',
'presynapse organization',
'regulation of synapse assembly',
'tau protein binding',
'synaptic vesicle endocytosis',
'PDZ domain binding',
'ephrin receptor signaling pathway',
'sodium channel complex',
'actin filament−based process',
'ERBB signaling pathway',
'neural nucleus development',
'antigen processing and presentation of peptide antigen',
'regulation of p38MAPK cascade',
'synaptic cleft',
'neuron projection fasciculation',
'neuropeptide signaling pathway',
'regulation of microtubule polymerization',
'regulation of mRNA stability',
'glucocorticoid receptor signaling pathway',
'regulation of histone phosphorylation',
'filopodium assembly',
'GTPase activator activity',
'oxidative phosphorylation',
"negative regulation of autophagy",
"protein dephosphorylation",
"protein polymerization",
"actin filament")


grep("protein dephosphorylation", res.gsa$Geneset, value=TRUE) %>% unique
grep("protein polymer", res.gsa$Geneset, value=TRUE) %>% unique
grep("actin filam", res.gsa$Geneset, value=TRUE) %>% unique


# keep only results of these genesets
df = res.gsa[res.gsa$Geneset %in% gs,]

# plot results, but with no limit based on the highest/lowest t-statistic
fig = plotZenithResults(df, Inf, Inf, zmax=6)


file = '~/www/test.pdf'
ggsave(file=file, fig, width=12, height=40)


### Highlight specific genesets
# gs1 = "peptidyl-tyrosine dephosphorylation"

gs1 = "GO1900744: regulation of p38MAPK cascade"
grep(gs1, names(go.gs))

fig = plotGeneHeatmap( fit, coef="DxAD", genes=sort(geneIds(go.gs[[gs1]])), transpose=TRUE, assays='Micro_PVM', zmax=6) +
  ggtitle(gs1) + 
  theme(legend.position = "bottom",
    axis.text.x=element_text(size=10, angle=90, vjust=0.5),
    axis.text.y=element_text(size=10)) + 
  xlab('') + ylab('') 

ggsave(file=file, fig)


idx = which(sapply(go.gs, function(x) length(grep('DUSP10', geneIds(x))) > 0))

go.gs[names(go.gs)[idx]]

df = res.gsa[res.gsa$Geneset %in% gsub("^GO\\S+ ", "",names(go.gs)[idx]),]

df[df$assay == "Micro_PVM",] %>% head


res = zenith_gsa(fit, go.gs[names(go.gs)[idx]], coefs="DxAD")


# PTPRG 
# "GO0016791: phosphatase activity" 
# "GO0010975: regulation of neuron projection development""

 tab = topTable(fit, coef='DxAD', number=Inf)
 tab[(tab$ID == "PTPRG") &(tab$assay == "Micro_PVM"),]

topTable(fit[['Micro_PVM']], coef='DxAD', number=1)


```


# Session Info
<details>
```{r sessioninfo, cache=FALSE}
sessionInfo()
```
</details>