From a99c40922795137527b657d3e1b3aac4db0e87ae Mon Sep 17 00:00:00 2001
From: Artur Szalata <arturs6668@gmail.com>
Date: Thu, 9 May 2024 11:06:11 +0200
Subject: [PATCH] use common filtered genes across cell types

---
 src/task/process_dataset/run_limma/script.R | 40 ++++++++++++++-------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/task/process_dataset/run_limma/script.R b/src/task/process_dataset/run_limma/script.R
index a00f93e7..69d01db8 100755
--- a/src/task/process_dataset/run_limma/script.R
+++ b/src/task/process_dataset/run_limma/script.R
@@ -7,7 +7,7 @@ library(edgeR)
 
 ## VIASH START
 par <- list(
-  input = "resources/neurips-2023-data/small_pseudobulk.h5ad",
+  input = "resources/neurips-2023-data/pseudobulk.h5ad",
   de_sig_cutoff = 0.05,
   control_compound = "Dimethyl Sulfoxide",
   # for public data
@@ -46,11 +46,10 @@ limma_trafo <- function(value) {
   gsub("[^[:alnum:]]", "_", value)
 }
 
-# run glmQLFit per cell type
-ql_fits <- list()
+filtered_gene_lists <- list()
 
 for (cell_type in cell_types) {
-  cat("Running glmQLFit for cell type: ", cell_type, "\n")
+  cat("Filtering genes for cell type:", cell_type, "\n")
 
   # subset data by cell type and split
   obs_filt <- (adata$obs$cell_type == cell_type) & (adata$obs$split %in% par$input_splits)
@@ -58,18 +57,37 @@ for (cell_type in cell_types) {
 
   # prepare count data
   counts <- Matrix::t(cell_type_adata$X)
-  counts <- as.matrix(counts)
 
   d <- DGEList(counts)
+  design <- model.matrix(~ 0 + sm_name + donor_id, data = cell_type_adata$obs)
+  keep <- filterByExpr(d, design)
+  genes_to_keep <- rownames(d)[keep]
+
+  # Store the filtered gene list
+  filtered_gene_lists[[cell_type]] <- genes_to_keep
+}
+
+# Calculate the intersection of genes across all cell types
+common_genes <- Reduce(intersect, filtered_gene_lists)
+cat("Number of common genes:", length(common_genes), "\n")
+
+
+# run glmQLFit per cell type
+ql_fits <- list()
 
-  # Filter lowly expressed genes
-  keep <- filterByExpr(d, design = model.matrix(~ 0 + sm_name + donor_id, data = cell_type_adata$obs))
-  d <- d[keep,, keep.lib.sizes=FALSE]
+for (cell_type in cell_types) {
+  cat("Running glmQLFit for cell type:", cell_type, "using common genes\n")
 
+  # subset data by cell type and split again
+  obs_filt <- (adata$obs$cell_type == cell_type) & (adata$obs$split %in% par$input_splits)
+  cell_type_adata <- adata[obs_filt, ]
 
+  # prepare count data again, this time filtering to common genes
+  counts <- Matrix::t(cell_type_adata$X)
+  counts <- as.matrix(counts)
+  d <- DGEList(counts[common_genes, , drop = FALSE])
   d <- calcNormFactors(d)
 
-  # create design matrix
   design <- model.matrix(~ 0 + sm_name + donor_id, data = cell_type_adata$obs %>% mutate_all(limma_trafo))
 
   # Estimate dispersions
@@ -102,6 +120,7 @@ de_results <- bind_rows(lapply(seq_len(nrow(new_obs)), function(row_i) {
 
   # Convert TopTags object to a data frame
   test_results_df <- as.data.frame(topTags(test_results, n = Inf))
+  test_results_df$gene <- rownames(test_results_df)
 
   # Add cell type and sm_name columns
   test_results_df <- test_results_df %>%
@@ -110,9 +129,6 @@ de_results <- bind_rows(lapply(seq_len(nrow(new_obs)), function(row_i) {
   return(test_results_df)
 }))
 
-# Assuming the first column of de_results needs to be set as 'gene'
-de_results <- rownames_to_column(de_results, var = "gene")
-
 # Transform DE results and prepare for writing
 de_results_final <- de_results %>%
   mutate(