From 76a6d183bc6d0a73d50e8c6e36ffd08f0b2d6c3a Mon Sep 17 00:00:00 2001
From: Dave Weaver <dweav94@gmail.com>
Date: Tue, 19 Jan 2021 14:26:13 -0500
Subject: [PATCH] first working version!!!!

---
 R/compute_crosstalk.R | 41 +++++++++++++++++++++++++++++++++++++++--
 R/create_null.R       | 21 ++++++---------------
 R/ppi_ingest.R        |  3 +++
 man/bootstrap_null.Rd | 39 ++++++++++++++++++++++++++++++++++-----
 man/prep_stringdb.Rd  |  8 +++++++-
 5 files changed, 89 insertions(+), 23 deletions(-)

diff --git a/R/compute_crosstalk.R b/R/compute_crosstalk.R
index fe5445c..d9526e9 100644
--- a/R/compute_crosstalk.R
+++ b/R/compute_crosstalk.R
@@ -9,10 +9,47 @@
 #'
 #' @export
 
-compute_crosstalk <- function(seed_proteins, ppi = "stringdb", n = 10000,
+compute_crosstalk <- function(seed_proteins, ppi = "stringdb", n = 1000,
                               gamma=0.6, eps = 1e-10, tmax = 1000,
                               norm = TRUE, set_seed,
                               cache, seed_name = NULL,
-                              ncores = 1)  {
+                              ncores = 1, significance_level = 0.95,
+                              p_adjust = "bonferroni")  {
+  if(ppi == "biogrid") {
+    g <- prep_biogrid(cache = cache)
+  } else if (ppi == "stringdb") {
+    g <- prep_stringdb(cache = cache)
+  } else {
+    stop("ppi must be either 'biogrid' or 'stringdb'")
+  }
+
+  w <- igraph::as_adjacency_matrix(g) #sparse adjacency matrix.
+
+  #Compute p given seed proteins
+  p_seed <- sparseRWR(w = w, seed_proteins = seed_proteins, gamma = gamma,
+                      eps = eps, tmax = tmax, norm = norm)
+  p_vec <- p_seed[[1]]
+  p_df <- tibble::as_tibble(p_vec, rownames = "gene_id")
+  colnames(p_df)[colnames(p_df) == "value"] <- "p_test"
+
+  #compute null distribution
+  null_dist <- bootstrap_null(seed_proteins = seed_proteins, g = g, n = n,
+                              gamma = gamma, eps = eps, tmax = tmax,
+                              norm = norm, set_seed = set_seed, cache = cache,
+                              seed_name = seed_name, ncores = ncores)
+  null_df <- null_dist[[1]]
+
+  df <- dplyr::left_join(null_df, p_df)
+
+  #compute the Z-score and p-value
+  df <- dplyr::mutate(df,
+                      Z = (p_test - mean_p)/ stdev_p,
+                      p_value = 2*pnorm(-abs(Z)),
+                      adj_p_value = p.adjust(p_value, method = p_adjust))
+  df <- dplyr::filter(df, adj_p_value < 1-significance_level)
+
+  return(df)
+
 
 }
+
diff --git a/R/create_null.R b/R/create_null.R
index 53a6cb2..6accf51 100644
--- a/R/create_null.R
+++ b/R/create_null.R
@@ -19,7 +19,7 @@
 #'
 #' @export
 
-bootstrap_null <- function(seed_proteins, ppi = "stringdb", n = 10000,
+bootstrap_null <- function(seed_proteins, g, n = 1000,
                            gamma=0.6, eps = 1e-10, tmax = 1000,
                            norm = TRUE, set_seed,
                            cache, seed_name = NULL,
@@ -29,15 +29,6 @@ bootstrap_null <- function(seed_proteins, ppi = "stringdb", n = 10000,
   if(file.exists(paste0(cache, "/", seed_name, "null_dist.Rda"))) {
     load(file = paste0(cache, "/", seed_name, "null_dist.Rda"))
   } else{
-
-    if(ppi == "biogrid") {
-      g <- prep_biogrid(cache = cache)
-    } else if (ppi == "stringdb") {
-      g <- prep_stringdb(cache = cache)
-    } else {
-      stop("ppi must be either 'biogrid' or 'stringdb'")
-    }
-
     w <- igraph::as_adjacency_matrix(g) #sparse adjacency matrix.
 
     #generate list of degree-similar seed protein vectors.
@@ -62,7 +53,8 @@ bootstrap_null <- function(seed_proteins, ppi = "stringdb", n = 10000,
       null_dist <- dplyr::bind_rows(null_dist)
     }
 
-    null_dist <- dist_calc(null_dist, ncores = ncores)
+    null_dist <- dist_calc(null_dist, ncores = ncores,
+                           seed_proteins = seed_proteins)
 
     out <- list(null_dist, seed_proteins)
 
@@ -129,7 +121,7 @@ match_seeds <- function(g, seed_proteins, n, set_seed = NULL) {
 #' @param df : numeric vector
 #' @return a 3-column dataframe (gene, )
 
-dist_calc <- function(df, ncores) {
+dist_calc <- function(df, ncores, seed_proteins) {
   if(ncores > 1){
     null_dist <- tidyr::pivot_longer(df, cols = -run_number, names_to = "gene_id", values_to = "p")
   } else {
@@ -139,8 +131,7 @@ dist_calc <- function(df, ncores) {
     dplyr::group_by(gene_id) %>%
     dplyr::summarise(mean_p = mean(p),
                      stdev_p = sd(p),
-                     nobs = dplyr::n())
-
+                     nobs = dplyr::n()) %>%
+    dplyr::mutate(seed = ifelse(gene_id %in% seed_proteins, "yes", "no"))
   return(null_dist)
-
 }
diff --git a/R/ppi_ingest.R b/R/ppi_ingest.R
index 1f6e71f..11a729d 100644
--- a/R/ppi_ingest.R
+++ b/R/ppi_ingest.R
@@ -94,6 +94,9 @@ prep_biogrid <- function(cache = NULL) {
 #' @inheritParams prep_stringdb
 #'
 #' @return directory on users computer containing the different adjacency matrices for future use.
+#'
+#' @export
+#'
 setup_init <- function(cache = NULL) {
 
   #Functons are written to return a tibble - this use will ensure a df is not printed
diff --git a/man/bootstrap_null.Rd b/man/bootstrap_null.Rd
index ff95631..08a4c34 100644
--- a/man/bootstrap_null.Rd
+++ b/man/bootstrap_null.Rd
@@ -5,15 +5,44 @@
 \title{Bootstrap null distribution for significance testing}
 \usage{
 bootstrap_null(
-  ppi = "biogrid",
-  n = 1000,
   seed_proteins,
-  gamma,
+  ppi = "stringdb",
+  n = 10000,
+  gamma = 0.6,
   eps = 1e-10,
   tmax = 1000,
-  norm = TRUE
+  norm = TRUE,
+  set_seed,
+  cache,
+  seed_name = NULL,
+  ncores = 1
 )
 }
+\arguments{
+\item{ppi}{specifies which protein-protein interaction network will be used - currently only "stringdb" is supported}
+
+\item{n}{number of random walks with repeats}
+
+\item{set_seed}{integer to set random number seed - for reproducibility}
+
+\item{seed_name}{Name to give the cached null distribution - must be a character string}
+
+\item{ncores}{= 1}
+
+\item{...}{
+  Arguments passed on to \code{\link[=sparseRWR]{sparseRWR}}, \code{\link[=setup_init]{setup_init}}
+  \describe{
+    \item{\code{seed_proteins}}{user defined seed proteins}
+    \item{\code{gamma}}{restart probability}
+    \item{\code{eps}}{maximum allowed difference between the computed probabilities at the steady state}
+    \item{\code{norm}}{if True, w is normalized by dividing each value by the column sum.}
+    \item{\code{tmax}}{the maximum number of iterations for the RWR}
+    \item{\code{cache}}{A filepath to a folder downloaded files should be stored, inherits from user-available functions}
+  }}
+}
 \description{
-Bootstrap null distribution for significance testing
+This function will generate a bootstrapped null distribution to identify
+signficant vertices in a PPI given a set of user-defined seed proteins.
+Bootstrapping is done by performing random walk with repeats repeatedly over "random"
+sets of seed proteins. Degree distribution of user-provided seeds is used to inform sampling.
 }
diff --git a/man/prep_stringdb.Rd b/man/prep_stringdb.Rd
index ee3d331..c9ac50e 100644
--- a/man/prep_stringdb.Rd
+++ b/man/prep_stringdb.Rd
@@ -4,12 +4,18 @@
 \alias{prep_stringdb}
 \title{Prepare Stringdb for use in analyses}
 \usage{
-prep_stringdb(cache = NULL, edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79)
+prep_stringdb(
+  cache = NULL,
+  edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79,
+  min_score = NULL
+)
 }
 \arguments{
 \item{cache}{A filepath to a folder downloaded files should be stored, inherits from user-available functions}
 
 \item{edb}{ensemble database object}
+
+\item{min_score}{minimum connectivity score for each edge in the network.}
 }
 \value{
 list containing Adjacency matrix from stringdb dataset and igraph object built from the adjacency matrix.