From 76a6d183bc6d0a73d50e8c6e36ffd08f0b2d6c3a Mon Sep 17 00:00:00 2001 From: Dave Weaver Date: Tue, 19 Jan 2021 14:26:13 -0500 Subject: [PATCH] first working version!!!! --- R/compute_crosstalk.R | 41 +++++++++++++++++++++++++++++++++++++++-- R/create_null.R | 21 ++++++--------------- R/ppi_ingest.R | 3 +++ man/bootstrap_null.Rd | 39 ++++++++++++++++++++++++++++++++++----- man/prep_stringdb.Rd | 8 +++++++- 5 files changed, 89 insertions(+), 23 deletions(-) diff --git a/R/compute_crosstalk.R b/R/compute_crosstalk.R index fe5445c..d9526e9 100644 --- a/R/compute_crosstalk.R +++ b/R/compute_crosstalk.R @@ -9,10 +9,47 @@ #' #' @export -compute_crosstalk <- function(seed_proteins, ppi = "stringdb", n = 10000, +compute_crosstalk <- function(seed_proteins, ppi = "stringdb", n = 1000, gamma=0.6, eps = 1e-10, tmax = 1000, norm = TRUE, set_seed, cache, seed_name = NULL, - ncores = 1) { + ncores = 1, significance_level = 0.95, + p_adjust = "bonferroni") { + if(ppi == "biogrid") { + g <- prep_biogrid(cache = cache) + } else if (ppi == "stringdb") { + g <- prep_stringdb(cache = cache) + } else { + stop("ppi must be either 'biogrid' or 'stringdb'") + } + + w <- igraph::as_adjacency_matrix(g) #sparse adjacency matrix. + + #Compute p given seed proteins + p_seed <- sparseRWR(w = w, seed_proteins = seed_proteins, gamma = gamma, + eps = eps, tmax = tmax, norm = norm) + p_vec <- p_seed[[1]] + p_df <- tibble::as_tibble(p_vec, rownames = "gene_id") + colnames(p_df)[colnames(p_df) == "value"] <- "p_test" + + #compute null distribution + null_dist <- bootstrap_null(seed_proteins = seed_proteins, g = g, n = n, + gamma = gamma, eps = eps, tmax = tmax, + norm = norm, set_seed = set_seed, cache = cache, + seed_name = seed_name, ncores = ncores) + null_df <- null_dist[[1]] + + df <- dplyr::left_join(null_df, p_df) + + #compute the Z-score and p-value + df <- dplyr::mutate(df, + Z = (p_test - mean_p)/ stdev_p, + p_value = 2*pnorm(-abs(Z)), + adj_p_value = p.adjust(p_value, method = p_adjust)) + df <- dplyr::filter(df, adj_p_value < 1-significance_level) + + return(df) + } + diff --git a/R/create_null.R b/R/create_null.R index 53a6cb2..6accf51 100644 --- a/R/create_null.R +++ b/R/create_null.R @@ -19,7 +19,7 @@ #' #' @export -bootstrap_null <- function(seed_proteins, ppi = "stringdb", n = 10000, +bootstrap_null <- function(seed_proteins, g, n = 1000, gamma=0.6, eps = 1e-10, tmax = 1000, norm = TRUE, set_seed, cache, seed_name = NULL, @@ -29,15 +29,6 @@ bootstrap_null <- function(seed_proteins, ppi = "stringdb", n = 10000, if(file.exists(paste0(cache, "/", seed_name, "null_dist.Rda"))) { load(file = paste0(cache, "/", seed_name, "null_dist.Rda")) } else{ - - if(ppi == "biogrid") { - g <- prep_biogrid(cache = cache) - } else if (ppi == "stringdb") { - g <- prep_stringdb(cache = cache) - } else { - stop("ppi must be either 'biogrid' or 'stringdb'") - } - w <- igraph::as_adjacency_matrix(g) #sparse adjacency matrix. #generate list of degree-similar seed protein vectors. @@ -62,7 +53,8 @@ bootstrap_null <- function(seed_proteins, ppi = "stringdb", n = 10000, null_dist <- dplyr::bind_rows(null_dist) } - null_dist <- dist_calc(null_dist, ncores = ncores) + null_dist <- dist_calc(null_dist, ncores = ncores, + seed_proteins = seed_proteins) out <- list(null_dist, seed_proteins) @@ -129,7 +121,7 @@ match_seeds <- function(g, seed_proteins, n, set_seed = NULL) { #' @param df : numeric vector #' @return a 3-column dataframe (gene, ) -dist_calc <- function(df, ncores) { +dist_calc <- function(df, ncores, seed_proteins) { if(ncores > 1){ null_dist <- tidyr::pivot_longer(df, cols = -run_number, names_to = "gene_id", values_to = "p") } else { @@ -139,8 +131,7 @@ dist_calc <- function(df, ncores) { dplyr::group_by(gene_id) %>% dplyr::summarise(mean_p = mean(p), stdev_p = sd(p), - nobs = dplyr::n()) - + nobs = dplyr::n()) %>% + dplyr::mutate(seed = ifelse(gene_id %in% seed_proteins, "yes", "no")) return(null_dist) - } diff --git a/R/ppi_ingest.R b/R/ppi_ingest.R index 1f6e71f..11a729d 100644 --- a/R/ppi_ingest.R +++ b/R/ppi_ingest.R @@ -94,6 +94,9 @@ prep_biogrid <- function(cache = NULL) { #' @inheritParams prep_stringdb #' #' @return directory on users computer containing the different adjacency matrices for future use. +#' +#' @export +#' setup_init <- function(cache = NULL) { #Functons are written to return a tibble - this use will ensure a df is not printed diff --git a/man/bootstrap_null.Rd b/man/bootstrap_null.Rd index ff95631..08a4c34 100644 --- a/man/bootstrap_null.Rd +++ b/man/bootstrap_null.Rd @@ -5,15 +5,44 @@ \title{Bootstrap null distribution for significance testing} \usage{ bootstrap_null( - ppi = "biogrid", - n = 1000, seed_proteins, - gamma, + ppi = "stringdb", + n = 10000, + gamma = 0.6, eps = 1e-10, tmax = 1000, - norm = TRUE + norm = TRUE, + set_seed, + cache, + seed_name = NULL, + ncores = 1 ) } +\arguments{ +\item{ppi}{specifies which protein-protein interaction network will be used - currently only "stringdb" is supported} + +\item{n}{number of random walks with repeats} + +\item{set_seed}{integer to set random number seed - for reproducibility} + +\item{seed_name}{Name to give the cached null distribution - must be a character string} + +\item{ncores}{= 1} + +\item{...}{ + Arguments passed on to \code{\link[=sparseRWR]{sparseRWR}}, \code{\link[=setup_init]{setup_init}} + \describe{ + \item{\code{seed_proteins}}{user defined seed proteins} + \item{\code{gamma}}{restart probability} + \item{\code{eps}}{maximum allowed difference between the computed probabilities at the steady state} + \item{\code{norm}}{if True, w is normalized by dividing each value by the column sum.} + \item{\code{tmax}}{the maximum number of iterations for the RWR} + \item{\code{cache}}{A filepath to a folder downloaded files should be stored, inherits from user-available functions} + }} +} \description{ -Bootstrap null distribution for significance testing +This function will generate a bootstrapped null distribution to identify +signficant vertices in a PPI given a set of user-defined seed proteins. +Bootstrapping is done by performing random walk with repeats repeatedly over "random" +sets of seed proteins. Degree distribution of user-provided seeds is used to inform sampling. } diff --git a/man/prep_stringdb.Rd b/man/prep_stringdb.Rd index ee3d331..c9ac50e 100644 --- a/man/prep_stringdb.Rd +++ b/man/prep_stringdb.Rd @@ -4,12 +4,18 @@ \alias{prep_stringdb} \title{Prepare Stringdb for use in analyses} \usage{ -prep_stringdb(cache = NULL, edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79) +prep_stringdb( + cache = NULL, + edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79, + min_score = NULL +) } \arguments{ \item{cache}{A filepath to a folder downloaded files should be stored, inherits from user-available functions} \item{edb}{ensemble database object} + +\item{min_score}{minimum connectivity score for each edge in the network.} } \value{ list containing Adjacency matrix from stringdb dataset and igraph object built from the adjacency matrix.