Pulled over a lot of the wrappers from scRNAseq.

SingleR-inc · Feb 26, 2024 · 885bd49 · 885bd49
1 parent 4f79115
commit 885bd49
Show file tree

Hide file tree

Showing 12 changed files with 389 additions and 19 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,20 +7,30 @@ export(ImmGenData)
 export(MonacoImmuneData)
 export(MouseRNAseqData)
 export(NovershternHematopoieticData)
+export(defineTextQuery)
+export(fetchLatestVersion)
 export(fetchMetadata)
 export(fetchReference)
+export(listReferences)
+export(listVersions)
 export(saveReference)
+export(searchDatasets)
+export(surveyDatasets)
 importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,lgCMatrix)
 importClassesFrom(alabaster.matrix,ReloadedArray)
 importFrom(AnnotationDbi,mapIds)
 importFrom(AnnotationHub,AnnotationHub)
+importFrom(DBI,dbConnect)
+importFrom(DBI,dbDisconnect)
+importFrom(DBI,dbGetQuery)
 importFrom(DelayedArray,DelayedArray)
 importFrom(DelayedArray,is_sparse)
 importFrom(DelayedArray,type)
 importFrom(DelayedMatrixStats,colAnyNAs)
 importFrom(DelayedMatrixStats,rowAnyNAs)
 importFrom(ExperimentHub,ExperimentHub)
+importFrom(RSQLite,SQLite)
 importFrom(S4Vectors,DataFrame)
 importFrom(SingleCellExperiment,"reducedDim<-")
 importFrom(SingleCellExperiment,reducedDim)
@@ -34,9 +44,14 @@ importFrom(alabaster.base,readObject)
 importFrom(alabaster.base,readObjectFile)
 importFrom(alabaster.base,saveObject)
 importFrom(gypsum,cacheDirectory)
+importFrom(gypsum,defineTextQuery)
+importFrom(gypsum,fetchLatest)
+importFrom(gypsum,fetchMetadataDatabase)
 importFrom(gypsum,fetchMetadataSchema)
+importFrom(gypsum,listAssets)
 importFrom(gypsum,saveFile)
 importFrom(gypsum,saveVersion)
+importFrom(gypsum,searchMetadataTextFilter)
 importFrom(gypsum,validateMetadata)
 importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,toJSON)

diff --git a/R/fetchReference.R b/R/fetchReference.R
@@ -1,10 +1,10 @@
-#' Fetch a dataset from the gypsum backend
+#' Fetch a reference dataset 
 #'
-#' Fetch a dataset (or its metadata) from the gypsum backend.
+#' Fetch a reference dataset (or its metadata) from the gypsum backend.
 #'
-#' @param name String containing the name of the dataset.
+#' @param name String containing the name of the reference dataset.
 #' @param version String containing the version of the dataset.
-#' @param path String containing the path to a subdataset, if \code{name} contains multiple datasets.
+#' @param path String containing the path to a subdataset, if \code{name} contains multiple reference datasets.
 #' Defaults to \code{NA} if no subdatasets are present.
 #' @param package String containing the name of the package.
 #' @param cache,overwrite Arguments to pass to \code{\link[gypsum]{saveVersion}} or \code{\link[gypsum]{saveFile}}.
@@ -13,6 +13,8 @@
 #' @param ... Further arguments to pass to \code{\link{readObject}}.
 #'
 #' @return \code{fetchReference} returns the dataset as a \linkS4class{SummarizedExperiment}.
+#' This is guaranteed to have a \code{"logcounts"} assay with log-normalized expression values,
+#' along with at least one character vector of labels in the column data.
 #'
 #' \code{fetchMetadata} returns a named list of metadata for the specified dataset.
 #'
@@ -40,14 +42,14 @@ fetchReference <- function(name, version, path=NA, package="celldex", cache=cach
 
     old <- altReadObjectFunction(cdLoadObject)
     on.exit(altReadObjectFunction(old))
-    altReadObject(obj_path, scRNAseq.realize.assays=realize.assays, scRNAseq.realize.reduced.dims=realize.reduced.dims, ...)
+    altReadObject(obj_path, celldex.realize.assays=realize.assays, ...)
 }
 
 #' @export
 #' @rdname fetchReference
 #' @importFrom jsonlite fromJSON
 #' @importFrom gypsum cacheDirectory saveFile
-fetchMetadata <- function(name, version, path=NA, package="scRNAseq", cache=cacheDirectory(), overwrite=FALSE) {
+fetchMetadata <- function(name, version, path=NA, package="celldex", cache=cacheDirectory(), overwrite=FALSE) {
     remote_path <- "_bioconductor.json"
     if (!is.na(path)) {
         remote_path <- paste0(path, "/", remote_path)

diff --git a/R/listReferences.R b/R/listReferences.R
@@ -0,0 +1,38 @@
+#' List available references
+#'
+#' List the available reference datasets and the associated versions in \pkg{celldex}.
+#'
+#' @param name String containing the name of the reference dataset.
+#'
+#' @return 
+#' For \code{listReferences}, a character vector containing the names of the available references.
+#'
+#' For \code{listVersions}, a character vector containing the names of the available versions of the \code{name} reference.
+#'
+#' For \code{fetchLatestVersion}, a string containing the name of the latest version.
+#'
+#' @author Aaron Lun
+#'
+#' @examples
+#' listReferences()
+#' listVersions("immgen")
+#' fetchLatestVersion("immgen")
+#'
+#' @export
+#' @importFrom gypsum listAssets
+listReferences <- function() {
+    listAssets("celldex")
+}
+
+#' @export
+#' @rdname listVersions
+listVersions <- function(name) {
+    gypsum::listVersions("celldex", name)
+}
+
+#' @export
+#' @rdname listVersions
+#' @importFrom gypsum fetchLatest
+fetchLatestVersion <- function(name) {
+    fetchLatest("celldex", name)
+}
diff --git a/R/saveReference.R b/R/saveReference.R
@@ -1,6 +1,6 @@
-#' Save a dataset to disk
+#' Save a reference dataset 
 #'
-#' Save a dataset to disk, usually in preparation for upload via \code{\link{uploadDirectory}}.
+#' Save a reference dataset to disk, usually in preparation for upload via \code{\link{uploadDirectory}}.
 #'
 #' @param x Matrix of log-normalized expression values.
 #' This may be sparse or dense, but should be non-integer and have no missing values.
@@ -9,14 +9,21 @@
 #' Each row of \code{labels} corresponds to a column of \code{x} and contains the label(s) for that column.
 #' Each column of \code{labels} represents a different label type;
 #' typically, the column name has a \code{label.} prefix to distinguish between, e.g., \code{label.fine}, \code{label.broad} and so on.
-#' @param path String containing the path to a new directory in which save the dataset.
-#' @param metadata Named list containing metadata for this dataset,
+#' At least one column should be present.
+#' @param path String containing the path to a directory in which to save \code{x}.
+#' @param metadata Named list containing metadata for this reference dataset,
 #' see the schema returned by \code{\link{fetchMetadataSchema}()}.
 #' Note that the \code{applications.takane} property will be automatically added by this function and does not have to be supplied.
 #'
 #' @return \code{x} and \code{labels} are used to create a \linkS4class{SummarizedExperiment} that is saved into \code{path}.
 #' \code{NULL} is invisibly returned.
 #'
+#' @details
+#' The SummarizedExperiment saved to \code{path} is guaranteed to have the \code{"logcounts"} assay and at least one column in \code{labels}.
+#' This mirrors the expectation for reference datasets obtained via \code{\link{fetchReference}}.
+#'
+#' @details
+#' 
 #' @author Aaron Lun
 #' @examples
 #' # Mocking up some data to be saved.
@@ -73,6 +80,9 @@ saveReference <- function(x, labels, path, metadata) {
     validateMetadata(metadata, schema) # First validation for user-supplied content.
 
     # Check that all labels are categorical.
+    if (ncol(labels) == 0L) {
+        stop("'labels' should contain at least one column")
+    }
     for (cn in colnames(labels)) {
         if (!is.character(labels[[cn]])) {
             stop("all columns of 'labels' should be character vectors")

diff --git a/R/searchReferences.R b/R/searchReferences.R
@@ -0,0 +1,61 @@
+#' Search reference metadata
+#'
+#' Search for reference datasets of interest based on matching text in the associated metadata.
+#'
+#' @param query String or a \code{gypsum.search.object}, see Examples.
+#' @inheritParams surveyReferences
+#'
+#' @return 
+#' A \linkS4class{DataFrame} where each row corresponds to a dataset, containing various columns of metadata.
+#' Some columns may be lists to capture 1:many mappings.
+#'
+#' @details
+#' The returned DataFrame contains the usual suspects like the title and description for each dataset,
+#' the number of rows and columns, the organisms and genome builds involved,
+#' whether the dataset has any pre-computed reduced dimensions, and so on.
+#' More details can be found in the Bioconductor metadata schema at \url{https://github.com/ArtifactDB/bioconductor-metadata-index}. 
+#'
+#' @author Aaron Lun
+#'
+#' @examples
+#' searchReferences(defineTextQuery("immun%", partial=TRUE))[,c("name", "title")]
+#' searchReferences(defineTextQuery("10090", field="taxonomy_id"))[,c("name", "title")]
+#' searchReferences(
+#'    defineTextQuery("10090", field="taxonomy_id") &
+#'    defineTextQuery("immun%", partial=TRUE)
+#' )[,c("name", "title")]
+#' 
+#' @seealso
+#' \code{\link{surveyReferences}}, to easily obtain a listing of all available datasets.
+#' @export
+#' @importFrom S4Vectors DataFrame
+#' @importFrom gypsum cacheDirectory fetchMetadataDatabase searchMetadataTextFilter
+#' @importFrom DBI dbConnect dbDisconnect dbGetQuery
+#' @importFrom RSQLite SQLite
+searchReferences <- function(query, cache=cacheDirectory(), overwrite=FALSE, latest=TRUE) {
+    filter <- searchMetadataTextFilter(query)
+
+    bpath <- fetchMetadataDatabase(cache=cache, overwrite=overwrite)
+    con <- dbConnect(SQLite(), bpath)
+    on.exit(dbDisconnect(con))
+
+    stmt <- "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path";
+    if (!latest) {
+        stmt <- paste0(stmt, ", versions.latest AS latest")
+    }
+    stmt <- paste0(stmt, " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'scRNAseq'")
+    if (latest) {
+        stmt <- paste0(stmt, " AND versions.latest = 1")
+    }
+    if (!is.null(filter)) {
+        stmt <- paste0(stmt, " AND ", filter$where)
+        everything <- dbGetQuery(con, stmt, params=filter$parameters)
+    } else {
+        everything <- dbGetQuery(con, stmt)
+    }
+
+    sanitize_query_to_output(everything, latest)
+}
+
+#' @export
+gypsum::defineTextQuery
diff --git a/R/surveyReferences.R b/R/surveyReferences.R
@@ -0,0 +1,112 @@
+#' Survey reference metadata
+#'
+#' Metadata survey for all available reference datasets in the \pkg{celldex} package.
+#'
+#' @param cache,overwrite Arguments to pass to \code{\link{fetchMetadataDatabase}}.
+#' @param latest Whether to only consider the latest version of each dataset.
+#'
+#' @return 
+#' A \linkS4class{DataFrame} where each row corresponds to a dataset, containing various columns of metadata.
+#' Some columns may be lists to capture 1:many mappings.
+#'
+#' @details
+#' The returned DataFrame contains the usual suspects like the title and description for each dataset,
+#' the number of samples and types of labels, the organisms and genome builds involved, and so on.
+#' More details can be found in the Bioconductor metadata schema at \url{https://github.com/ArtifactDB/bioconductor-metadata-index}. 
+#'
+#' @author Aaron Lun
+#'
+#' @examples
+#' surveyReferences()
+#'
+#' @seealso
+#' \code{\link{searchReferences}}, to search on the metadata for specific datasets.
+#' 
+#' @export
+#' @importFrom S4Vectors DataFrame
+#' @importFrom gypsum cacheDirectory fetchMetadataDatabase
+#' @importFrom DBI dbConnect dbDisconnect dbGetQuery
+#' @importFrom RSQLite SQLite
+surveyReferences <- function(cache=cacheDirectory(), overwrite=FALSE, latest=TRUE) {
+    bpath <- fetchMetadataDatabase(cache=cache, overwrite=overwrite)
+    con <- dbConnect(SQLite(), bpath)
+    on.exit(dbDisconnect(con))
+
+    stmt <- "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path";
+    if (!latest) {
+        stmt <- paste0(stmt, ", versions.latest AS latest")
+    }
+    stmt <- paste0(stmt, " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'celldex'")
+    if (latest) {
+        stmt <- paste0(stmt, " AND versions.latest = 1")
+    }
+    everything <- dbGetQuery(con, stmt)
+
+    sanitize_query_to_output(everything, latest)
+}
+
+#' @importFrom S4Vectors DataFrame
+#' @importFrom jsonlite fromJSON
+sanitize_query_to_output <- function(results, latest, meta.name="meta") {
+    path <- results$path
+    has.slash <- grepl("/", path)
+    path[!has.slash] <- NA_character_
+    path[has.slash] <- sub("/[^/]+$", "", path[has.slash])
+    df <- DataFrame(name = results$asset, version = results$version, path = path)
+    if (!latest) {
+        df$latest <- results$latest == 1
+    }
+
+    all_meta <- lapply(results[[meta.name]], fromJSON, simplifyVector=FALSE)
+    df$title <- extract_atomic_from_json(all_meta, function(x) x$title, "character")
+    df$description <- extract_atomic_from_json(all_meta, function(x) x$title, "character")
+    df$taxonomy_id <- extract_charlist_from_json(all_meta, function(x) x$taxonomy_id)
+    df$genome <- extract_charlist_from_json(all_meta, function(x) x$genome)
+
+    df$samples <- extract_atomic_from_json(all_meta, function(x) x$applications$takane$summarized_experiment$columns, "integer")
+    df$labels <- extract_charlist_from_json(all_meta, function(x) x$applications$takane$summarized_experiment$column_annotations)
+
+    df$bioconductor_version < extract_atomic_from_json(all_meta, function(x) x$bioconductor_version, "character")
+    df$maintainer_name < extract_atomic_from_json(all_meta, function(x) x$maintainer_name, "character")
+    df$maintainer_email < extract_atomic_from_json(all_meta, function(x) x$maintainer_email, "character")
+
+    sources <- vector("list", length(all_meta))
+    for (i in seq_along(all_meta)) {
+        cursources <- all_meta[[i]]$sources
+        if (is.null(cursources)) {
+            sources[[i]] <- DataFrame(provider=character(0), id=character(0), version=character(0))
+        } else {
+            sources[[i]] <- DataFrame(
+                provider = extract_atomic_from_json(cursources, function(x) x$provider, "character"),
+                id = extract_atomic_from_json(cursources, function(x) x$id, "character"),
+                version = extract_atomic_from_json(cursources, function(x) x$version, "character")
+            )
+        }
+    }
+    df$sources <- as(sources, "CompressedList")
+
+    df
+}
+
+extract_atomic_from_json <- function(metadata, extract, type) {
+    vapply(metadata, function(y) {
+        x <- extract(y)
+        if (is.null(x)) {
+            as(NA, type)
+        } else {
+            x
+        }
+    }, vector(type, 1))
+}
+
+extract_charlist_from_json <- function(metadata, extract) {
+    output <- lapply(metadata, function(y) {
+        x <- extract(y)
+        if (is.null(y)) {
+            character(0)
+        } else {
+            x
+        }
+    })
+    as(output, "CompressedList")
+}
diff --git a/man/fetchReference.Rd b/man/fetchReference.Rd