Skip to content

Commit

Permalink
Pulled over a lot of the wrappers from scRNAseq.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Feb 26, 2024
1 parent 4f79115 commit 885bd49
Show file tree
Hide file tree
Showing 12 changed files with 389 additions and 19 deletions.
15 changes: 15 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,30 @@ export(ImmGenData)
export(MonacoImmuneData)
export(MouseRNAseqData)
export(NovershternHematopoieticData)
export(defineTextQuery)
export(fetchLatestVersion)
export(fetchMetadata)
export(fetchReference)
export(listReferences)
export(listVersions)
export(saveReference)
export(searchDatasets)
export(surveyDatasets)
importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,lgCMatrix)
importClassesFrom(alabaster.matrix,ReloadedArray)
importFrom(AnnotationDbi,mapIds)
importFrom(AnnotationHub,AnnotationHub)
importFrom(DBI,dbConnect)
importFrom(DBI,dbDisconnect)
importFrom(DBI,dbGetQuery)
importFrom(DelayedArray,DelayedArray)
importFrom(DelayedArray,is_sparse)
importFrom(DelayedArray,type)
importFrom(DelayedMatrixStats,colAnyNAs)
importFrom(DelayedMatrixStats,rowAnyNAs)
importFrom(ExperimentHub,ExperimentHub)
importFrom(RSQLite,SQLite)
importFrom(S4Vectors,DataFrame)
importFrom(SingleCellExperiment,"reducedDim<-")
importFrom(SingleCellExperiment,reducedDim)
Expand All @@ -34,9 +44,14 @@ importFrom(alabaster.base,readObject)
importFrom(alabaster.base,readObjectFile)
importFrom(alabaster.base,saveObject)
importFrom(gypsum,cacheDirectory)
importFrom(gypsum,defineTextQuery)
importFrom(gypsum,fetchLatest)
importFrom(gypsum,fetchMetadataDatabase)
importFrom(gypsum,fetchMetadataSchema)
importFrom(gypsum,listAssets)
importFrom(gypsum,saveFile)
importFrom(gypsum,saveVersion)
importFrom(gypsum,searchMetadataTextFilter)
importFrom(gypsum,validateMetadata)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
Expand Down
14 changes: 8 additions & 6 deletions R/fetchReference.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#' Fetch a dataset from the gypsum backend
#' Fetch a reference dataset
#'
#' Fetch a dataset (or its metadata) from the gypsum backend.
#' Fetch a reference dataset (or its metadata) from the gypsum backend.
#'
#' @param name String containing the name of the dataset.
#' @param name String containing the name of the reference dataset.
#' @param version String containing the version of the dataset.
#' @param path String containing the path to a subdataset, if \code{name} contains multiple datasets.
#' @param path String containing the path to a subdataset, if \code{name} contains multiple reference datasets.
#' Defaults to \code{NA} if no subdatasets are present.
#' @param package String containing the name of the package.
#' @param cache,overwrite Arguments to pass to \code{\link[gypsum]{saveVersion}} or \code{\link[gypsum]{saveFile}}.
Expand All @@ -13,6 +13,8 @@
#' @param ... Further arguments to pass to \code{\link{readObject}}.
#'
#' @return \code{fetchReference} returns the dataset as a \linkS4class{SummarizedExperiment}.
#' This is guaranteed to have a \code{"logcounts"} assay with log-normalized expression values,
#' along with at least one character vector of labels in the column data.
#'
#' \code{fetchMetadata} returns a named list of metadata for the specified dataset.
#'
Expand Down Expand Up @@ -40,14 +42,14 @@ fetchReference <- function(name, version, path=NA, package="celldex", cache=cach

old <- altReadObjectFunction(cdLoadObject)
on.exit(altReadObjectFunction(old))
altReadObject(obj_path, scRNAseq.realize.assays=realize.assays, scRNAseq.realize.reduced.dims=realize.reduced.dims, ...)
altReadObject(obj_path, celldex.realize.assays=realize.assays, ...)
}

#' @export
#' @rdname fetchReference
#' @importFrom jsonlite fromJSON
#' @importFrom gypsum cacheDirectory saveFile
fetchMetadata <- function(name, version, path=NA, package="scRNAseq", cache=cacheDirectory(), overwrite=FALSE) {
fetchMetadata <- function(name, version, path=NA, package="celldex", cache=cacheDirectory(), overwrite=FALSE) {
remote_path <- "_bioconductor.json"
if (!is.na(path)) {
remote_path <- paste0(path, "/", remote_path)
Expand Down
38 changes: 38 additions & 0 deletions R/listReferences.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#' List available references
#'
#' List the available reference datasets and the associated versions in \pkg{celldex}.
#'
#' @param name String containing the name of the reference dataset.
#'
#' @return
#' For \code{listReferences}, a character vector containing the names of the available references.
#'
#' For \code{listVersions}, a character vector containing the names of the available versions of the \code{name} reference.
#'
#' For \code{fetchLatestVersion}, a string containing the name of the latest version.
#'
#' @author Aaron Lun
#'
#' @examples
#' listReferences()
#' listVersions("immgen")
#' fetchLatestVersion("immgen")
#'
#' @export
#' @importFrom gypsum listAssets
listReferences <- function() {
listAssets("celldex")
}

#' @export
#' @rdname listVersions
listVersions <- function(name) {
gypsum::listVersions("celldex", name)
}

#' @export
#' @rdname listVersions
#' @importFrom gypsum fetchLatest
fetchLatestVersion <- function(name) {
fetchLatest("celldex", name)
}
18 changes: 14 additions & 4 deletions R/saveReference.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' Save a dataset to disk
#' Save a reference dataset
#'
#' Save a dataset to disk, usually in preparation for upload via \code{\link{uploadDirectory}}.
#' Save a reference dataset to disk, usually in preparation for upload via \code{\link{uploadDirectory}}.
#'
#' @param x Matrix of log-normalized expression values.
#' This may be sparse or dense, but should be non-integer and have no missing values.
Expand All @@ -9,14 +9,21 @@
#' Each row of \code{labels} corresponds to a column of \code{x} and contains the label(s) for that column.
#' Each column of \code{labels} represents a different label type;
#' typically, the column name has a \code{label.} prefix to distinguish between, e.g., \code{label.fine}, \code{label.broad} and so on.
#' @param path String containing the path to a new directory in which save the dataset.
#' @param metadata Named list containing metadata for this dataset,
#' At least one column should be present.
#' @param path String containing the path to a directory in which to save \code{x}.
#' @param metadata Named list containing metadata for this reference dataset,
#' see the schema returned by \code{\link{fetchMetadataSchema}()}.
#' Note that the \code{applications.takane} property will be automatically added by this function and does not have to be supplied.
#'
#' @return \code{x} and \code{labels} are used to create a \linkS4class{SummarizedExperiment} that is saved into \code{path}.
#' \code{NULL} is invisibly returned.
#'
#' @details
#' The SummarizedExperiment saved to \code{path} is guaranteed to have the \code{"logcounts"} assay and at least one column in \code{labels}.
#' This mirrors the expectation for reference datasets obtained via \code{\link{fetchReference}}.
#'
#' @details
#'
#' @author Aaron Lun
#' @examples
#' # Mocking up some data to be saved.
Expand Down Expand Up @@ -73,6 +80,9 @@ saveReference <- function(x, labels, path, metadata) {
validateMetadata(metadata, schema) # First validation for user-supplied content.

# Check that all labels are categorical.
if (ncol(labels) == 0L) {
stop("'labels' should contain at least one column")
}
for (cn in colnames(labels)) {
if (!is.character(labels[[cn]])) {
stop("all columns of 'labels' should be character vectors")
Expand Down
61 changes: 61 additions & 0 deletions R/searchReferences.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#' Search reference metadata
#'
#' Search for reference datasets of interest based on matching text in the associated metadata.
#'
#' @param query String or a \code{gypsum.search.object}, see Examples.
#' @inheritParams surveyReferences
#'
#' @return
#' A \linkS4class{DataFrame} where each row corresponds to a dataset, containing various columns of metadata.
#' Some columns may be lists to capture 1:many mappings.
#'
#' @details
#' The returned DataFrame contains the usual suspects like the title and description for each dataset,
#' the number of rows and columns, the organisms and genome builds involved,
#' whether the dataset has any pre-computed reduced dimensions, and so on.
#' More details can be found in the Bioconductor metadata schema at \url{https://github.com/ArtifactDB/bioconductor-metadata-index}.
#'
#' @author Aaron Lun
#'
#' @examples
#' searchReferences(defineTextQuery("immun%", partial=TRUE))[,c("name", "title")]
#' searchReferences(defineTextQuery("10090", field="taxonomy_id"))[,c("name", "title")]
#' searchReferences(
#' defineTextQuery("10090", field="taxonomy_id") &
#' defineTextQuery("immun%", partial=TRUE)
#' )[,c("name", "title")]
#'
#' @seealso
#' \code{\link{surveyReferences}}, to easily obtain a listing of all available datasets.
#' @export
#' @importFrom S4Vectors DataFrame
#' @importFrom gypsum cacheDirectory fetchMetadataDatabase searchMetadataTextFilter
#' @importFrom DBI dbConnect dbDisconnect dbGetQuery
#' @importFrom RSQLite SQLite
searchReferences <- function(query, cache=cacheDirectory(), overwrite=FALSE, latest=TRUE) {
filter <- searchMetadataTextFilter(query)

bpath <- fetchMetadataDatabase(cache=cache, overwrite=overwrite)
con <- dbConnect(SQLite(), bpath)
on.exit(dbDisconnect(con))

stmt <- "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path";
if (!latest) {
stmt <- paste0(stmt, ", versions.latest AS latest")
}
stmt <- paste0(stmt, " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'scRNAseq'")
if (latest) {
stmt <- paste0(stmt, " AND versions.latest = 1")
}
if (!is.null(filter)) {
stmt <- paste0(stmt, " AND ", filter$where)
everything <- dbGetQuery(con, stmt, params=filter$parameters)
} else {
everything <- dbGetQuery(con, stmt)
}

sanitize_query_to_output(everything, latest)
}

#' @export
gypsum::defineTextQuery
112 changes: 112 additions & 0 deletions R/surveyReferences.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#' Survey reference metadata
#'
#' Metadata survey for all available reference datasets in the \pkg{celldex} package.
#'
#' @param cache,overwrite Arguments to pass to \code{\link{fetchMetadataDatabase}}.
#' @param latest Whether to only consider the latest version of each dataset.
#'
#' @return
#' A \linkS4class{DataFrame} where each row corresponds to a dataset, containing various columns of metadata.
#' Some columns may be lists to capture 1:many mappings.
#'
#' @details
#' The returned DataFrame contains the usual suspects like the title and description for each dataset,
#' the number of samples and types of labels, the organisms and genome builds involved, and so on.
#' More details can be found in the Bioconductor metadata schema at \url{https://github.com/ArtifactDB/bioconductor-metadata-index}.
#'
#' @author Aaron Lun
#'
#' @examples
#' surveyReferences()
#'
#' @seealso
#' \code{\link{searchReferences}}, to search on the metadata for specific datasets.
#'
#' @export
#' @importFrom S4Vectors DataFrame
#' @importFrom gypsum cacheDirectory fetchMetadataDatabase
#' @importFrom DBI dbConnect dbDisconnect dbGetQuery
#' @importFrom RSQLite SQLite
surveyReferences <- function(cache=cacheDirectory(), overwrite=FALSE, latest=TRUE) {
bpath <- fetchMetadataDatabase(cache=cache, overwrite=overwrite)
con <- dbConnect(SQLite(), bpath)
on.exit(dbDisconnect(con))

stmt <- "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path";
if (!latest) {
stmt <- paste0(stmt, ", versions.latest AS latest")
}
stmt <- paste0(stmt, " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'celldex'")
if (latest) {
stmt <- paste0(stmt, " AND versions.latest = 1")
}
everything <- dbGetQuery(con, stmt)

sanitize_query_to_output(everything, latest)
}

#' @importFrom S4Vectors DataFrame
#' @importFrom jsonlite fromJSON
sanitize_query_to_output <- function(results, latest, meta.name="meta") {
path <- results$path
has.slash <- grepl("/", path)
path[!has.slash] <- NA_character_
path[has.slash] <- sub("/[^/]+$", "", path[has.slash])
df <- DataFrame(name = results$asset, version = results$version, path = path)
if (!latest) {
df$latest <- results$latest == 1
}

all_meta <- lapply(results[[meta.name]], fromJSON, simplifyVector=FALSE)
df$title <- extract_atomic_from_json(all_meta, function(x) x$title, "character")
df$description <- extract_atomic_from_json(all_meta, function(x) x$title, "character")
df$taxonomy_id <- extract_charlist_from_json(all_meta, function(x) x$taxonomy_id)
df$genome <- extract_charlist_from_json(all_meta, function(x) x$genome)

df$samples <- extract_atomic_from_json(all_meta, function(x) x$applications$takane$summarized_experiment$columns, "integer")
df$labels <- extract_charlist_from_json(all_meta, function(x) x$applications$takane$summarized_experiment$column_annotations)

df$bioconductor_version < extract_atomic_from_json(all_meta, function(x) x$bioconductor_version, "character")
df$maintainer_name < extract_atomic_from_json(all_meta, function(x) x$maintainer_name, "character")
df$maintainer_email < extract_atomic_from_json(all_meta, function(x) x$maintainer_email, "character")

sources <- vector("list", length(all_meta))
for (i in seq_along(all_meta)) {
cursources <- all_meta[[i]]$sources
if (is.null(cursources)) {
sources[[i]] <- DataFrame(provider=character(0), id=character(0), version=character(0))
} else {
sources[[i]] <- DataFrame(
provider = extract_atomic_from_json(cursources, function(x) x$provider, "character"),
id = extract_atomic_from_json(cursources, function(x) x$id, "character"),
version = extract_atomic_from_json(cursources, function(x) x$version, "character")
)
}
}
df$sources <- as(sources, "CompressedList")

df
}

extract_atomic_from_json <- function(metadata, extract, type) {
vapply(metadata, function(y) {
x <- extract(y)
if (is.null(x)) {
as(NA, type)
} else {
x
}
}, vector(type, 1))
}

extract_charlist_from_json <- function(metadata, extract) {
output <- lapply(metadata, function(y) {
x <- extract(y)
if (is.null(y)) {
character(0)
} else {
x
}
})
as(output, "CompressedList")
}
10 changes: 5 additions & 5 deletions man/fetchReference.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 885bd49

Please sign in to comment.