Skip to content

Commit

Permalink
Update GPT annotations, include those without genes
Browse files Browse the repository at this point in the history
  • Loading branch information
bschilder committed Mar 11, 2024
1 parent 93428f3 commit 3a3d953
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 80 deletions.
17 changes: 11 additions & 6 deletions R/add_evidence.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#' }
#' @param evidence_score_threshold The minimum threshold of mean
#' evidence scores of each gene-phenotype association to keep.
#' @param evidence_score_threshold_metric The metric to use for filtering with
#' \code{evidence_score_threshold}.
#' @param default_score Default evidence score to
#' apply to gene-disease associations that are present in the HPO annotations
#' but don't have evidence scores in the GenCC annotations.
Expand All @@ -37,16 +39,17 @@
#' phenos2 <- add_evidence(phenos = phenos)
add_evidence <- function(phenos,
evidence_score_threshold = NULL,
evidence_score_threshold_metric="evidence_score_sum",
all.x = TRUE,
allow.cartesian = FALSE,
agg_by = c("disease_id",
"gene_symbol"),
default_score = 1,
...){
evidence_score <- evidence_score_mean <- NULL;
evidence_score <- NULL;

if(!all(c("evidence_score_mean") %in% names(phenos))){
messager("Annotating gene-disease associations with Evidence score")
messager("Annotating gene-disease associations with Evidence Score")
phenos <- add_disease(phenos = phenos,
all.x = all.x,
allow.cartesian = allow.cartesian)
Expand All @@ -63,11 +66,13 @@ add_evidence <- function(phenos,
}
#### Filter ####
if(!is.null(evidence_score_threshold)){
if("evidence_score_mean" %in% names(phenos)){
phenos <- phenos[evidence_score_mean>=evidence_score_threshold,]
} else if("evidence_score" %in% names(phenos)){
phenos <- phenos[evidence_score>=evidence_score_threshold,]
if(!evidence_score_threshold_metric %in% names(phenos) &&
"evidence_score" %in% names(phenos)){
messager(evidence_score_threshold_metric,"not found in phenos.",
"Using evidence_score for filtering step instead.")
evidence_score_threshold_metric <- "evidence_score"
}
phenos <- phenos[get(evidence_score_threshold_metric)>=evidence_score_threshold,]
}
#### Set default score ####
if(!is.null(default_score)){
Expand Down
6 changes: 4 additions & 2 deletions R/add_hpo_id.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@
#' Add HPO ID column to dataframe
#'
#' Adds the HPO term ID column "hpo_id".
#' @inheritParams KGExplorer::map_ontology_terms
#' @export
#' @examples
#' phenotype_to_genes <- load_phenotype_to_genes()
#' phenos <- unique(phenotype_to_genes[,c("hpo_id","hpo_name")])
#' phenos2 <- add_hpo_id(phenos=phenos)
add_hpo_id <- function(phenos,
hpo = get_hpo()) {
hpo = get_hpo(),
ignore_case = TRUE) {
if(!"hpo_id" %in% names(phenos)){
messager("Adding HPO IDs.")
phenos$hpo_id <- map_phenotypes(hpo = hpo,
ignore_case = ignore_case,
terms = phenos$hpo_name,
to = "id")
}
Expand Down
2 changes: 1 addition & 1 deletion R/get_hpo.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' Note that the maximum ontology level depth in the 2016 version was 14,
#' whereas in the 2023 version the maximum ontology level depth is 16
#' (due to an expansion of the HPO).
#' @inheritParams KGExplorer::add_ancestors
#' @inheritParams KGExplorer::get_ontology
#' @inheritDotParams KGExplorer::get_ontology
#' @returns \link[simona]{ontology_DAG} object.
#'
Expand Down
44 changes: 26 additions & 18 deletions R/gpt_annot_read.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,46 @@
#'
#' Read in phenotype annotations generated by GPT and
#' do some initial preprocessing (e.g. adding HPO IDs).
#' @param path Path to annotations CSV file.
#' If \code{NULL}, will pull data from GitHub Releases instead.
#' @inheritParams main
#' @param save_path Path to annotations CSV file.
#' If the file does not exist, the data will be downloaded from GitHub.
#' @param force_new If \code{TRUE}, the data will be downloaded from GitHub
#' even if it already exists locally.
#' @param verbose Print messages.
#' @source code {
#' piggyback::pb_upload(file = "~/Downloads/gpt4_hpo_annotations.csv",
#' repo = "neurogenomics/HPOExplorer",
#' overwrite = TRUE,
#' tag = "latest")
#' }
#' @param include_nogenes Include phenotypes with no associated genes.
#' @returns data.table of phenotype annotations
#'
#' @export
#' @examples
#' annot <- gpt_annot_read()
gpt_annot_read <- function(path = NULL,
gpt_annot_read <- function(save_path=file.path(
KGExplorer::cache_dir(package = "HPOExplorer"),
"gpt4_hpo_annotations.csv"
),
force_new=FALSE,
hpo=get_hpo(),
include_nogenes=TRUE,
verbose=TRUE){
pheno_count <- hpo_name <- hpo_id <- NULL;
pheno_count <- hpo_name <- hpo_id <- phenotype <- NULL;

if(is.null(path)){
# path <- paste0(
# "https://github.com/neurogenomics/RareDiseasePrioritisation/raw/master/",
# "gpt_annotations/gpt4_hpo_annotations.csv"
# )
path <- get_data("gpt4_hpo_annotations.csv")
if(!file.exists(save_path) || isTRUE(force_new)){
path <- paste0(
"https://github.com/neurogenomics/RareDiseasePrioritisation/raw/master/",
"gpt_annotations/gpt4_hpo_annotations.csv"
)
utils::download.file(path, save_path)
# path <- get_data("gpt4_hpo_annotations.csv")
}
d <- data.table::fread(path, header = TRUE)
d <- data.table::fread(save_path, header = TRUE)
d <- d[!is.na(phenotype)]
data.table::setnames(d,"phenotype","hpo_name")
d <- add_hpo_id(d, hpo = hpo)
#### Check phenotype names ####
annot <- load_phenotype_to_genes(verbose = verbose)
d <- merge(d,
unique(annot[,c("hpo_id","hpo_name")]),
by="hpo_name")
all.x = TRUE,
by=c("hpo_name","hpo_id"))
d <- data.frame(d)
d[d==""] <- NA
d <- data.table::data.table(d)
Expand Down
9 changes: 7 additions & 2 deletions R/map_phenotypes.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#' @describeIn map_ map_
#' Harmonise phenotypes
#'
#' Harmonise a mixed vector of phenotype names (e.g. "Focal motor seizure")
#' and HPO IDs (e.g. c("HP:0000002","HP:0000003")).
#' @inheritParams map_
#' @inheritParams KGExplorer::map_ontology_terms
#' @returns Character vector
#'
#' @export
#' @import KGExplorer
#' @examples
#' terms <- c("Focal motor seizure","HP:0000002","HP:0000003")
#' terms <- c("Focal motor seizure",
#' "Focal MotoR SEIzure",
#' "HP:0000002","HP:0000003")
#' #### As phenotype names ####
#' term_names <- map_phenotypes(terms=terms)
#' #### As HPO IDs ####
Expand All @@ -17,10 +20,12 @@ map_phenotypes <- function(terms,
hpo = get_hpo(),
to=c("name","id"),
keep_order = TRUE,
ignore_case = TRUE,
invert = FALSE){
KGExplorer::map_ontology_terms(terms = terms,
ont = hpo,
to = to,
keep_order = keep_order,
ignore_case = ignore_case,
invert = invert)
}
4 changes: 2 additions & 2 deletions R/plot_evidence.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ plot_evidence <- function(metric="evidence_score_sum",
pw <- patchwork::wrap_plots(h1,h2,h3,h4,h5, ncol = 1)
if(isTRUE(show_plot)) methods::show(pw)

KGExplorer::plot_save(plt = pw,
path = save_path,
KGExplorer::plot_save(plt=pw,
save_path=save_path,
height=height,
width=width)
return(
Expand Down
11 changes: 8 additions & 3 deletions man/add_.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 18 additions & 11 deletions man/gpt_annot_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 1 addition & 35 deletions man/map_.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

52 changes: 52 additions & 0 deletions man/map_phenotypes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 3a3d953

Please sign in to comment.