umap; pdf to svg

BIONF · Jul 9, 2024 · 22e8bdd · 22e8bdd
1 parent 6413ca5
commit 22e8bdd
Show file tree

Hide file tree

Showing 15 changed files with 838 additions and 58 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: PhyloProfile
-Version: 1.19.4
-Date: 2024-06-25
+Version: 1.19.5
+Date: 2024-07-09
 Title: PhyloProfile
 Authors@R: c(
     person("Vinh", "Tran", role = c("aut", "cre"), email = "tran@bio.uni-frankfurt.de", comment=c(ORCID="0000-0001-6772-7595")),
@@ -17,7 +17,7 @@ biocViews: Software, Visualization, DataRepresentation, MultipleComparison, Func
 Imports:
     ape, bioDist, BiocStyle, Biostrings, colourpicker, data.table, dplyr, DT,
     energy, ExperimentHub, extrafont, ggplot2, gridExtra, pbapply, RColorBrewer,
-    RCurl, shiny, shinyBS, shinycssloaders, shinyFiles, shinyjs, stringr,
+    RCurl, shiny, shinyBS, shinycssloaders, shinyFiles, shinyjs, stringr, umap,
     xml2, zoo, yaml
 RoxygenNote: 7.3.1
 Suggests: knitr, rmarkdown, testthat, OmaDB

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(createGeneAgePlot)
 export(createLongMatrix)
 export(createPercentageDistributionData)
 export(createProfileFromOma)
+export(createUmapPlotData)
 export(createUnrootedTree)
 export(createVarDistPlot)
 export(createVariableDistributionData)
@@ -61,13 +62,16 @@ export(linearizeArchitecture)
 export(mainTaxonomyRank)
 export(parseDomainInput)
 export(parseInfoProfile)
+export(plotUmap)
+export(prepareUmapData)
 export(processNcbiTaxonomy)
 export(processOrthoID)
 export(reduceProfile)
 export(runPhyloProfile)
 export(sortInputTaxa)
 export(sortTaxaFromTree)
 export(taxonomyTableCreator)
+export(umapClustering)
 export(varDistTaxPlot)
 export(wideToLong)
 export(xmlParser)
@@ -76,6 +80,7 @@ import(ExperimentHub)
 import(data.table, except = c(first, last, between))
 import(dplyr)
 import(ggplot2)
+import(umap)
 importFrom(Biostrings,readAAStringSet)
 importFrom(DT,dataTableOutput)
 importFrom(DT,renderDataTable)
@@ -132,6 +137,7 @@ importFrom(stringr,str_count)
 importFrom(stringr,str_split_fixed)
 importFrom(stringr,str_wrap)
 importFrom(utils,head)
+importFrom(utils,tail)
 importFrom(xml2,read_xml)
 importFrom(xml2,xml_attr)
 importFrom(xml2,xml_find_all)

diff --git a/R/umapClustering.R b/R/umapClustering.R
@@ -0,0 +1,207 @@
+#' Prepare data for UMAP
+#' @export
+#' @usage prepareUmapData(longDf = NULL, taxonRank = NULL, type = "taxa", 
+#'     taxDB = NULL, cutoff = 0)
+#' @param longDf input phyloprofile file in long format
+#' @param taxonRank taxonomy rank for labels (e.g. "phylum")
+#' @param type type of clustering, either "taxa" (default) or "genes"
+#' @param taxDB path to taxonomy database
+#' @param cutoff cutoff to filter data values. Default: 0
+#' @return A dataframe in wide format
+#' @author Vinh Tran tran@bio.uni-frankfurt.de
+#' @examples
+#' rawInput <- system.file(
+#'    "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
+#' )
+#' longDf <- createLongMatrix(rawInput)
+#' prepareUmapData(longDf, "phylum")
+
+prepareUmapData <- function(
+    longDf = NULL, taxonRank = NULL, type = "taxa", taxDB = NULL, cutoff = 0
+) {
+    if (is.null(longDf)) stop("Input data cannot be NULL")
+    if (is.null(taxonRank)) stop("Taxon rank must be specified!")
+    FAS_F <- FAS_B <- geneID <- ncbiID <- abbrName <- fullName <- NULL
+    # filter and subset input df
+    longDfSub <- longDf %>% filter(FAS_F >= cutoff & FAS_B >= cutoff) %>% 
+        select(geneID, ncbiID, FAS_F)
+    # get taxon names for input taxa based on a selected supertaxon rank
+    taxMatrix <- getTaxonomyMatrix(taxDB)
+    nameList <- getNameList(taxDB)
+    superTaxonDf <- taxMatrix %>% 
+        filter(abbrName %in% longDfSub$ncbiID) %>% 
+        select(abbrName, one_of(taxonRank))
+    colnames(superTaxonDf) <- c("ncbiID", "superID")
+    superTaxonDf <- dplyr::left_join(
+        superTaxonDf, nameList, by = c("superID" = "ncbiID")
+    )
+    # transform to wide format, add taxon names and ortholog count
+    if (type == "taxa") {
+        wideDf <- data.table::dcast(
+            setDT(longDfSub), ncbiID ~ geneID, value.var = c("FAS_F"), 
+            fun.aggregate = max, fill = -1
+        )
+        wideDf <- dplyr::left_join(
+            wideDf, superTaxonDf %>% select(ncbiID, Label = fullName), 
+            by = "ncbiID"
+        )
+        # add count (how many seed genes each taxon has, excluding co-orthologs)
+        seedWithTax <- longDfSub %>% select(geneID, ncbiID)
+        seedWithTax <- seedWithTax[!duplicated(seedWithTax),]
+        countDf <- data.frame(table(seedWithTax$ncbiID))
+        wideDf <- dplyr::left_join(
+            data.frame(wideDf), countDf, by = c("ncbiID" = "Var1")
+        )
+    } else {
+        wideDf <- data.table::dcast(
+            setDT(longDfSub), geneID ~ ncbiID, value.var = c("FAS_F"), 
+            fun.aggregate = max, fill = -1
+        )
+        wideDf$Label <- wideDf$geneID
+        # add count (how many taxa has orthologs for each seed gene)
+        seedWithTax <- longDfSub %>% select(geneID, ncbiID)
+        seedWithTax <- seedWithTax[!duplicated(seedWithTax),]
+        countDf <- data.frame(table(longDfSub$geneID))
+        wideDf <- dplyr::left_join(
+            data.frame(wideDf), countDf, by = c("geneID" = "Var1")
+        )
+    }
+    return(wideDf)
+}
+
+#' Perform UMAP clustering
+#' @export
+#' @usage umapClustering(data4umap = NULL, by = "taxa", type = "binary", 
+#'     randomSeed = 123)
+#' @param data4umap data for UMAP clustering (output from prepareUmapData)
+#' @param by cluster data by "taxa" (default) or "genes"
+#' @param type type of data, either "binary" (default) or "non-binary"
+#' @param randomSeed random seed. Default: 123
+#' @import umap
+#' @return A list contain UMAP cluster objects
+#' @author Vinh Tran tran@bio.uni-frankfurt.de
+#' @seealso \code{\link{prepareUmapData}}
+#' @examples
+#' rawInput <- system.file(
+#'    "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
+#' )
+#' longDf <- createLongMatrix(rawInput)
+#' data4umap <- prepareUmapData(longDf, "phylum")
+#' umapClustering(data4umap)
+
+umapClustering <- function(
+        data4umap = NULL, by = "taxa", type = "binary", randomSeed = 123
+) {
+    if (is.null(data4umap)) stop("Input data cannot be NULL!")
+    ncbiID <- Label <- Freq <- geneID <- NULL
+    if (by == "taxa") {
+        subsetDt <- subset(data4umap, select = -c(ncbiID, Label, Freq))
+
+    } else {
+        subsetDt <- subset(data4umap, select = -c(geneID, Label, Freq))
+    }
+    if (type == "binary") {
+        subsetDt[subsetDt >= 0] <- 1
+        subsetDt[subsetDt < 0] <- 0
+    }
+    df.umap <- umap(subsetDt, random_state = randomSeed, preserve.seed = TRUE)
+    return(df.umap)
+}
+
+
+#' Create UMAP cluster plot
+#' @export
+#' @usage createUmapPlotData(umapData = NULL, data4umap = NULL, labelNr = 5, 
+#'     excludeTaxa = "None")
+#' @param umapData data contains UMAP cluster (output from umapClustering())
+#' @param data4umap data for UMAP clustering (output from prepareUmapData())
+#' @param labelNr maximal number of labels. Default: 5
+#' @param excludeTaxa hide taxa from plot. Default: "None"
+#' @importFrom utils tail
+#' @return A plot as ggplot object
+#' @author Vinh Tran tran@bio.uni-frankfurt.de
+#' @seealso \code{\link{prepareUmapData}}, \code{\link{umapClustering}}
+#' @examples
+#' rawInput <- system.file(
+#'    "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
+#' )
+#' longDf <- createLongMatrix(rawInput)
+#' data4umap <- prepareUmapData(longDf, "phylum")
+#' umapData <- umapClustering(data4umap)
+#' createUmapPlotData(umapData, data4umap)
+
+createUmapPlotData <- function(
+    umapData = NULL, data4umap = NULL, labelNr = 5, excludeTaxa = "None"
+) {
+    if (is.null(umapData) | is.null(data4umap)) 
+        stop("Input data cannot be NULL!")
+    if (length(umapData) == 0 | length(data4umap) == 0) 
+        stop("Input data cannot be EMPTY!")
+
+    Label <- Freq <- NULL
+
+    data4umap$X <- umapData$layout[,1]
+    data4umap$Y <- umapData$layout[,2]
+    # join less freq items into "other"
+    maxFreqDf <- data.frame(
+        data4umap %>% group_by(Label) %>% summarise(max = max(Freq))
+    )
+    minFreq <- tail(sort(unique(maxFreqDf$max)),labelNr)[1]
+    keepLabel <- unique(data4umap$Label[data4umap$Freq >= minFreq])
+    data4umap$Label[!(data4umap$Label %in% keepLabel)] <- "[Other]"
+    # exclude taxa
+    if (length(excludeTaxa) == 0) excludeTaxa <- c("None")
+    if (length(excludeTaxa) > 0 & excludeTaxa[1] != "None") {
+        data4umap$X[data4umap$Label %in% excludeTaxa] <- NA
+        data4umap$Y[data4umap$Label %in% excludeTaxa] <- NA
+    }
+    return(data4umap)
+}
+
+
+#' Create UMAP cluster plot
+#' @export
+#' @usage plotUmap(plotDf = NULL, legendPos = "right", colorPalette = "Set2", 
+#'     transparent = 0, textSize = 12)
+#' @param plotDf data for UMAP plot 
+#' @param legendPos position of legend. Default: "right"
+#' @param colorPalette color palette. Default: "Set2"
+#' @param transparent transparent level (from 0 to 1). Default: 0
+#' @param textSize size of axis and legend text. Default: 12
+#' @return A plot as ggplot object
+#' @author Vinh Tran tran@bio.uni-frankfurt.de
+#' @seealso \code{\link{prepareUmapData}}, \code{\link{umapClustering}},
+#' \code{\link{createUmapPlotData}}
+#' @examples
+#' rawInput <- system.file(
+#'    "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
+#' )
+#' longDf <- createLongMatrix(rawInput)
+#' umapData <- prepareUmapData(longDf, "phylum")
+#' data.umap <- umapClustering(umapData)
+#' plotDf <- createUmapPlotData(data.umap, umapData)
+#' plotUmap(plotDf)
+
+plotUmap <- function(
+        plotDf = NULL, legendPos = "right", colorPalette = "Set2", 
+        transparent = 0, textSize = 12
+) {
+    if (is.null(plotDf)) stop("Input data cannot be NULL!")
+    X <- Y <- Label <- Freq <- NULL
+    # generate plot
+    plot <- ggplot(plotDf, aes(x = X, y = Y, color = Label)) +
+        geom_point(aes(size = Freq), alpha = 1 - transparent) +
+        geom_rug(alpha = 1) +
+        theme_minimal() +
+        labs(x = "", y = "") +
+        scale_color_brewer(palette = colorPalette) +
+        guides(color = guide_legend(override.aes = list(alpha = 1))) +
+        theme(
+            legend.position = legendPos,
+            legend.text = element_text(size = textSize),
+            legend.title = element_text(size = textSize),
+            axis.text = element_text(size = textSize), 
+            axis.title = element_text(size = textSize),
+        )
+    return(plot)
+}
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
@@ -4,6 +4,7 @@
 
 \section{Version 1.20.0}{
   \itemize{
+    \item added UMAP clustering
     \item improved domain plot #110
     \item option to change font
     \item option to show gene names

diff --git a/inst/PhyloProfile/R/createDetailedPlot.R b/inst/PhyloProfile/R/createDetailedPlot.R
@@ -31,12 +31,12 @@ createDetailedPlot <- function(
         plotDf <- data()
         if (
             checkBionfFormat(
-                plotDf$orthoID[1], plotDf$geneID[1], 
+                plotDf$orthoID[1], plotDf$geneID[1],
                 gsub('ncbi','',plotDf$abbrName[1])
             )
         ) {
             plotDf <- within(
-                plotDf, 
+                plotDf,
                 orthoMod <- data.frame(
                     do.call(
                         'rbind', strsplit(as.character(orthoID),'|', fixed=TRUE)
@@ -47,7 +47,7 @@ createDetailedPlot <- function(
         }
         return(plotDf)
     })
-    
+
     # render detailed plot -----------------------------------------------------
     output$detailPlot <- renderPlot({
         detailPlot(plotData(), detailedText(), var1ID(), var2ID(), font())
@@ -67,7 +67,7 @@ createDetailedPlot <- function(
 
     output$downloadDetailed <- downloadHandler(
         filename = function() {
-            c("detailedPlot.pdf")
+            c("detailedPlot.svg")
         },
         content = function(file) {
             g <- detailPlot(data(), detailedText(), var1ID(), var2ID(), font())
@@ -78,7 +78,7 @@ createDetailedPlot <- function(
                 height = detailedHeight() * 0.056458333,
                 units = "cm",
                 dpi = 300,
-                device = "pdf",
+                device = "svg",
                 limitsize = FALSE
             )
         }
@@ -88,7 +88,7 @@ createDetailedPlot <- function(
     pointInfoDetail <- reactive({
         selDf <- data()
         selDf$orthoID <- as.character(selDf$orthoID)
-        
+
         # if only one ortholog, get directly from data()
         if (nrow(selDf) == 1) {
             seedID <- as.character(selDf$geneID)
@@ -107,22 +107,22 @@ createDetailedPlot <- function(
             orthoID <- as.character(selDf$orthoID[corX])
             taxID <- as.character(selDf$abbrName[corX])
         }
-        
+
         # get var1, var2
         var1 <- as.list(selDf$var1[selDf$orthoID == orthoID])
         var1 <- as.character(var1[!is.na(var1)])
         var2 <- as.list(selDf$var2[selDf$orthoID == orthoID])
         var2 <- as.character(var2[!is.na(var2)])
         if (length(var2) == 0) var2 = "NA"
-        
+
         ncbiID <- selDf[selDf$orthoID == orthoID, ]$abbrName
         ncbiID <- as.character(ncbiID[!is.na(ncbiID)][1])
 
         # return info
-        if (is.na(orthoID)) 
+        if (is.na(orthoID))
             return(NULL)
         else {
-            if (orthoID != "NA") 
+            if (orthoID != "NA")
                 return(c(seedID, orthoID, var1, var2, ncbiID))
         }
     })

diff --git a/inst/PhyloProfile/R/createProfilePlot.R b/inst/PhyloProfile/R/createProfilePlot.R
@@ -43,7 +43,6 @@ createProfilePlot <- function(input, output, session,
         if (is.null(data())) stop("Profile data is NULL!")
         if (typeProfile() == "customizedProfile") {
             if (is.null(inTaxa()) | is.null(inSeq())) return()
-
             dataHeat <- dataCustomizedPlot(data(), inTaxa(), inSeq())
             if (applyCluster() == TRUE) {
                 dataHeat <- dataCustomizedPlot(
@@ -124,7 +123,7 @@ createProfilePlot <- function(input, output, session,
 
     output$profileDownload <- downloadHandler(
         filename = function() {
-            c("profile.pdf")
+            c("profile.svg")
         },
         content = function(file) {
             ggsave(
@@ -137,7 +136,7 @@ createProfilePlot <- function(input, output, session,
                 ),
                 width = parameters()$width * 0.056458333,
                 height = parameters()$height * 0.056458333,
-                units = "cm", dpi = 300, device = "pdf", limitsize = FALSE
+                units = "cm", dpi = 300, device = "svg", limitsize = FALSE
             )
         }
     )

diff --git a/inst/PhyloProfile/R/downloadPlotSettings.R b/inst/PhyloProfile/R/downloadPlotSettings.R
@@ -169,7 +169,7 @@ writePlottingScript <- function(settingsFile) {
         "p <- heatmapPlotting(plotDf, plotParameter)",
         "ggsave(args[3], plot = p, width = settings$width * 0.056458333,
             height = settings$height * 0.056458333,
-            units = 'cm', dpi = 300, device = 'pdf', limitsize = FALSE)",
+            units = 'cm', dpi = 300, device = 'svg', limitsize = FALSE)",
         "print('DONE! Your plot is saved in', args[3])"
     )
     return(out)