Skip to content

Commit

Permalink
umap; pdf to svg
Browse files Browse the repository at this point in the history
  • Loading branch information
trvinh committed Jul 9, 2024
1 parent 6413ca5 commit 22e8bdd
Show file tree
Hide file tree
Showing 15 changed files with 838 additions and 58 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: PhyloProfile
Version: 1.19.4
Date: 2024-06-25
Version: 1.19.5
Date: 2024-07-09
Title: PhyloProfile
Authors@R: c(
person("Vinh", "Tran", role = c("aut", "cre"), email = "tran@bio.uni-frankfurt.de", comment=c(ORCID="0000-0001-6772-7595")),
Expand All @@ -17,7 +17,7 @@ biocViews: Software, Visualization, DataRepresentation, MultipleComparison, Func
Imports:
ape, bioDist, BiocStyle, Biostrings, colourpicker, data.table, dplyr, DT,
energy, ExperimentHub, extrafont, ggplot2, gridExtra, pbapply, RColorBrewer,
RCurl, shiny, shinyBS, shinycssloaders, shinyFiles, shinyjs, stringr,
RCurl, shiny, shinyBS, shinycssloaders, shinyFiles, shinyjs, stringr, umap,
xml2, zoo, yaml
RoxygenNote: 7.3.1
Suggests: knitr, rmarkdown, testthat, OmaDB
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export(createGeneAgePlot)
export(createLongMatrix)
export(createPercentageDistributionData)
export(createProfileFromOma)
export(createUmapPlotData)
export(createUnrootedTree)
export(createVarDistPlot)
export(createVariableDistributionData)
Expand Down Expand Up @@ -61,13 +62,16 @@ export(linearizeArchitecture)
export(mainTaxonomyRank)
export(parseDomainInput)
export(parseInfoProfile)
export(plotUmap)
export(prepareUmapData)
export(processNcbiTaxonomy)
export(processOrthoID)
export(reduceProfile)
export(runPhyloProfile)
export(sortInputTaxa)
export(sortTaxaFromTree)
export(taxonomyTableCreator)
export(umapClustering)
export(varDistTaxPlot)
export(wideToLong)
export(xmlParser)
Expand All @@ -76,6 +80,7 @@ import(ExperimentHub)
import(data.table, except = c(first, last, between))
import(dplyr)
import(ggplot2)
import(umap)
importFrom(Biostrings,readAAStringSet)
importFrom(DT,dataTableOutput)
importFrom(DT,renderDataTable)
Expand Down Expand Up @@ -132,6 +137,7 @@ importFrom(stringr,str_count)
importFrom(stringr,str_split_fixed)
importFrom(stringr,str_wrap)
importFrom(utils,head)
importFrom(utils,tail)
importFrom(xml2,read_xml)
importFrom(xml2,xml_attr)
importFrom(xml2,xml_find_all)
Expand Down
207 changes: 207 additions & 0 deletions R/umapClustering.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#' Prepare data for UMAP
#' @export
#' @usage prepareUmapData(longDf = NULL, taxonRank = NULL, type = "taxa",
#' taxDB = NULL, cutoff = 0)
#' @param longDf input phyloprofile file in long format
#' @param taxonRank taxonomy rank for labels (e.g. "phylum")
#' @param type type of clustering, either "taxa" (default) or "genes"
#' @param taxDB path to taxonomy database
#' @param cutoff cutoff to filter data values. Default: 0
#' @return A dataframe in wide format
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @examples
#' rawInput <- system.file(
#' "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
#' )
#' longDf <- createLongMatrix(rawInput)
#' prepareUmapData(longDf, "phylum")

prepareUmapData <- function(
longDf = NULL, taxonRank = NULL, type = "taxa", taxDB = NULL, cutoff = 0
) {
if (is.null(longDf)) stop("Input data cannot be NULL")
if (is.null(taxonRank)) stop("Taxon rank must be specified!")
FAS_F <- FAS_B <- geneID <- ncbiID <- abbrName <- fullName <- NULL
# filter and subset input df
longDfSub <- longDf %>% filter(FAS_F >= cutoff & FAS_B >= cutoff) %>%
select(geneID, ncbiID, FAS_F)
# get taxon names for input taxa based on a selected supertaxon rank
taxMatrix <- getTaxonomyMatrix(taxDB)
nameList <- getNameList(taxDB)
superTaxonDf <- taxMatrix %>%
filter(abbrName %in% longDfSub$ncbiID) %>%
select(abbrName, one_of(taxonRank))
colnames(superTaxonDf) <- c("ncbiID", "superID")
superTaxonDf <- dplyr::left_join(
superTaxonDf, nameList, by = c("superID" = "ncbiID")
)
# transform to wide format, add taxon names and ortholog count
if (type == "taxa") {
wideDf <- data.table::dcast(
setDT(longDfSub), ncbiID ~ geneID, value.var = c("FAS_F"),
fun.aggregate = max, fill = -1
)
wideDf <- dplyr::left_join(
wideDf, superTaxonDf %>% select(ncbiID, Label = fullName),
by = "ncbiID"
)
# add count (how many seed genes each taxon has, excluding co-orthologs)
seedWithTax <- longDfSub %>% select(geneID, ncbiID)
seedWithTax <- seedWithTax[!duplicated(seedWithTax),]
countDf <- data.frame(table(seedWithTax$ncbiID))
wideDf <- dplyr::left_join(
data.frame(wideDf), countDf, by = c("ncbiID" = "Var1")
)
} else {
wideDf <- data.table::dcast(
setDT(longDfSub), geneID ~ ncbiID, value.var = c("FAS_F"),
fun.aggregate = max, fill = -1
)
wideDf$Label <- wideDf$geneID
# add count (how many taxa has orthologs for each seed gene)
seedWithTax <- longDfSub %>% select(geneID, ncbiID)
seedWithTax <- seedWithTax[!duplicated(seedWithTax),]
countDf <- data.frame(table(longDfSub$geneID))
wideDf <- dplyr::left_join(
data.frame(wideDf), countDf, by = c("geneID" = "Var1")
)
}
return(wideDf)
}

#' Perform UMAP clustering
#' @export
#' @usage umapClustering(data4umap = NULL, by = "taxa", type = "binary",
#' randomSeed = 123)
#' @param data4umap data for UMAP clustering (output from prepareUmapData)
#' @param by cluster data by "taxa" (default) or "genes"
#' @param type type of data, either "binary" (default) or "non-binary"
#' @param randomSeed random seed. Default: 123
#' @import umap
#' @return A list contain UMAP cluster objects
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @seealso \code{\link{prepareUmapData}}
#' @examples
#' rawInput <- system.file(
#' "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
#' )
#' longDf <- createLongMatrix(rawInput)
#' data4umap <- prepareUmapData(longDf, "phylum")
#' umapClustering(data4umap)

umapClustering <- function(
data4umap = NULL, by = "taxa", type = "binary", randomSeed = 123
) {
if (is.null(data4umap)) stop("Input data cannot be NULL!")
ncbiID <- Label <- Freq <- geneID <- NULL
if (by == "taxa") {
subsetDt <- subset(data4umap, select = -c(ncbiID, Label, Freq))

} else {
subsetDt <- subset(data4umap, select = -c(geneID, Label, Freq))
}
if (type == "binary") {
subsetDt[subsetDt >= 0] <- 1
subsetDt[subsetDt < 0] <- 0
}
df.umap <- umap(subsetDt, random_state = randomSeed, preserve.seed = TRUE)
return(df.umap)
}


#' Create UMAP cluster plot
#' @export
#' @usage createUmapPlotData(umapData = NULL, data4umap = NULL, labelNr = 5,
#' excludeTaxa = "None")
#' @param umapData data contains UMAP cluster (output from umapClustering())
#' @param data4umap data for UMAP clustering (output from prepareUmapData())
#' @param labelNr maximal number of labels. Default: 5
#' @param excludeTaxa hide taxa from plot. Default: "None"
#' @importFrom utils tail
#' @return A plot as ggplot object
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @seealso \code{\link{prepareUmapData}}, \code{\link{umapClustering}}
#' @examples
#' rawInput <- system.file(
#' "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
#' )
#' longDf <- createLongMatrix(rawInput)
#' data4umap <- prepareUmapData(longDf, "phylum")
#' umapData <- umapClustering(data4umap)
#' createUmapPlotData(umapData, data4umap)

createUmapPlotData <- function(
umapData = NULL, data4umap = NULL, labelNr = 5, excludeTaxa = "None"
) {
if (is.null(umapData) | is.null(data4umap))
stop("Input data cannot be NULL!")
if (length(umapData) == 0 | length(data4umap) == 0)
stop("Input data cannot be EMPTY!")

Label <- Freq <- NULL

data4umap$X <- umapData$layout[,1]
data4umap$Y <- umapData$layout[,2]
# join less freq items into "other"
maxFreqDf <- data.frame(
data4umap %>% group_by(Label) %>% summarise(max = max(Freq))
)
minFreq <- tail(sort(unique(maxFreqDf$max)),labelNr)[1]
keepLabel <- unique(data4umap$Label[data4umap$Freq >= minFreq])
data4umap$Label[!(data4umap$Label %in% keepLabel)] <- "[Other]"
# exclude taxa
if (length(excludeTaxa) == 0) excludeTaxa <- c("None")
if (length(excludeTaxa) > 0 & excludeTaxa[1] != "None") {
data4umap$X[data4umap$Label %in% excludeTaxa] <- NA
data4umap$Y[data4umap$Label %in% excludeTaxa] <- NA
}
return(data4umap)
}


#' Create UMAP cluster plot
#' @export
#' @usage plotUmap(plotDf = NULL, legendPos = "right", colorPalette = "Set2",
#' transparent = 0, textSize = 12)
#' @param plotDf data for UMAP plot
#' @param legendPos position of legend. Default: "right"
#' @param colorPalette color palette. Default: "Set2"
#' @param transparent transparent level (from 0 to 1). Default: 0
#' @param textSize size of axis and legend text. Default: 12
#' @return A plot as ggplot object
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @seealso \code{\link{prepareUmapData}}, \code{\link{umapClustering}},
#' \code{\link{createUmapPlotData}}
#' @examples
#' rawInput <- system.file(
#' "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
#' )
#' longDf <- createLongMatrix(rawInput)
#' umapData <- prepareUmapData(longDf, "phylum")
#' data.umap <- umapClustering(umapData)
#' plotDf <- createUmapPlotData(data.umap, umapData)
#' plotUmap(plotDf)

plotUmap <- function(
plotDf = NULL, legendPos = "right", colorPalette = "Set2",
transparent = 0, textSize = 12
) {
if (is.null(plotDf)) stop("Input data cannot be NULL!")
X <- Y <- Label <- Freq <- NULL
# generate plot
plot <- ggplot(plotDf, aes(x = X, y = Y, color = Label)) +
geom_point(aes(size = Freq), alpha = 1 - transparent) +
geom_rug(alpha = 1) +
theme_minimal() +
labs(x = "", y = "") +
scale_color_brewer(palette = colorPalette) +
guides(color = guide_legend(override.aes = list(alpha = 1))) +
theme(
legend.position = legendPos,
legend.text = element_text(size = textSize),
legend.title = element_text(size = textSize),
axis.text = element_text(size = textSize),
axis.title = element_text(size = textSize),
)
return(plot)
}
1 change: 1 addition & 0 deletions inst/NEWS.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

\section{Version 1.20.0}{
\itemize{
\item added UMAP clustering
\item improved domain plot #110
\item option to change font
\item option to show gene names
Expand Down
20 changes: 10 additions & 10 deletions inst/PhyloProfile/R/createDetailedPlot.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ createDetailedPlot <- function(
plotDf <- data()
if (
checkBionfFormat(
plotDf$orthoID[1], plotDf$geneID[1],
plotDf$orthoID[1], plotDf$geneID[1],
gsub('ncbi','',plotDf$abbrName[1])
)
) {
plotDf <- within(
plotDf,
plotDf,
orthoMod <- data.frame(
do.call(
'rbind', strsplit(as.character(orthoID),'|', fixed=TRUE)
Expand All @@ -47,7 +47,7 @@ createDetailedPlot <- function(
}
return(plotDf)
})

# render detailed plot -----------------------------------------------------
output$detailPlot <- renderPlot({
detailPlot(plotData(), detailedText(), var1ID(), var2ID(), font())
Expand All @@ -67,7 +67,7 @@ createDetailedPlot <- function(

output$downloadDetailed <- downloadHandler(
filename = function() {
c("detailedPlot.pdf")
c("detailedPlot.svg")
},
content = function(file) {
g <- detailPlot(data(), detailedText(), var1ID(), var2ID(), font())
Expand All @@ -78,7 +78,7 @@ createDetailedPlot <- function(
height = detailedHeight() * 0.056458333,
units = "cm",
dpi = 300,
device = "pdf",
device = "svg",
limitsize = FALSE
)
}
Expand All @@ -88,7 +88,7 @@ createDetailedPlot <- function(
pointInfoDetail <- reactive({
selDf <- data()
selDf$orthoID <- as.character(selDf$orthoID)

# if only one ortholog, get directly from data()
if (nrow(selDf) == 1) {
seedID <- as.character(selDf$geneID)
Expand All @@ -107,22 +107,22 @@ createDetailedPlot <- function(
orthoID <- as.character(selDf$orthoID[corX])
taxID <- as.character(selDf$abbrName[corX])
}

# get var1, var2
var1 <- as.list(selDf$var1[selDf$orthoID == orthoID])
var1 <- as.character(var1[!is.na(var1)])
var2 <- as.list(selDf$var2[selDf$orthoID == orthoID])
var2 <- as.character(var2[!is.na(var2)])
if (length(var2) == 0) var2 = "NA"

ncbiID <- selDf[selDf$orthoID == orthoID, ]$abbrName
ncbiID <- as.character(ncbiID[!is.na(ncbiID)][1])

# return info
if (is.na(orthoID))
if (is.na(orthoID))
return(NULL)
else {
if (orthoID != "NA")
if (orthoID != "NA")
return(c(seedID, orthoID, var1, var2, ncbiID))
}
})
Expand Down
5 changes: 2 additions & 3 deletions inst/PhyloProfile/R/createProfilePlot.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ createProfilePlot <- function(input, output, session,
if (is.null(data())) stop("Profile data is NULL!")
if (typeProfile() == "customizedProfile") {
if (is.null(inTaxa()) | is.null(inSeq())) return()

dataHeat <- dataCustomizedPlot(data(), inTaxa(), inSeq())
if (applyCluster() == TRUE) {
dataHeat <- dataCustomizedPlot(
Expand Down Expand Up @@ -124,7 +123,7 @@ createProfilePlot <- function(input, output, session,

output$profileDownload <- downloadHandler(
filename = function() {
c("profile.pdf")
c("profile.svg")
},
content = function(file) {
ggsave(
Expand All @@ -137,7 +136,7 @@ createProfilePlot <- function(input, output, session,
),
width = parameters()$width * 0.056458333,
height = parameters()$height * 0.056458333,
units = "cm", dpi = 300, device = "pdf", limitsize = FALSE
units = "cm", dpi = 300, device = "svg", limitsize = FALSE
)
}
)
Expand Down
2 changes: 1 addition & 1 deletion inst/PhyloProfile/R/downloadPlotSettings.R
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ writePlottingScript <- function(settingsFile) {
"p <- heatmapPlotting(plotDf, plotParameter)",
"ggsave(args[3], plot = p, width = settings$width * 0.056458333,
height = settings$height * 0.056458333,
units = 'cm', dpi = 300, device = 'pdf', limitsize = FALSE)",
units = 'cm', dpi = 300, device = 'svg', limitsize = FALSE)",
"print('DONE! Your plot is saved in', args[3])"
)
return(out)
Expand Down
Loading

0 comments on commit 22e8bdd

Please sign in to comment.