Skip to content

Commit

Permalink
Robust, Universal tree balance (#160)
Browse files Browse the repository at this point in the history
Incorporating code from Rob Noble
  • Loading branch information
ms609 authored Jun 20, 2024
1 parent 34cdc05 commit a18a48b
Show file tree
Hide file tree
Showing 14 changed files with 528 additions and 6 deletions.
12 changes: 8 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
Package: TreeTools
Title: Create, Modify and Analyse Phylogenetic Trees
Version: 1.11.1.9000
Version: 1.11.1.9001
Authors@R: c(
person("Martin R.", 'Smith', role = c("aut", "cre", "cph"),
email = "martin.smith@durham.ac.uk",
comment = c(ORCID = "0000-0001-5660-1727")),
person(given = "Emmanuel", family = "Paradis", role = c("cph"),
comment = c(ORCID = "0000-0003-3092-2199"))
comment = c(ORCID = "0000-0003-3092-2199")),
person(given = "Robert", family = "Noble", role = c("cph"),
comment = c(ORCID = "0000-0002-8057-4252"))
)
License: GPL (>= 3)
Copyright: Incorporates C/C++ code from 'ape' by Emmanuel Paradis
<doi:10.1093/bioinformatics/bty633>
<doi:10.1093/bioinformatics/bty633> and 'RUtreebalance' by Robert Noble
<doi:10.5281/zenodo.5873857>.
Description: Efficient implementations of functions for the creation,
modification and analysis of phylogenetic trees.
Applications include:
Expand All @@ -23,7 +26,8 @@ Description: Efficient implementations of functions for the creation,
<doi:10.1371/journal.pone.0212942>;
calculation of ancestor-descendant relationships,
of 'stemwardness' (Asher & Smith, 2022) <doi:10.1093/sysbio/syab072>,
and of tree balance (Mir et al. 2013) <doi:10.1016/j.mbs.2012.10.005>;
and of tree balance (Mir et al. 2013, Lemant et al. 2022)
<doi:10.1016/j.mbs.2012.10.005>, <doi:10.1093/sysbio/syac027>;
artificial extinction (Asher & Smith, 2022) <doi:10.1093/sysbio/syab072>;
import and export of trees from Newick, Nexus (Maddison et al. 1997)
<doi:10.1093/sysbio/46.4.590>,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,8 @@ export(ForestSplits)
export(Hamming)
export(IC1Spr)
export(ImposeConstraint)
export(J1Index)
export(JQIndex)
export(KeepTip)
export(KeepTipPostorder)
export(KeepTipPreorder)
Expand Down Expand Up @@ -459,6 +461,7 @@ importFrom(methods,setClass)
importFrom(methods,setMethod)
importFrom(stats,as.dist)
importFrom(stats,median)
importFrom(stats,na.omit)
importFrom(stats,runif)
importFrom(stats,setNames)
importFrom(utils,combn)
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# TreeTools 1.11.1.9001 (2024-06-19) #

- `J1Index()` computes the robust, universal tree balance measure of
Lemant _et al_. 2022 <doi:10.1093/sysbio/syac027>, incorporating [code](
https://github.com/robjohnnoble/RUtreebalance) by Rob Noble.


# TreeTools 1.11.1.9000 (2024-06-19) #

- Support larger trees in `TotalCopheneticIndex()`, fixing
Expand Down
319 changes: 319 additions & 0 deletions R/RUtreebalance.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
# This file is adapted from code by Rob Noble, doi:10.5281/zenodo.5873857
# https://github.com/robjohnnoble/RUtreebalance

# MIT License
#
# Copyright (c) 2021 Rob Noble
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


# Find the parent of a node.
#
# Example:
# edges1 <- data.frame(Parent = c(1, 1, 1, 3, 3), Identity = 2:6)
# .MoveUp(edges1, 3)
.MoveUp <- function(edges, identity) {
if (!(identity %in% edges[["Identity"]]) &&
!(identity %in% edges[["Parent"]])) {
stop("Invalid identity.")
}
parent <- edges[edges[["Identity"]] == identity, "Parent"]
if (length(parent) == 0) {
# if identity is the root then don't move
identity
} else if (is.factor(parent)) {
levels(parent)[parent]
} else {
parent
}
}

# Find the root node of a tree.
#
# Example:
# edges1 <- data.frame(Parent = c(1,1,1,3,3), Identity = 2:6)
# .FindRoot(edges1)
.FindRoot <- function(edges) {
start <- edges[["Parent"]][[1]] # reasonable guess
if (is.factor(start)) {
start <- levels(start)[start]
}
repeat {
if (.MoveUp(edges, start) == start) break
start <- .MoveUp(edges, start)
}
return(start)
}

# Get a list of all subtree sizes via depth-first search.
# Optionally provide root node i and adjacency list Adj if known.
#
# Example:
# tree1 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4),
# Identity = 1:7,
# Population = c(1, rep(5, 6)))
# .GetSubtreeSizes(tree1)
.GetSubtreeSizes <- function(
tree,
i = which(tree[["Identity"]] == .FindRoot(tree[, 1:2])),
Adj = .GetAdjacency(tree),
Col = NULL, Cumul = NULL, is_leaf = NULL) {
n <- length(tree[["Identity"]])
has_pops <- "Population" %in% colnames(tree)

uniqueID <- unique(tree[["Identity"]])
if (is.null(Col)) {
Col <- setNames(rep("w", n), uniqueID)
}
if (is.null(Cumul)) {
Cumul <- setNames(rep(NA, n), uniqueID)
}
if (is.null(is_leaf)) {
is_leaf <- setNames(logical(n), uniqueID)
}
if (is.null(Adj[[i]])) {
is_leaf[[i]] <- TRUE
}
for (j in Adj[[i]]) {
if (Col[[j]] == "w"){
L <- .GetSubtreeSizes(tree, j, Adj, Col, Cumul, is_leaf)
Col<- L[["colour"]]
Cumul <- L[["cumulative"]]
is_leaf <- L[["is_leaf"]]
}
}
Col[[i]] <- "b"
Cumul[[i]] <- if (has_pops) {
tree[["Population"]][[i]] + sum(Cumul[Adj[[i]]])
} else {
ifelse(is_leaf[[i]], 1, 0) + sum(Cumul[Adj[[i]]])
}
# Return:
list("colour" = Col,"cumulative" = Cumul, "is_leaf" = is_leaf)
}

# Get adjacency list of a tree.
#
# Example:
# tree1 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4),
# Identity = 1:7,
# Population = c(1, rep(5, 6)))
# .GetAdjacency(tree1)
.GetAdjacency <- function(tree) {
n <- length(tree[["Identity"]])
Adj <- vector(mode = "list", length = n)
for (i in seq_len(n)) {
if(tree$Parent[[i]] != tree$Identity[[i]]) {
p <- match(tree$Parent[[i]], tree$Identity)
Adj[[p]] <- c(Adj[[p]], i)
}
}

# Return:
Adj
}

# Add a row to the edges list to represent the root node (if not already present)
.AddRootRow <- function(tree) {
start <- setdiff(tree[["Parent"]], tree[["Identity"]])
if (length(start) > 1) {
stop("Input dataframe is missing one or more rows")
}
if (length(start) > 0) { # add row for root node
root_row <- if ("Population" %in% colnames(tree)) {
data.frame(Parent = start, Identity = start, Population = 0)
} else {
data.frame(Parent = start, Identity = start)
}
tree <- rbind(root_row, tree)
}

# Return:
tree
}

#' Robust universal tree balance index
#'
#' Calculate tree balance index \ifelse{html}{\out{J<sup>1</sup>}}{\eqn{J^1}}
#' (when `nonRootDominance = FALSE`) or
#' \ifelse{html}{\out{J<sup>1c</sup>}}{\eqn{J^{1c}}}
#' (when `nonRootDominance = TRUE`) from \insertRef{Lemant2022}{TreeTools}.
#'
#' If population sizes are not provided, then the function assigns size 0 to
#' internal nodes, and size 1 to leaves.
#'
#' @param tree Either an object of class 'phylo', or a dataframe with column
#' names Parent, Identity and (optionally) Population.
#' The latter is similar to `tree$edge`, where tree is an object of class
#' 'phylo'; the differences are in class (data.frame versus matrix) and column
#' names.
#' The dataframe may (but isn't required to) include a row for the root node.
#' If population sizes are omitted then internal nodes will be assigned
#' population size zero and leaves will be assigned population size one.
#' @param q Numeric between zero and one specifying sensitivity to type
#' frequencies. If `q < 1`, the \ifelse{html}{\out{J<sup>q</sup>}}{\eqn{J^q}}
#' index - based on generalized entropy - will be returned; see
#' \insertCite{Lemant2022;textual}{TreeTools}, page 1223.
#' @param nonRootDominance Logical specifying whether to use non-root dominance
#' factor.
#'
#' @examples
#' # Using phylo object as input:
#' phylo_tree <- read.tree(text="((a:0.1)A:0.5,(b1:0.2,b2:0.1)B:0.2);")
#' J1Index(phylo_tree)
#' phylo_tree2 <- read.tree(text='((A, B), ((C, D), (E, F)));')
#' J1Index(phylo_tree2)
#'
#' # Using edges lists as input:
#' tree1 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4),
#' Identity = 1:7,
#' Population = c(1, rep(5, 6)))
#' J1Index(tree1)
#' tree2 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4),
#' Identity = 1:7,
#' Population = c(rep(0, 4), rep(1, 3)))
#' J1Index(tree2)
#' tree3 <- data.frame(Parent = c(1, 1, 1, 1, 2, 3, 4),
#' Identity = 1:7,
#' Population = c(0, rep(1, 3), rep(0, 3)))
#' J1Index(tree3)
#' cat_tree <- data.frame(Parent = c(1, 1:14, 1:15, 15),
#' Identity = 1:31,
#' Population = c(rep(0, 15), rep(1, 16)))
#' J1Index(cat_tree)
#'
#' # If population sizes are omitted then internal nodes are assigned population
#' # size zero and leaves are assigned population size one:
#' sym_tree1 <- data.frame(Parent = c(1, rep(1:15, each = 2)),
#' Identity = 1:31,
#' Population = c(rep(0, 15), rep(1, 16)))
#' # Equivalently:
#' sym_tree2 <- data.frame(Parent = c(1, rep(1:15, each = 2)),
#' Identity = 1:31)
#' J1Index(sym_tree1)
#' J1Index(sym_tree2)
#' @author Rob Noble, adapted by Martin R. Smith
#' @references \insertAllCited{}
#' @family tree characterization functions
#' @importFrom stats na.omit
#' @export
J1Index <- function(tree, q = 1, nonRootDominance = FALSE) {

if (!is.na(tree)[[1]]) {
if(inherits(tree, "phylo")) {
# convert from phylo object to data frame
tree <- as.data.frame(tree[["edge"]])
colnames(tree) <- c("Parent", "Identity")
}
tree <- na.omit(tree) # remove any rows containing NA
if(is.factor(tree[["Parent"]])) {
tree[["Parent"]] <- levels(tree[["Parent"]])[tree[["Parent"]]]
}
if (is.factor(tree[["Identity"]])) {
tree[["Identity"]] <- levels(tree[["Identity"]])[tree[["Identity"]]]
}
tree <- .AddRootRow(tree)
}

n <- length(tree[["Identity"]])
if (n <= 1) {
return(0)
}

# Adjacency list
Adj <- .GetAdjacency(tree)

# Get the list of all subtree sizes
subtree_sizes <- .GetSubtreeSizes(tree, Adj = Adj)

# Subtree sizes, including the root
Cumul <- subtree_sizes[["cumulative"]]

# Vector of internal nodes
eff_int_nodes <- which(!subtree_sizes[["is_leaf"]])

# Vector of leaves
leaves <- which(subtree_sizes[["is_leaf"]])

# If population sizes are missing then assign size 0 to internal nodes,
# and size 1 to leaves:
if(!("Population" %in% colnames(tree))) {
tree[["Population"]] <- double(n)
tree[["Population"]][leaves] <- 1
}
if (sum(tree$Population) <= 0) {
stop("At least one node must have Population > 0")
}
J <- 0
# Subtree sizes, excluding the root
Star <- Cumul - tree[["Population"]]
for (i in seq_len(n)){ # Loop over all nodes
if (Star[[i]] > 0){ # Node has at least one child with non-zero size
K <- 0
if(length(Adj[[i]]) > 1) { # Otherwise i has only one child and its
# balance score is 0
eff_children <- 0 # Number of children with non-zero size
for (j in Adj[[i]]) {
if (Cumul[[j]] > 0){ # Otherwise child j has a 0-sized subtree and
# does not count
eff_children <- eff_children + 1

# p is the ratio of the child subtree size including the root
# (root = the child)
# to the parent subtree size excluding the root
p <- Cumul[[j]] / Star[[i]]

# K is the sum of the node balance scores
K <- K + if (q == 1) {-p * log(p)} else {p ^ q}
}
}
# Non-root dominance factor:
if (isTRUE(nonRootDominance)) {
h_factor <- Star[[i]] / Cumul[[i]]
} else {
h_factor <- 1
}

# Normalize the sum of balance scores, adjust for non-root dominance,
# and then add the result to the index
if (eff_children > 1) { # Exclude nodes that have only one child with
# size greater than zero
J_increment <- h_factor * Star[[i]] * if (q == 1) {
K / log(eff_children)
} else {
(1 - K) * eff_children ^ (q - 1) / (eff_children ^ (q - 1) - 1)
}
J <- J + J_increment
}
}
}
}
# normalize the index by dividing by the sum of all subtree sizes:
if (length(eff_int_nodes) > 0) {
J <- J / sum(Star[eff_int_nodes])
}

# Return:
as.numeric(J)
}

#' @rdname J1Index
#' @export
JQIndex <- J1Index
Loading

0 comments on commit a18a48b

Please sign in to comment.