Merge pull request #168 from picrust/v2.4.0

V2.4.0
picrust · Mar 29, 2021 · 0d3150c · 0d3150c
2 parents 8a2e3df + 24b0318
commit 0d3150c
Show file tree

Hide file tree

Showing 44 changed files with 40,545 additions and 206 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,6 @@
 # PICRUSt2
 
-**Please note that PICRUSt2 is currently in beta**.
-
-See the [PICRUSt2 wiki](https://github.com/picrust/picrust2/wiki) for the documentation.
+Pleae see the [PICRUSt2 wiki](https://github.com/picrust/picrust2/wiki) for the documentation and tutorials.
 
 Note that this repository contains the source code for MinPath, which is distributed under the GNU General Public License.
 
diff --git a/picrust2-env.yaml b/picrust2-env.yaml
@@ -7,19 +7,20 @@ channels:
 - defaults
 
 dependencies:
-- biom-format >=2.1.7
-- epa-ng=0.3.6
-- gappa=0.5.1
+- biom-format >=2.1.10
+- epa-ng=0.3.8
+- gappa=0.7.0
 - glpk=4.65
-- h5py >=2.9.0
+- h5py >=3.1.0
 - hmmer >=3.1b2,<=3.2.1
-- jinja2 >=2.10.1
-- joblib >=0.13.1
-- numpy >=1.16.2
-- pandas >=0.24.2,<=0.25.2
+- jinja2 >=2.11.3
+- joblib >=1.0.1
+- numpy >=1.19.5
+- pandas >=1.1.5
 - pytest >=4.4.1
 - pytest-cov >=2.6.1
 - python >=3.5,<3.7
 - r-base >=3.5.1
-- r-castor=1.5.0
+- r-castor=1.6.5
 - scipy >=1.2.1
+- sepp=4.3.10
diff --git a/picrust2/Rscripts/castor_hsp.R b/picrust2/Rscripts/castor_hsp.R
@@ -9,11 +9,12 @@ Args <- commandArgs(TRUE)
 full_tree <- read_tree(file=Args[1], check_label_uniqueness = TRUE)
 trait_values <- read.delim(Args[2], check.names=FALSE, row.names=1)
 hsp_method <- Args[3]
-calc_ci <- as.logical(Args[4])
-check_input_set <- as.logical(Args[5])
-predict_outfile <- Args[6]
-ci_outfile <- Args[7]
-seed_setting <- Args[8]
+edge_exponent_set <- as.numeric(Args[4])
+calc_ci <- as.logical(Args[5])
+check_input_set <- as.logical(Args[6])
+predict_outfile <- Args[7]
+ci_outfile <- Args[8]
+seed_setting <- Args[9]
 
 # Set random seed if integer specified.
 if(seed_setting != "None") {
@@ -67,7 +68,7 @@ mp_study_probs <- function(in_trait, in_tree ,unknown_i, check_input) {
                                   tip_states = in_trait,
                                   check_input=check_input,
                                   transition_costs = "proportional",
-                                  edge_exponent=0.5,
+                                  edge_exponent=edge_exponent_set,
                                   weight_by_scenarios = TRUE)
 
   return(get_sorted_prob(mp_hsp_out$likelihoods,
@@ -89,14 +90,43 @@ emp_prob_study_probs <- function(in_trait, in_tree, unknown_i, check_input) {
                          tree_tips=in_tree$tip.label))
 }
 
+
+# First check if any tip labels are present in the trait table but do not match
+# because they have quotes around them.
+if(length(grep("\"", full_tree$tip.label) > 0) || length(grep("\'", full_tree$tip.label) > 0)) {
+
+    tmp_unknown_tips_index <- which(! full_tree$tip.label %in% rownames(trait_values))
+    tmp_unknown_tips <- full_tree$tip.label[tmp_unknown_tips_index]
+
+    unknown_labels_no_quotes <- gsub("\'", "", tmp_unknown_tips)
+    unknown_labels_no_quotes <- gsub("\"", "", unknown_labels_no_quotes)
+
+    no_quote_matches = which(unknown_labels_no_quotes %in% rownames(trait_values))
+
+    # Remove quotes from around tips where matches only occur when quotes are removed.
+    if(length(no_quote_matches) > 0) {
+        indices_to_change <- tmp_unknown_tips_index[no_quote_matches]
+        full_tree$tip.label[indices_to_change] <- unknown_labels_no_quotes[no_quote_matches]
+    }
+
+    cat(paste("\nWhile running castor_hsp.R: Fixed ", length(no_quote_matches),
+        " tip label(s) that had quotations around them that stopped them from matching the function abundance table.",
+        sep=""))
+}
+
 # Order the trait table to match the tree tip labels. Set all tips without a value to be NA.
 unknown_tips_index <- which(! full_tree$tip.label %in% rownames(trait_values))
 unknown_tips <- full_tree$tip.label[unknown_tips_index]
-num_unknown = length(unknown_tips)
+num_unknown <- length(unknown_tips)
+num_known <- length(full_tree$tip.label) - num_unknown
 
 # Throw error if all tips are unknown.
 if(num_unknown == length(full_tree$tip.label)) {
   stop("None of the reference ids within the function abundance table are found within the input tree. This can occur when malformed or mismatched custom reference files are used.")
+
+# Check if only very few tips are "known".
+} else if(num_known / nrow(trait_values) < 0.1) {
+    stop("Fewer than 10% of reference ids in the function abundance table are in the tree. This could be because the ids are slightly different between the table and the tree.")
 }
 
 unknown_df <- as.data.frame(matrix(NA,
@@ -119,7 +149,7 @@ trait_values <- trait_values[full_tree$tip.label, , drop=FALSE]
 
 num_tip <- nrow(trait_values)
 
-if (hsp_method == "pic" | hsp_method == "scp" | hsp_method == "subtree_average") {
+if (hsp_method == "pic" || hsp_method == "scp" || hsp_method == "subtree_average") {
 
   if (hsp_method == "pic") {
     predict_out <- lapply(trait_values,
@@ -150,7 +180,7 @@ if (hsp_method == "pic" | hsp_method == "scp" | hsp_method == "subtree_average")
   remove(predict_out)
   invisible(gc(verbose = FALSE))
 
-} else if(hsp_method == "emp_prob" | hsp_method == "mp") {
+} else if(hsp_method == "emp_prob" || hsp_method == "mp") {
 
   # Add 1 to all input counts because because traits states need to start at 1.
   trait_values <- trait_values + 1
@@ -211,5 +241,14 @@ predicted_values <- data.frame(predicted_values, check.names = FALSE)
 predicted_values$sequence <- unknown_tips
 predicted_values <- predicted_values[, c("sequence", colnames(trait_values))]
 
+# Check to see if there are any columns that are totally missing.
+missing_values_by_column <- colSums(is.na(predicted_values))
+
+if(length(which(missing_values_by_column == nrow(predicted_values))) > 0) {
+    stop("\nError - at least one trait in the prediction table was entirely missing values.")
+} else if(length(which(missing_values_by_column > 0)) > 0) {
+    cat("\nWarning: there are missing values in the output prediction table.")
+}
+
 # Write out predicted values.
 write.table(predicted_values, file=predict_outfile, row.names=FALSE, quote=FALSE, sep="\t")
diff --git a/picrust2/__init__.py b/picrust2/__init__.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 __author__ = "The PICRUSt Development Team"
-__copyright__ = "Copyright 2018-2020, PICRUSt Project"
+__copyright__ = "Copyright 2018-2021, PICRUSt Project"
 __license__ = "GPL"
 __url__ = "https://github.com/picrust/picrust2"
-__version__ = "2.3.0-b"
+__version__ = "2.4.0"
diff --git a/picrust2/default.py b/picrust2/default.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 
-__copyright__ = "Copyright 2018-2020, The PICRUSt Project"
+__copyright__ = "Copyright 2018-2021, The PICRUSt Project"
 __license__ = "GPL"
-__version__ = "2.3.0-b"
+__version__ = "2.4.0"
 
 from os import path
 
@@ -11,14 +11,16 @@
 default_ref_dir = path.join(project_dir, "default_files", "prokaryotic",
                             "pro_ref")
 
-default_fasta = path.join(default_ref_dir, "pro_ref.fna.gz")
+default_fasta = path.join(default_ref_dir, "pro_ref.fna")
 
 default_tree = path.join(default_ref_dir, "pro_ref.tre")
 
 default_hmm = path.join(default_ref_dir, "pro_ref.hmm")
 
 default_model = path.join(default_ref_dir, "pro_ref.model")
 
+default_raxml_info = path.join(default_ref_dir, "pro_ref.raxml_info")
+
 default_regroup_map = path.join(project_dir, "default_files",
                                 "pathway_mapfiles",
                                 "ec_level4_to_metacyc_rxn.tsv")