-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtree2ncbitax
executable file
·115 lines (94 loc) · 4.68 KB
/
tree2ncbitax
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env Rscript
### tree2ncbitax v1.0.0 (C) Max Planck Society for the Advancement of Science
###
### Code developed by Josue Barrera-Redondo <josue.barrera@tuebingen.mpg.de>
###
### This program is free software: you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
### GNU General Public License for more details.
library("optparse")
# Establish input and output arguments for the script
option_list = list(
make_option(c("-i", "--input"), type="character", default=NULL,
help="Name of input NEWICK tree", metavar="file"),
make_option(c("-g", "--genome"), type="character", default=NULL,
help="Name of the target genome (the name should match the NEWICK labels)", metavar = "STRING"),
make_option(c("-t", "--taxid"), type="integer", default=NULL,
help="NCBI taxonomy ID that is shared between all the organims in the phylogeny", metavar="NUM"),
make_option(c("-o", "--out"), type="character", default="tree.csv",
help="Output file name [default= %default]", metavar="OUT")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
# Add a couple of error messages if the user forgets to add the necessary input
if (is.null(opt$input)){
message ("")
message ("tree2ncbitax: a script to parse NEWICK trees into a table for GenEra")
message ("Used to add infraspecies nodes for detailed analysis of putative de novo genes")
message ("")
print_help(opt_parser)
stop("ERROR: Could not find the NEWICK tree", call.=FALSE)
}
if (is.null(opt$genome)){
message ("")
message ("tree2ncbitax: a script to parse NEWICK trees into a table for GenEra")
message ("Used to add infraspecies nodes for detailed analysis of putative de novo genes")
message ("")
print_help(opt_parser)
stop("ERROR: The user forgot to add the name of the target genome", call.=FALSE)
}
if (is.null(opt$taxid)){
message ("")
message ("tree2ncbitax: a script to parse NEWICK trees into a table for GenEra")
message ("Used to add infraspecies nodes for detailed analysis of putative de novo genes")
message ("")
print_help(opt_parser)
stop("ERROR: The user forgot to add the NCBI Taxonomy ID ", call.=FALSE)
}
library("phytools")
# Read NEWICK tree
INPUT_TREE<-read.tree(file = opt$input)
# Export tree as dataframe
TREE_TABLE<-as.data.frame(phytools::compute.mr(INPUT_TREE, type = "matrix"))
# Replace standard column names with node names
original_col_names <- colnames(TREE_TABLE)
new_col_names <- gsub("V", "Node_", original_col_names)
colnames(TREE_TABLE) <- new_col_names
# Replace presence values (1) with the name of the nodes
for (col_name in colnames(TREE_TABLE)) {
TREE_TABLE[[col_name]][TREE_TABLE[[col_name]] == 1] <- col_name
}
# Replace absence values (0) with NAs
TREE_TABLE[TREE_TABLE == 0] <- NA
# Eliminate the nodes that are not relevant for the target genome
target_row <- which(rownames(TREE_TABLE) == opt$genome)
columns_with_na <- which(is.na(TREE_TABLE[target_row, ]))
TARGET_TABLE <- TREE_TABLE[, -columns_with_na]
# Rearrange the nodes in the dataframe from youngest to oldest in the phylogeny
na_counts <- colSums(is.na(TARGET_TABLE))
TARGET_TABLE <- TARGET_TABLE[, order(-na_counts)]
# Rearrange the row names by phylogenetic proximity to the target genome
na_counts <- rowSums(is.na(TARGET_TABLE))
SORTED_TABLE <- TARGET_TABLE[order(na_counts), ]
# Put the target genome at the top of the dataframe
target_row_index <- which(rownames(SORTED_TABLE) == opt$genome)
SORTED_TABLE <- rbind(SORTED_TABLE[target_row_index, ], SORTED_TABLE[-target_row_index, ])
# Include the genome names as a column in the dataframe
SORTED_TABLE$Genomes <- rownames(SORTED_TABLE)
# Add a column with modified TAXIDs that will allow us to distinguish each genome in the tree from the rest of the NCBI taxonomy
num_rows <- nrow(SORTED_TABLE)
taxid_values <- paste(opt$taxid, 1:(num_rows-1), sep = "_")
taxid_values <- c(opt$taxid, taxid_values)
SORTED_TABLE$taxid_column <- taxid_values
# Move the taxids and the genome names in the 1rst and 2nd columns of the dataframe
FINAL_TABLE <- SORTED_TABLE[, c("taxid_column", "Genomes", names(SORTED_TABLE)[-c(ncol(SORTED_TABLE))])]
FINAL_TABLE <- FINAL_TABLE[, -ncol(FINAL_TABLE)]
# Save the dataframe as a CSV table
FINAL_TABLE[is.na(FINAL_TABLE)] <- ""
write.table(FINAL_TABLE, opt$out, sep = ",", col.names = FALSE, row.names = FALSE, quote = FALSE)