diff --git a/.gitignore b/.gitignore index d9d07e6..b473205 100755 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,4 @@ benchmark/.Rhistory taxonkit/binaries* *names.dmp *nodes.dmp +*.json diff --git a/README.md b/README.md index d3e8f34..620dfe1 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ -# TaxonKit - Crossplatform and Efficient NCBI Taxonomy Toolkit +# TaxonKit - Cross-platform and Efficient NCBI Taxonomy Toolkit **Documents:** [http://bioinf.shenwei.me/taxonkit](http://bioinf.shenwei.me/taxonkit) -([**Usage**](http://bioinf.shenwei.me/taxonkit/usage/)) +([**Usage**](http://bioinf.shenwei.me/taxonkit/usage/), +[**Tutorial**](http://bioinf.shenwei.me/taxonkit/tutorial/)) **Source code:** [https://github.com/shenwei356/taxonkit](https://github.com/shenwei356/taxonkit) [![GitHub stars](https://img.shields.io/github/stars/shenwei356/taxonkit.svg?style=social&label=Star&?maxAge=2592000)](https://github.com/shenwei356/taxonkit) @@ -20,7 +21,7 @@ Go to [Download Page](http://bioinf.shenwei.me/taxonkit/download) for more download options and changelogs. -`taxonkit` is implemented in [Golang](https://golang.org/) programming language, +`TaxonKit` is implemented in [Go](https://golang.org/) programming language, executable binary files **for most popular operating systems** are freely available in [release](https://github.com/shenwei356/taxonkit/releases) page. diff --git a/doc/docs/download.md b/doc/docs/download.md index b867b6e..166ae89 100644 --- a/doc/docs/download.md +++ b/doc/docs/download.md @@ -1,15 +1,17 @@ # Download -`taxonkit` is implemented in [Golang](https://golang.org/) programming language, +`TaxonKit` is implemented in [Go](https://golang.org/) programming language, executable binary files **for most popular operating system** are freely available in [release](https://github.com/shenwei356/taxonkit/releases) page. ## Current Version -[taxonkit v0.1](https://github.com/shenwei356/taxonkit/releases/tag/v0.1) -[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/taxonkit/v0.1/total.svg)](https://github.com/shenwei356/taxonkit/releases/tag/v0.1) +[TaxonKit v0.1.1](https://github.com/shenwei356/taxonkit/releases/tag/v0.1.1) +[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/taxonkit/v0.1.1/total.svg)](https://github.com/shenwei356/taxonkit/releases/tag/v0.1.1) -- first release +- add feature of `taxonkit list`, users can choose output in readable JSON + format by flag `--json` so the taxonomy tree could be collapse and + uncollapse in modern text editor. Links: @@ -33,7 +35,7 @@ Links: [Download Page](https://github.com/shenwei356/taxonkit/releases) -`taxonkit` is implemented in [Golang](https://golang.org/) programming language, +`TaxonKit` is implemented in [Go](https://golang.org/) programming language, executable binary files **for most popular operating systems** are freely available in [release](https://github.com/shenwei356/taxonkit/releases) page. @@ -61,7 +63,9 @@ For Go developer, just one command: ## Previous Versions - +- [TaxonKit v0.1](https://github.com/shenwei356/taxonkit/releases/tag/v0.1) +[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/taxonkit/v0.1/total.svg)](https://github.com/shenwei356/taxonkit/releases/tag/v0.1) + - first release
diff --git a/doc/docs/files/grep_result.png b/doc/docs/files/grep_result.png deleted file mode 100644 index da2acad..0000000 Binary files a/doc/docs/files/grep_result.png and /dev/null differ diff --git a/doc/docs/files/otu_table.csv b/doc/docs/files/otu_table.csv deleted file mode 100755 index 981cd84..0000000 --- a/doc/docs/files/otu_table.csv +++ /dev/null @@ -1,6 +0,0 @@ -Taxonomy,A.1,A.2,A.3,B.1,B.2,B.3,C.1,C.2,C.3 -Proteobacteria,.13,.29,.13,.16,.13,.22,.30,.23,.21 -Firmicutes,.42,.06,.49,.41,.55,.41,.32,.38,.66 -Bacteroidetes,.19,.62,.12,.33,.16,.29,.34,.35,.09 -Deferribacteres,.17,.00,.24,.01,.01,.01,.01,.01,.02 -Tenericutes,.00,.00,.00,.01,.03,.02,.00,.00,.00 diff --git a/doc/docs/files/otu_table.gAB.csv b/doc/docs/files/otu_table.gAB.csv deleted file mode 100644 index 608df9a..0000000 --- a/doc/docs/files/otu_table.gAB.csv +++ /dev/null @@ -1,6 +0,0 @@ -Taxonomy,A.1,A.2,A.3,B.1,B.2,B.3 -Proteobacteria,.13,.29,.13,.16,.13,.22 -Firmicutes,.42,.06,.49,.41,.55,.41 -Bacteroidetes,.19,.62,.12,.33,.16,.29 -Deferribacteres,.17,.00,.24,.01,.01,.01 -Tenericutes,.00,.00,.00,.01,.03,.02 diff --git a/doc/docs/files/otu_table.gAB.t.csv b/doc/docs/files/otu_table.gAB.t.csv deleted file mode 100644 index bf8fe0d..0000000 --- a/doc/docs/files/otu_table.gAB.t.csv +++ /dev/null @@ -1,7 +0,0 @@ -Taxonomy,Proteobacteria,Firmicutes,Bacteroidetes,Deferribacteres,Tenericutes -A.1,.13,.42,.19,.17,.00 -A.2,.29,.06,.62,.00,.00 -A.3,.13,.49,.12,.24,.00 -B.1,.16,.41,.33,.01,.01 -B.2,.13,.55,.16,.01,.03 -B.3,.22,.41,.29,.01,.02 diff --git a/doc/docs/files/otu_table.gAB.t.r.csv b/doc/docs/files/otu_table.gAB.t.r.csv deleted file mode 100644 index a9106d7..0000000 --- a/doc/docs/files/otu_table.gAB.t.r.csv +++ /dev/null @@ -1,7 +0,0 @@ -sample,Proteobacteria,Firmicutes,Bacteroidetes,Deferribacteres,Tenericutes -A.1,.13,.42,.19,.17,.00 -A.2,.29,.06,.62,.00,.00 -A.3,.13,.49,.12,.24,.00 -B.1,.16,.41,.33,.01,.01 -B.2,.13,.55,.16,.01,.03 -B.3,.22,.41,.29,.01,.02 diff --git a/doc/docs/files/otu_table2.csv b/doc/docs/files/otu_table2.csv deleted file mode 100644 index 4daaefe..0000000 --- a/doc/docs/files/otu_table2.csv +++ /dev/null @@ -1,7 +0,0 @@ -sample,Proteobacteria,Firmicutes,Bacteroidetes,Deferribacteres,Tenericutes,group -A.1,.13,.42,.19,.17,.00,A -A.2,.29,.06,.62,.00,.00,A -A.3,.13,.49,.12,.24,.00,A -B.1,.16,.41,.33,.01,.01,B -B.2,.13,.55,.16,.01,.03,B -B.3,.22,.41,.29,.01,.02,B diff --git a/doc/docs/files/otu_table3.csv b/doc/docs/files/otu_table3.csv deleted file mode 100644 index 50d1aad..0000000 --- a/doc/docs/files/otu_table3.csv +++ /dev/null @@ -1,7 +0,0 @@ -sample,Proteobacteria,Firmicutes,Bacteroidetes,Deferribacteres,Tenericutes,group -A.1,.13,.42,.19,.17,.00,Ctrl -A.2,.29,.06,.62,.00,.00,Ctrl -A.3,.13,.49,.12,.24,.00,Ctrl -B.1,.16,.41,.33,.01,.01,Treatment -B.2,.13,.55,.16,.01,.03,Treatment -B.3,.22,.41,.29,.01,.02,Treatment diff --git a/doc/docs/files/taxon.json.png b/doc/docs/files/taxon.json.png new file mode 100644 index 0000000..b21be72 Binary files /dev/null and b/doc/docs/files/taxon.json.png differ diff --git a/doc/docs/tutorial.md b/doc/docs/tutorial.md new file mode 100644 index 0000000..940abc1 --- /dev/null +++ b/doc/docs/tutorial.md @@ -0,0 +1,52 @@ +# Tutorial + +## Extract all sequences of certen taxons from the nr database + +### Dataset + +- [prot.accession2taxid.gz](ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz) + +### Steps + +Taking bacteria for example. + +1. Getting all taxids of bacteria (taxid 2): + + $ taxonkit list --nodes nodes.dmp --ids 2 --indent "" > bacteria.taxid.txt + + It takes only 2.5s! Number of taxids: + + $ wc -l bacteria.taxid.txt + 454591 bacteria.taxid.txt + +2. Extacting accessions with [csvtk](http://bioinf.shenwei.me/csvtk/download/): + + $ csvtk -t grep -f taxid -P bacteria.taxid.txt prot.accession2taxid.gz | csvtk -t cut -f accession.version > bacteria.taxid.acc.txt + +3. Extracting nr sequences: + + $ blastdbcmd -db nr -entry all -outfmt "%a\t%T" | \ + csvtk -t grep -f 2 -P bacteria.taxid.acc.txt | \ + csvtk -t cut -f 1 | \ + blastdbcmd -db nr -entry_batch - -out bacteria.fa + +
+ + diff --git a/doc/docs/usage.md b/doc/docs/usage.md index 5e52635..9ebe448 100644 --- a/doc/docs/usage.md +++ b/doc/docs/usage.md @@ -14,7 +14,7 @@ Usage ``` TaxonKit - NCBI Taxonomy Toolkit -Version: 0.1 +Version: 0.1.1 Author: Wei Shen @@ -45,7 +45,7 @@ Use "TaxonKit [command] --help" for more information about a command. Usage ``` -list taxon tree of given taxon IDs. +list taxon tree of given taxon IDs Usage: taxonkit list [flags] @@ -53,9 +53,9 @@ Usage: Flags: --ids string taxon ID(s), multiple IDs should be seperated by comma (default "1") --indent string indent (default " ") + --json output in JSON format. you can save the result in file with suffix ".json" and open with modern text editor --names string names.dmp file, when it given taxid will be followed by its scientific name --nodes string nodes.dmp file (default "nodes.dmp") - --show-rank show rank of the node ``` @@ -63,22 +63,55 @@ Examples 1. Default usage - $ taxonkit list --nodes nodes.dmp --ids 9605 + $ taxonkit list --nodes nodes.dmp --ids 9605,239934 9605 9606 63221 741158 1425170 -1. Removing indent. The list could be used to extract sequences from BLAST database with `blastdbcmd` - - $ taxonkit list --nodes nodes.dmp --ids 9605 --indent "" + 239934 + 239935 + 349741 + 512293 + 512294 + 1131822 + 1262691 + 1263034 + 1131336 + 1574264 + 1574265 + 1638783 + 1679444 + 1755639 + 1896967 + +1. Removing indent. The list could be used to extract sequences from BLAST database with `blastdbcmd` (see [tutorial](http://bioinf.shenwei.me/taxonkit/tutorial/)) + + $ taxonkit list --nodes nodes.dmp --ids 9605,239934 --indent "" 9605 9606 63221 741158 1425170 + 239934 + 239935 + 349741 + 512293 + 512294 + 1131822 + 1262691 + 1263034 + 1131336 + 1574264 + 1574265 + 1638783 + 1679444 + 1755639 + 1896967 + + **Performance:** Time and memory usage for whole taxon tree: $ # emptying the buffers cache @@ -90,13 +123,29 @@ Examples 1. Adding names - $ taxonkit list --nodes nodes.dmp --names names.dmp --ids 9605 + $ taxonkit list --nodes nodes.dmp --names names.dmp --ids 9605,239934 9605 [genus] Homo 9606 [species] Homo sapiens 63221 [subspecies] Homo sapiens neanderthalensis 741158 [subspecies] Homo sapiens ssp. Denisova 1425170 [species] Homo heidelbergensis + 239934 [genus] Akkermansia + 239935 [species] Akkermansia muciniphila + 349741 [no rank] Akkermansia muciniphila ATCC BAA-835 + 512293 [no rank] environmental samples + 512294 [species] uncultured Akkermansia sp. + 1131822 [species] uncultured Akkermansia sp. SMG25 + 1262691 [species] Akkermansia sp. CAG:344 + 1263034 [species] Akkermansia muciniphila CAG:154 + 1131336 [species] Akkermansia sp. KLE1605 + 1574264 [species] Akkermansia sp. KLE1797 + 1574265 [species] Akkermansia sp. KLE1798 + 1638783 [species] Akkermansia sp. UNK.MGS-1 + 1679444 [species] Akkermansia glycaniphila + 1755639 [species] Akkermansia sp. MC_55 + 1896967 [species] Akkermansia sp. 54_46 + **Performance:** Time and memory usage for whole taxon tree: $ # emptying the buffers cache @@ -106,8 +155,39 @@ Examples elapsed time: 9.825s peak rss: 648.65 MB - - +1. Output in JSON format, so you can easily collapse and uncollapse taxonomy tree in modern text editor. + + $ taxonkit list --nodes nodes.dmp --names names.dmp --ids 9605,239934 --json + { + "9605 [genus] Homo": { + "9606 [species] Homo sapiens": { + "63221 [subspecies] Homo sapiens neanderthalensis": {}, + "741158 [subspecies] Homo sapiens ssp. Denisova": {} + } + "1425170 [species] Homo heidelbergensis": {} + }, + "239934 [genus] Akkermansia": { + "239935 [species] Akkermansia muciniphila": { + "349741 [no rank] Akkermansia muciniphila ATCC BAA-835": {} + } + "512293 [no rank] environmental samples": { + "512294 [species] uncultured Akkermansia sp.": {}, + "1131822 [species] uncultured Akkermansia sp. SMG25": {}, + "1262691 [species] Akkermansia sp. CAG:344": {}, + "1263034 [species] Akkermansia muciniphila CAG:154": {} + } + "1131336 [species] Akkermansia sp. KLE1605": {}, + "1574264 [species] Akkermansia sp. KLE1797": {}, + "1574265 [species] Akkermansia sp. KLE1798": {}, + "1638783 [species] Akkermansia sp. UNK.MGS-1": {}, + "1679444 [species] Akkermansia glycaniphila": {}, + "1755639 [species] Akkermansia sp. MC_55": {}, + "1896967 [species] Akkermansia sp. 54_46": {} + } + } + + Snapshot of taxonomy (taxid 1) in kate: + ![taxon.json.png](files/taxon.json.png)