From fd3d3c45a2982d750899ae169359caefb7e2530d Mon Sep 17 00:00:00 2001 From: Wei Shen Date: Fri, 2 Apr 2021 15:31:38 +0800 Subject: [PATCH] mark --- taxonkit/cmd/reformat.go | 199 ++++++++++++++++------- taxonkit/cmd/util-complex-data.go | 72 +++++++- taxonkit/cmd/{util-string.go => util.go} | 6 + 3 files changed, 217 insertions(+), 60 deletions(-) rename taxonkit/cmd/{util-string.go => util.go} (92%) diff --git a/taxonkit/cmd/reformat.go b/taxonkit/cmd/reformat.go index 5906342..c70efaa 100644 --- a/taxonkit/cmd/reformat.go +++ b/taxonkit/cmd/reformat.go @@ -74,7 +74,20 @@ column by flag "-t/--show-lineage-taxids". iblank := getFlagString(cmd, "miss-taxid-repl") fill := getFlagBool(cmd, "fill-miss-rank") pseudoStrain := getFlagBool(cmd, "pseudo-strain") - field := getFlagPositiveInt(cmd, "lineage-field") - 1 + + taxIdField := getFlagNonNegativeInt(cmd, "taxid-field") + field := getFlagPositiveInt(cmd, "lineage-field") + + var parsingTaxId bool + if taxIdField > 0 { + log.Infof("parsing TaxIds from field %d", taxIdField) + parsingTaxId = true + taxIdField-- + } else if field > 0 { + log.Infof("parsing complete lineages from field %d", field) + field-- + } + printLineageInTaxid := getFlagBool(cmd, "show-lineage-taxids") addPrefix := getFlagBool(cmd, "add-prefix") @@ -138,7 +151,30 @@ column by flag "-t/--show-lineage-taxids". checkError(err) defer outfh.Close() - taxid2taxon, name2parent2taxid, name2taxid, ambigous := getName2Parent2Taxid(config) + // -------------------------------------------------------- + // load data + + // for parsing lineage + var taxid2taxon map[uint32]*Taxon + var name2parent2taxid map[string]map[string]uint32 + var name2taxid map[string]uint32 + var ambigous map[string][]uint32 + + // for parsing taxid + var tree0 map[uint32]uint32 + var ranks0 map[uint32]string + var names0 map[uint32]string + var delnodes0 map[uint32]struct{} + var merged0 map[uint32]uint32 + + if parsingTaxId { + tree0, ranks0, names0, delnodes0, merged0 = loadData(config, true, true) + } else { + taxid2taxon, name2parent2taxid, name2taxid, ambigous = getName2Parent2Taxid(config) + } + + // -------------------------------------------------------- + type line2flineage struct { line string flineage string @@ -162,27 +198,21 @@ column by flag "-t/--show-lineage-taxids". return nil, false, nil } data := strings.Split(line, "\t") - if len(data) < field+1 { + + if parsingTaxId { + if len(data) < taxIdField+1 { + return nil, false, fmt.Errorf("taxid-field (%d) out of range (%d):%s", taxIdField+1, len(data), line) + } + } else if len(data) < field+1 { return nil, false, fmt.Errorf("lineage-field (%d) out of range (%d):%s", field+1, len(data), line) } - // names - names := strings.Split(data[field], delimiter) // all names of full lineage - - // ranks := make([]string, len(names)) - ranks := poolStrings.Get().([]string) - - // sranks := make([]string, len(names)) - sranks := poolStrings.Get().([]string) + // ----------------------------------------------- var rank, srank string // lower case of name : name var lname, plname string // lower case of name, rank and it's one-letter symbol var ok bool - name2Name := make(map[string]string, len(names)) // lower case of name of parent - - srank2idx := make(map[string]int) // srank: index - // preprare replacements. // find the orphan names and missing ranks replacements := make(map[string]string, len(matches)) @@ -199,7 +229,40 @@ column by flag "-t/--show-lineage-taxids". } } + // ----------------------------------------------- + var taxid uint32 + var taxidInt int + + var names []string + var ranks []string + var taxids []uint32 + + if parsingTaxId { + taxidInt, err = strconv.Atoi(data[taxIdField]) + if err != nil || taxidInt < 0 { + checkError(fmt.Errorf("invalid TaxId: %s", data[taxIdField])) + } + taxid = uint32(taxidInt) + names, ranks, taxids, ok = namesRanksTaxids(tree0, ranks0, names0, delnodes0, merged0, taxid) + if !ok { + return line2flineage{line, "", ""}, false, nil + } + } else { + // names + names = strings.Split(data[field], delimiter) // all names of full lineage + + // ranks := make([]string, len(names)) + ranks = poolStrings.Get().([]string) + } + + // sranks := make([]string, len(names)) + sranks := poolStrings.Get().([]string) + + name2Name := make(map[string]string, len(names)) // lower case of name of parent + + srank2idx := make(map[string]int) // srank: index + var maxRankWeight float32 var pair string @@ -213,68 +276,78 @@ column by flag "-t/--show-lineage-taxids". name2Name[lname] = name name = lname - if _, ok = name2taxid[name]; !ok { // unofficial name - log.Warningf(`unofficial taxon name detected: %s. Possible reasons: 1) lineages were produced with different taxonomy data files, please re-run taxonkit lineage; 2) some taxon names contain semicolon (";"), please re-run taxonkit lineage and taxonkit reformat with different flag value of -d, e.g., -d /`, name) - return line2flineage{line, "", ""}, true, nil - } + // ----------------------------------------------- - if i == 0 { // root node - taxid = name2taxid[name] + if parsingTaxId { + rank = ranks[i] + taxid = taxids[i] } else { - plname = strings.ToLower(names[i-1]) - if _, ok = name2parent2taxid[name]; !ok { + if _, ok = name2taxid[name]; !ok { // unofficial name log.Warningf(`unofficial taxon name detected: %s. Possible reasons: 1) lineages were produced with different taxonomy data files, please re-run taxonkit lineage; 2) some taxon names contain semicolon (";"), please re-run taxonkit lineage and taxonkit reformat with different flag value of -d, e.g., -d /`, name) return line2flineage{line, "", ""}, true, nil } - if taxid, ok = name2parent2taxid[name][plname]; !ok { - log.Warningf(`unofficial taxon name detected: %s. Possible reasons: 1) lineages were produced with different taxonomy data files, please re-run taxonkit lineage; 2) some taxon names contain semicolon (";"), please re-run taxonkit lineage and taxonkit reformat with different flag value of -d, e.g., -d /`, plname) - return line2flineage{line, "", ""}, true, nil - } - // for cases where child-parent pairs are shared by multiple taxids. - pair = name + plname + if i == 0 { // root node + taxid = name2taxid[name] + } else { + plname = strings.ToLower(names[i-1]) + if _, ok = name2parent2taxid[name]; !ok { + log.Warningf(`unofficial taxon name detected: %s. Possible reasons: 1) lineages were produced with different taxonomy data files, please re-run taxonkit lineage; 2) some taxon names contain semicolon (";"), please re-run taxonkit lineage and taxonkit reformat with different flag value of -d, e.g., -d /`, name) + return line2flineage{line, "", ""}, true, nil + } + if taxid, ok = name2parent2taxid[name][plname]; !ok { + log.Warningf(`unofficial taxon name detected: %s. Possible reasons: 1) lineages were produced with different taxonomy data files, please re-run taxonkit lineage; 2) some taxon names contain semicolon (";"), please re-run taxonkit lineage and taxonkit reformat with different flag value of -d, e.g., -d /`, plname) + return line2flineage{line, "", ""}, true, nil + } + + // for cases where child-parent pairs are shared by multiple taxids. + pair = name + "__" + plname - if _ambids, ok = ambigous[pair]; ok { + if _ambids, ok = ambigous[pair]; ok { - var _lineage string - lineage0 := data[field] - var _taxids2 []uint32 - var _taxid uint32 + var _lineage string + lineage0 := strings.Join(names[:i], delimiter) + var _taxids2 []uint32 + var _taxid uint32 - _taxids2 = make([]uint32, 0, 2) // possible taxids + _taxids2 = make([]uint32, 0, 2) // possible taxids - for _, _taxid = range _ambids { - _lineage = lineageFromTaxid2Taxon(taxid2taxon, _taxid, delimiter) - if _lineage == lineage0 { - _taxids2 = append(_taxids2, _taxid) + for _, _taxid = range _ambids { + _lineage = lineageFromTaxid2Taxon(taxid2taxon, _taxid, delimiter) + if _lineage == lineage0 { + _taxids2 = append(_taxids2, _taxid) + } } - } - switch len(_taxids2) { // cool - case 0: - log.Warningf("it's a bug, please report: %s", pair) - case 1: // we correct it - taxid = _taxid - default: - tmp := make([]string, len(_taxids2)) - for _i, _taxid := range _taxids2 { - tmp[_i] = strconv.Itoa(int(_taxid)) + switch len(_taxids2) { // cool + case 0: + log.Warningf("it's a bug, please report: '%s'. %s", pair, line) + case 1: // we correct it + taxid = _taxid + default: + tmp := make([]string, len(_taxids2)) + for _i, _taxid := range _taxids2 { + tmp[_i] = strconv.Itoa(int(_taxid)) + } + log.Warningf("we can't distinguish the TaxId (%s) for lineage: %s", + strings.Join(tmp, ", "), lineage0) } - log.Warningf("we can't distinguish the TaxId (%s) for lineage: %s", - strings.Join(tmp, ", "), lineage0) } } - } - // note that code below is computing rank of current name, not its parent. - rank = taxid2taxon[taxid].Rank + // note that code below is computing rank of current name, not its parent. + rank = taxid2taxon[taxid].Rank + + if rank == norank { + ranks = append(ranks, rank) + sranks = append(sranks, "") + continue + } - if rank == norank { + // ranks[i] = rank ranks = append(ranks, rank) - sranks = append(sranks, "") - continue } - // ranks[i] = rank - ranks = append(ranks, rank) + // ----------------------------------------------- + if srank, ok = rank2symbol[rank]; ok { // special symbol "{t}" switch rank { @@ -379,6 +452,13 @@ column by flag "-t/--show-lineage-taxids". sranks = sranks[:0] poolStrings.Put(sranks) + if parsingTaxId { + names = names[:0] + poolStrings.Put(names) + taxids = taxids[:0] + poolUint32.Put(taxids) + } + return line2flineage{line, unescape(flineage), unescape(iflineage)}, true, nil } @@ -421,6 +501,7 @@ func init() { flineageCmd.Flags().BoolP("pseudo-strain", "S", false, `use the node with lowest rank as strain name, only if which rank is lower than "species" and not "subpecies" nor "strain". It affects {t}, {S}, {T}. This flag needs flag -F`) flineageCmd.Flags().IntP("lineage-field", "i", 2, "field index of lineage. data should be tab-separated") + flineageCmd.Flags().IntP("taxid-field", "I", 0, "field index of taxid. input data should be tab-separated") flineageCmd.Flags().BoolP("show-lineage-taxids", "t", false, `show corresponding taxids of reformated lineage`) flineageCmd.Flags().BoolP("add-prefix", "P", false, `add prefixes for all ranks, single prefix for a rank is defined by flag --prefix-X`) diff --git a/taxonkit/cmd/util-complex-data.go b/taxonkit/cmd/util-complex-data.go index 198023a..36d4ea6 100644 --- a/taxonkit/cmd/util-complex-data.go +++ b/taxonkit/cmd/util-complex-data.go @@ -255,7 +255,7 @@ func getName2Parent2Taxid(config Config) ( } else { if _, ok = _n2i[pname]; ok { // log.Warningf("ambigous name pair: (%s, %s). TaxIds: %d, %d", _name, taxid2name[taxid2taxon[taxid].Parent], _n2i[pname], taxid) - pair = name + pname + pair = name + "__" + pname if _, ok = ambigous[pair]; !ok { ambigous[pair] = []uint32{_n2i[pname], taxid} } else { @@ -298,3 +298,73 @@ func lineageFromTaxid2Taxon(taxid2taxon map[uint32]*Taxon, id uint32, delimiter return strings.Join(stringutil.ReverseStringSlice(lineage), delimiter) } + +var poolStrings = &sync.Pool{New: func() interface{} { + return make([]string, 0, 16) +}} + +var poolUint32 = &sync.Pool{New: func() interface{} { + return make([]uint32, 0, 16) +}} + +// only for reformat. +// remember to recyle return values +func namesRanksTaxids( + tree map[uint32]uint32, + ranks map[uint32]string, + names map[uint32]string, + delnodes map[uint32]struct{}, + merged map[uint32]uint32, + id uint32, +) ([]string, []string, []uint32, bool) { + + lineage := poolStrings.Get().([]string) + lineageInRank := poolStrings.Get().([]string) + lineageInTaxid := poolUint32.Get().([]uint32) + + var child, parent, newtaxid uint32 + var ok bool + child = id + var notFound bool + for { + parent, ok = tree[child] + if !ok { // taxid not found + // check if it was deleted + if _, ok = delnodes[child]; ok { + // log + log.Warningf("taxid %d was deleted", child) + id = 0 + break + } + // check if it was merged + if newtaxid, ok = merged[child]; ok { + // log + log.Warningf("taxid %d was merged into %d", child, newtaxid) + child = newtaxid + parent = tree[child] + id = child + } else { + id = 0 + log.Warningf("taxid %d not found", child) + notFound = true + break + } + } + + lineage = append(lineage, names[child]) + lineageInRank = append(lineageInRank, ranks[child]) + lineageInTaxid = append(lineageInTaxid, child) + + if parent == 1 { + break + } + + child = parent + } + + stringutil.ReverseStringSliceInplace(lineage) + stringutil.ReverseStringSliceInplace(lineageInRank) + reverseUint32s(lineageInTaxid) + + return lineage, lineageInRank, lineageInTaxid, !notFound +} diff --git a/taxonkit/cmd/util-string.go b/taxonkit/cmd/util.go similarity index 92% rename from taxonkit/cmd/util-string.go rename to taxonkit/cmd/util.go index f988e17..b524a1d 100644 --- a/taxonkit/cmd/util-string.go +++ b/taxonkit/cmd/util.go @@ -45,3 +45,9 @@ func stringSplitN(s string, sep string, n int, a *[]string) { (*a) = (*a)[:i+1] } + +func reverseUint32s(s []uint32) { + for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 { + s[i], s[j] = s[j], s[i] + } +}