diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000..789d44ae6f --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# Pre-commit updates (#2427) +fee62922d8857ce93f1d4e90fd7240629d606997 diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml index b6fa73ec7f..c3771289ac 100644 --- a/.github/workflows/dev_envs.yml +++ b/.github/workflows/dev_envs.yml @@ -5,25 +5,23 @@ on: branches: [latest] jobs: nix: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-14] steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: cachix/install-nix-action@v25 - with: - extra_nix_config: | - access-tokens = github.com=${{ secrets.GITHUB_TOKEN }} - - - uses: cachix/cachix-action@v14 - with: - name: sourmash-bio - authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v4 + - name: Run the Magic Nix Cache + uses: DeterminateSystems/magic-nix-cache-action@v1 - run: nix run .# -- --version - - run: nix-shell --command "tox -e py310" + - run: nix develop --command bash -c "tox -e py310" mamba: runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index 6913ea4907..b5af6d3875 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1496,7 +1496,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.12.0" +version = "0.12.1" dependencies = [ "az", "byteorder", diff --git a/doc/command-line.md b/doc/command-line.md index 14e875a9d6..7b9cbe2615 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -551,8 +551,8 @@ The sourmash `tax` or `taxonomy` commands integrate taxonomic `gather` command (we cannot combine separate `gather` runs for the same query). For supported databases (e.g. GTDB, NCBI), we provide taxonomy csv files, but they can also be generated for user-generated - databases. As of v4.8, some sourmash taxonomy commands can also use `LIN` - lineage information. For more information, see [databases](databases.md). + databases. As of v4.8 and 4.8.6, respectively, some sourmash taxonomy + commands can also use `LIN` or `ICTV` lineage information. `tax` commands rely upon the fact that `gather` provides both the total fraction of the query matched to each database matched, as well as a diff --git a/flake.lock b/flake.lock index 3f5dc6569a..ad45360618 100644 --- a/flake.lock +++ b/flake.lock @@ -2,11 +2,11 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1701589523, - "narHash": "sha256-7LK019+Y9khM18WjIt4ISK2yd1P5z+CXJq0ts+E13UA=", + "lastModified": 1706925685, + "narHash": "sha256-hVInjWMmgH4yZgA4ZtbgJM1qEAel72SYhP5nOWX4UIM=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "ec04772e7516b6d58d98b491e68b329b7558b14d", + "rev": "79a13f1437e149dc7be2d1290c74d378dad60814", "type": "github" }, "original": { @@ -33,11 +33,11 @@ ] }, "locked": { - "lastModified": 1701569797, - "narHash": "sha256-ObvQFAPpC5IVbI2GHedSTQVzYxht2qhBgHHQnh3mYTs=", + "lastModified": 1707099356, + "narHash": "sha256-ph483MDKLi9I/gndYOieVP41es633DOOmPjEI50x5KU=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "516c9477757b628b157780d96d84e8c82b46dc99", + "rev": "61dfa5a8129f7edbe9150253c68f673f87b16fb1", "type": "github" }, "original": { @@ -66,11 +66,11 @@ "systems": "systems" }, "locked": { - "lastModified": 1694529238, - "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", + "lastModified": 1705309234, + "narHash": "sha256-uNRRNRKmJyCRC/8y1RqBkqWBLM034y4qN7EprSdmgyA=", "owner": "numtide", "repo": "flake-utils", - "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", + "rev": "1ef2e671c3b0c19053962c07dbda38332dcebf26", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 921d29def7..6739271d50 100644 --- a/flake.nix +++ b/flake.nix @@ -30,40 +30,55 @@ rustc = rustVersion; }; + inherit (pkgs) lib; + python = pkgs.python311Packages; + stdenv = if pkgs.stdenv.isDarwin then pkgs.overrideSDK pkgs.stdenv "11.0" else pkgs.stdenv; + + commonArgs = { + src = ./.; + stdenv = stdenv; + preConfigure = lib.optionalString stdenv.isDarwin '' + export MACOSX_DEPLOYMENT_TARGET=10.14 + ''; + + # Work around https://github.com/NixOS/nixpkgs/issues/166205. + env = lib.optionalAttrs stdenv.cc.isClang { + NIX_LDFLAGS = "-l${stdenv.cc.libcxx.cxxabi.libName}"; + }; + + buildInputs = lib.optionals stdenv.isDarwin [ pkgs.libiconv pkgs.darwin.apple_sdk.frameworks.Security ]; + + nativeBuildInputs = with rustPlatform; [ cargoSetupHook maturinBuildHook bindgenHook ]; + }; + in with pkgs; { packages = { - lib = rustPlatform.buildRustPackage { + lib = rustPlatform.buildRustPackage ( commonArgs // { name = "libsourmash"; - src = lib.cleanSource ./.; copyLibs = true; cargoLock.lockFile = ./Cargo.lock; nativeBuildInputs = with rustPlatform; [ bindgenHook ]; - }; + }); - sourmash = python.buildPythonPackage rec { + sourmash = python.buildPythonPackage ( commonArgs // rec { pname = "sourmash"; version = "4.8.6"; format = "pyproject"; - src = ./.; - cargoDeps = rustPlatform.importCargoLock { lockFile = ./Cargo.lock; }; - nativeBuildInputs = with rustPlatform; [ cargoSetupHook maturinBuildHook bindgenHook ]; - - buildInputs = lib.optionals stdenv.isDarwin [ libiconv ]; propagatedBuildInputs = with python; [ cffi deprecation cachetools bitstring numpy scipy matplotlib screed ]; DYLD_LIBRARY_PATH = "${self.packages.${system}.lib}/lib"; - }; + }); docker = let @@ -83,7 +98,7 @@ defaultPackage = self.packages.${system}.sourmash; - devShell = mkShell { + devShells.default = pkgs.mkShell.override { stdenv = stdenv; } (commonArgs // { nativeBuildInputs = with rustPlatform; [ bindgenHook ]; buildInputs = [ @@ -113,15 +128,19 @@ cargo-outdated cargo-udeps cargo-deny - cargo-semver-checks + #cargo-semver-checks nixpkgs-fmt ]; + shellHook = '' + export MACOSX_DEPLOYMENT_TARGET=10.14 + ''; + # Needed for matplotlib LD_LIBRARY_PATH = lib.makeLibraryPath [ pkgs.stdenv.cc.cc.lib ]; # workaround for https://github.com/NixOS/nixpkgs/blob/48dfc9fa97d762bce28cc8372a2dd3805d14c633/doc/languages-frameworks/python.section.md#python-setuppy-bdist_wheel-cannot-create-whl SOURCE_DATE_EPOCH = 315532800; # 1980 - }; + }); }); } diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 6a65d1b859..9700d2b67a 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.12.0" +version = "0.12.1" authors = ["Luiz Irber "] description = "MinHash sketches for genomic data" repository = "https://github.com/sourmash-bio/sourmash" diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index 7541440fc2..f30318ad0a 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -78,6 +78,13 @@ def subparser(subparsers): default=False, help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.", ) + subparser.add_argument( + "--ictv", + "--ictv-taxonomy", + action="store_true", + default=False, + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + ) def main(args): diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index b9712658a4..cd72499c31 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -124,6 +124,13 @@ def subparser(subparsers): default=None, help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups.", ) + subparser.add_argument( + "--ictv", + "--ictv-taxonomy", + action="store_true", + default=False, + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + ) add_tax_threshold_arg(subparser, 0.1) add_rank_arg(subparser) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 563c6c3d81..f8e31902e8 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -116,6 +116,13 @@ def subparser(subparsers): default=None, help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.", ) + subparser.add_argument( + "--ictv", + "--ictv-taxonomy", + action="store_true", + default=False, + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + ) add_rank_arg(subparser) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index d430677b8f..76236f19ae 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -57,6 +57,13 @@ def subparser(subparsers): default=False, help="use LIN taxonomy in place of standard taxonomic ranks.", ) + subparser.add_argument( + "--ictv", + "--ictv-taxonomy", + action="store_true", + default=False, + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + ) def main(args): diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 8e490ae545..073977cb79 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -18,6 +18,7 @@ RankLineageInfo, LINLineageInfo, AnnotateTaxResult, + ICTVRankLineageInfo, ) usage = """ @@ -82,6 +83,7 @@ def metagenome(args): keep_identifier_versions=args.keep_identifier_versions, force=args.force, lins=args.lins, + ictv=args.ictv, ) available_ranks = tax_assign.available_ranks except ValueError as exc: @@ -113,6 +115,7 @@ def metagenome(args): keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, lins=args.lins, + ictv=args.ictv, ) except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -258,6 +261,7 @@ def genome(args): keep_identifier_versions=args.keep_identifier_versions, force=args.force, lins=args.lins, + ictv=args.ictv, ) available_ranks = tax_assign.available_ranks @@ -297,6 +301,7 @@ def genome(args): keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, lins=args.lins, + ictv=args.ictv, ) except ValueError as exc: @@ -402,6 +407,7 @@ def annotate(args): keep_identifier_versions=args.keep_identifier_versions, force=args.force, lins=args.lins, + ictv=args.ictv, ) except ValueError as exc: @@ -466,6 +472,7 @@ def annotate(args): raw=row, id_col=id_col, lins=args.lins, + ictv=args.ictv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, ) @@ -591,6 +598,7 @@ def summarize(args): keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, lins=args.lins, + ictv=args.ictv, ) except ValueError as exc: error("ERROR while loading taxonomies!") @@ -637,6 +645,8 @@ def summarize(args): rank = lineage[-1].rank if args.lins: inf = LINLineageInfo(lineage=lineage) + elif args.ictv: + inf = ICTVRankLineageInfo(lineage=lineage) else: inf = RankLineageInfo(lineage=lineage) lin = inf.display_lineage() diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d1827c2aad..55feed66d2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -48,6 +48,36 @@ "unclassified": "U", } +ICTV_RANKS = ( + "realm", + "subrealm", + "kingdom", + "subkingdom", + "phylum", + "subphylum", + "class", + "subclass", + "order", + "suborder", + "family", + "subfamily", + "genus", + "subgenus", + "species", + "name", +) + +NCBI_RANKS = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", +) + class LineagePair(NamedTuple): rank: str @@ -314,7 +344,7 @@ def find_lca(self, other): @dataclass(frozen=True, order=True) class RankLineageInfo(BaseLineageInfo): """ - This RankLineageInfo class usees the BaseLineageInfo methods for a standard set + This RankLineageInfo class uses the BaseLineageInfo methods for a standard set of taxonomic ranks. Inputs: @@ -332,16 +362,7 @@ class RankLineageInfo(BaseLineageInfo): and will not be used or compared in any other class methods. """ - ranks: tuple = ( - "superkingdom", - "phylum", - "class", - "order", - "family", - "genus", - "species", - "strain", - ) + ranks: tuple = NCBI_RANKS lineage_dict: dict = field(default=None, compare=False) # dict of rank: name def __post_init__(self): @@ -408,6 +429,79 @@ def _init_from_lineage_dict(self): object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) +@dataclass(frozen=True, order=True) +class ICTVRankLineageInfo(RankLineageInfo): + """ + This ICTV RankLineageInfo class uses the RankLineageInfo methods but uses + the 15-rank ICTV taxonomy. It also allows for a 'name' column in the taxonomy, + which reflects that virus name is sometimes used as a sub-species rank. + + Inputs: + optional: + ranks: tuple or list of hierarchical ranks + default: ('realm','subrealm','kingdom','subkingdom','phylum','subphylum', + 'class','subclass','order','suborder','family','subfamily', + 'genus','subgenus','species','name') + lineage: tuple or list of LineagePair + lineage_str: `;`- or `,`-separated string of names + lineage_dict: dictionary of {rank: name} + + If no inputs are provided, result will be ICTVRankLineageInfo with + default ranks and no lineage names. + + Input lineage information is only used for initialization of the final `lineage` + and will not be used or compared in any other class methods. + """ + + ranks: tuple = ICTV_RANKS + lineage_dict: dict = field(default=None, compare=False) # dict of rank: name + + def __post_init__(self): + "Initialize according to passed values" + object.__setattr__(self, "ranks", ICTV_RANKS) + if self.lineage is not None: + self._init_from_lineage_tuples() + elif self.lineage_str is not None: + self._init_from_lineage_str() + elif self.lineage_dict is not None: + self._init_from_lineage_dict() + elif self.ranks: + self._init_empty() + + def _init_from_lineage_dict(self): + """ + Initialize from lineage dict, e.g. from lineages csv. + Allows empty ranks/extra columns and reordering if necessary + """ + null_names = set(["[Blank]", "na", "null", "NA", ""]) + if not isinstance(self.lineage_dict, (dict)): + raise ValueError(f"{self.lineage_dict} is not dictionary") + new_lineage = [] + # build empty lineage and taxpath + for rank in self.ranks: + new_lineage.append(LineagePair(rank=rank)) + + # now add rank information in correct spots. This corrects for order and allows empty ranks and extra dict keys + for key, val in self.lineage_dict.items(): + name = None + try: + rank, name = key, val + rank_idx = self.rank_index(rank) + except ValueError: + continue # ignore dictionary entries (columns) that don't match a rank + + # filter null + if name is not None and name.strip() in null_names: + name = None + new_lineage[rank_idx] = LineagePair(rank=rank, name=name) + + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name] + # set lineage and filled_ranks + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) + + @dataclass(frozen=True, order=True) class LINLineageInfo(BaseLineageInfo): """ @@ -557,7 +651,10 @@ def __post_init__(self): self.add_lineages(self.assignments) def add_lineage(self, lineage): - if isinstance(lineage, BaseLineageInfo | RankLineageInfo | LINLineageInfo): + if isinstance( + lineage, + BaseLineageInfo | RankLineageInfo | LINLineageInfo | ICTVRankLineageInfo, + ): lineage = lineage.filled_lineage node = self.tree for lineage_tup in lineage: @@ -724,6 +821,7 @@ def load_gather_results( keep_full_identifiers=False, keep_identifier_versions=False, lins=False, + ictv=False, ): "Load a single gather csv" if not seen_queries: @@ -761,6 +859,7 @@ def load_gather_results( keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions, lins=lins, + ictv=ictv, ) taxres.get_match_lineage( tax_assignments=tax_assignments, @@ -771,7 +870,8 @@ def load_gather_results( if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new this_querytaxres = gather_results.get( - gatherRow.query_name, QueryTaxResult(taxres.query_info, lins=lins) + gatherRow.query_name, + QueryTaxResult(taxres.query_info, lins=lins, ictv=ictv), ) this_querytaxres.add_taxresult(taxres) gather_results[gatherRow.query_name] = this_querytaxres @@ -795,6 +895,7 @@ def check_and_load_gather_csvs( keep_full_identifiers=False, keep_identifier_versions=False, lins=False, + ictv=False, ): """ Load gather csvs, checking for empties and ids missing from taxonomic assignments. @@ -816,6 +917,7 @@ def check_and_load_gather_csvs( keep_identifier_versions=keep_identifier_versions, fail_on_missing_taxonomy=fail_on_missing_taxonomy, lins=lins, + ictv=ictv, ) except ValueError as exc: if force: @@ -1134,6 +1236,7 @@ def load( keep_full_identifiers=False, keep_identifier_versions=True, lins=False, + ictv=False, ): """ Load a taxonomy assignment CSV file into a LineageDB. @@ -1156,7 +1259,7 @@ def load( if os.path.isdir(filename): raise ValueError(f"'{filename}' is a directory") - with sourmash_args.FileInputCSV(filename) as r: + with sourmash_args.FileInputCSV(filename, delimiter=",") as r: header = r.fieldnames if not header: raise ValueError(f"cannot read taxonomy assignments from {filename}") @@ -1173,7 +1276,7 @@ def load( header = ["ident" if "accession" == x else x for x in header] elif "name" in header and "lineage" in header: return cls.load_from_gather_with_lineages( - filename, force=force, lins=lins + filename, force=force, lins=lins, ictv=ictv ) else: header_str = ",".join([repr(x) for x in header]) @@ -1181,12 +1284,21 @@ def load( f"No taxonomic identifiers found; headers are {header_str}" ) - if lins and "lin" not in header: - raise ValueError( - f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}." - ) + if lins: + notify("Trying to read LIN taxonomy assignments.") + if "lin" not in header: + raise ValueError( + f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}." + ) + + if ictv: + notify("Trying to read ICTV taxonomy assignments.") + # check that all ranks are in header + ranks = list(ICTVRankLineageInfo().taxlist) + if not set(ranks).issubset(header): + raise ValueError("Not all taxonomy ranks present") - if not lins: + if not lins and not ictv: # is "strain" an available rank? if "strain" in header: include_strain = True @@ -1217,10 +1329,12 @@ def load( "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." ) else: - n_pos = ( - lineageInfo.n_lin_positions - ) # set n_pos with first entry + # set n_pos with first entry + n_pos = lineageInfo.n_lin_positions ranks = lineageInfo.ranks + elif ictv: + # read lineage from row dictionary + lineageInfo = ICTVRankLineageInfo(lineage_dict=row) else: # read lineage from row dictionary lineageInfo = RankLineageInfo(lineage_dict=row) @@ -1247,7 +1361,7 @@ def load( else: assignments[ident] = lineage - if not lins: + if not lins and not ictv: if lineage[-1].rank == "species": n_species += 1 elif lineage[-1].rank == "strain": @@ -1257,7 +1371,9 @@ def load( return LineageDB(assignments, ranks) @classmethod - def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): + def load_from_gather_with_lineages( + cls, filename, *, force=False, lins=False, ictv=False + ): """ Load an annotated gather-with-lineages CSV file produced by 'tax annotate' into a LineageDB. @@ -1294,6 +1410,8 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): if lins: lineageInfo = LINLineageInfo(lineage_str=row["lineage"]) + elif ictv: + lineageInfo = ICTVRankLineageInfo(lineage_str=row["lineage"]) else: lineageInfo = RankLineageInfo(lineage_str=row["lineage"]) @@ -1772,6 +1890,7 @@ class BaseTaxResult: missed_ident: bool = False match_lineage_attempted: bool = False lins: bool = False + ictv: bool = False def get_ident(self, id_col=None): # split identifiers = split on whitespace @@ -1799,6 +1918,8 @@ def get_match_lineage( if lin: if self.lins: self.lineageInfo = LINLineageInfo(lineage=lin) + elif self.ictv: + self.lineageInfo = ICTVRankLineageInfo(lineage=lin) else: self.lineageInfo = RankLineageInfo(lineage=lin) else: @@ -1858,7 +1979,7 @@ class TaxResult(BaseTaxResult): # get match lineage tax_res.get_match_lineage(taxD=taxonomic_assignments) - Use RankLineageInfo or LINLineageInfo to store lineage information. + Use RankLineageInfo, ICTVLineageInfo, or LINLineageInfo to store lineage information. """ raw: GatherRow @@ -1884,6 +2005,8 @@ def __post_init__(self): self.unique_intersect_bp = int(self.raw.unique_intersect_bp) if self.lins: self.lineageInfo = LINLineageInfo() + elif self.ictv: + self.lineageInfo = ICTVRankLineageInfo() else: self.lineageInfo = RankLineageInfo() @@ -2109,6 +2232,7 @@ class QueryTaxResult: query_info: QueryInfo # initialize with QueryInfo dataclass lins: bool = False + ictv: bool = False def __post_init__(self): self.query_name = self.query_info.query_name # for convenience @@ -2147,7 +2271,11 @@ def _init_classification_results(self): self.krona_header = [] def is_compatible(self, taxresult): - return taxresult.query_info == self.query_info and taxresult.lins == self.lins + return ( + taxresult.query_info == self.query_info + and taxresult.lins == self.lins + and taxresult.ictv == self.ictv + ) @property def ascending_ranks(self): @@ -2280,6 +2408,8 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): # record unclassified if self.lins: lineage = LINLineageInfo() + elif self.ictv: + lineage = ICTVRankLineageInfo() else: lineage = RankLineageInfo() query_ani = None diff --git a/tests/test-data/tax/test.ictv-taxonomy.csv b/tests/test-data/tax/test.ictv-taxonomy.csv new file mode 100644 index 0000000000..d99a76601c --- /dev/null +++ b/tests/test-data/tax/test.ictv-taxonomy.csv @@ -0,0 +1,8 @@ +ident,superkingdom,realm,subrealm,kingdom,subkingdom,phylum,subphylum,class,subclass,order,suborder,family,subfamily,genus,subgenus,species,name +GCF_000017325,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus zairense,Ebola virus +GCF_000021665,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus sudanense,Sudan virus +GCF_009494285,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus taiense,Taï Forest virus +GCF_003471795,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus restonense,Reston virus +GCF_001881345,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus bundibugyoense,Bundibugyo virus +GCF_013368705,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus bombaliense,Bombali virus +GCF_013311112,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,other-genus,,other-sp,other test diff --git a/tests/test_tax.py b/tests/test_tax.py index 3f766f5e37..70b4f14fc0 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1126,6 +1126,131 @@ def test_metagenome_bad_rank_krona(runtmp): ) +def test_metagenome_ictv(runtmp): + # test basic metagenome + c = runtmp + + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.ictv-taxonomy.csv") + + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "--ictv") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + print(c.last_result.out) + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,realm,0.204,Riboviria,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,realm,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,kingdom,0.204,Riboviria;;Orthornavirae,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,kingdom,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,phylum,0.204,Riboviria;;Orthornavirae;;Negarnaviricota,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,subphylum,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,subphylum,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,class,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,order,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,family,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,genus,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,species,0.088,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bundibugyoense,md5,test1.sig,0.058,442000,0.925,0" + in c.last_result.out + ) + assert ( + "test1,species,0.078,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus taiense,md5,test1.sig,0.050,390000,0.921,0" + in c.last_result.out + ) + assert ( + "test1,species,0.028,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bombaliense,md5,test1.sig,0.016,138000,0.891,0" + in c.last_result.out + ) + assert ( + "test1,species,0.011,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus restonense,md5,test1.sig,0.007,54000,0.864,0" + in c.last_result.out + ) + assert ( + "test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,name,0.088,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bundibugyoense;Bundibugyo virus,md5,test1.sig,0.058,442000,0.925,0" + in c.last_result.out + ) + assert ( + "test1,name,0.078,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus taiense;Taï Forest virus,md5,test1.sig,0.050,390000,0.921,0" + in c.last_result.out + ) + assert ( + "test1,name,0.028,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bombaliense;Bombali virus,md5,test1.sig,0.016,138000,0.891,0" + in c.last_result.out + ) + assert ( + "test1,name,0.011,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus restonense;Reston virus,md5,test1.sig,0.007,54000,0.864,0" + in c.last_result.out + ) + assert ( + "test1,name,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + + def test_genome_no_rank_krona(runtmp): g_csv = utils.get_test_data("tax/test1.gather.csv") tax = utils.get_test_data("tax/test.taxonomy.csv") @@ -2897,6 +3022,121 @@ def test_genome_gather_two_queries(runtmp): ) +def test_genome_gather_ictv(runtmp): + """ + test genome classification with ictv taxonomy + """ + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.ictv-taxonomy.csv") + g_res = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + "--taxonomy-csv", + taxonomy_csv, + "--containment-threshold", + "0", + "--ictv", + ) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert ( + "47+63,match,name,0.664,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus sudanense;Sudan virus,491c0a81,,0.664,5238000,0.987" + in c.last_result.out + ) + + +def test_genome_gather_ictv_twoqueries(runtmp): + """ + test genome classification with ictv taxonomy + """ + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.ictv-taxonomy.csv") + g_res = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") + + # split 47+63 into two fake queries: q47, q63 + g_res2 = runtmp.output("two-queries.gather.csv") + q2_results = [x + "\n" for x in Path(g_res).read_text().splitlines()] + # rename queries + q2_results[1] = q2_results[1].replace("47+63", "q47") + q2_results[2] = q2_results[2].replace("47+63", "q63") + with open(g_res2, "w") as fp: + for line in q2_results: + print(line) + fp.write(line) + + c.run_sourmash( + "tax", + "genome", + "-g", + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--containment-threshold", + "0", + "--ictv", + ) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + print(c.last_result.out) + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert ( + "q47,match,name,0.664,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus sudanense;Sudan virus,491c0a81,,0.664,5238000,0.987" + in c.last_result.out + ) + assert ( + "q63,match,name,0.336,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus zairense;Ebola virus,491c0a81,,0.336,2648000,0.965" + in c.last_result.out + ) + + +def test_genome_gather_ictv_fail(runtmp): + """ + test genome classification with ictv taxonomy + """ + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.ictv-taxonomy.csv") + tax2_csv = runtmp.output("ictv-taxfail") + # copy taxonomy csv to new file, but remove one of the columns + with open(taxonomy_csv) as inF: + with open(tax2_csv, "w") as outF: + for line in inF.readlines(): + line = line.rsplit(",", 1)[0] + outF.write(f"{line}\n") + + g_res = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + "--taxonomy-csv", + tax2_csv, + "--containment-threshold", + "0", + "--ictv", + ) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status != 0 + print(c.last_result.out) + assert "Not all taxonomy ranks present" in str(exc.value) + + def test_genome_rank_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates @@ -4159,6 +4399,57 @@ def test_annotate_gzipped_gather(runtmp): ) +def test_annotate_0_ictv(runtmp): + # test annotate basics + c = runtmp + + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.ictv-taxonomy.csv") + csvout = runtmp.output("test1.gather.with-lineages.csv") + out_dir = os.path.dirname(csvout) + + c.run_sourmash( + "tax", + "annotate", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + out_dir, + "--ictv", + ) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert os.path.exists(csvout) + + gather_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] + print("\n".join(gather_results)) + assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err + + assert "lineage" in gather_results[0] + assert ( + "Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bundibugyoense;Bundibugyo virus" + in gather_results[1] + ) + assert ( + "Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus taiense;Taï Forest virus" + in gather_results[2] + ) + assert ( + "Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bombaliense;Bombali virus" + in gather_results[3] + ) + assert ( + "Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus restonense;Reston virus" + in gather_results[4] + ) + + def test_annotate_0_LIN(runtmp): # test annotate basics c = runtmp @@ -5466,6 +5757,44 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp): assert c["1"] == 11 +def test_tax_summarize_ictv(runtmp): + # test basic operation w/csv output on lineages-style file w/strain csv + taxfile = utils.get_test_data("tax/test.ictv-taxonomy.csv") + lineage_csv = runtmp.output("ictv-lins.csv") + + taxdb = tax_utils.LineageDB.load(taxfile) + with open(lineage_csv, "w", newline="") as fp: + w = csv.writer(fp) + w.writerow(["name", "lineage"]) + for k, v in taxdb.items(): + linstr = lca_utils.display_lineage(v) + w.writerow([k, linstr]) + + runtmp.sourmash("tax", "summarize", lineage_csv, "-o", "ranks.csv", "--ictv") + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 7" in out + assert "saved 14 lineage counts to" in err + + csv_out = runtmp.output("ranks.csv") + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + print(row) + val = row["lineage_count"] + c[val] += 1 + + print(list(c.most_common())) + print(c) + assert c["1"] == 8 + assert c["7"] == 5 + assert c["6"] == 1 + + def test_tax_summarize_LINS(runtmp): # test basic operation w/LINs taxfile = utils.get_test_data("tax/test.LIN-taxonomy.csv") diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index bd0060b65a..192406e251 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -28,6 +28,7 @@ BaseLineageInfo, RankLineageInfo, LINLineageInfo, + ICTVRankLineageInfo, aggregate_by_lineage_at_rank, format_for_krona, write_krona, @@ -38,6 +39,8 @@ LineageDB_Sqlite, MultiLineageDB, filter_row, + NCBI_RANKS, + ICTV_RANKS, ) @@ -2018,6 +2021,22 @@ def test_lca_RankLineageInfo_no_lca(): assert lca_from_lin1 == lca_from_lin2 is None +def test_ICTVLineageInfo_ranks_input_ignored(): + # check that ranks cannot be changed + taxinfo = ICTVRankLineageInfo(ranks=["one", "two"]) + assert taxinfo.taxlist == ICTV_RANKS + + +def test_ICTVLineageInfo_lineagedict_input(): + # check that ranks cannot be changed + dummy_names = [f"name{i}" for i in range(1, len(ICTV_RANKS) + 1)] + input_lindict = dict(zip(ICTV_RANKS, dummy_names)) + taxinfo = ICTVRankLineageInfo(lineage_dict=input_lindict) + print(taxinfo.display_lineage()) + assert taxinfo.display_lineage() == ";".join(dummy_names) + assert taxinfo.taxlist == ICTV_RANKS + + def test_incompatibility_LINLineageInfo_RankLineageInfo(): x = "a;b;c" lin1 = RankLineageInfo(lineage_str=x) diff --git a/tox.ini b/tox.ini index 1806e48778..4057100554 100644 --- a/tox.ini +++ b/tox.ini @@ -56,6 +56,7 @@ wheel_build_env = .pkg pass_env = LIBCLANG_PATH BINDGEN_EXTRA_CLANG_ARGS + NIX_* [testenv:pypy3] deps =