Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update filter & lineage/refilter funcs to support updates in TaxonKit 0.7.2 #8

Merged
merged 5 commits into from
Jan 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/cibuild.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: CI Build

on: [push]
on:
push:
schedule:
- cron: '0 4 1 * *'

jobs:
build:
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ This project adheres to [Semantic Versioning](http://semver.org/).
## [Unreleased]

### Added
- New `filter()` function supporting new `taxonkit filter` command (#4)
- New `filter()` function supporting new `taxonkit filter` command (#4, #8)
- New `lca()` function supporting new `taxonkit lca` command (#5)
- Support for new `prefix` related flags (#6)
- Use new `--show-lineage-ranks` to create new `FullLineageRanks` column in `lineage()` output (#8)


## [0.6.1] 2020-10-30
Expand Down
79 changes: 67 additions & 12 deletions pytaxonkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,8 @@ def test_list_str():
# -------------------------------------------------------------------------------------------------


def lineage(ids, formatstr=None, threads=None, data_dir=None, prefix=False, debug=False, **kwargs):
def lineage(ids, formatstr=None, threads=None, data_dir=None, prefix=False, pseudo_strain=False,
fill_missing=False, debug=False, **kwargs):
'''query lineage of given taxids

Executes `taxonkit lineage` and `taxonkit reformat` to provide both a full and a "standard"
Expand All @@ -289,9 +290,14 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, prefix=False, debu
prefix : bool, default False
add prefixes to each name indicating rank
prefix_* : str, default None
when `prefix=True`, override default prefix for the specified rank; replace `*` with one of
When `prefix=True`, override default prefix for the specified rank; replace `*` with one of
k, p, c, o, f, g, s, S corresponding to (super)kingdom, phylum, class, order, family,
genus, species, subspecies; for example, `prefix_g='GENUS:', prefix_s='SPECIES:'`
pseudo_strain : bool, default False
Use the node with lowest rank as the strain name, only if that rank is lower than "species"
and not "subspecies" nor "strain"
fill_missing : bool, default False
Fill missing rank with lineage information of the next higher rank
debug : bool, default False
Print debugging output, e.g., system calls to `taxonkit`

Expand All @@ -310,7 +316,7 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, prefix=False, debu
>>> result = pytaxonkit.lineage([1325911, 1649473, 1401311])
>>> result.columns
Index(['TaxID', 'Code', 'Name', 'Lineage', 'LineageTaxIDs', 'Rank',
'FullLineage', 'FullLineageTaxIDs'],
'FullLineage', 'FullLineageTaxIDs', 'FullLineageRanks'],
dtype='object')
>>> result[['TaxID', 'Lineage', 'LineageTaxIDs']]
TaxID Lineage LineageTaxIDs
Expand All @@ -327,7 +333,7 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, prefix=False, debu
idlist = '\n'.join(map(str, ids))
arglist = [
'taxonkit', 'lineage', '--show-lineage-taxids', '--show-rank', '--show-status-code',
'--show-name'
'--show-name', '--show-lineage-ranks'
]
if threads:
arglist.extend(('--threads', validate_threads(threads)))
Expand All @@ -351,12 +357,16 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, prefix=False, debu
if key.count('_') != 1:
raise TypeError(f'unexpected keyword argument "{key}"')
prefix, subkey = key.split('_')
if prefix != 'prefix' or subkey not in 'kpcofgsS':
if prefix != 'prefix' or subkey not in 'kpcofgsStT':
raise TypeError(f'unexpected keyword argument "{key}"')
flag = f'--prefix-{subkey}'
extraargs.extend((flag, value))
if threads:
extraargs.extend(('--threads', validate_threads(threads)))
if pseudo_strain:
extraargs.append('--pseudo-strain')
if fill_missing:
extraargs.append('--fill-miss-rank')
if data_dir:
extraargs.extend(('--data-dir', validate_data_dir(data_dir))) # pragma: no cover
arglist = [
Expand All @@ -370,12 +380,12 @@ def lineage(ids, formatstr=None, threads=None, data_dir=None, prefix=False, debu
if proc.returncode != 0:
raise TaxonKitCLIError(err) # pragma: no cover
columnorderin = [
'TaxID', 'Code', 'FullLineage', 'FullLineageTaxIDs', 'Name', 'Rank', 'Lineage',
'LineageTaxIDs'
'TaxID', 'Code', 'FullLineage', 'FullLineageTaxIDs', 'Name', 'Rank',
'FullLineageRanks', 'Lineage', 'LineageTaxIDs',
]
columnorderout = [
'TaxID', 'Code', 'Name', 'Lineage', 'LineageTaxIDs', 'Rank', 'FullLineage',
'FullLineageTaxIDs'
'FullLineageTaxIDs', 'FullLineageRanks'
]
data = pandas.read_csv(
StringIO(out), sep='\t', header=None, names=columnorderin, index_col=False
Expand Down Expand Up @@ -460,6 +470,7 @@ def test_lineage_threads():
'cellular organisms;Bacteria;FCB group;Bacteroidetes/Chlorobi group;Bacteroidetes;'
'Bacteroidia'
)
assert result.FullLineageRanks.iloc[0] == 'no rank;superkingdom;clade;clade;phylum;class'


def test_lineage_name():
Expand Down Expand Up @@ -503,6 +514,19 @@ def test_lineage_bad_prefix():
lineage([325064], breakfast_sausage='YUM')


def test_lineage_pseudo_strain():
result = lineage(
[36827], formatstr='{k};{p};{c};{o};{f};{g};{s};{t}', fill_missing=True,
pseudo_strain=True, prefix=True,
)
obs_out = result.Lineage.iloc[0]
exp_out = (
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;'
'g__Clostridium;s__Clostridium botulinum;t__Clostridium botulinum B'
)
assert exp_out == obs_out


def test_name_debug(capsys):
result = name([207661, 1353792, 1597281], debug=True)
assert result.Name.equals(pandas.Series([
Expand Down Expand Up @@ -602,8 +626,8 @@ def test_name2taxid_threads():
# -------------------------------------------------------------------------------------------------

def filter(ids, threads=None, equal_to=None, higher_than=None, lower_than=None,
discard_norank=False, discard_root=False, root_taxid=None, blacklist=None,
rank_file=None, debug=False):
discard_norank=False, save_predictable=False, discard_root=False, root_taxid=None,
blacklist=None, rank_file=None, debug=False):
'''filter taxids by taxonomic rank (or a range of ranks)

Executes the `taxonkit filter` command to include or exclude taxa at the specified ranks.
Expand All @@ -614,14 +638,17 @@ def filter(ids, threads=None, equal_to=None, higher_than=None, lower_than=None,
A list of taxids (ints or strings are ok)
threads : int
Override the default taxonkit threads setting
equal_to : str, default None
Keep only taxa at the specified rank
equal_to : str or list, default None
Keep only taxa at the specified rank(s); can be a string or a list of strings
higher_than : str, default None
Keep only taxa ranked higher than the specified rank
lower_than : str, default None
Keep only taxa ranked lower than the specified rank
discard_norank : bool, default False
Discard generic ranks without an explicit ranking order ("no rank" and "clade")
save_predictable : bool, default False
When `discard_norank=True`, do not discard some special ranks without order where the rank
of the closest higher node is still lower than rank cutoff
discard_root : bool, default False
Discard root taxon
root_taxid : int or str
Expand Down Expand Up @@ -653,13 +680,17 @@ def filter(ids, threads=None, equal_to=None, higher_than=None, lower_than=None,
if threads:
arglist.extend(('--threads', validate_threads(threads)))
if equal_to:
if isinstance(equal_to, (pylist, tuple)):
equal_to = ','.join(equal_to)
arglist.extend(['--equal-to', equal_to])
if higher_than:
arglist.extend(['--higher-than', higher_than])
if lower_than:
arglist.extend(['--lower-than', lower_than])
if discard_norank:
arglist.append('--discard-noranks')
if save_predictable:
arglist.append('--save-predictable-norank')
if discard_root: # pragma: no cover
arglist.append('--discard-root')
if blacklist:
Expand Down Expand Up @@ -763,12 +794,36 @@ def test_filter_lower_than(capsys):
assert 'taxonkit filter --lower-than family' in terminal.err


def test_filter_equal_to_multi(capsys):
taxids = [
131567, 2759, 33154, 33208, 6072, 33213, 33317, 1206794, 88770, 6656, 197563, 197562, 6960,
50557, 85512, 7496, 33340, 33392, 7041, 41071, 535382, 41073, 706613, 586004, 87479, 412111
]
obs_result = filter(taxids, threads=1, equal_to=['phylum', 'genus'], debug=True)
exp_result = [6656, 87479]
assert obs_result == exp_result
terminal = capsys.readouterr()
assert '--equal-to phylum,genus' in terminal.err


def test_filter_higher_lower_conflict():
message = r'cannot specify "higher_than" and "lower_than" simultaneously'
with pytest.raises(ValueError, match=message):
filter([42], discard_norank=True, higher_than='genus', lower_than='genus')


def test_filter_save_predictable():
taxids = [
131567, 2, 1224, 1236, 91347, 543, 561, 562, 2605619, 10239, 2731341, 2731360, 2731618,
2731619, 28883, 10699, 196894, 1327037
]
obs_result = filter(
taxids, threads=1, lower_than='species', equal_to='species', save_predictable=True
)
exp_result = [562, 2605619, 1327037]
assert obs_result == exp_result


def test_list_ranks(capsys):
ranks = list_ranks(debug=True)
assert len(ranks) == 68
Expand Down