Skip to content

Commit

Permalink
filter: Read a subset of metadata columns
Browse files Browse the repository at this point in the history
Before these changes, all columns were read into memory even if they
were not used for actual filtering. The only reason was because
metadata-based outputs were created from the in-memory representation of
metadata. An earlier commit switched to creating those outputs by doing
another pass of the original metadata. That means there is no longer a
need to have all columns in memory.
  • Loading branch information
victorlin committed Jan 22, 2024
1 parent 0a671bb commit d8b5765
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 22 deletions.
2 changes: 1 addition & 1 deletion augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def register_arguments(parser):
)
metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+",
help=f"""Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).
Example: region:str coverage:float.
All columns used in the query must be specified. Example: region:str coverage:float.
If unspecified, automatic type inference will be attempted on all columns used in the query.""")
metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
Expand Down
23 changes: 14 additions & 9 deletions augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
DELIMITER as SEQUENCE_INDEX_DELIMITER,
)
from augur.io.file import open_file
from augur.io.metadata import InvalidDelimiter, read_metadata
from augur.io.metadata import InvalidDelimiter, Metadata, read_metadata
from augur.io.sequences import read_sequences, write_sequences
from augur.io.print import print_err
from augur.io.vcf import is_vcf as filename_is_vcf, write_vcf
from augur.types import EmptyOutputReportingMethod
from . import include_exclude_rules
from .io import cleanup_outputs, read_priority_scores, write_metadata_based_outputs
from .io import cleanup_outputs, get_useful_metadata_columns, read_priority_scores, write_metadata_based_outputs
from .include_exclude_rules import apply_filters, construct_filters
from .subsample import PriorityQueue, TooManyGroupsError, calculate_sequences_per_group, create_queues_by_group, get_groups_for_subsampling

Expand Down Expand Up @@ -154,19 +154,23 @@ def run(args):
filter_counts = defaultdict(int)

try:
metadata_reader = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
dtype="string",
)
metadata_object = Metadata(args.metadata, args.metadata_delimiters, args.metadata_id_columns)
except InvalidDelimiter:
raise AugurError(
f"Could not determine the delimiter of {args.metadata!r}. "
f"Valid delimiters are: {args.metadata_delimiters!r}. "
"This can be changed with --metadata-delimiters."
)
useful_metadata_columns = get_useful_metadata_columns(args, metadata_object.id_column, metadata_object.columns)

metadata_reader = read_metadata(
args.metadata,
delimiters=[metadata_object.dialect.delimiter],
columns=useful_metadata_columns,
id_columns=[metadata_object.id_column],
chunk_size=args.metadata_chunk_size,
dtype="string",
)
for metadata in metadata_reader:
duplicate_strains = (
set(metadata.index[metadata.index.duplicated()]) |
Expand Down Expand Up @@ -285,6 +289,7 @@ def run(args):
metadata_reader = read_metadata(
args.metadata,
delimiters=args.metadata_delimiters,
columns=useful_metadata_columns,
id_columns=args.metadata_id_columns,
chunk_size=args.metadata_chunk_size,
dtype="string",
Expand Down
14 changes: 8 additions & 6 deletions augur/filter/include_exclude_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def filter_by_exclude(metadata, exclude_file) -> FilterFunctionReturn:
return set(metadata.index.values) - excluded_strains


def _parse_filter_query(query):
def parse_filter_query(query):
"""Parse an augur filter-style query and return the corresponding column,
operator, and value for the query.
Expand All @@ -99,9 +99,9 @@ def _parse_filter_query(query):
Examples
--------
>>> _parse_filter_query("property=value")
>>> parse_filter_query("property=value")
('property', <built-in function eq>, 'value')
>>> _parse_filter_query("property!=value")
>>> parse_filter_query("property!=value")
('property', <built-in function ne>, 'value')
"""
Expand Down Expand Up @@ -144,7 +144,7 @@ def filter_by_exclude_where(metadata, exclude_where) -> FilterFunctionReturn:
['strain1', 'strain2']
"""
column, op, value = _parse_filter_query(exclude_where)
column, op, value = parse_filter_query(exclude_where)
if column in metadata.columns:
# Apply a test operator (equality or inequality) to values from the
# column in the given query. This produces an array of boolean values we
Expand Down Expand Up @@ -500,7 +500,7 @@ def force_include_where(metadata, include_where) -> FilterFunctionReturn:
set()
"""
column, op, value = _parse_filter_query(include_where)
column, op, value = parse_filter_query(include_where)

if column in metadata.columns:
# Apply a test operator (equality or inequality) to values from the
Expand Down Expand Up @@ -758,7 +758,9 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
except Exception as e:
if filter_function is filter_by_query:
if isinstance(e, PandasUndefinedVariableError):
raise AugurError(f"Query contains a column that does not exist in metadata.") from e
# This happens under two conditions which are indistinguishable,
# so give both possible reasons in the error message.
raise AugurError(f"Query contains an invalid variable. Either (1) a queried column is missing from the metadata, or (2) you are using --query-columns but did not specify all columns.") from e
raise AugurError(f"Internal Pandas error when applying query:\n\t{e}\nEnsure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.") from e
else:
raise
Expand Down
64 changes: 62 additions & 2 deletions augur/filter/io.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,74 @@
import argparse
import csv
from argparse import Namespace
import os
import re
from typing import Sequence, Set
from typing import Iterable, Sequence, Set
import numpy as np
from collections import defaultdict
from xopen import xopen

from augur.errors import AugurError
from augur.io.metadata import Metadata
from augur.io.metadata import Metadata, METADATA_DATE_COLUMN
from augur.filter.constants import GROUP_BY_GENERATED_COLUMNS
from augur.filter.include_exclude_rules import extract_variables, parse_filter_query
from augur.io.print import print_err


def get_useful_metadata_columns(args: Namespace, id_column: str, all_columns: Iterable[str]):
"""Return a list of column names that are used in augur filter.
This allows reading only the necessary columns.
"""

# Start with just the ID column.
columns = {id_column}

# Add the date column if it is used.
if (args.exclude_ambiguous_dates_by
or args.min_date
or args.max_date
or (args.group_by and GROUP_BY_GENERATED_COLUMNS.intersection(args.group_by))):
columns.add(METADATA_DATE_COLUMN)

if args.group_by:
group_by_set = set(args.group_by)

# Add columns used for grouping.
columns.update(group_by_set - GROUP_BY_GENERATED_COLUMNS)

# Warn if any existing metadata columns will be ignored.
generated_columns_requested = GROUP_BY_GENERATED_COLUMNS & group_by_set
for col in sorted(generated_columns_requested):
if col in all_columns:
print_err(f"WARNING: `--group-by {col}` uses a generated {col} value from the {METADATA_DATE_COLUMN!r} column. The custom '{col}' column in the metadata is ignored for grouping purposes.")

# Add columns used in exclude queries.
if args.exclude_where:
for query in args.exclude_where:
column, op, value = parse_filter_query(query)
columns.add(column)

# Add columns used in include queries.
if args.include_where:
for query in args.include_where:
column, op, value = parse_filter_query(query)
columns.add(column)

# Add columns used in Pandas queries.
if args.query:
if args.query_columns:
# Use column names explicitly specified by the user.
for column, dtype in args.query_columns:
columns.add(column)
else:
# Attempt to automatically extract columns from the query.
variables = extract_variables(args.query)
if variables is None:
raise AugurError("Could not infer columns from the pandas query. If the query is valid, please specify columns using --query-columns.")
else:
columns.update(variables)

return list(columns)


def read_priority_scores(fname):
Expand Down
7 changes: 3 additions & 4 deletions tests/functional/filter/cram/filter-query-errors.t
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ Using a pandas query with a nonexistent column results in a specific error.
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --query "invalid == 'value'" \
> --output-strains filtered_strains.txt > /dev/null
ERROR: Query contains a column that does not exist in metadata.
WARNING: Column 'invalid' does not exist in the metadata file. Ignoring it.
ERROR: Query contains an invalid variable. Either (1) a queried column is missing from the metadata, or (2) you are using --query-columns but did not specify all columns.
[2]


Expand Down Expand Up @@ -40,7 +41,5 @@ However, other Pandas errors are not so helpful, so a link is provided for users
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --query "some bad syntax" \
> --output-strains filtered_strains.txt > /dev/null
ERROR: Internal Pandas error when applying query:
invalid syntax (<unknown>, line 1)
Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.
ERROR: Could not infer columns from the pandas query. If the query is valid, please specify columns using --query-columns.
[2]
46 changes: 46 additions & 0 deletions tests/functional/filter/cram/filter-query-incomplete-columns.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Setup

$ source "$TESTDIR"/_setup.sh

Create metadata file for testing.

$ cat >metadata.tsv <<~~
> strain coverage category
> SEQ_1 0.94 A
> SEQ_2 0.95 B
> SEQ_3 0.96 C
> SEQ_4
> ~~

--query-columns must be all-or-nothing.

Automatic inference works.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95 & category == 'B'" \
> --output-strains filtered_strains.txt
3 strains were dropped during filtering
3 of these were filtered out by the query: "coverage >= 0.95 & category == 'B'"
1 strains passed all filters

An incomplete --query-columns does not work.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95 & category == 'B'" \
> --query-columns coverage:float \
> --output-strains filtered_strains.txt
ERROR: Query contains an invalid variable. Either (1) a queried column is missing from the metadata, or (2) you are using --query-columns but did not specify all columns.
[2]

It works again when all columns are specified.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95 & category == 'B'" \
> --query-columns coverage:float category:str \
> --output-strains filtered_strains.txt
3 strains were dropped during filtering
3 of these were filtered out by the query: "coverage >= 0.95 & category == 'B'"
1 strains passed all filters
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Error on missing group-by columns.
> --group-by year \
> --sequences-per-group 1 \
> --output-metadata metadata-filtered.tsv > /dev/null
WARNING: Column 'date' does not exist in the metadata file. Ignoring it.
ERROR: The specified group-by categories (['year']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'.
[2]
$ cat metadata-filtered.tsv
Expand All @@ -26,6 +27,7 @@ Error on missing group-by columns.
> --group-by invalid \
> --sequences-per-group 1 \
> --output-metadata metadata-filtered.tsv > /dev/null
WARNING: Column 'invalid' does not exist in the metadata file. Ignoring it.
ERROR: The specified group-by categories (['invalid']) were not found.
[2]
$ cat metadata-filtered.tsv
Expand Down

0 comments on commit d8b5765

Please sign in to comment.