diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index e1ae4ec2e..b391674ae 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -31,7 +31,7 @@ def register_arguments(parser): ) metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", help=f"""Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}). - Example: region:str coverage:float. + All columns used in the query must be specified. Example: region:str coverage:float. If unspecified, automatic type inference will be attempted on all columns used in the query.""") metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}") metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}") diff --git a/augur/filter/_run.py b/augur/filter/_run.py index 91dc68b40..d06c5b32f 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -15,13 +15,13 @@ DELIMITER as SEQUENCE_INDEX_DELIMITER, ) from augur.io.file import open_file -from augur.io.metadata import InvalidDelimiter, read_metadata +from augur.io.metadata import InvalidDelimiter, Metadata, read_metadata from augur.io.sequences import read_sequences, write_sequences from augur.io.print import print_err from augur.io.vcf import is_vcf as filename_is_vcf, write_vcf from augur.types import EmptyOutputReportingMethod from . import include_exclude_rules -from .io import cleanup_outputs, read_priority_scores, write_metadata_based_outputs +from .io import cleanup_outputs, get_useful_metadata_columns, read_priority_scores, write_metadata_based_outputs from .include_exclude_rules import apply_filters, construct_filters from .subsample import PriorityQueue, TooManyGroupsError, calculate_sequences_per_group, create_queues_by_group, get_groups_for_subsampling @@ -154,19 +154,23 @@ def run(args): filter_counts = defaultdict(int) try: - metadata_reader = read_metadata( - args.metadata, - delimiters=args.metadata_delimiters, - id_columns=args.metadata_id_columns, - chunk_size=args.metadata_chunk_size, - dtype="string", - ) + metadata_object = Metadata(args.metadata, args.metadata_delimiters, args.metadata_id_columns) except InvalidDelimiter: raise AugurError( f"Could not determine the delimiter of {args.metadata!r}. " f"Valid delimiters are: {args.metadata_delimiters!r}. " "This can be changed with --metadata-delimiters." ) + useful_metadata_columns = get_useful_metadata_columns(args, metadata_object.id_column, metadata_object.columns) + + metadata_reader = read_metadata( + args.metadata, + delimiters=[metadata_object.dialect.delimiter], + columns=useful_metadata_columns, + id_columns=[metadata_object.id_column], + chunk_size=args.metadata_chunk_size, + dtype="string", + ) for metadata in metadata_reader: duplicate_strains = ( set(metadata.index[metadata.index.duplicated()]) | @@ -285,6 +289,7 @@ def run(args): metadata_reader = read_metadata( args.metadata, delimiters=args.metadata_delimiters, + columns=useful_metadata_columns, id_columns=args.metadata_id_columns, chunk_size=args.metadata_chunk_size, dtype="string", diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py index 3e6e7fda9..128171ceb 100644 --- a/augur/filter/include_exclude_rules.py +++ b/augur/filter/include_exclude_rules.py @@ -79,7 +79,7 @@ def filter_by_exclude(metadata, exclude_file) -> FilterFunctionReturn: return set(metadata.index.values) - excluded_strains -def _parse_filter_query(query): +def parse_filter_query(query): """Parse an augur filter-style query and return the corresponding column, operator, and value for the query. @@ -99,9 +99,9 @@ def _parse_filter_query(query): Examples -------- - >>> _parse_filter_query("property=value") + >>> parse_filter_query("property=value") ('property', , 'value') - >>> _parse_filter_query("property!=value") + >>> parse_filter_query("property!=value") ('property', , 'value') """ @@ -144,7 +144,7 @@ def filter_by_exclude_where(metadata, exclude_where) -> FilterFunctionReturn: ['strain1', 'strain2'] """ - column, op, value = _parse_filter_query(exclude_where) + column, op, value = parse_filter_query(exclude_where) if column in metadata.columns: # Apply a test operator (equality or inequality) to values from the # column in the given query. This produces an array of boolean values we @@ -500,7 +500,7 @@ def force_include_where(metadata, include_where) -> FilterFunctionReturn: set() """ - column, op, value = _parse_filter_query(include_where) + column, op, value = parse_filter_query(include_where) if column in metadata.columns: # Apply a test operator (equality or inequality) to values from the @@ -758,7 +758,9 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil except Exception as e: if filter_function is filter_by_query: if isinstance(e, PandasUndefinedVariableError): - raise AugurError(f"Query contains a column that does not exist in metadata.") from e + # This happens under two conditions which are indistinguishable, + # so give both possible reasons in the error message. + raise AugurError(f"Query contains an invalid variable. Either (1) a queried column is missing from the metadata, or (2) you are using --query-columns but did not specify all columns.") from e raise AugurError(f"Internal Pandas error when applying query:\n\t{e}\nEnsure the syntax is valid per .") from e else: raise diff --git a/augur/filter/io.py b/augur/filter/io.py index ec26069c9..2fd2ec453 100644 --- a/augur/filter/io.py +++ b/augur/filter/io.py @@ -1,14 +1,74 @@ import argparse import csv +from argparse import Namespace import os import re -from typing import Sequence, Set +from typing import Iterable, Sequence, Set import numpy as np from collections import defaultdict from xopen import xopen from augur.errors import AugurError -from augur.io.metadata import Metadata +from augur.io.metadata import Metadata, METADATA_DATE_COLUMN +from augur.filter.constants import GROUP_BY_GENERATED_COLUMNS +from augur.filter.include_exclude_rules import extract_variables, parse_filter_query +from augur.io.print import print_err + + +def get_useful_metadata_columns(args: Namespace, id_column: str, all_columns: Iterable[str]): + """Return a list of column names that are used in augur filter. + This allows reading only the necessary columns. + """ + + # Start with just the ID column. + columns = {id_column} + + # Add the date column if it is used. + if (args.exclude_ambiguous_dates_by + or args.min_date + or args.max_date + or (args.group_by and GROUP_BY_GENERATED_COLUMNS.intersection(args.group_by))): + columns.add(METADATA_DATE_COLUMN) + + if args.group_by: + group_by_set = set(args.group_by) + + # Add columns used for grouping. + columns.update(group_by_set - GROUP_BY_GENERATED_COLUMNS) + + # Warn if any existing metadata columns will be ignored. + generated_columns_requested = GROUP_BY_GENERATED_COLUMNS & group_by_set + for col in sorted(generated_columns_requested): + if col in all_columns: + print_err(f"WARNING: `--group-by {col}` uses a generated {col} value from the {METADATA_DATE_COLUMN!r} column. The custom '{col}' column in the metadata is ignored for grouping purposes.") + + # Add columns used in exclude queries. + if args.exclude_where: + for query in args.exclude_where: + column, op, value = parse_filter_query(query) + columns.add(column) + + # Add columns used in include queries. + if args.include_where: + for query in args.include_where: + column, op, value = parse_filter_query(query) + columns.add(column) + + # Add columns used in Pandas queries. + if args.query: + if args.query_columns: + # Use column names explicitly specified by the user. + for column, dtype in args.query_columns: + columns.add(column) + else: + # Attempt to automatically extract columns from the query. + variables = extract_variables(args.query) + if variables is None: + raise AugurError("Could not infer columns from the pandas query. If the query is valid, please specify columns using --query-columns.") + else: + columns.update(variables) + + return list(columns) def read_priority_scores(fname): diff --git a/tests/functional/filter/cram/filter-query-errors.t b/tests/functional/filter/cram/filter-query-errors.t index 5ccd1dfad..4f131b5f7 100644 --- a/tests/functional/filter/cram/filter-query-errors.t +++ b/tests/functional/filter/cram/filter-query-errors.t @@ -8,7 +8,8 @@ Using a pandas query with a nonexistent column results in a specific error. > --metadata "$TESTDIR/../data/metadata.tsv" \ > --query "invalid == 'value'" \ > --output-strains filtered_strains.txt > /dev/null - ERROR: Query contains a column that does not exist in metadata. + WARNING: Column 'invalid' does not exist in the metadata file. Ignoring it. + ERROR: Query contains an invalid variable. Either (1) a queried column is missing from the metadata, or (2) you are using --query-columns but did not specify all columns. [2] @@ -40,7 +41,5 @@ However, other Pandas errors are not so helpful, so a link is provided for users > --metadata "$TESTDIR/../data/metadata.tsv" \ > --query "some bad syntax" \ > --output-strains filtered_strains.txt > /dev/null - ERROR: Internal Pandas error when applying query: - invalid syntax (, line 1) - Ensure the syntax is valid per . + ERROR: Could not infer columns from the pandas query. If the query is valid, please specify columns using --query-columns. [2] diff --git a/tests/functional/filter/cram/filter-query-incomplete-columns.t b/tests/functional/filter/cram/filter-query-incomplete-columns.t new file mode 100644 index 000000000..f1ed81fae --- /dev/null +++ b/tests/functional/filter/cram/filter-query-incomplete-columns.t @@ -0,0 +1,46 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Create metadata file for testing. + + $ cat >metadata.tsv <<~~ + > strain coverage category + > SEQ_1 0.94 A + > SEQ_2 0.95 B + > SEQ_3 0.96 C + > SEQ_4 + > ~~ + +--query-columns must be all-or-nothing. + +Automatic inference works. + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --query "coverage >= 0.95 & category == 'B'" \ + > --output-strains filtered_strains.txt + 3 strains were dropped during filtering + 3 of these were filtered out by the query: "coverage >= 0.95 & category == 'B'" + 1 strains passed all filters + +An incomplete --query-columns does not work. + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --query "coverage >= 0.95 & category == 'B'" \ + > --query-columns coverage:float \ + > --output-strains filtered_strains.txt + ERROR: Query contains an invalid variable. Either (1) a queried column is missing from the metadata, or (2) you are using --query-columns but did not specify all columns. + [2] + +It works again when all columns are specified. + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --query "coverage >= 0.95 & category == 'B'" \ + > --query-columns coverage:float category:str \ + > --output-strains filtered_strains.txt + 3 strains were dropped during filtering + 3 of these were filtered out by the query: "coverage >= 0.95 & category == 'B'" + 1 strains passed all filters diff --git a/tests/functional/filter/cram/subsample-group-by-missing-error.t b/tests/functional/filter/cram/subsample-group-by-missing-error.t index 54ef691cc..9884c2b5f 100644 --- a/tests/functional/filter/cram/subsample-group-by-missing-error.t +++ b/tests/functional/filter/cram/subsample-group-by-missing-error.t @@ -15,6 +15,7 @@ Error on missing group-by columns. > --group-by year \ > --sequences-per-group 1 \ > --output-metadata metadata-filtered.tsv > /dev/null + WARNING: Column 'date' does not exist in the metadata file. Ignoring it. ERROR: The specified group-by categories (['year']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'. [2] $ cat metadata-filtered.tsv @@ -26,6 +27,7 @@ Error on missing group-by columns. > --group-by invalid \ > --sequences-per-group 1 \ > --output-metadata metadata-filtered.tsv > /dev/null + WARNING: Column 'invalid' does not exist in the metadata file. Ignoring it. ERROR: The specified group-by categories (['invalid']) were not found. [2] $ cat metadata-filtered.tsv