Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Update search query builder to support int and date range queries #195

Merged
merged 7 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions api/src/adapters/search/opensearch_query_builder.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import typing

from src.pagination.pagination_models import SortDirection
Expand All @@ -17,6 +18,7 @@ class SearchQueryBuilder:
* Sorted by relevancy score descending
* Scored on titles containing "king"
* Where the author is one of Brandon Sanderson or J R.R. Tolkien
* With a page count between 300 and 1000
* Returning aggregate counts of books by those authors in the full results

This query could either be built manually and look like:
Expand Down Expand Up @@ -52,6 +54,12 @@ class SearchQueryBuilder:
"Brandon Sanderson",
"J R.R. Tolkien"
]
},
"range": {
"publication_date": {
"gte": 300,
"lte": 1000
}
}
}
]
Expand All @@ -75,6 +83,7 @@ class SearchQueryBuilder:
.sort_by([("relevancy", SortDirection.DESCENDING)])
.simple_query("king", fields=["title.keyword"])
.filter_terms("author.keyword", terms=["Brandon Sanderson", "J R.R. Tolkien"])
.filter_int_range("page_count", 300, 1000)
.aggregation_terms(aggregation_name="author", field_name="author.keyword", minimum_count=0)
.build()
"""
Expand Down Expand Up @@ -150,6 +159,54 @@ def filter_terms(self, field: str, terms: list) -> typing.Self:
self.filters.append({"terms": {field: terms}})
return self

def filter_int_range(
self, field: str, min_value: int | None, max_value: int | None
) -> typing.Self:
"""
For a given field, filter results to a range of integer values.

If min or max is not provided, the range is unbounded and only
affects the minimum or maximum possible value. At least one min or max value must be specified.

These filters do not affect the relevancy score, they are purely
a binary filter on the overall results.
"""
if min_value is None and max_value is None:
raise ValueError("Cannot use int range filter if both min and max are None")

range_filter = {}
if min_value is not None:
range_filter["gte"] = min_value
if max_value is not None:
range_filter["lte"] = max_value

self.filters.append({"range": {field: range_filter}})
return self

def filter_date_range(
self, field: str, start_date: datetime.date | None, end_date: datetime.date | None
) -> typing.Self:
"""
For a given field, filter results to a range of dates.

If start or end is not provided, the range is unbounded and only
affects the start or end date. At least one start or end date must be specified.

These filters do not affect the relevancy score, they are purely
a binary filter on the overall results.
"""
if start_date is None and end_date is None:
raise ValueError("Cannot use date range filter if both start and end are None")

range_filter = {}
if start_date is not None:
range_filter["gte"] = start_date.isoformat()
if end_date is not None:
range_filter["lte"] = end_date.isoformat()

self.filters.append({"range": {field: range_filter}})
return self

def aggregation_terms(
self, aggregation_name: str, field_name: str, size: int = 25, minimum_count: int = 1
) -> typing.Self:
Expand Down
142 changes: 141 additions & 1 deletion api/tests/src/adapters/search/test_opensearch_query_builder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import uuid
from datetime import date

import pytest

Expand All @@ -12,83 +13,95 @@
"author": "Brandon Sanderson",
"in_stock": True,
"page_count": 1007,
"publication_date": "2010-08-31",
}
WORDS_OF_RADIANCE = {
"id": 2,
"title": "Words of Radiance",
"author": "Brandon Sanderson",
"in_stock": False,
"page_count": 1087,
"publication_date": "2014-03-04",
}
OATHBRINGER = {
"id": 3,
"title": "Oathbringer",
"author": "Brandon Sanderson",
"in_stock": True,
"page_count": 1248,
"publication_date": "2017-11-14",
}
RHYTHM_OF_WAR = {
"id": 4,
"title": "Rhythm of War",
"author": "Brandon Sanderson",
"in_stock": False,
"page_count": 1232,
"publication_date": "2020-11-17",
}
GAME_OF_THRONES = {
"id": 5,
"title": "A Game of Thrones",
"author": "George R.R. Martin",
"in_stock": True,
"page_count": 694,
"publication_date": "1996-08-01",
}
CLASH_OF_KINGS = {
"id": 6,
"title": "A Clash of Kings",
"author": "George R.R. Martin",
"in_stock": True,
"page_count": 768,
"publication_date": "1998-11-16",
}
STORM_OF_SWORDS = {
"id": 7,
"title": "A Storm of Swords",
"author": "George R.R. Martin",
"in_stock": True,
"page_count": 973,
"publication_date": "2000-08-08",
}
FEAST_FOR_CROWS = {
"id": 8,
"title": "A Feast for Crows",
"author": "George R.R. Martin",
"in_stock": True,
"page_count": 753,
"publication_date": "2005-10-17",
}
DANCE_WITH_DRAGONS = {
"id": 9,
"title": "A Dance with Dragons",
"author": "George R.R. Martin",
"in_stock": False,
"page_count": 1056,
"publication_date": "2011-07-12",
}
FELLOWSHIP_OF_THE_RING = {
"id": 10,
"title": "The Fellowship of the Ring",
"author": "J R.R. Tolkien",
"in_stock": True,
"page_count": 423,
"publication_date": "1954-07-29",
}
TWO_TOWERS = {
"id": 11,
"title": "The Two Towers",
"author": "J R.R. Tolkien",
"in_stock": True,
"page_count": 352,
"publication_date": "1954-11-11",
}
RETURN_OF_THE_KING = {
"id": 12,
"title": "The Return of the King",
"author": "J R.R. Tolkien",
"in_stock": True,
"page_count": 416,
"publication_date": "1955-10-20",
}

FULL_DATA = [
Expand Down Expand Up @@ -120,7 +133,9 @@ def validate_valid_request(
f"Request generated was invalid and caused an error in search client: {json_value}"
)

assert resp.records == expected_results
assert (
resp.records == expected_results
), f"{[record['title'] for record in resp.records]} != {[expected['title'] for expected in expected_results]}"

if expected_aggregations is not None:
assert resp.aggregations == expected_aggregations
Expand Down Expand Up @@ -364,6 +379,131 @@ def test_query_builder_filter_terms(

validate_valid_request(search_client, search_index, builder, expected_results)

@pytest.mark.parametrize(
"start_date,end_date,expected_results",
[
# Date range that will include all results
(date(1900, 1, 1), date(2050, 1, 1), FULL_DATA),
# Start only date range that will get all results
(date(1950, 1, 1), None, FULL_DATA),
# End only date range that will get all results
(None, date(2025, 1, 1), FULL_DATA),
# Range that filters to just oldest
(
date(1950, 1, 1),
date(1960, 1, 1),
[FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING],
),
# Unbounded range for oldest few
(None, date(1990, 1, 1), [FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING]),
# Unbounded range for newest few
(date(2011, 8, 1), None, [WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR]),
# Selecting a few in the middle
(
date(2005, 1, 1),
date(2014, 1, 1),
[WAY_OF_KINGS, FEAST_FOR_CROWS, DANCE_WITH_DRAGONS],
),
# Exact date
(date(1954, 7, 29), date(1954, 7, 29), [FELLOWSHIP_OF_THE_RING]),
# None fetched in range
(date(1981, 1, 1), date(1989, 1, 1), []),
],
)
def test_query_builder_filter_date_range(
self, search_client, search_index, start_date, end_date, expected_results
):
builder = (
SearchQueryBuilder()
.sort_by([])
.filter_date_range("publication_date", start_date, end_date)
)

expected_ranges = {}
if start_date is not None:
expected_ranges["gte"] = start_date.isoformat()
if end_date is not None:
expected_ranges["lte"] = end_date.isoformat()

expected_query = {
"size": 25,
"from": 0,
"track_scores": True,
"query": {"bool": {"filter": [{"range": {"publication_date": expected_ranges}}]}},
}

assert builder.build() == expected_query

validate_valid_request(search_client, search_index, builder, expected_results)

@pytest.mark.parametrize(
"min_value,max_value,expected_results",
[
# All fetched
(1, 2000, FULL_DATA),
# None fetched
(2000, 3000, []),
# "Short" books
(300, 700, [GAME_OF_THRONES, FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING]),
# Unbounded short
(None, 416, [TWO_TOWERS, RETURN_OF_THE_KING]),
# Unbounded long
(1050, None, [WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR, DANCE_WITH_DRAGONS]),
# Middle length
(
500,
1010,
[WAY_OF_KINGS, GAME_OF_THRONES, CLASH_OF_KINGS, STORM_OF_SWORDS, FEAST_FOR_CROWS],
),
],
)
def test_query_builder_filter_int_range(
self, search_client, search_index, min_value, max_value, expected_results
):
builder = (
SearchQueryBuilder().sort_by([]).filter_int_range("page_count", min_value, max_value)
)

expected_ranges = {}
if min_value is not None:
expected_ranges["gte"] = min_value
if max_value is not None:
expected_ranges["lte"] = max_value

expected_query = {
"size": 25,
"from": 0,
"track_scores": True,
"query": {"bool": {"filter": [{"range": {"page_count": expected_ranges}}]}},
}

assert builder.build() == expected_query

validate_valid_request(search_client, search_index, builder, expected_results)

def test_multiple_ranges(self, search_client, search_index):
# Sanity test that we can specify multiple ranges (in this case, a date + int range)
# in the same query
builder = (
SearchQueryBuilder()
.sort_by([])
.filter_int_range("page_count", 600, 1100)
.filter_date_range("publication_date", date(2000, 1, 1), date(2013, 1, 1))
)

expected_results = [WAY_OF_KINGS, STORM_OF_SWORDS, FEAST_FOR_CROWS, DANCE_WITH_DRAGONS]
validate_valid_request(
search_client, search_index, builder, expected_results=expected_results
)

def test_filter_int_range_both_none(self):
with pytest.raises(ValueError, match="Cannot use int range filter"):
SearchQueryBuilder().filter_int_range("test_field", None, None)

def test_filter_date_range_both_none(self):
with pytest.raises(ValueError, match="Cannot use date range filter"):
SearchQueryBuilder().filter_date_range("test_field", None, None)

@pytest.mark.parametrize(
"query,fields,expected_results,expected_aggregations",
[
Expand Down
Loading