Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: more weather pathfinder improvements #785

Merged
merged 5 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 51 additions & 49 deletions merino/providers/suggest/weather/backends/accuweather/pathfinder.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,53 @@
"""Pathfinder - a utility to reconcile geolocation distinctions between MaxmindDB and AccuWeather."""

import typing
import unicodedata
import re

from typing import Any, Awaitable, Callable, Generator, Optional

from merino.middleware.geolocation import Location
from merino.providers.suggest.weather.backends.protocol import WeatherContext

MaybeStr = Optional[str]
Pair = tuple[MaybeStr, MaybeStr]

LOCALITY_SUFFIX_PATTERN: re.Pattern = re.compile(r"\s+(city|municipality)$", re.IGNORECASE)
SUCCESSFUL_REGIONS_MAPPING: dict[tuple[str, str], str | None] = {("GB", "London"): "LND"}
REGION_MAPPING_EXCLUSIONS: frozenset = frozenset(
["AU", "CA", "CN", "DE", "ES", "FR", "GB", "GR", "IT", "PL", "PT", "RU", "US"]
)
CITY_NAME_CORRECTION_MAPPING: dict[str, str] = {
"Baie Ste. Anne": "Baie-Sainte-Anne",
"Banhā": "Banha",
"Bogota D.C.": "Bogota",
"Centro Municipality": "Centro",
"Chihuahua City": "Chihuahua",
"Chūōku": "Chuoku",
"Collingwood": "Collingwood Corner",
"Dawson City": "Dawson",
"Délı̨ne": "Deline",
"Đồng Nại": "Dong Nai",
"Ejido Culiacán (Culiacancito)": "Ejido Culiacán",
"Gharroli": "Gharoli",
"Grand Bay–Westfield": "Grand Bay Westfield",
"Guanajuato City": "Guanajuato",
"Gustavo Adolfo Madero": "Gustavo A. Madero",
"Gāndhīdhām": "Gandhidham",
"Hājīpur": "Hajipur",
"Haʻikū": "Haiku",
"Hoʻolehua": "Hoolehua",
"Ixtapa-Zihuatanejo": "Zihuatanejo",
"Jingūmae": "Jingumae",
"Kleinburg Station": "Kleinburg",
"Köseköy": "Kosekoy",
"Kīhei": "Kihei",
"Lake Shasta": "Shasta Lake",
"La'ie": "Laie",
"Llambí Campbell": "Llambi Campbell",
"Luis Eduardo Magalhães": "Luis Eduardo Magalhaes",
"Magnesia on the Maeander": "Manisa",
"Mendocino City": "Mendocino",
"México": "Mexico",
"Middlebury (village)": "Middlebury",
"Minamirokugō": "Minamirokugo",
"Mitchell/Ontario": "Mitchell",
"Montreal East": "Montreal",
"Montreal West": "Montreal",
"Napier City": "Napier",
"Naucalpan": "Naucalpan de Juárez",
"Nishiōizumi": "Nishioizumi",
"Ōkubo-naka": "Okubo naka",
"Ōmorikita": "Omorikita",
"Orléans": "Orleans",
"Panjim": "Panaji",
"Pilāni": "Pilani",
"Pohénégamook": "Pohenegamook",
"Puebla City": "Puebla",
"Querétaro City": "Querétaro",
"Sainte-Clotilde-de-Châteauguay": "Sainte-Clotilde-de-Chateauguay",
"Sainte-Geneviève": "Sainte-Genevieve",
"Saint-Barnabe": "Saint-Barnabé",
"Saint-Raymond-de-Portneuf": "Saint-Raymond",
"San Luis Potosí City": "San Luis Potosí",
"Santiago de Cali": "Cali",
"Sōsa": "Sosa-shi",
"Tracadie–Sheila": "Tracadie Sheila",
"Yunderup": "South Yunderup",
"Zacoalco": "Zacoalco de Torres",
"‘Aiea": "Aiea",
}

SKIP_CITIES_MAPPING: dict[tuple[str, str | None, str], int] = {
Expand All @@ -88,7 +65,24 @@
}


def compass(location: Location) -> Generator[Pair, None, None]:
def normalize_string(input_str: str) -> str:
"""Normalize string with special chcarcters"""
return unicodedata.normalize("NFKD", input_str).encode("ascii", "ignore").decode("ascii")


def remove_locality_suffix(city: str) -> str:
"""Remove either city or municipality suffix from a city name"""
return LOCALITY_SUFFIX_PATTERN.sub("", city)


CITY_NAME_NORMALIZERS: list[Callable[[str], str]] = [
lambda a: a,
normalize_string,
remove_locality_suffix,
]


def compass(location: Location) -> Generator[MaybeStr, None, None]:
"""Generate all the regions based on a `Location`.

It will generate ones that are more likely to produce a valid result based on heuristics.
Expand All @@ -98,8 +92,6 @@ def compass(location: Location) -> Generator[Pair, None, None]:
Returns:
- region string that could be None.
"""
# TODO(nanj): add more heuristics to here.

country = location.country
regions = location.regions
city = location.city
Expand All @@ -111,18 +103,18 @@ def compass(location: Location) -> Generator[Pair, None, None]:
country,
city,
) in SUCCESSFUL_REGIONS_MAPPING: # dynamic rules we've learned
yield SUCCESSFUL_REGIONS_MAPPING[(country, city)], city
yield SUCCESSFUL_REGIONS_MAPPING[(country, city)]
case ("AU" | "CA" | "CN" | "DE" | "FR" | "GB" | "PL" | "PT" | "RU" | "US", _):
yield regions[0], city
yield regions[0]
# use the most specific region
case ("IT" | "ES" | "GR", _):
yield regions[-1], city # use the least specific region
yield regions[-1] # use the least specific region
case _: # Fall back to try all regions
regions_to_try = [*regions, None]
for region in regions_to_try:
yield region, city
yield region
else:
yield None, city
yield None


async def explore(
Expand All @@ -149,19 +141,29 @@ async def explore(
is_skipped = False
geolocation = weather_context.geolocation
country = geolocation.country
for region, city in compass(weather_context.geolocation):
if country and city and (country, region, city) in SKIP_CITIES_MAPPING:
# increment since we tried to look up this combo again.
increment_skip_cities_mapping(country, region, city)
return None, True

weather_context.selected_region = region
geolocation.city = city

res = await probe(weather_context)

if res is not None:
return res, is_skipped
city: str = typing.cast(str, geolocation.city) # tell the type checker it's truly not None
# map is lazy, so items of `cities` would only be evaluated one by one if needed
cities = map(lambda fn: fn(city), CITY_NAME_NORMALIZERS)
explored_cities: list[str] = [] # store the explored cities to avoid duplicates

for region in compass(weather_context.geolocation):
for city in cities:
if city in explored_cities:
continue
else:
explored_cities.append(city)
if country and (country, region, city) in SKIP_CITIES_MAPPING:
# increment since we tried to look up this combo again.
increment_skip_cities_mapping(country, region, city)
return None, True

weather_context.selected_region = region
geolocation.city = city

res = await probe(weather_context)

if res is not None:
return res, is_skipped

return None, is_skipped

Expand Down
48 changes: 39 additions & 9 deletions tests/unit/providers/suggest/weather/backends/test_pathfinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@

"""Unit tests for the Accuweather pathfinder module."""

from typing import Tuple

import pytest

from merino.middleware.geolocation import Location
from merino.providers.suggest.weather.backends.accuweather.pathfinder import (
compass,
set_region_mapping,
clear_region_mapping,
normalize_string,
remove_locality_suffix,
)


Expand All @@ -21,27 +21,27 @@
[
(
Location(country="CA", regions=["BC"], city="Vancouver"),
("BC", "Vancouver"),
"BC",
),
(
Location(country="IT", regions=["MT", "77"], city="Matera"),
("77", "Matera"),
"77",
),
(
Location(country="AR", regions=["B", "5"], city="La Plata"),
("STE", "La Plata"),
"STE",
),
(
Location(country="BR", regions=["DF"], city="Brasilia"),
("DF", "Brasilia"),
"DF",
),
(
Location(country="IE", regions=None, city="Dublin"),
(None, "Dublin"),
None,
),
(
Location(country="CA", regions=["ON"], city="Mitchell/Ontario"),
("ON", "Mitchell"),
"ON",
),
],
ids=[
Expand All @@ -53,9 +53,39 @@
"Corrected City Name",
],
)
def test_compass(location: Location, expected_region_and_city: Tuple) -> None:
def test_compass(location: Location, expected_region_and_city: str) -> None:
"""Test country that returns the most specific region."""
set_region_mapping("AR", "La Plata", "STE")
assert next(compass(location)) == expected_region_and_city

clear_region_mapping()


@pytest.mark.parametrize(
"input_string, expected_output",
[
("Köseköy", "Kosekoy"),
("Kīhei", "Kihei"),
("Llambí Campbell", "Llambi Campbell"),
("Luis Eduardo Magalhães", "Luis Eduardo Magalhaes"),
("México", "Mexico"),
("Minamirokugō", "Minamirokugo"),
("Orléans", "Orleans"),
],
)
def test_normalize_string(input_string, expected_output) -> None:
"""Test the normalization of strings with special characters"""
assert normalize_string(input_string) == expected_output


@pytest.mark.parametrize(
"input_string, expected_output",
[
("Querétaro City", "Querétaro"),
("Centro Municipality", "Centro"),
("Burnaby", "Burnaby"),
],
)
def test_remove_locality_suffix(input_string, expected_output) -> None:
"""Test locality suffix is removed"""
assert remove_locality_suffix(input_string) == expected_output
Loading