diff --git a/requirements-dev.txt b/requirements-dev.txt index 0196252..3aa9824 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,3 @@ -r requirements.txt -coverage==7.6.4 +coverage==7.6.8 pytest==8.3.3 diff --git a/requirements.txt b/requirements.txt index 5597b5b..8b37974 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ pandas==2.2.3 parsel==1.9.1 pyutils @ git+https://github.com/iagocanalejas/pyutils.git@master requests==2.32.3 -simplemma==1.1.1 +simplemma==1.1.2 tables==3.10.1 diff --git a/rscraping/data/checks.py b/rscraping/data/checks.py index 182413d..dc3ed63 100644 --- a/rscraping/data/checks.py +++ b/rscraping/data/checks.py @@ -5,7 +5,11 @@ def should_be_time_trial(name: str, date: date) -> bool: - return is_play_off(name) or (all(w in name.split() for w in ["TERESA", "HERRERA"]) and date.isoweekday() == 6) + return ( + is_play_off(name) + or (all(w in name.split() for w in ["TERESA", "HERRERA"]) and date.isoweekday() == 6) + or (all(w in name.split() for w in ["VILLA", "BILBAO"])) + ) def is_play_off(name: str) -> bool: diff --git a/rscraping/data/constants.py b/rscraping/data/constants.py index 896f5b8..d0f6b4e 100644 --- a/rscraping/data/constants.py +++ b/rscraping/data/constants.py @@ -101,6 +101,7 @@ def HTTP_HEADERS(): SYNONYM_DIPUTATION = "DIPUTACION" SYNONYM_QUALIFYING = "CLASIFICATORIA" SYNONYM_PORT = "PUERTO" +SYNONYM_BEACH = "PLAYA" SYNONYM_BAY = "BAHIA" SYNONYM_ROWER = "REMERO" SYNONYM_COXWAIN = "PATRON" @@ -146,8 +147,9 @@ def HTTP_HEADERS(): SYNONYM_RACE: ["REGATA", "REGATAS", "ESTROPADA", "ESTROPADAK"], SYNONYM_DIPUTATION: ["DEPUTACIÓN", "DIPUTACIÓN", "DEPUTACION", "DIPUTACION"], SYNONYM_QUALIFYING: ["CLASIFICATORIA", "SAILKAPEN OROKORRA", "CLASIFICACION GENERAL", "ELIMINATORIA"], - SYNONYM_PORT: ["PEIRAO COMERCIAL", "PUERTO", "PORTO", "PEIRAO", "MUELLE", "PRAIA"], + SYNONYM_PORT: ["PEIRAO COMERCIAL", "PUERTO", "PORTO", "PEIRAO", "MUELLE"], SYNONYM_BAY: ["BAIA", "BAHIA"], + SYNONYM_BEACH: ["PLAYA", "HONDARTZA", "PRAIA", "PLAGE", "PRAIA"], SYNONYM_ROWER: ["REMERO", "REMERA", "REMEIRO", "REMEIRA", "REMEROS", "REMERAS", "REMEIRAS", "REMEIROS"], SYNONYM_COXWAIN: ["PATRON", "PATRÓN", "PATRONA", "PATROA"], SYNONYM_HOMEGROWN: ["CANTERANO", "CANTEIRAN", "CANTEIRÁN", "CANTERANA", "CANTEIRA", "CANTEIRÁ"], diff --git a/rscraping/data/normalization/lemmatize.py b/rscraping/data/normalization/lemmatize.py index 0c76b47..430ba7e 100644 --- a/rscraping/data/normalization/lemmatize.py +++ b/rscraping/data/normalization/lemmatize.py @@ -1,13 +1,13 @@ from simplemma.lemmatizer import text_lemmatizer -from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols, unaccent +from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_parenthesis, remove_symbols, unaccent from rscraping.data.constants import SYNONYMS def lemmatize(phrase: str, lang: str = "es") -> list[str]: """ Lemmatize a phrase using the simplemma library. The phrase is preprocessed before lemmatization. - Synonyms are normalized, conjunctions are removed, symbols are removed, and accents are removed. + Synonyms are normalized, conjunctions are removed and symbols are removed. Accents are removed after lemmatization. Parameters: - phrase (str): The phrase to lemmatize. @@ -16,5 +16,8 @@ def lemmatize(phrase: str, lang: str = "es") -> list[str]: Returns: list[str]: A list of lemmatized words from the phrase. """ phrase = normalize_synonyms(phrase, SYNONYMS) - phrase = unaccent(remove_symbols(remove_conjunctions(phrase))) - return list(set(text_lemmatizer(phrase, lang=lang))) + phrase = remove_symbols(remove_conjunctions(phrase)).replace(".", " ") + phrase = remove_parenthesis(phrase, preserve_content=True) + tokens = [unaccent(w).strip() for w in set(text_lemmatizer(phrase, lang=lang))] + + return tokens diff --git a/rscraping/data/normalization/penalty.py b/rscraping/data/normalization/penalty.py index a921495..72e5c88 100644 --- a/rscraping/data/normalization/penalty.py +++ b/rscraping/data/normalization/penalty.py @@ -23,18 +23,12 @@ _CANCELLED_LEMMAS = [ ["anular", "regata"], - ["anulo", "regata"], ["cancelar", "regata"], - ["cancelo", "regata"], ["suspender", "regata"], - ["suspendio", "regata"], ["anular", "prueba"], - ["anulo", "prueba"], ["cancelar", "prueba"], - ["cancelo", "prueba"], ["suspender", "prueba"], - ["suspendio", "prueba"], - ["tanda", "no", "salio"], + ["tanda", "no", "salir"], ] @@ -146,13 +140,11 @@ def retrieve_penalty_times(note: str) -> dict[str, time]: ["abrir", "molesto"], ["poner", "delante"], ["ponerse", "delante"], - ["salio", "abrir"], + ["salir", "abrir"], ["abordaje"], ["abordar"], ["abordo"], ["chocar"], - ["choco"], - ["colisiono"], ["colisionar"], ["estorbo"], ["estorbar"], @@ -164,10 +156,10 @@ def retrieve_penalty_times(note: str) -> dict[str, time]: ], COXWAIN_WEIGHT_LIMIT: [], DOPING: [["antidoping"], ["doping"], ["positivo"]], - LACK_OF_COMPETITIVENESS: [["falto", "voluntad", "competir"]], + LACK_OF_COMPETITIVENESS: [["faltar", "voluntad", "competir"]], NO_LINE_START: [], - NULL_START: [["nulo", "salida"], ["tarde", "salida"], ["deberia", "baliza", "salida"]], - SINKING: [["hundio"], ["entrar", "agua"]], + NULL_START: [["nulo", "salida"], ["tarde", "salida"], ["deber", "baliza", "salir"]], + SINKING: [["hundir"], ["entrar", "agua"]], WRONG_LINEUP: [["ficha", "remero"], ["remero", "licencia"], ["alineacion", "indebido"], ["juvenil"]], } diff --git a/rscraping/parsers/html/traineras.py b/rscraping/parsers/html/traineras.py index 77f23c4..c4edc57 100644 --- a/rscraping/parsers/html/traineras.py +++ b/rscraping/parsers/html/traineras.py @@ -57,8 +57,8 @@ class TrainerasHtmlParser(HtmlParser): DATASOURCE = Datasource.TRAINERAS _FEMALE = ["SF", "VF", "JF", "F"] - _VETERAN = ["VF", "VM"] _MIX = ["M"] + _VETERAN = ["VF", "VM"] _SCHOOL = ["JM", "JF", "CM", "CF"] @override @@ -343,7 +343,7 @@ def get_distance(self, selector: Selector) -> int | None: return int(part.replace(" metros", "")) if part is not None else None def get_laps(self, participant: Selector) -> list[str]: - laps = [e for e in participant.xpath("//*/td/text()").getall() if any(c in e for c in [":", "."])] + laps = [e for e in participant.xpath("//*/td/text()").getall() if any(c in e for c in [":", ".", ","])] return [t.strftime("%M:%S.%f") for t in [normalize_lap_time(e) for e in laps if e] if t is not None] def is_disqualified(self, participant: Selector) -> bool: @@ -415,8 +415,8 @@ def _get_matching_flag_table(self, gender: str, category: str, selector: Selecto words = ["MIXTO"] titles = selector.xpath("/html/body/main/div/div/div/div[*]/h2/text()").getall() - idx = next((i for i, t in enumerate(titles) if all(w in t for w in words)), 0) - return selector.xpath(f"/html/body/main/div/div/div/div[{idx + 1}]/div/table").get(None) + idx = next((i for i, t in enumerate(titles) if all(w in t for w in words)), -1) + return selector.xpath(f"/html/body/main/div/div/div/div[{idx + 1}]/div/table").get(None) if idx >= 0 else None @staticmethod def _participants_path(selector: Selector) -> str: diff --git a/tests/normalization/time_normalization_test.py b/tests/normalization/time_normalization_test.py index f5652bd..6cda651 100644 --- a/tests/normalization/time_normalization_test.py +++ b/tests/normalization/time_normalization_test.py @@ -14,6 +14,7 @@ def setUp(self) -> None: "028:24", "00:009", "21.13.66", + "11,10", ] def test_lap_time_normalization(self): @@ -25,6 +26,7 @@ def test_lap_time_normalization(self): "28:24.00", None, "21:13.66", + "11:10.00", ] for idx, lap_time in enumerate(self.TIMES):