fix: add unaccent to lemmatization

iagocanalejas · Mar 15, 2024 · 4f2439d · 4f2439d
1 parent 6bdd6cf
commit 4f2439d
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 3 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ numpy==1.26.4
 opencv-python==4.9.0.80
 openpyxl==3.1.2
 pandas==2.2.1
-parsel==1.8.1
+parsel==1.9.0
 pyarrow==15.0.0
 PyMuPDF==1.23.26
 pytesseract==0.3.10

diff --git a/rscraping/__init__.py b/rscraping/__init__.py
@@ -2,7 +2,7 @@
 from typing import Any
 from collections.abc import Generator
 
-from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols
+from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols, unaccent
 from rscraping.data.constants import SYNONYMS
 from rscraping.data.models import Datasource, Lineup, Race
 from simplemma.simplemma import text_lemmatizer
@@ -103,6 +103,16 @@ def find_lineup(race_id: str, datasource: Datasource, is_female: bool) -> Genera
 
 
 def lemmatize(phrase: str, lang: str = "es") -> list[str]:
+    """
+    Lemmatize a phrase using the simplemma library. The phrase is preprocessed before lemmatization.
+    Synonyms are normalized, conjunctions are removed, symbols are removed, and accents are removed.
+
+    Parameters:
+    - phrase (str): The phrase to lemmatize.
+    - lang (str): The language of the phrase (default: "es").
+
+    Returns: list[str]: A list of lemmatized words from the phrase.
+    """
     phrase = normalize_synonyms(phrase, SYNONYMS)
-    phrase = remove_symbols(remove_conjunctions(phrase))
+    phrase = unaccent(remove_symbols(remove_conjunctions(phrase)))
     return list(set(text_lemmatizer(phrase, lang=lang)))
diff --git a/rscraping/data/normalization/races.py b/rscraping/data/normalization/races.py
@@ -178,6 +178,7 @@ def amend_race_name(name: str) -> str:
     re.sub(r"(CONCELLO)( DE)?", "CONCELLO DE", name)
     name = name.replace("BILBOKO BANDERA - BANDERA DE BILBAO", "BANDERA DE BILBAO")
     name = name.replace("JESÚS TENORIO", "XESÚS TENORIO")
+    name = name.replace("CCD CESANTES", "CESANTES")
 
     name = name.replace("/", "-").replace("-", " - ")
     for a, b in _MISSPELLINGS: