Skip to content

Commit

Permalink
fix: add unaccent to lemmatization
Browse files Browse the repository at this point in the history
  • Loading branch information
iagocanalejas committed Mar 15, 2024
1 parent 6bdd6cf commit 4f2439d
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 3 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ numpy==1.26.4
opencv-python==4.9.0.80
openpyxl==3.1.2
pandas==2.2.1
parsel==1.8.1
parsel==1.9.0
pyarrow==15.0.0
PyMuPDF==1.23.26
pytesseract==0.3.10
Expand Down
14 changes: 12 additions & 2 deletions rscraping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Any
from collections.abc import Generator

from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols
from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols, unaccent
from rscraping.data.constants import SYNONYMS
from rscraping.data.models import Datasource, Lineup, Race
from simplemma.simplemma import text_lemmatizer
Expand Down Expand Up @@ -103,6 +103,16 @@ def find_lineup(race_id: str, datasource: Datasource, is_female: bool) -> Genera


def lemmatize(phrase: str, lang: str = "es") -> list[str]:
"""
Lemmatize a phrase using the simplemma library. The phrase is preprocessed before lemmatization.
Synonyms are normalized, conjunctions are removed, symbols are removed, and accents are removed.
Parameters:
- phrase (str): The phrase to lemmatize.
- lang (str): The language of the phrase (default: "es").
Returns: list[str]: A list of lemmatized words from the phrase.
"""
phrase = normalize_synonyms(phrase, SYNONYMS)
phrase = remove_symbols(remove_conjunctions(phrase))
phrase = unaccent(remove_symbols(remove_conjunctions(phrase)))
return list(set(text_lemmatizer(phrase, lang=lang)))
1 change: 1 addition & 0 deletions rscraping/data/normalization/races.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def amend_race_name(name: str) -> str:
re.sub(r"(CONCELLO)( DE)?", "CONCELLO DE", name)
name = name.replace("BILBOKO BANDERA - BANDERA DE BILBAO", "BANDERA DE BILBAO")
name = name.replace("JESÚS TENORIO", "XESÚS TENORIO")
name = name.replace("CCD CESANTES", "CESANTES")

name = name.replace("/", "-").replace("-", " - ")
for a, b in _MISSPELLINGS:
Expand Down

0 comments on commit 4f2439d

Please sign in to comment.