Skip to content

Commit

Permalink
add known races normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
iagocanalejas committed Nov 8, 2023
1 parent bbf2b80 commit 1758d73
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
fillpdf==0.7.2
inquirer==3.1.3
matplotlib==3.8.0
matplotlib==3.8.1
numpy==1.26.1
opencv-python==4.8.1.78
openpyxl==3.1.2
pandas==2.1.1
pandas==2.1.2
parsel==1.8.1
PyMuPDF==1.23.4
PyMuPDF==1.23.6
pytesseract==0.3.10
pyutils @ git+https://github.com/iagocanalejas/pyutils.git@master
reportlab~=4.0.5
Expand Down
14 changes: 14 additions & 0 deletions rscraping/data/normalization/races.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
]


_NORMALIZED_RACES = {
"DONIBANE ZIBURUKO ESTROPADAK": [["SAN", "JUAN", "LUZ"]],
}


def normalize_name_parts(normalized_name: str) -> list[tuple[str, int | None]]:
parts: list[tuple[str, int | None]] = []
normalized_name = remove_parenthesis(whitespaces_clean(normalized_name))
Expand All @@ -51,6 +56,7 @@ def normalize_race_name(name: str) -> str:
name = amend_race_name(name)
name = remove_league_indicator(name)
name = remove_race_sponsor(name)
name = normalize_race_known_synonyms(name)

return whitespaces_clean(name)

Expand Down Expand Up @@ -111,3 +117,11 @@ def amend_race_name(name: str) -> str:
for a, b in _MISSPELLINGS:
name = name.replace(a, b)
return whitespaces_clean(name)


def normalize_race_known_synonyms(name: str) -> str:
for synonym, values in _NORMALIZED_RACES.items():
for value in values:
if name in " ".join(value) or all(v in name for v in value):
return synonym
return name

0 comments on commit 1758d73

Please sign in to comment.