Skip to content

Commit

Permalink
add entity & race normalizations
Browse files Browse the repository at this point in the history
  • Loading branch information
iagocanalejas committed Mar 18, 2024
1 parent f776370 commit f38dacd
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 0 deletions.
3 changes: 3 additions & 0 deletions rscraping/data/normalization/clubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
"BUEU": ["BUEU TECCARSA"],
"ESTEIRANA": ["ESTEIRANA REMO"],
"A CABANA": ["A CABANA FERROL"],
"MUGARDOS - A CABANA": ["MUGARDOS - A CABANA FERROL"],
"RIVEIRA": ["DE RIVEIRA"],
"ZARAUTZ": ["ZARAUTZ GESALAGA-OKELAN", "ZARAUTZ INMOB. ORIO"],
"PASAI DONIBANE KOXTAPE": ["P.DONIBANE IBERDROLA"],
Expand Down Expand Up @@ -130,6 +131,8 @@ def normalize_club_name(name: str) -> str:
def deacronym_club_name(name: str) -> str:
if any(w in ["P", "D", "PD"] for w in name.split()):
name = re.sub(r"P\.? ?D\.?", "PASAIA DONIBANE", name)
if "CABANA" in name:
name = name.replace("CABANA FERROL", "CABANA")
return whitespaces_clean(name)


Expand Down
6 changes: 6 additions & 0 deletions rscraping/parsers/html/lgt.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,12 @@ def _normalize_race_name(name: str, t_date: date) -> str:
if "TERESA HERRERA" in name: # lgt never saves the final
return "TROFEO TERESA HERRERA" if t_date.isoweekday() == 7 else "TROFEO TERESA HERRERA (CLASIFICATORIA)"

if all(n in name for n in ["ILLA", "SAMERTOLAMEU"]) and t_date.year in [2021, 2022, 2023]:
# HACK: this is a weird flag case in witch Meira restarted the edition for his 'B' team.
# We have "III BANDEIRA ILLA DO SAMERTOLAMEU" in 2017 for his main team and
# "III BANDEIRA ILLA DO SAMERTOLAMEU" in 2023 for his 'B' team. So we need to differentiate them.
return "BANDEIRA ILLA DO SAMERTOLAMEU B"

if "PLAY" in name:
return "PLAY-OFF LGT"

Expand Down

0 comments on commit f38dacd

Please sign in to comment.