Skip to content

Commit

Permalink
feat: retrieve race_ids by flag
Browse files Browse the repository at this point in the history
  • Loading branch information
iagocanalejas committed Apr 10, 2024
1 parent 6ef6407 commit 2bfb8f9
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 9 deletions.
23 changes: 21 additions & 2 deletions rscraping/clients/traineras.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ def get_races_url(year: int, page: int = 1, **_) -> str:
def get_search_races_url(name: str) -> str:
return f"https://traineras.es/banderas?nombre={name.replace(' ', '+')}"

@staticmethod
def get_flag_url(flag_id: str) -> str:
return f"https://traineras.es/banderas/{flag_id}"

@staticmethod
def get_rower_url(rower_id: str, **_) -> str:
return f"https://traineras.es/personas/{rower_id}"
Expand Down Expand Up @@ -68,6 +72,20 @@ def validate_url(self, url: str):
if not pattern.match(url):
raise ValueError(f"invalid {url=}")

def get_race_ids_by_flag(self, flag_id: str) -> Generator[str, Any, Any]:
"""
Find the IDs of the race editions for a given flag.
Args:
flag_id (str): The ID of the flag.
**kwargs: Additional keyword arguments.
Yields: str: Race IDs.
"""
url = self.get_flag_url(flag_id)
content = Selector(requests.get(url=url, headers=HTTP_HEADERS()).content.decode("utf-8"))
yield from self._html_parser.parse_flag_race_ids(content, is_female=self._is_female)

@override
def get_race_by_id(self, race_id: str, **kwargs) -> Race | None:
"""
Expand All @@ -87,14 +105,15 @@ def get_race_by_id(self, race_id: str, **kwargs) -> Race | None:
# search the race name in the flags seach page
url = self.get_search_races_url(race.name)
content = Selector(requests.get(url=url, headers=HTTP_HEADERS()).content.decode("utf-8"))
flag_urls = self._html_parser.parse_search_flags(content)
flag_urls = self._html_parser.parse_searched_flag_urls(content)

if len(flag_urls) < 1:
return race

# the first flag should be an exact match of the given one, so we can use it to get the editions
content = Selector(requests.get(url=flag_urls[0], headers=HTTP_HEADERS()).content.decode("utf-8"))
edition = next((e for (y, e) in self._html_parser.parse_flag_editions(content) if y == race.year), None)
editions = self._html_parser.parse_flag_editions(content, is_female=self._is_female)
edition = next((e for (y, e) in editions if y == race.year), None)
if edition:
race.normalized_names = [(n[0], edition) for n in race.normalized_names]

Expand Down
3 changes: 2 additions & 1 deletion rscraping/data/normalization/races.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re

from pyutils.shortcuts import none
from pyutils.strings import (
apply_replaces,
find_roman,
Expand Down Expand Up @@ -90,7 +91,7 @@ def normalize_name_parts(name: str) -> list[tuple[str, int | None]]:
normalized = remove_parenthesis(whitespaces_clean(name))
normalized = f"{normalized} ({'CLASIFICATORIA'})" if "CLASIFICATORIA" in name else normalized

should_split = not any(r in normalized for r in _NORMALIZED_RACES.keys() if " - " in r)
should_split = none(" - " in r in normalized for r in _NORMALIZED_RACES.keys())
name_parts = normalized.split(" - ") if should_split and not is_play_off(normalized) else [normalized]
if not is_play_off(normalized) and len(name_parts) == 1:
editions = [w for w in normalized.split() if find_roman(w) is not None]
Expand Down
3 changes: 2 additions & 1 deletion rscraping/parsers/html/lgt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from parsel.selector import Selector

from pyutils.shortcuts import none
from pyutils.strings import whitespaces_clean
from rscraping.data.checks import is_female, is_play_off
from rscraping.data.constants import (
Expand Down Expand Up @@ -61,7 +62,7 @@ def parse_race(self, selector: Selector, *, results_selector: Selector, race_id:
for (n, e) in normalized_names
]
# try to find the edition in the original name before normalizations
if not any(e is not None for (_, e) in normalized_names):
if none(e for (_, e) in normalized_names):
edition = find_edition(name)
normalized_names = [(n, edition) for (n, _) in normalized_names]
logger.info(f"{self.DATASOURCE}: race normalized to {normalized_names=}")
Expand Down
8 changes: 6 additions & 2 deletions rscraping/parsers/html/traineras.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ def parse_race_names(
name = " ".join(n for n in name.split() if n != ttype)
yield RaceName(race_id=row.xpath("//*/td[1]/a/@href").get("").split("/")[-1], name=name)

def parse_flag_race_ids(self, selector: Selector, is_female: bool) -> Generator[str, Any, Any]:
rows = selector.xpath(f"/html/body/main/div/div/div/div[{2 if is_female else 1}]/div/table/tr").getall()
return (Selector(row).xpath("//*/td[3]/a/@href").get("").split("/")[-1] for row in rows[1:])

def parse_club_race_ids(self, selector: Selector) -> Generator[str, Any, Any]:
rows = selector.xpath("/html/body/div[1]/div[2]/div/table/tr").getall()
return (Selector(row).xpath("//*/td[1]/a/@href").get("").split("/")[-1] for row in rows[1:])
Expand All @@ -171,10 +175,10 @@ def parse_rower_race_ids(self, selector: Selector, year: str | None = None) -> G
if year in selector.xpath("//*/td[2]/text()").get("")
)

def parse_search_flags(self, selector: Selector) -> list[str]:
def parse_searched_flag_urls(self, selector: Selector) -> list[str]:
return selector.xpath("/html/body/div[1]/div[2]/div/div/div[*]/div/div/div[2]/h5/a/@href").getall()

def parse_flag_editions(self, selector: Selector, is_female: bool = False) -> Generator[tuple[int, int], Any, Any]:
def parse_flag_editions(self, selector: Selector, is_female: bool) -> Generator[tuple[int, int], Any, Any]:
table = selector.xpath(f"/html/body/main/div/div/div/div[{2 if is_female else 1}]/div/table").get(None)
if table:
for row in Selector(table).xpath("//*/tr").getall()[1:]:
Expand Down
3 changes: 2 additions & 1 deletion rscraping/parsers/pdf/act.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from fitz import Page

from pyutils.lists import flatten
from pyutils.shortcuts import none
from pyutils.strings import whitespaces_clean
from rscraping.data.constants import (
SYNONYM_COXWAIN,
Expand Down Expand Up @@ -60,7 +61,7 @@ def _parse_name(parts: list[str]) -> tuple[str, str]:
return normalize_race_name(race), normalize_club_name(club)

def _parse_rowers(self, rowers: list[str]) -> tuple[str, str, list[str], list[str]]:
rowers = [r for r in rowers if not any(t for t in self._TRASH if t in r.upper())]
rowers = [rower for rower in rowers if none(t in rower.upper() for t in self._TRASH)]

coach, delegate = self._get_coach_and_delegate(rowers)

Expand Down
62 changes: 62 additions & 0 deletions tests/fixtures/html/traineras_flag.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<html>
<body>
<main>
<div>
<div>
<div>
<div>
<div>
<table>
<tr></tr>
<tr>
<td></td>
<td></td>
<td>
<a href="https://traineras.es/clasificaciones/2476"></a>
</td>
</tr>
<tr>
<td></td>
<td></td>
<td>
<a href="https://traineras.es/clasificaciones/2477"></a>
</td>
</tr>
<tr>
<td></td>
<td></td>
<td>
<a href="https://traineras.es/clasificaciones/5814"></a>
</td>
</tr>
</table>
</div>
</div>

<div>
<div>
<table>
<tr></tr>
<tr>
<td></td>
<td></td>
<td>
<a href="https://traineras.es/clasificaciones/2508"></a>
</td>
</tr>
<tr>
<td></td>
<td></td>
<td>
<a href="https://traineras.es/clasificaciones/5815"></a>
</td>
</tr>
</table>
</div>
</div>
</div>
</div>
</div>
</main>
</body>
</html>
12 changes: 10 additions & 2 deletions tests/parsers/html/traineras_parser_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ def test_parse_race_ids(self):
ids = self.parser.parse_race_ids(data, is_female=True)
self.assertEqual(list(ids), ["5456"])

def test_parse_flag_race_ids(self):
with open(os.path.join(self.fixtures, "traineras_flag.html")) as file:
selector = Selector(file.read())
male_ids = self.parser.parse_flag_race_ids(selector, is_female=False)
female_ids = self.parser.parse_flag_race_ids(selector, is_female=True)
self.assertEqual(list(male_ids), ["2476", "2477", "5814"])
self.assertEqual(list(female_ids), ["2508", "5815"])

def test_parse_club_race_ids(self):
with open(os.path.join(self.fixtures, "traineras_club.html")) as file:
ids = self.parser.parse_club_race_ids(Selector(file.read()))
Expand All @@ -81,13 +89,13 @@ def test_parse_rower_race_ids(self):

def test_parse_search_flags(self):
with open(os.path.join(self.fixtures, "traineras_search_flags.html")) as file:
urls = self.parser.parse_search_flags(Selector(file.read()))
urls = self.parser.parse_searched_flag_urls(Selector(file.read()))
self.assertEqual(urls, ["https://traineras.es/banderas/104#SM", "https://traineras.es/banderas/679#SF"])

def test_parse_flag_editions(self):
with open(os.path.join(self.fixtures, "traineras_flag_editions.html")) as file:
content = Selector(file.read())
male_editions = self.parser.parse_flag_editions(content)
male_editions = self.parser.parse_flag_editions(content, is_female=False)
female_editions = self.parser.parse_flag_editions(content, is_female=True)
self.assertEqual(list(male_editions), [(2007, 1), (2008, 2), (2011, 3), (2023, 14)])
self.assertEqual(list(female_editions), [(2016, 1), (2017, 2), (2023, 8)])
Expand Down

0 comments on commit 2bfb8f9

Please sign in to comment.