feat: retrieve race_ids by flag

iagocanalejas · Apr 10, 2024 · 2bfb8f9 · 2bfb8f9
1 parent 6ef6407
commit 2bfb8f9
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 9 deletions.
diff --git a/rscraping/clients/traineras.py b/rscraping/clients/traineras.py
@@ -41,6 +41,10 @@ def get_races_url(year: int, page: int = 1, **_) -> str:
     def get_search_races_url(name: str) -> str:
         return f"https://traineras.es/banderas?nombre={name.replace(' ', '+')}"
 
+    @staticmethod
+    def get_flag_url(flag_id: str) -> str:
+        return f"https://traineras.es/banderas/{flag_id}"
+
     @staticmethod
     def get_rower_url(rower_id: str, **_) -> str:
         return f"https://traineras.es/personas/{rower_id}"
@@ -68,6 +72,20 @@ def validate_url(self, url: str):
         if not pattern.match(url):
             raise ValueError(f"invalid {url=}")
 
+    def get_race_ids_by_flag(self, flag_id: str) -> Generator[str, Any, Any]:
+        """
+        Find the IDs of the race editions for a given flag.
+
+        Args:
+            flag_id (str): The ID of the flag.
+            **kwargs: Additional keyword arguments.
+
+        Yields: str: Race IDs.
+        """
+        url = self.get_flag_url(flag_id)
+        content = Selector(requests.get(url=url, headers=HTTP_HEADERS()).content.decode("utf-8"))
+        yield from self._html_parser.parse_flag_race_ids(content, is_female=self._is_female)
+
     @override
     def get_race_by_id(self, race_id: str, **kwargs) -> Race | None:
         """
@@ -87,14 +105,15 @@ def get_race_by_id(self, race_id: str, **kwargs) -> Race | None:
         # search the race name in the flags seach page
         url = self.get_search_races_url(race.name)
         content = Selector(requests.get(url=url, headers=HTTP_HEADERS()).content.decode("utf-8"))
-        flag_urls = self._html_parser.parse_search_flags(content)
+        flag_urls = self._html_parser.parse_searched_flag_urls(content)
 
         if len(flag_urls) < 1:
             return race
 
         # the first flag should be an exact match of the given one, so we can use it to get the editions
         content = Selector(requests.get(url=flag_urls[0], headers=HTTP_HEADERS()).content.decode("utf-8"))
-        edition = next((e for (y, e) in self._html_parser.parse_flag_editions(content) if y == race.year), None)
+        editions = self._html_parser.parse_flag_editions(content, is_female=self._is_female)
+        edition = next((e for (y, e) in editions if y == race.year), None)
         if edition:
             race.normalized_names = [(n[0], edition) for n in race.normalized_names]
 

diff --git a/rscraping/data/normalization/races.py b/rscraping/data/normalization/races.py
@@ -1,5 +1,6 @@
 import re
 
+from pyutils.shortcuts import none
 from pyutils.strings import (
     apply_replaces,
     find_roman,
@@ -90,7 +91,7 @@ def normalize_name_parts(name: str) -> list[tuple[str, int | None]]:
     normalized = remove_parenthesis(whitespaces_clean(name))
     normalized = f"{normalized} ({'CLASIFICATORIA'})" if "CLASIFICATORIA" in name else normalized
 
-    should_split = not any(r in normalized for r in _NORMALIZED_RACES.keys() if " - " in r)
+    should_split = none(" - " in r in normalized for r in _NORMALIZED_RACES.keys())
     name_parts = normalized.split(" - ") if should_split and not is_play_off(normalized) else [normalized]
     if not is_play_off(normalized) and len(name_parts) == 1:
         editions = [w for w in normalized.split() if find_roman(w) is not None]

diff --git a/rscraping/parsers/html/lgt.py b/rscraping/parsers/html/lgt.py
@@ -7,6 +7,7 @@
 
 from parsel.selector import Selector
 
+from pyutils.shortcuts import none
 from pyutils.strings import whitespaces_clean
 from rscraping.data.checks import is_female, is_play_off
 from rscraping.data.constants import (
@@ -61,7 +62,7 @@ def parse_race(self, selector: Selector, *, results_selector: Selector, race_id:
             for (n, e) in normalized_names
         ]
         # try to find the edition in the original name before normalizations
-        if not any(e is not None for (_, e) in normalized_names):
+        if none(e for (_, e) in normalized_names):
             edition = find_edition(name)
             normalized_names = [(n, edition) for (n, _) in normalized_names]
         logger.info(f"{self.DATASOURCE}: race normalized to {normalized_names=}")

diff --git a/rscraping/parsers/html/traineras.py b/rscraping/parsers/html/traineras.py
@@ -156,6 +156,10 @@ def parse_race_names(
             name = " ".join(n for n in name.split() if n != ttype)
             yield RaceName(race_id=row.xpath("//*/td[1]/a/@href").get("").split("/")[-1], name=name)
 
+    def parse_flag_race_ids(self, selector: Selector, is_female: bool) -> Generator[str, Any, Any]:
+        rows = selector.xpath(f"/html/body/main/div/div/div/div[{2 if is_female else 1}]/div/table/tr").getall()
+        return (Selector(row).xpath("//*/td[3]/a/@href").get("").split("/")[-1] for row in rows[1:])
+
     def parse_club_race_ids(self, selector: Selector) -> Generator[str, Any, Any]:
         rows = selector.xpath("/html/body/div[1]/div[2]/div/table/tr").getall()
         return (Selector(row).xpath("//*/td[1]/a/@href").get("").split("/")[-1] for row in rows[1:])
@@ -171,10 +175,10 @@ def parse_rower_race_ids(self, selector: Selector, year: str | None = None) -> G
             if year in selector.xpath("//*/td[2]/text()").get("")
         )
 
-    def parse_search_flags(self, selector: Selector) -> list[str]:
+    def parse_searched_flag_urls(self, selector: Selector) -> list[str]:
         return selector.xpath("/html/body/div[1]/div[2]/div/div/div[*]/div/div/div[2]/h5/a/@href").getall()
 
-    def parse_flag_editions(self, selector: Selector, is_female: bool = False) -> Generator[tuple[int, int], Any, Any]:
+    def parse_flag_editions(self, selector: Selector, is_female: bool) -> Generator[tuple[int, int], Any, Any]:
         table = selector.xpath(f"/html/body/main/div/div/div/div[{2 if is_female else 1}]/div/table").get(None)
         if table:
             for row in Selector(table).xpath("//*/tr").getall()[1:]:

diff --git a/rscraping/parsers/pdf/act.py b/rscraping/parsers/pdf/act.py
@@ -3,6 +3,7 @@
 from fitz import Page
 
 from pyutils.lists import flatten
+from pyutils.shortcuts import none
 from pyutils.strings import whitespaces_clean
 from rscraping.data.constants import (
     SYNONYM_COXWAIN,
@@ -60,7 +61,7 @@ def _parse_name(parts: list[str]) -> tuple[str, str]:
         return normalize_race_name(race), normalize_club_name(club)
 
     def _parse_rowers(self, rowers: list[str]) -> tuple[str, str, list[str], list[str]]:
-        rowers = [r for r in rowers if not any(t for t in self._TRASH if t in r.upper())]
+        rowers = [rower for rower in rowers if none(t in rower.upper() for t in self._TRASH)]
 
         coach, delegate = self._get_coach_and_delegate(rowers)
 

diff --git a/tests/fixtures/html/traineras_flag.html b/tests/fixtures/html/traineras_flag.html
@@ -0,0 +1,62 @@
+<html>
+    <body>
+        <main>
+            <div>
+                <div>
+                    <div>
+                        <div>
+                            <div>
+                                <table>
+                                    <tr></tr>
+                                    <tr>
+                                        <td></td>
+                                        <td></td>
+                                        <td>
+                                            <a href="https://traineras.es/clasificaciones/2476"></a>
+                                        </td>
+                                    </tr>
+                                    <tr>
+                                        <td></td>
+                                        <td></td>
+                                        <td>
+                                            <a href="https://traineras.es/clasificaciones/2477"></a>
+                                        </td>
+                                    </tr>
+                                    <tr>
+                                        <td></td>
+                                        <td></td>
+                                        <td>
+                                            <a href="https://traineras.es/clasificaciones/5814"></a>
+                                        </td>
+                                    </tr>
+                                </table>
+                            </div>
+                        </div>
+
+                        <div>
+                            <div>
+                                <table>
+                                    <tr></tr>
+                                    <tr>
+                                        <td></td>
+                                        <td></td>
+                                        <td>
+                                            <a href="https://traineras.es/clasificaciones/2508"></a>
+                                        </td>
+                                    </tr>
+                                    <tr>
+                                        <td></td>
+                                        <td></td>
+                                        <td>
+                                            <a href="https://traineras.es/clasificaciones/5815"></a>
+                                        </td>
+                                    </tr>
+                                </table>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </main>
+    </body>
+</html>
diff --git a/tests/parsers/html/traineras_parser_test.py b/tests/parsers/html/traineras_parser_test.py
@@ -69,6 +69,14 @@ def test_parse_race_ids(self):
         ids = self.parser.parse_race_ids(data, is_female=True)
         self.assertEqual(list(ids), ["5456"])
 
+    def test_parse_flag_race_ids(self):
+        with open(os.path.join(self.fixtures, "traineras_flag.html")) as file:
+            selector = Selector(file.read())
+            male_ids = self.parser.parse_flag_race_ids(selector, is_female=False)
+            female_ids = self.parser.parse_flag_race_ids(selector, is_female=True)
+        self.assertEqual(list(male_ids), ["2476", "2477", "5814"])
+        self.assertEqual(list(female_ids), ["2508", "5815"])
+
     def test_parse_club_race_ids(self):
         with open(os.path.join(self.fixtures, "traineras_club.html")) as file:
             ids = self.parser.parse_club_race_ids(Selector(file.read()))
@@ -81,13 +89,13 @@ def test_parse_rower_race_ids(self):
 
     def test_parse_search_flags(self):
         with open(os.path.join(self.fixtures, "traineras_search_flags.html")) as file:
-            urls = self.parser.parse_search_flags(Selector(file.read()))
+            urls = self.parser.parse_searched_flag_urls(Selector(file.read()))
         self.assertEqual(urls, ["https://traineras.es/banderas/104#SM", "https://traineras.es/banderas/679#SF"])
 
     def test_parse_flag_editions(self):
         with open(os.path.join(self.fixtures, "traineras_flag_editions.html")) as file:
             content = Selector(file.read())
-            male_editions = self.parser.parse_flag_editions(content)
+            male_editions = self.parser.parse_flag_editions(content, is_female=False)
             female_editions = self.parser.parse_flag_editions(content, is_female=True)
         self.assertEqual(list(male_editions), [(2007, 1), (2008, 2), (2011, 3), (2023, 14)])
         self.assertEqual(list(female_editions), [(2016, 1), (2017, 2), (2023, 8)])