Skip to content

Commit

Permalink
feat: allow to retrieve last weekend races
Browse files Browse the repository at this point in the history
  • Loading branch information
iagocanalejas committed Jul 19, 2024
1 parent a9aef2a commit decc085
Show file tree
Hide file tree
Showing 14 changed files with 256 additions and 44 deletions.
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
-r requirements.txt
coverage==7.5.4
coverage==7.6.0
pytest==8.2.2
18 changes: 17 additions & 1 deletion rscraping/clients/_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections.abc import Generator
from datetime import date
from datetime import date, datetime, timedelta
from typing import Any, override

import requests
Expand Down Expand Up @@ -83,6 +83,22 @@ def get_race_ids_by_year(self, year: int, **kwargs) -> Generator[str, Any, Any]:
**kwargs,
)

@override
def get_last_weekend_race_ids(self, **kwargs) -> Generator[str, Any, Any]:
today = datetime.now()
days_to_saturday = (today.weekday() + 2) % 7 + 1

last_saturday = today - timedelta(days=days_to_saturday)
last_sunday = last_saturday + timedelta(days=1)

url = self.get_races_url(today.year, is_female=self.is_female)
yield from self._html_parser.parse_race_ids_by_days(
selector=Selector(requests.get(url=url, headers=HTTP_HEADERS()).text),
is_female=self.is_female,
days=[last_saturday, last_sunday],
**kwargs,
)

@override
def get_race_names_by_year(self, year: int, **kwargs) -> Generator[RaceName, Any, Any]:
self.validate_year(year)
Expand Down
8 changes: 8 additions & 0 deletions rscraping/clients/_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ def get_race_ids_by_club(self, club_id: str, year: int, **kwargs) -> Generator[s
"""
...

def get_last_weekend_race_ids(self, **kwargs) -> Generator[str, Any, Any]:
"""
Find the IDs for the races that took place the last weekend.
Yields: str: Race IDs.
"""
...

def get_races_url(self, year: int, **kwargs) -> str:
"""
Return the URL for retrieving races in a specific year.
Expand Down
14 changes: 14 additions & 0 deletions rscraping/parsers/html/_protocol.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections.abc import Generator
from datetime import datetime
from typing import Any, Protocol

from parsel.selector import Selector
Expand Down Expand Up @@ -33,6 +34,19 @@ def parse_race_ids(self, selector: Selector, **kwargs) -> Generator[str, Any, An
"""
...

def parse_race_ids_by_days(self, selector: Selector, days: list[datetime], **kwargs) -> Generator[str, Any, Any]:
"""
Parse the given Selector to retrieve the IDs of the races that took place on the given days.
Args:
selector (Selector): The Selector to parse.
days (list[datetime]): The days to filter
**kwargs: Additional keyword arguments.
Yields: str: The IDs of the races.
"""
...

def parse_race_names(self, selector: Selector, **kwargs) -> Generator[RaceName, Any, Any]:
"""
Parse the given Selector to retrieve the names of the races.
Expand Down
14 changes: 14 additions & 0 deletions rscraping/parsers/html/act.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
from collections.abc import Generator
from datetime import datetime
from typing import Any, override

from parsel.selector import Selector
Expand Down Expand Up @@ -115,6 +116,19 @@ def parse_race_ids(self, selector: Selector, **_) -> Generator[str, Any, Any]:
urls = selector.xpath('//*[@id="col-a"]/div/section/div[5]/table/tbody/tr[*]/td[*]/a/@href').getall()
return (url_parts[-1] for url_parts in (url.split("r=") for url in urls))

@override
def parse_race_ids_by_days(self, selector: Selector, days: list[datetime], **kwargs) -> Generator[str, Any, Any]:
assert len(days) > 0, "days must have at least one element"
assert all(d.year == days[0].year for d in days), "all days must be from the same year"

rows = selector.xpath('//*[@id="col-a"]/div/section/div[5]/table/tbody/tr[*]').getall()
selectors = [Selector(r) for r in rows]
return (
s.xpath("//*/td[2]/a/@href").get("").split("r=")[-1]
for s in selectors
if datetime.strptime(s.xpath("//*/td[4]/text()").get(""), "%d-%m-%Y") in days
)

@override
def parse_race_names(self, selector: Selector, **_) -> Generator[RaceName, Any, Any]:
hrefs = selector.xpath('//*[@id="col-a"]/div/section/div[5]/table/tbody/tr[*]/td[*]/a').getall()
Expand Down
20 changes: 19 additions & 1 deletion rscraping/parsers/html/arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from parsel.selector import Selector

from pyutils.strings import remove_parenthesis, whitespaces_clean
from pyutils.strings import find_date, remove_parenthesis, whitespaces_clean
from rscraping.data.checks import is_play_off
from rscraping.data.constants import (
CATEGORY_ABSOLUT,
Expand Down Expand Up @@ -112,6 +112,24 @@ def parse_race_ids(self, selector: Selector, **_) -> Generator[str, Any, Any]:
)
return (url_parts[-2] for url_parts in (url.split("/") for url in urls))

@override
def parse_race_ids_by_days(self, selector: Selector, days: list[datetime], **kwargs) -> Generator[str, Any, Any]:
assert len(days) > 0, "days must have at least one element"
assert all(d.year == days[0].year for d in days), "all days must be from the same year"

def _find_date(s: Selector) -> datetime | None:
maybe_date = f"{whitespaces_clean(s.xpath("//*/td[1]/span/text()").get("")).upper()} {days[0].year}"
found_date = find_date(maybe_date, day_first=True)
return datetime.combine(found_date, datetime.min.time()) if found_date else None

rows = (
selector.xpath('//*[@id="main"]/div[6]/table/tbody/tr[*]').getall()
if selector.xpath('//*[@id="proximas-regatas"]').get()
else selector.xpath('//*[@id="main"]/div[4]/table/tbody/tr[*]').getall()
)
selectors = [Selector(h) for h in rows]
return (s.xpath("//*/td[2]/span/a/@href").get("").split("/")[-2] for s in selectors if _find_date(s) in days)

@override
def parse_race_names(self, selector: Selector, **_) -> Generator[RaceName, Any, Any]:
hrefs = (
Expand Down
29 changes: 28 additions & 1 deletion rscraping/parsers/html/lgt.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from parsel.selector import Selector

from pyutils.shortcuts import none
from pyutils.strings import whitespaces_clean
from pyutils.strings import find_date, whitespaces_clean
from rscraping.data.checks import is_female, is_play_off
from rscraping.data.constants import (
CATEGORY_ABSOLUT,
Expand Down Expand Up @@ -126,6 +126,32 @@ def parse_race_ids(self, selector: Selector, **_) -> Generator[str, Any, Any]:
urls = selector.xpath("//*/div/div/div[*]/div/a/@href").getall()
return (u.split("/")[-1].split("-")[0] for u in urls[0:])

@override
def parse_race_ids_by_days(self, selector: Selector, days: list[datetime], **kwargs) -> Generator[str, Any, Any]:
assert len(days) > 0, "days must have at least one element"
assert all(d.year == days[0].year for d in days), "all days must be from the same year"

year, month, day = days[0].year, None, None
divs = [Selector(s) for s in selector.xpath('//*[@id="club"]/div/div/div[3]/div[1]/div/div/div[*]').getall()]

for div in divs:
maybe_month = whitespaces_clean(div.xpath("//*/div/div/text()").get(""))
if maybe_month:
month = whitespaces_clean(maybe_month.upper())
continue

maybe_day = whitespaces_clean(div.xpath("//*/div/div/table/tbody/tr[1]/td[1]/text()").get(""))
if maybe_day:
day = int(maybe_day.upper().replace("D", "").replace("S", ""))
found_date = find_date(f"{day} {month} {year}", day_first=True)

if (
found_date
and datetime.combine(found_date, datetime.min.time()) in days
and div.xpath("//*/div/a/@href").get(None)
):
yield div.xpath("//*/div/a/@href").get("").split("/")[-1].split("-")[0]

@override
def parse_race_names(self, selector: Selector, **_) -> Generator[RaceName, Any, Any]:
values = [Selector(u) for u in selector.xpath("//*/div/div/div[*]/div").getall()]
Expand All @@ -135,6 +161,7 @@ def parse_race_names(self, selector: Selector, **_) -> Generator[RaceName, Any,
name=u.xpath("//*/table/tr/td[2]/text()").get(""),
)
for u in values
if u.xpath("//*/a/@href").get(None)
)

####################################################
Expand Down
13 changes: 13 additions & 0 deletions rscraping/parsers/html/traineras.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,19 @@ def parse_race(self, selector: Selector, *, race_id: str, table: int | None = No
def parse_race_ids(self, selector: Selector, **_) -> Generator[str, Any, Any]:
return (race.race_id for race in self.parse_race_names(selector))

@override
def parse_race_ids_by_days(self, selector: Selector, days: list[datetime], **kwargs) -> Generator[str, Any, Any]:
assert len(days) > 0, "days must have at least one element"
assert all(d.year == days[0].year for d in days), "all days must be from the same year"

rows = [Selector(r) for r in selector.xpath("/html/body/div[1]/div[2]/table/tbody/tr").getall()]
for row in rows:
ttype = row.xpath("//*/td[2]/text()").get("")
name = whitespaces_clean(row.xpath("//*/td[1]/a/text()").get("").upper())
name = " ".join(n for n in name.split() if n != ttype)
if datetime.strptime(row.xpath("//*/td[5]/text()").get(""), "%d-%m-%Y") in days:
yield row.xpath("//*/td[1]/a/@href").get("").split("/")[-1]

@override
def parse_race_names(self, selector: Selector, **_) -> Generator[RaceName, Any, Any]:
rows = [Selector(r) for r in selector.xpath("/html/body/div[1]/div[2]/table/tbody/tr").getall()]
Expand Down
64 changes: 64 additions & 0 deletions tests/fixtures/html/lgt_calendar.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<html>
<body>
<div id="club">
<div>
<div>
<div></div>
<div></div>
<div>
<div>
<div>
<div>
<div>
<div>Xuño</div>
</div>
<div>
<div>
<a href="principal/regata/193-vi-bandeira-cidade-de-naron">
<table>
<tbody>
<tr>
<td>D30</td>
<td>VI BANDEIRA CIDADE DE NARÓN</td>
</tr>
<tr></tr>
</tbody></table
></a>
</div>
</div>
<div>
<div>
<table>
<tbody>
<tr>
<td>D30</td>
<td></td>
</tr>
<tr></tr>
</tbody>
</table>
</div>
</div>
<div>
<div>
<a href="principal/regata/195-xxxix-bandeira--concello-de-redondela">
<table>
<tbody>
<tr>
<td>S06</td>
<td>XXXIX BANDEIRA CONCELLO DE REDONDELA</td>
</tr>
<tr></tr>
</tbody></table
></a>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
</html>
80 changes: 40 additions & 40 deletions tests/fixtures/html/lgt_races.html
Original file line number Diff line number Diff line change
@@ -1,44 +1,44 @@
<div>
<div>
<div></div>
<div>
<div>
<a href="principal/regata/152-i-bandeira-concello-as-pontes-a">
<table>
<tr>
<td>D23</td>
<td>XXVIII BANDEIRA TRAIÑEIRAS CONCELLO DE BUEU</td>
</tr>
<tr />
</table>
</a>
</div>
<div></div>
<div>
<div>
<a href="principal/regata/152-i-bandeira-concello-as-pontes-a">
<table>
<tr>
<td>D23</td>
<td>XXVIII BANDEIRA TRAIÑEIRAS CONCELLO DE BUEU</td>
</tr>
<tr />
</table>
</a>
</div>
</div>
<div>
<div>
<a href="principal/regata/153-i-bandeira-concello-as-pontes-b">
<table>
<tr>
<td>S10</td>
<td>I BANDEIRA CONCELLO AS PONTES B</td>
</tr>
<tr />
</table>
</a>
</div>
</div>
<div>
<div>
<a href="principal/regata/154-i-bandeira-concello-as-pontes-f">
<table>
<tr>
<td>S10</td>
<td>I BANDEIRA CONCELLO AS PONTES F</td>
</tr>
<tr />
</table>
</a>
</div>
</div>
</div>
<div>
<div>
<a href="principal/regata/153-i-bandeira-concello-as-pontes-b">
<table>
<tr>
<td>S10</td>
<td>I BANDEIRA CONCELLO AS PONTES B</td>
</tr>
<tr />
</table>
</a>
</div>
</div>
<div>
<div>
<a href="principal/regata/154-i-bandeira-concello-as-pontes-f">
<table>
<tr>
<td>S10</td>
<td>I BANDEIRA CONCELLO AS PONTES F</td>
</tr>
<tr />
</table>
</a>
</div>
</div>
</div>
</div>
10 changes: 10 additions & 0 deletions tests/parsers/html/act_parser_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import unittest
from datetime import datetime

from parsel.selector import Selector

Expand Down Expand Up @@ -35,6 +36,15 @@ def test_parse_race_ids(self):

self.assertEqual(list(ids), ["1616789082", "1616789390", "1616789420"])

def test_parse_race_ids_by_days(self):
with open(os.path.join(self.fixtures, "act_races.html")) as file:
ids = self.parser.parse_race_ids_by_days(
Selector(file.read()),
days=[datetime.strptime("03-07-2021", "%d-%m-%Y")],
)

self.assertEqual(list(ids), ["1616789082"])

def test_parse_race_names(self):
with open(os.path.join(self.fixtures, "act_races.html")) as file:
race_names = self.parser.parse_race_names(Selector(file.read()), is_female=False)
Expand Down
Loading

0 comments on commit decc085

Please sign in to comment.