Skip to content

Commit

Permalink
normalize method names/args
Browse files Browse the repository at this point in the history
  • Loading branch information
iagocanalejas committed Mar 25, 2024
1 parent 0bc25cf commit 186ea50
Show file tree
Hide file tree
Showing 16 changed files with 118 additions and 103 deletions.
2 changes: 1 addition & 1 deletion rscraping/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def parse_race_image(
header_data = processor.retrieve_header_data(path=path, header_size=header_size)
df = processor.retrieve_tabular_dataframe(path=path, header_size=header_size)

return parser.parse_races_from(
return parser.parse_races(
data=df,
file_name=os.path.splitext(os.path.basename(path))[0],
header=header_data,
Expand Down
10 changes: 5 additions & 5 deletions rscraping/clients/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,12 @@ def validate_url(self, url: str):

@override
@staticmethod
def get_race_details_url(race_id: str, **kwargs) -> str:
def get_races_url(year: int, **kwargs) -> str:
raise NotImplementedError

@override
@staticmethod
def get_races_url(year: int, **kwargs) -> str:
def get_race_details_url(race_id: str, **kwargs) -> str:
raise NotImplementedError

@override
Expand All @@ -114,13 +114,13 @@ def get_lineup_url(race_id: str, **kwargs) -> str:
raise NotImplementedError

@override
def get_race_ids_by_rower(self, rower_id: str, **kwargs) -> Generator[str, Any, Any]:
def get_race_ids_by_club(self, club_id: str, year: int, **kwargs) -> Generator[str, Any, Any]:
raise NotImplementedError

@override
def get_lineup_by_race_id(self, race_id: str, **kwargs) -> Generator[Lineup, Any, Any]:
def get_race_ids_by_rower(self, rower_id: str, **kwargs) -> Generator[str, Any, Any]:
raise NotImplementedError

@override
def get_race_ids_by_club(self, club_id: str, year: int, **kwargs) -> Generator[str, Any, Any]:
def get_lineup_by_race_id(self, race_id: str, **kwargs) -> Generator[Lineup, Any, Any]:
raise NotImplementedError
22 changes: 11 additions & 11 deletions rscraping/clients/_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,40 +66,40 @@ def get_race_by_url(self, url: str, race_id: str, **kwargs) -> Race | None:
"""
...

def get_race_ids_by_year(self, year: int, **kwargs) -> Generator[str, Any, Any]:
def get_race_names_by_year(self, year: int, **kwargs) -> Generator[RaceName, Any, Any]:
"""
Find the race IDs for a given year.
Find the race names for a given year.
Args:
year (int): The year for which to generate race IDs.
year (int): The year for which to generate race names.
**kwargs: Additional keyword arguments.
Yields: str: Race IDs.
Yields: RaceName: Race names.
"""
...

def get_race_ids_by_club(self, club_id: str, year: int, **kwargs) -> Generator[str, Any, Any]:
def get_race_ids_by_year(self, year: int, **kwargs) -> Generator[str, Any, Any]:
"""
Find the race IDs for a given club and year.
Find the race IDs for a given year.
Args:
club_id (str): The ID of the club.
year (int): The year for which to generate race IDs.
**kwargs: Additional keyword arguments.
Yields: str: Race IDs.
"""
...

def get_race_names_by_year(self, year: int, **kwargs) -> Generator[RaceName, Any, Any]:
def get_race_ids_by_club(self, club_id: str, year: int, **kwargs) -> Generator[str, Any, Any]:
"""
Find the race names for a given year.
Find the race IDs for a given club and year.
Args:
year (int): The year for which to generate race names.
club_id (str): The ID of the club.
year (int): The year for which to generate race IDs.
**kwargs: Additional keyword arguments.
Yields: RaceName: Race names.
Yields: str: Race IDs.
"""
...

Expand Down
58 changes: 29 additions & 29 deletions rscraping/clients/lgt.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,35 @@ def get_race_by_url(self, url: str, race_id: str, **kwargs):
kwargs["results_selector"] = self.get_results_selector(race_id)
return super().get_race_by_url(url, race_id, **kwargs)

@override
def get_race_names_by_year(self, year: int, **_) -> Generator[RaceName, Any, Any]:
"""
Find the race names for a given year and gender. Uses the unchecked IDs found in get_race_ids_by_year to
find them.
Args:
year (int): The year for which to generate race names.
**kwargs: Additional keyword arguments.
Yields: RaceName: Race names.
"""
today = date.today().year
if today == year:
race_names = self._html_parser.parse_race_names(selector=self.get_calendar_selector())
if race_names:
yield from race_names
return

for id in self.get_race_ids_by_year(year, is_female=self._is_female):
if id in self._excluded_ids:
pass

url = self.get_race_details_url(id)
selector = Selector(requests.get(url=url, headers=HTTP_HEADERS()).content.decode("utf-8"))
if self._html_parser.is_valid_race(selector):
name = self._html_parser.get_name(selector)
yield RaceName(race_id=id, name=whitespaces_clean(name).upper())

@override
def get_race_ids_by_year(self, year: int, **_) -> Generator[str, Any, Any]:
"""
Expand Down Expand Up @@ -192,35 +221,6 @@ def get_race_ids_by_year(self, year: int, **_) -> Generator[str, Any, Any]:

return (str(r) for r in range(lower_race_id, (upper_race_id + 1)) if r not in self._excluded_ids)

@override
def get_race_names_by_year(self, year: int, **_) -> Generator[RaceName, Any, Any]:
"""
Find the race names for a given year and gender. Uses the unchecked IDs found in get_race_ids_by_year to
find them.
Args:
year (int): The year for which to generate race names.
**kwargs: Additional keyword arguments.
Yields: RaceName: Race names.
"""
today = date.today().year
if today == year:
race_names = self._html_parser.parse_race_names(selector=self.get_calendar_selector())
if race_names:
yield from race_names
return

for id in self.get_race_ids_by_year(year, is_female=self._is_female):
if id in self._excluded_ids:
pass

url = self.get_race_details_url(id)
selector = Selector(requests.get(url=url, headers=HTTP_HEADERS()).content.decode("utf-8"))
if self._html_parser.is_valid_race(selector):
name = self._html_parser.get_name(selector)
yield RaceName(race_id=id, name=whitespaces_clean(name).upper())

@override
def get_lineup_by_race_id(self, race_id: str, **_) -> Generator[Lineup, Any, Any]:
if race_id in self._excluded_ids:
Expand Down
14 changes: 7 additions & 7 deletions rscraping/clients/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,26 +108,26 @@ def validate_url(self, url: str):
if not pattern.match(url):
raise ValueError(f"invalid {url=}")

def get_races(self, **kwargs) -> Generator[Race, Any, Any]:
return self._parser.parse_races(self._df, is_female=self._is_female, url=self._url)

@override
def get_race_by_id(self, race_id: str, *_, **kwargs) -> Race | None:
race_row = self._df.iloc[int(race_id) - 1]
return self._parser.parse_race_serie(race_row, is_female=self._is_female, url=self._url)
return self._parser.parse_race(race_row, is_female=self._is_female, url=self._url)

@override
def get_race_by_url(self, *_, race_id: str, **kwargs) -> Race | None:
return self.get_race_by_id(race_id=race_id, **kwargs)

def get_races(self, **kwargs) -> Generator[Race, Any, Any]:
return self._parser.parse_races_from(self._df, is_female=self._is_female, url=self._url)
@override
def get_race_names_by_year(self, year: int, *_, **kwargs) -> Generator[RaceName, Any, Any]:
return self._parser.parse_race_names(self._df, year)

@override
def get_race_ids_by_year(self, year: int, *_, **kwargs) -> Generator[str, Any, Any]:
return self._parser.parse_race_ids(self._df, year)

@override
def get_race_names_by_year(self, year: int, *_, **kwargs) -> Generator[RaceName, Any, Any]:
return self._parser.parse_race_names(self._df, year)

################################################
############## PRIVATE METHODS #################
################################################
Expand Down
24 changes: 12 additions & 12 deletions rscraping/clients/traineras.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,23 +64,29 @@ def validate_url(self, url: str):
if not pattern.match(url):
raise ValueError(f"invalid {url=}")

@override
def get_race_names_by_year(self, year: int, **_) -> Generator[RaceName, Any, Any]:
self.validate_year(year)
for page in self.get_pages(year):
yield from self._html_parser.parse_race_names(page, is_female=self._is_female, category=self._category)

@override
def get_race_ids_by_year(self, year: int, **_) -> Generator[str, Any, Any]:
self.validate_year(year)
for page in self.get_pages(year):
yield from self._html_parser.parse_race_ids(page, is_female=self._is_female, category=self._category)

@override
def get_race_ids_by_club(self, club_id: str, year: int, **kwargs) -> Generator[str, Any, Any]:
content = requests.get(url=self.get_club_races_url(club_id, year), headers=HTTP_HEADERS())
content = content.content.decode("utf-8")
return self._html_parser.parse_club_race_ids(Selector(content))

@override
def get_race_ids_by_rower(self, rower_id: str, year: str | None = None, **_) -> Generator[str, Any, Any]:
content = requests.get(url=self.get_rower_url(rower_id), headers=HTTP_HEADERS()).content.decode("utf-8")
yield from self._html_parser.parse_rower_race_ids(Selector(content), year=year)

@override
def get_race_names_by_year(self, year: int, **_) -> Generator[RaceName, Any, Any]:
self.validate_year(year)
for page in self.get_pages(year):
yield from self._html_parser.parse_race_names(page, is_female=self._is_female, category=self._category)

@override
def get_lineup_by_race_id(self, race_id: str, **_) -> Generator[Lineup, Any, Any]:
content = requests.get(url=self.get_race_details_url(race_id), headers=HTTP_HEADERS()).content.decode("utf-8")
Expand All @@ -92,12 +98,6 @@ def get_lineup_by_race_id(self, race_id: str, **_) -> Generator[Lineup, Any, Any
lineup = requests.get(url=url, headers=HTTP_HEADERS()).content.decode("utf-8")
yield self._html_parser.parse_lineup(Selector(lineup))

@override
def get_race_ids_by_club(self, club_id: str, year: int, **kwargs) -> Generator[str, Any, Any]:
content = requests.get(url=self.get_club_races_url(club_id, year), headers=HTTP_HEADERS())
content = content.content.decode("utf-8")
return self._html_parser.parse_club_race_ids(Selector(content))

def get_pages(self, year: int) -> Generator[Selector, Any, Any]:
"""
Generate Selector objects for each page of races in a specific year.
Expand Down
10 changes: 9 additions & 1 deletion rscraping/parsers/df/_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from abc import ABC
from collections.abc import Generator
from typing import Any, override

from rscraping.data.models import Datasource
from pandas import DataFrame

from rscraping.data.models import Datasource, Race

from ._protocol import DataFrameParserProtocol

Expand All @@ -19,3 +23,7 @@ def __new__(cls, source: str, **_) -> "DataFrameParser": # pragma: no cover
final_obj = object.__new__(subclass)

return final_obj

@override
def parse_races(self, data: DataFrame, **kwargs) -> Generator[Race, Any, Any]:
raise NotImplementedError
2 changes: 1 addition & 1 deletion rscraping/parsers/df/_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@


class DataFrameParserProtocol(Protocol):
def parse_races_from(self, data: DataFrame, *_, **kwargs) -> Generator[Race, Any, Any]: ...
def parse_races(self, data: DataFrame, **kwargs) -> Generator[Race, Any, Any]: ...
12 changes: 6 additions & 6 deletions rscraping/parsers/df/inforemo.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from collections.abc import Generator
from datetime import date, datetime
from typing import Any
from typing import Any, override

import inquirer
from pandas import DataFrame, Series
Expand Down Expand Up @@ -37,14 +37,14 @@ class InforemoDataFrameParser(DataFrameParser, source=Datasource.INFOREMO):
GENDER_MIX: ["MIXTO"],
}

def parse_races_from(
@override
def parse_races(
self,
data: DataFrame,
*_,
file_name: str,
header: str,
file_name: str = "",
header: str = "",
manual_input: bool = False,
**__,
**_,
) -> Generator[Race, Any, Any]:
name = self._try_find_race_name(header, manual_input=manual_input)
t_date = self._try_find_race_date(header, manual_input=manual_input)
Expand Down
30 changes: 15 additions & 15 deletions rscraping/parsers/df/tabular.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections.abc import Generator
from datetime import date
from typing import Any
from typing import Any, override

from pandas import DataFrame, Series

Expand Down Expand Up @@ -41,7 +41,20 @@


class TabularDataFrameParser(DataFrameParserProtocol):
def parse_race_serie(self, row: Series, is_female: bool = False, url: str | None = None) -> Race | None:
@override
def parse_races(
self,
data: DataFrame,
is_female: bool = False,
url: str | None = None,
**__,
) -> Generator[Race, Any, Any]:
for _, row in data.iterrows():
race = self.parse_race(row, is_female=is_female, url=url)
if race:
yield race

def parse_race(self, row: Series, is_female: bool = False, url: str | None = None) -> Race | None:
if not isinstance(row, Series):
return None

Expand Down Expand Up @@ -106,19 +119,6 @@ def parse_race_names(self, data: DataFrame, year: int) -> Generator[RaceName, An
df = data[data[COLUMN_DATE].dt.year == year]
return (RaceName(race_id=str(row.name), name=str(row[COLUMN_NAME])) for _, row in df.iterrows())

def parse_races_from(
self,
data: DataFrame,
*_,
is_female: bool = False,
url: str | None = None,
**__,
) -> Generator[Race, Any, Any]:
for _, row in data.iterrows():
race = self.parse_race_serie(row, is_female=is_female, url=url)
if race:
yield race

@staticmethod
def _normalize_race_name(name: str, league: str | None, t_date: date) -> str:
if all(n in name for n in ["ILLA", "SAMERTOLAMEU"]) and (
Expand Down
9 changes: 7 additions & 2 deletions rscraping/parsers/html/act.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,13 @@ def parse_race_ids(self, selector: Selector, **_) -> Generator[str, Any, Any]:
def parse_race_names(self, selector: Selector, **_) -> Generator[RaceName, Any, Any]:
hrefs = selector.xpath('//*[@id="col-a"]/div/section/div[5]/table/tbody/tr[*]/td[*]/a').getall()
selectors = [Selector(h) for h in hrefs]
pairs = [(s.xpath("//*/@href").get("").split("r=")[-1], s.xpath("//*/text()").get("")) for s in selectors]
return (RaceName(race_id=p[0], name=whitespaces_clean(p[1]).upper()) for p in pairs)
return (
RaceName(
race_id=s.xpath("//*/@href").get("").split("r=")[-1],
name=whitespaces_clean(s.xpath("//*/text()").get("")).upper(),
)
for s in selectors
)

@override
def parse_lineup(self, *_, **__):
Expand Down
9 changes: 7 additions & 2 deletions rscraping/parsers/html/arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,13 @@ def parse_race_names(self, selector: Selector, **_) -> Generator[RaceName, Any,
else selector.xpath('//*[@id="main"]/div[4]/table/tbody/tr[*]/td[2]/span/a').getall()
)
selectors = [Selector(h) for h in hrefs]
pairs = [(s.xpath("//*/@href").get("").split("/")[-2], s.xpath("//*/text()").get("")) for s in selectors]
return (RaceName(race_id=p[0], name=whitespaces_clean(p[1]).upper()) for p in pairs)
return (
RaceName(
race_id=s.xpath("//*/@href").get("").split("/")[-2],
name=whitespaces_clean(s.xpath("//*/text()").get("")).upper(),
)
for s in selectors
)

def parse_club_ids(self, selector: Selector) -> Generator[str, Any, Any]:
urls = (
Expand Down
Loading

0 comments on commit 186ea50

Please sign in to comment.