Skip to content

Commit

Permalink
fix: re-exporting in rscraping __init__.py
Browse files Browse the repository at this point in the history
  • Loading branch information
iagocanalejas committed Mar 17, 2024
1 parent 0b262a3 commit f776370
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 120 deletions.
2 changes: 2 additions & 0 deletions findrace.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def _parse_arguments():
def main(race_id: str, datasource: str, is_female: bool, with_lineups: bool, save: bool, day: int | None):
if not Datasource.has_value(datasource):
raise ValueError(f"invalid datasource={datasource}")
if datasource == Datasource.TABULAR.value:
raise ValueError(f"datasource={datasource} is not supported in this script")

race = find_race(
race_id=race_id,
Expand Down
122 changes: 4 additions & 118 deletions rscraping/__init__.py
Original file line number Diff line number Diff line change
@@ -1,118 +1,4 @@
import os
from typing import Any
from collections.abc import Generator

from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols, unaccent
from rscraping.data.constants import SYNONYMS
from rscraping.data.models import Datasource, Lineup, Race
from simplemma.simplemma import text_lemmatizer
from rscraping.clients import Client

from rscraping.ocr import ImageProcessor
from rscraping.parsers.df import DataFrameParser


def find_race(
race_id: str,
datasource: Datasource,
is_female: bool,
category: str | None = None,
day: int | None = None,
with_lineup: bool = False,
) -> Race | None:
"""
Find a race based on the provided parameters.
Parameters:
- race_id (str): The ID of the race to find.
- datasource (Datasource): The data source to use for retrieving race information.
- is_female (bool): Whether the race is for females (True) or not (False).
- category (Optional[str]): The category of the race (optional).
- day (Optional[int]): The day of the race (optional).
- with_lineup (bool): Whether to include lineup information for participants (default: False).
Returns:
- Optional[Race]: The found Race object if the race is found, otherwise None.
This function retrieves race information using the provided datasource and parameters.
If with_lineup is True, it attempts to find lineup information for each participant in the race and attaches it to
the participant object.
Note that lineup retrieval may raise NotImplementedError, in which case it will be caught and
the function will proceed without adding lineup information.
"""

client = Client(source=datasource, is_female=is_female, category=category)
race = client.get_race_by_id(race_id, day=day)

if race is not None and with_lineup:
try:
lineups = client.get_lineup_by_race_id(race_id)
for participant in race.participants:
lineup = [lin for lin in lineups if lin.club == participant.participant]
participant.lineup = lineup[0] if len(lineup) == 1 else None
except NotImplementedError:
pass

return race


def parse_race_image(
path: str,
datasource: Datasource,
header_size: int = 3,
allow_plot: bool = False,
) -> Generator[Race, Any, Any]:
"""
Parse race information from an image file using provided data source and parameters.
Parameters:
- path (str): The path to the image file to be processed.
- datasource (Datasource): The data source to use for retrieving additional data.
- header_size (int): The ammount of the image that represents the header (default: 3) -> 1/3.
- allow_plot (bool): Whether to allow plotting for debugging during processing (default: False).
Yields:
- Generator[Race, Any, Any]: A generator that yields Race objects as they are parsed.
This function processes race information from an image file. It uses an ImageProcessor and a
DataFrameParser for retrieving header data and tabular data from the image, respectively.
The header data is retrieved based on the specified header size, and the tabular data is
extracted using the DataFrameParser.
The function yields Race objects as they are parsed from the data. The file name, header data,
and tabular data are used as input for the parsing process.
"""

processor = ImageProcessor(source=datasource, allow_plot=allow_plot) # pyright: ignore
parser = DataFrameParser(source=datasource, allow_plot=allow_plot) # pyright: ignore

header_data = processor.retrieve_header_data(path=path, header_size=header_size)
df = processor.retrieve_tabular_dataframe(path=path, header_size=header_size)

return parser.parse_races_from(
data=df,
file_name=os.path.splitext(os.path.basename(path))[0],
header=header_data,
)


def find_lineup(race_id: str, datasource: Datasource, is_female: bool) -> Generator[Lineup, Any, Any]:
client = Client(source=datasource, is_female=is_female)
return client.get_lineup_by_race_id(race_id)


def lemmatize(phrase: str, lang: str = "es") -> list[str]:
"""
Lemmatize a phrase using the simplemma library. The phrase is preprocessed before lemmatization.
Synonyms are normalized, conjunctions are removed, symbols are removed, and accents are removed.
Parameters:
- phrase (str): The phrase to lemmatize.
- lang (str): The language of the phrase (default: "es").
Returns: list[str]: A list of lemmatized words from the phrase.
"""
phrase = normalize_synonyms(phrase, SYNONYMS)
phrase = unaccent(remove_symbols(remove_conjunctions(phrase)))
return list(set(text_lemmatizer(phrase, lang=lang)))
from ._functions import find_race as find_race
from ._functions import parse_race_image as parse_race_image
from ._functions import lemmatize as lemmatize
from ._functions import find_lineup as find_lineup
118 changes: 118 additions & 0 deletions rscraping/_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
from collections.abc import Generator
from typing import Any

from simplemma.simplemma import text_lemmatizer

from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols, unaccent
from rscraping.clients import Client
from rscraping.data.constants import SYNONYMS
from rscraping.data.models import Datasource, Lineup, Race
from rscraping.ocr import ImageProcessor
from rscraping.parsers.df import DataFrameParser


def find_race(
race_id: str,
datasource: Datasource,
is_female: bool,
category: str | None = None,
day: int | None = None,
with_lineup: bool = False,
) -> Race | None:
"""
Find a race based on the provided parameters.
Parameters:
- race_id (str): The ID of the race to find.
- datasource (Datasource): The data source to use for retrieving race information.
- is_female (bool): Whether the race is for females (True) or not (False).
- category (Optional[str]): The category of the race (optional).
- day (Optional[int]): The day of the race (optional).
- with_lineup (bool): Whether to include lineup information for participants (default: False).
Returns:
- Optional[Race]: The found Race object if the race is found, otherwise None.
This function retrieves race information using the provided datasource and parameters.
If with_lineup is True, it attempts to find lineup information for each participant in the race and attaches it to
the participant object.
Note that lineup retrieval may raise NotImplementedError, in which case it will be caught and
the function will proceed without adding lineup information.
"""

client = Client(source=datasource, is_female=is_female, category=category)
race = client.get_race_by_id(race_id, day=day)

if race is not None and with_lineup:
try:
lineups = client.get_lineup_by_race_id(race_id)
for participant in race.participants:
lineup = [lin for lin in lineups if lin.club == participant.participant]
participant.lineup = lineup[0] if len(lineup) == 1 else None
except NotImplementedError:
pass

return race


def parse_race_image(
path: str,
datasource: Datasource,
header_size: int = 3,
allow_plot: bool = False,
) -> Generator[Race, Any, Any]:
"""
Parse race information from an image file using provided data source and parameters.
Parameters:
- path (str): The path to the image file to be processed.
- datasource (Datasource): The data source to use for retrieving additional data.
- header_size (int): The ammount of the image that represents the header (default: 3) -> 1/3.
- allow_plot (bool): Whether to allow plotting for debugging during processing (default: False).
Yields:
- Generator[Race, Any, Any]: A generator that yields Race objects as they are parsed.
This function processes race information from an image file. It uses an ImageProcessor and a
DataFrameParser for retrieving header data and tabular data from the image, respectively.
The header data is retrieved based on the specified header size, and the tabular data is
extracted using the DataFrameParser.
The function yields Race objects as they are parsed from the data. The file name, header data,
and tabular data are used as input for the parsing process.
"""

processor = ImageProcessor(source=datasource, allow_plot=allow_plot) # pyright: ignore
parser = DataFrameParser(source=datasource, allow_plot=allow_plot) # pyright: ignore

header_data = processor.retrieve_header_data(path=path, header_size=header_size)
df = processor.retrieve_tabular_dataframe(path=path, header_size=header_size)

return parser.parse_races_from(
data=df,
file_name=os.path.splitext(os.path.basename(path))[0],
header=header_data,
)


def find_lineup(race_id: str, datasource: Datasource, is_female: bool) -> Generator[Lineup, Any, Any]:
client = Client(source=datasource, is_female=is_female)
return client.get_lineup_by_race_id(race_id)


def lemmatize(phrase: str, lang: str = "es") -> list[str]:
"""
Lemmatize a phrase using the simplemma library. The phrase is preprocessed before lemmatization.
Synonyms are normalized, conjunctions are removed, symbols are removed, and accents are removed.
Parameters:
- phrase (str): The phrase to lemmatize.
- lang (str): The language of the phrase (default: "es").
Returns: list[str]: A list of lemmatized words from the phrase.
"""
phrase = normalize_synonyms(phrase, SYNONYMS)
phrase = unaccent(remove_symbols(remove_conjunctions(phrase)))
return list(set(text_lemmatizer(phrase, lang=lang)))
4 changes: 2 additions & 2 deletions rscraping/clients/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,9 @@ def _load_dataframe(self, config: TabularClientConfig) -> pd.DataFrame:
# remove unused dataframe parts
# 1. remove rows with NaN index
# 2. keep only the first 15 columns
# 3. remove columns "Temp." and "Puesto"
# 3. remove column "Puesto"
# 4. remove rows with empty "Nome" column
df = df[df.index.notnull()].iloc[:, :15].drop(columns=["Temp.", "Puesto"]).loc[df[COLUMN_NAME] != ""]
df = df[df.index.notnull()].iloc[:, :15].drop(columns=["Puesto"]).loc[df[COLUMN_NAME] != ""]

return df

Expand Down

0 comments on commit f776370

Please sign in to comment.