Skip to content

Commit

Permalink
refactor gdrive -> tabular & implement more retrieval methods
Browse files Browse the repository at this point in the history
  • Loading branch information
iagocanalejas committed Mar 2, 2024
1 parent 940dc53 commit c0507dd
Show file tree
Hide file tree
Showing 10 changed files with 83 additions and 140 deletions.
5 changes: 1 addition & 4 deletions findlineup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@ def _parse_arguments():
parser.add_argument("datasource", type=str, help="Datasource from where to retrieve.")
parser.add_argument("race_id", type=str, help="Race to find.")
parser.add_argument(
"--female",
action="store_true",
default=False,
help="Specifies if we need to search in the female pages.",
"--female", action="store_true", default=False, help="Specifies if we need to search in the female pages."
)
parser.add_argument("--save", action="store_true", default=False, help="Saves the output to a csv file.")
return parser.parse_args()
Expand Down
21 changes: 10 additions & 11 deletions processcsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import sys

from rscraping import find_csv_race, parse_race_csv
from rscraping.clients import Client, TabularDataClient
from rscraping.data.functions import save_csv, sys_print_items
from rscraping.data.models import Datasource

Expand All @@ -17,10 +17,7 @@ def _parse_arguments():
parser.add_argument("sheet_id_or_file_path", type=str, help="Google sheet ID or local file path.")
parser.add_argument("race_id", type=str, nargs="?", help="Race to find.")
parser.add_argument(
"--female",
action="store_true",
default=False,
help="Specifies if we need to search in the female pages.",
"--female", action="store_true", default=False, help="Specifies if we need to search in the female pages."
)
parser.add_argument("--sheet_name", type=str, default=None, help="Sheet name.")
parser.add_argument("--save", action="store_true", default=False, help="Saves the output to a csv file.")
Expand All @@ -37,23 +34,25 @@ def main(
sheet_id = sheet_id_or_file_path if not os.path.isfile(sheet_id_or_file_path) else None
file_path = sheet_id_or_file_path if os.path.isfile(sheet_id_or_file_path) else None

client: TabularDataClient = Client(
source=Datasource.TABULAR, gsheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name
) # type: ignore

if race_id:
race = find_csv_race(
race_id, sheet_id=sheet_id, file_path=file_path, is_female=is_female, sheet_name=sheet_name
)
race = client.get_race_by_id(race_id, is_female=is_female)
if not race:
raise ValueError(f"not found race for race_id={race_id}")

if save:
save_csv([race], file_name=f"race_{race_id}_{Datasource.GDRIVE.value.upper()}")
save_csv([race], file_name=f"race_{race_id}_{Datasource.TABULAR.value.upper()}")

sys_print_items([race])
sys.exit(0)

races = list(parse_race_csv(sheet_id=sheet_id, file_path=file_path, is_female=is_female, sheet_name=sheet_name))
races = list(client.get_races(is_female=is_female))

if save:
save_csv(races, file_name=f"race_{race_id}_{Datasource.GDRIVE.value.upper()}")
save_csv(races, file_name=f"race_{race_id}_{Datasource.TABULAR.value.upper()}")

sys_print_items(races)

Expand Down
61 changes: 4 additions & 57 deletions rscraping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,17 @@
from collections.abc import Generator

from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols
from pyutils.shortcuts import only_one_not_none
from rscraping.clients import Client
from rscraping.clients.gdrive import GoogleDriveClient
from rscraping.data.constants import SYNONYMS
from rscraping.data.models import Datasource, Lineup, Race
from simplemma.simplemma import text_lemmatizer
from rscraping.clients import Client

from rscraping.ocr import ImageProcessor
from rscraping.parsers.df import DataFrameParser


def find_race(
race_id: str,
datasource: Datasource,
is_female: bool,
day: int | None = None,
with_lineup: bool = False,
race_id: str, datasource: Datasource, is_female: bool, day: int | None = None, with_lineup: bool = False
) -> Race | None:
"""
Find a race based on the provided parameters.
Expand All @@ -42,7 +36,7 @@ def find_race(
the function will proceed without adding lineup information.
"""

client = Client(source=datasource) # type: ignore
client = Client(source=datasource)
race = client.get_race_by_id(race_id, day=day, is_female=is_female)

if race is not None and with_lineup:
Expand All @@ -57,55 +51,8 @@ def find_race(
return race


def find_csv_race(
race_id: str, sheet_id: str | None, file_path: str | None, is_female: bool, sheet_name: str | None = None
) -> Race | None:
"""
Find a race inside a Google Sheet by its ID or in a file.
Parameters:
- race_id (str): The ID of the race to find.
- sheet_id (Optional[str]): The ID of the Google Sheet to search in.
- file_path (Optional[str]): The path to the local file to search in.
- is_female (bool): Whether the race is for females (True) or not (False).
- sheet_name (Optional[str]): The name of the sheet to search in (optional).
Returns:
- Optional[Race]: The found Race object if the race is found, otherwise None.
This function retrieves the race information using a Google Drive sheet.
"""
client = GoogleDriveClient()
return client.get_race_by_id(
race_id, sheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name, is_female=is_female
)


def parse_race_csv(
*_, sheet_id: str | None, file_path: str | None, is_female: bool, sheet_name: str | None
) -> Generator[Race, Any, Any]:
"""
Parse race information from a Google Sheet or file using provided parameters.
Parameters:
- sheet_id (Optional[str]): The ID of the Google Sheet to parse.
- file_path (Optional[str]): The path to the local file to search in.
- is_female (bool): Whether the race is for females (True) or not (False).
- sheet_name (Optional[str]): The name of the sheet to parse (optional).
Returns:
- Generator[Race, Any, Any]: A generator that yields Race objects as they are parsed.
"""

client = GoogleDriveClient()
return client.get_races(sheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name, is_female=is_female)


def parse_race_image(
path: str,
datasource: Datasource,
header_size: int = 3,
allow_plot: bool = False,
path: str, datasource: Datasource, header_size: int = 3, allow_plot: bool = False
) -> Generator[Race, Any, Any]:
"""
Parse race information from an image file using provided data source and parameters.
Expand Down
2 changes: 1 addition & 1 deletion rscraping/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from .arc import ARCClient # pyright: ignore
from .lgt import LGTClient # pyright: ignore
from .traineras import TrainerasClient # pyright: ignore
from .gdrive import GoogleDriveClient # pyright: ignore
from .tabular import TabularDataClient # pyright: ignore
96 changes: 43 additions & 53 deletions rscraping/clients/gdrive.py → rscraping/clients/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,24 @@
COLUMN_ORGANIZER,
COLUMN_TIME,
COLUMN_TYPE,
GoogleDriveDataFrameParser,
TabularDataFrameParser,
)
from rscraping.parsers.html import HtmlParser
from rscraping.parsers.pdf import PdfParser

from ._client import ClientProtocol
from ._client import Client


class GoogleDriveClient(ClientProtocol):
DATASOURCE = Datasource.GDRIVE
class TabularDataClient(Client, source=Datasource.TABULAR):
DATASOURCE = Datasource.TABULAR
FEMALE_START = 2015
MALE_START = 2011

_df: pd.DataFrame
_df_types: dict[Any, Callable] = {
"N": lambda x: int(x) if x else None,
COLUMN_CLUB: lambda x: str(x) if x else None,
COLUMN_DATE: lambda x: pd.to_datetime(x, format="%d/%m/%Y").date() if x and x != "-" else None,
COLUMN_DATE: lambda x: pd.to_datetime(x, format="%d/%m/%Y") if x and x != "-" else None,
COLUMN_LEAGUE: lambda x: str(x) if x and x != "-" else None,
COLUMN_EDITION: lambda x: roman_to_int(x) if x and x != "-" else None,
COLUMN_NAME: lambda x: str(x) if x else None,
Expand All @@ -50,64 +51,61 @@ class GoogleDriveClient(ClientProtocol):
}

@property
def _parser(self) -> GoogleDriveDataFrameParser:
return GoogleDriveDataFrameParser()

@override
def validate_year(self, year: int, is_female: bool):
since = self.FEMALE_START if is_female else self.MALE_START
today = date.today().year
if year < since or year > today:
raise ValueError(f"invalid 'year', available values are [{since}, {today}]")
def _parser(self) -> TabularDataFrameParser:
return TabularDataFrameParser()

@override
@staticmethod
def get_race_details_url(*_, sheet_id: str, sheet_name: str | None = None, **kwargs) -> str:
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv"
def get_race_details_url(*_, gsheet_id: str, sheet_name: str | None = None, **kwargs) -> str:
url = f"https://docs.google.com/spreadsheets/d/{gsheet_id}/gviz/tq?tqx=out:csv"
if sheet_name:
url += f"&sheet={sheet_name}"
return url

@override
def get_race_by_id(
self,
race_id: str,
*_,
sheet_id: str | None = None,
file_path: str | None = None,
sheet_name: str | None = None,
is_female: bool = False,
**kwargs,
) -> Race | None:
if not only_one_not_none(sheet_id, file_path):
def __init__(
self, *_, gsheet_id: str | None = None, file_path: str | None = None, sheet_name: str | None = None, **kwargs
) -> None:
if not only_one_not_none(gsheet_id, file_path):
raise ValueError("sheet_id and file_path are mutually exclusive")
self._df = self._load_dataframe(gsheet_id=gsheet_id, file_path=file_path, sheet_name=sheet_name)
super().__init__(**kwargs)

df = self._load_dataframe(sheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name)
race_row = df.iloc[int(race_id) - 1]
@override
def validate_year(self, year: int, is_female: bool):
since = self.FEMALE_START if is_female else self.MALE_START
today = date.today().year
if year < since or year > today:
raise ValueError(f"invalid 'year', available values are [{since}, {today}]")

@override
def get_race_by_id(self, race_id: str, *_, is_female: bool = False, **kwargs) -> Race | None:
race_row = self._df.iloc[int(race_id) - 1]
return self._parser.parse_race_serie(race_row, is_female=is_female)

def get_races(
self,
sheet_id: str | None = None,
file_path: str | None = None,
sheet_name: str | None = None,
is_female: bool = False,
**kwargs,
) -> Generator[Race, Any, Any]:
if not only_one_not_none(sheet_id, file_path):
raise ValueError("sheet_id and file_path are mutually exclusive")
def get_races(self, is_female: bool = False, **kwargs) -> Generator[Race, Any, Any]:
return self._parser.parse_races_from(self._df, is_female=is_female)

def get_race_ids_by_year(self, year: int, *_, **kwargs) -> Generator[str, Any, Any]:
df = self._df[self._df[COLUMN_DATE].dt.year == year]
for _, row in df.iterrows():
yield str(row.name)

def get_race_names_by_year(self, year: int, *_, **kwargs) -> Generator[RaceName, Any, Any]:
df = self._df[self._df[COLUMN_DATE].dt.year == year]
for _, row in df.iterrows():
yield RaceName(race_id=str(row.name), name=str(row[COLUMN_NAME]))

df = self._load_dataframe(sheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name)
return self._parser.parse_races_from(df, is_female=is_female)
################################################
############## PRIVATE METHODS #################
################################################

def _load_dataframe(
self, sheet_id: str | None = None, file_path: str | None = None, sheet_name: str | None = None
self, gsheet_id: str | None = None, file_path: str | None = None, sheet_name: str | None = None
) -> pd.DataFrame:
df = None

if sheet_id:
url = self.get_race_details_url(sheet_id=sheet_id, sheet_name=sheet_name)
if gsheet_id:
url = self.get_race_details_url(gsheet_id=gsheet_id, sheet_name=sheet_name)
df = pd.read_csv(url, header=0, index_col=0, converters=self._df_types).fillna("")

if file_path:
Expand Down Expand Up @@ -139,14 +137,6 @@ def _html_parser(self) -> HtmlParser:
def _pdf_parser(self) -> PdfParser:
raise NotImplementedError

def get_race_ids_by_year(self, year: int, is_female: bool | None = None, **kwargs) -> Generator[str, Any, Any]:
raise NotImplementedError

def get_race_names_by_year(
self, year: int, is_female: bool | None = None, **kwargs
) -> Generator[RaceName, Any, Any]:
raise NotImplementedError

def get_race_ids_by_rower(self, rower_id: str, **kwargs) -> Generator[str, Any, Any]:
raise NotImplementedError

Expand Down
2 changes: 1 addition & 1 deletion rscraping/data/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Datasource(StrEnum):
ABE = auto()
TRAINERAS = auto()
INFOREMO = auto()
GDRIVE = auto()
TABULAR = auto()

@classmethod
def has_value(cls, value: str) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions rscraping/parsers/df/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from ._parser import DataFrameParser, DataFrameParserProtocol # pyright: ignore
from .inforemo import InforemoDataFrameParser # pyright: ignore
from .gdrive import (
GoogleDriveDataFrameParser,
from .tabular import (
TabularDataFrameParser,
COLUMN_CLUB,
COLUMN_ORGANIZER,
COLUMN_TYPE,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
COLUMN_NUMBER_LAPS = "N Largos"


class GoogleDriveDataFrameParser(DataFrameParserProtocol):
class TabularDataFrameParser(DataFrameParserProtocol):
def parse_race_serie(self, row: Series, is_female: bool = False) -> Race | None:
if not isinstance(row, Series):
return None
Expand All @@ -61,7 +61,7 @@ def parse_race_serie(self, row: Series, is_female: bool = False) -> Race | None:
race_id=str(int(row.name)), # pyright: ignore
url=None,
gender=GENDER_FEMALE if is_female else GENDER_MALE,
datasource=Datasource.GDRIVE.value,
datasource=Datasource.TABULAR.value,
cancelled=False,
race_laps=int_or_none(str(row[COLUMN_NUMBER_LAPS])),
race_lanes=int_or_none(str(row[COLUMN_NUMBER_LANES])),
Expand Down
12 changes: 11 additions & 1 deletion tests/clients/client_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from rscraping.clients import ABEClient, ACTClient, ARCClient, Client, LGTClient, TrainerasClient
from rscraping.clients import ABEClient, ACTClient, ARCClient, Client, LGTClient, TabularDataClient, TrainerasClient
from rscraping.data.models import Datasource


Expand All @@ -11,3 +11,13 @@ def test_client_initialization(self):
self.assertTrue(isinstance(Client(source=Datasource.ARC), ARCClient))
self.assertTrue(isinstance(Client(source=Datasource.ABE), ABEClient))
self.assertTrue(isinstance(Client(source=Datasource.LGT), LGTClient))

with self.assertRaises(ValueError):
TabularDataClient(source=Datasource.TABULAR, gsheet_id="1", file_path="1")

TabularDataClient._load_dataframe = self._load_dataframe # type: ignore
self.assertTrue(isinstance(Client(source=Datasource.TABULAR, gsheet_id="1"), TabularDataClient))

# testing replacement for _load_dataframe
def _load_dataframe(*_, **__):
return None
Loading

0 comments on commit c0507dd

Please sign in to comment.