refactor gdrive -> tabular & implement more retrieval methods

iagocanalejas · Mar 2, 2024 · c0507dd · c0507dd
1 parent 940dc53
commit c0507dd
Show file tree

Hide file tree

Showing 10 changed files with 83 additions and 140 deletions.
diff --git a/findlineup.py b/findlineup.py
@@ -16,10 +16,7 @@ def _parse_arguments():
     parser.add_argument("datasource", type=str, help="Datasource from where to retrieve.")
     parser.add_argument("race_id", type=str, help="Race to find.")
     parser.add_argument(
-        "--female",
-        action="store_true",
-        default=False,
-        help="Specifies if we need to search in the female pages.",
+        "--female", action="store_true", default=False, help="Specifies if we need to search in the female pages."
     )
     parser.add_argument("--save", action="store_true", default=False, help="Saves the output to a csv file.")
     return parser.parse_args()

diff --git a/processcsv.py b/processcsv.py
@@ -5,7 +5,7 @@
 import os
 import sys
 
-from rscraping import find_csv_race, parse_race_csv
+from rscraping.clients import Client, TabularDataClient
 from rscraping.data.functions import save_csv, sys_print_items
 from rscraping.data.models import Datasource
 
@@ -17,10 +17,7 @@ def _parse_arguments():
     parser.add_argument("sheet_id_or_file_path", type=str, help="Google sheet ID or local file path.")
     parser.add_argument("race_id", type=str, nargs="?", help="Race to find.")
     parser.add_argument(
-        "--female",
-        action="store_true",
-        default=False,
-        help="Specifies if we need to search in the female pages.",
+        "--female", action="store_true", default=False, help="Specifies if we need to search in the female pages."
     )
     parser.add_argument("--sheet_name", type=str, default=None, help="Sheet name.")
     parser.add_argument("--save", action="store_true", default=False, help="Saves the output to a csv file.")
@@ -37,23 +34,25 @@ def main(
     sheet_id = sheet_id_or_file_path if not os.path.isfile(sheet_id_or_file_path) else None
     file_path = sheet_id_or_file_path if os.path.isfile(sheet_id_or_file_path) else None
 
+    client: TabularDataClient = Client(
+        source=Datasource.TABULAR, gsheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name
+    )  # type: ignore
+
     if race_id:
-        race = find_csv_race(
-            race_id, sheet_id=sheet_id, file_path=file_path, is_female=is_female, sheet_name=sheet_name
-        )
+        race = client.get_race_by_id(race_id, is_female=is_female)
         if not race:
             raise ValueError(f"not found race for race_id={race_id}")
 
         if save:
-            save_csv([race], file_name=f"race_{race_id}_{Datasource.GDRIVE.value.upper()}")
+            save_csv([race], file_name=f"race_{race_id}_{Datasource.TABULAR.value.upper()}")
 
         sys_print_items([race])
         sys.exit(0)
 
-    races = list(parse_race_csv(sheet_id=sheet_id, file_path=file_path, is_female=is_female, sheet_name=sheet_name))
+    races = list(client.get_races(is_female=is_female))
 
     if save:
-        save_csv(races, file_name=f"race_{race_id}_{Datasource.GDRIVE.value.upper()}")
+        save_csv(races, file_name=f"race_{race_id}_{Datasource.TABULAR.value.upper()}")
 
     sys_print_items(races)
 

diff --git a/rscraping/__init__.py b/rscraping/__init__.py
@@ -3,23 +3,17 @@
 from collections.abc import Generator
 
 from pyutils.strings import normalize_synonyms, remove_conjunctions, remove_symbols
-from pyutils.shortcuts import only_one_not_none
-from rscraping.clients import Client
-from rscraping.clients.gdrive import GoogleDriveClient
 from rscraping.data.constants import SYNONYMS
 from rscraping.data.models import Datasource, Lineup, Race
 from simplemma.simplemma import text_lemmatizer
+from rscraping.clients import Client
 
 from rscraping.ocr import ImageProcessor
 from rscraping.parsers.df import DataFrameParser
 
 
 def find_race(
-    race_id: str,
-    datasource: Datasource,
-    is_female: bool,
-    day: int | None = None,
-    with_lineup: bool = False,
+    race_id: str, datasource: Datasource, is_female: bool, day: int | None = None, with_lineup: bool = False
 ) -> Race | None:
     """
     Find a race based on the provided parameters.
@@ -42,7 +36,7 @@ def find_race(
     the function will proceed without adding lineup information.
     """
 
-    client = Client(source=datasource)  # type: ignore
+    client = Client(source=datasource)
     race = client.get_race_by_id(race_id, day=day, is_female=is_female)
 
     if race is not None and with_lineup:
@@ -57,55 +51,8 @@ def find_race(
     return race
 
 
-def find_csv_race(
-    race_id: str, sheet_id: str | None, file_path: str | None, is_female: bool, sheet_name: str | None = None
-) -> Race | None:
-    """
-    Find a race inside a Google Sheet by its ID or in a file.
-
-    Parameters:
-    - race_id (str): The ID of the race to find.
-    - sheet_id (Optional[str]): The ID of the Google Sheet to search in.
-    - file_path (Optional[str]): The path to the local file to search in.
-    - is_female (bool): Whether the race is for females (True) or not (False).
-    - sheet_name (Optional[str]): The name of the sheet to search in (optional).
-
-    Returns:
-    - Optional[Race]: The found Race object if the race is found, otherwise None.
-
-    This function retrieves the race information using a Google Drive sheet.
-    """
-    client = GoogleDriveClient()
-    return client.get_race_by_id(
-        race_id, sheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name, is_female=is_female
-    )
-
-
-def parse_race_csv(
-    *_, sheet_id: str | None, file_path: str | None, is_female: bool, sheet_name: str | None
-) -> Generator[Race, Any, Any]:
-    """
-    Parse race information from a Google Sheet or file using provided parameters.
-
-    Parameters:
-    - sheet_id (Optional[str]): The ID of the Google Sheet to parse.
-    - file_path (Optional[str]): The path to the local file to search in.
-    - is_female (bool): Whether the race is for females (True) or not (False).
-    - sheet_name (Optional[str]): The name of the sheet to parse (optional).
-
-    Returns:
-    - Generator[Race, Any, Any]: A generator that yields Race objects as they are parsed.
-    """
-
-    client = GoogleDriveClient()
-    return client.get_races(sheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name, is_female=is_female)
-
-
 def parse_race_image(
-    path: str,
-    datasource: Datasource,
-    header_size: int = 3,
-    allow_plot: bool = False,
+    path: str, datasource: Datasource, header_size: int = 3, allow_plot: bool = False
 ) -> Generator[Race, Any, Any]:
     """
     Parse race information from an image file using provided data source and parameters.

diff --git a/rscraping/clients/__init__.py b/rscraping/clients/__init__.py
@@ -4,4 +4,4 @@
 from .arc import ARCClient  # pyright: ignore
 from .lgt import LGTClient  # pyright: ignore
 from .traineras import TrainerasClient  # pyright: ignore
-from .gdrive import GoogleDriveClient  # pyright: ignore
+from .tabular import TabularDataClient  # pyright: ignore
diff --git a/rscraping/clients/gdrive.py → rscraping/clients/tabular.py b/rscraping/clients/gdrive.py → rscraping/clients/tabular.py
@@ -20,23 +20,24 @@
     COLUMN_ORGANIZER,
     COLUMN_TIME,
     COLUMN_TYPE,
-    GoogleDriveDataFrameParser,
+    TabularDataFrameParser,
 )
 from rscraping.parsers.html import HtmlParser
 from rscraping.parsers.pdf import PdfParser
 
-from ._client import ClientProtocol
+from ._client import Client
 
 
-class GoogleDriveClient(ClientProtocol):
-    DATASOURCE = Datasource.GDRIVE
+class TabularDataClient(Client, source=Datasource.TABULAR):
+    DATASOURCE = Datasource.TABULAR
     FEMALE_START = 2015
     MALE_START = 2011
 
+    _df: pd.DataFrame
     _df_types: dict[Any, Callable] = {
         "N": lambda x: int(x) if x else None,
         COLUMN_CLUB: lambda x: str(x) if x else None,
-        COLUMN_DATE: lambda x: pd.to_datetime(x, format="%d/%m/%Y").date() if x and x != "-" else None,
+        COLUMN_DATE: lambda x: pd.to_datetime(x, format="%d/%m/%Y") if x and x != "-" else None,
         COLUMN_LEAGUE: lambda x: str(x) if x and x != "-" else None,
         COLUMN_EDITION: lambda x: roman_to_int(x) if x and x != "-" else None,
         COLUMN_NAME: lambda x: str(x) if x else None,
@@ -50,64 +51,61 @@ class GoogleDriveClient(ClientProtocol):
     }
 
     @property
-    def _parser(self) -> GoogleDriveDataFrameParser:
-        return GoogleDriveDataFrameParser()
-
-    @override
-    def validate_year(self, year: int, is_female: bool):
-        since = self.FEMALE_START if is_female else self.MALE_START
-        today = date.today().year
-        if year < since or year > today:
-            raise ValueError(f"invalid 'year', available values are [{since}, {today}]")
+    def _parser(self) -> TabularDataFrameParser:
+        return TabularDataFrameParser()
 
     @override
     @staticmethod
-    def get_race_details_url(*_, sheet_id: str, sheet_name: str | None = None, **kwargs) -> str:
-        url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv"
+    def get_race_details_url(*_, gsheet_id: str, sheet_name: str | None = None, **kwargs) -> str:
+        url = f"https://docs.google.com/spreadsheets/d/{gsheet_id}/gviz/tq?tqx=out:csv"
         if sheet_name:
             url += f"&sheet={sheet_name}"
         return url
 
-    @override
-    def get_race_by_id(
-        self,
-        race_id: str,
-        *_,
-        sheet_id: str | None = None,
-        file_path: str | None = None,
-        sheet_name: str | None = None,
-        is_female: bool = False,
-        **kwargs,
-    ) -> Race | None:
-        if not only_one_not_none(sheet_id, file_path):
+    def __init__(
+        self, *_, gsheet_id: str | None = None, file_path: str | None = None, sheet_name: str | None = None, **kwargs
+    ) -> None:
+        if not only_one_not_none(gsheet_id, file_path):
             raise ValueError("sheet_id and file_path are mutually exclusive")
+        self._df = self._load_dataframe(gsheet_id=gsheet_id, file_path=file_path, sheet_name=sheet_name)
+        super().__init__(**kwargs)
 
-        df = self._load_dataframe(sheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name)
-        race_row = df.iloc[int(race_id) - 1]
+    @override
+    def validate_year(self, year: int, is_female: bool):
+        since = self.FEMALE_START if is_female else self.MALE_START
+        today = date.today().year
+        if year < since or year > today:
+            raise ValueError(f"invalid 'year', available values are [{since}, {today}]")
 
+    @override
+    def get_race_by_id(self, race_id: str, *_, is_female: bool = False, **kwargs) -> Race | None:
+        race_row = self._df.iloc[int(race_id) - 1]
         return self._parser.parse_race_serie(race_row, is_female=is_female)
 
-    def get_races(
-        self,
-        sheet_id: str | None = None,
-        file_path: str | None = None,
-        sheet_name: str | None = None,
-        is_female: bool = False,
-        **kwargs,
-    ) -> Generator[Race, Any, Any]:
-        if not only_one_not_none(sheet_id, file_path):
-            raise ValueError("sheet_id and file_path are mutually exclusive")
+    def get_races(self, is_female: bool = False, **kwargs) -> Generator[Race, Any, Any]:
+        return self._parser.parse_races_from(self._df, is_female=is_female)
+
+    def get_race_ids_by_year(self, year: int, *_, **kwargs) -> Generator[str, Any, Any]:
+        df = self._df[self._df[COLUMN_DATE].dt.year == year]
+        for _, row in df.iterrows():
+            yield str(row.name)
+
+    def get_race_names_by_year(self, year: int, *_, **kwargs) -> Generator[RaceName, Any, Any]:
+        df = self._df[self._df[COLUMN_DATE].dt.year == year]
+        for _, row in df.iterrows():
+            yield RaceName(race_id=str(row.name), name=str(row[COLUMN_NAME]))
 
-        df = self._load_dataframe(sheet_id=sheet_id, file_path=file_path, sheet_name=sheet_name)
-        return self._parser.parse_races_from(df, is_female=is_female)
+    ################################################
+    ############## PRIVATE METHODS #################
+    ################################################
 
     def _load_dataframe(
-        self, sheet_id: str | None = None, file_path: str | None = None, sheet_name: str | None = None
+        self, gsheet_id: str | None = None, file_path: str | None = None, sheet_name: str | None = None
     ) -> pd.DataFrame:
         df = None
 
-        if sheet_id:
-            url = self.get_race_details_url(sheet_id=sheet_id, sheet_name=sheet_name)
+        if gsheet_id:
+            url = self.get_race_details_url(gsheet_id=gsheet_id, sheet_name=sheet_name)
             df = pd.read_csv(url, header=0, index_col=0, converters=self._df_types).fillna("")
 
         if file_path:
@@ -139,14 +137,6 @@ def _html_parser(self) -> HtmlParser:
     def _pdf_parser(self) -> PdfParser:
         raise NotImplementedError
 
-    def get_race_ids_by_year(self, year: int, is_female: bool | None = None, **kwargs) -> Generator[str, Any, Any]:
-        raise NotImplementedError
-
-    def get_race_names_by_year(
-        self, year: int, is_female: bool | None = None, **kwargs
-    ) -> Generator[RaceName, Any, Any]:
-        raise NotImplementedError
-
     def get_race_ids_by_rower(self, rower_id: str, **kwargs) -> Generator[str, Any, Any]:
         raise NotImplementedError
 

diff --git a/rscraping/data/models.py b/rscraping/data/models.py
@@ -14,7 +14,7 @@ class Datasource(StrEnum):
     ABE = auto()
     TRAINERAS = auto()
     INFOREMO = auto()
-    GDRIVE = auto()
+    TABULAR = auto()
 
     @classmethod
     def has_value(cls, value: str) -> bool:

diff --git a/rscraping/parsers/df/__init__.py b/rscraping/parsers/df/__init__.py
@@ -1,7 +1,7 @@
 from ._parser import DataFrameParser, DataFrameParserProtocol  # pyright: ignore
 from .inforemo import InforemoDataFrameParser  # pyright: ignore
-from .gdrive import (
-    GoogleDriveDataFrameParser,
+from .tabular import (
+    TabularDataFrameParser,
     COLUMN_CLUB,
     COLUMN_ORGANIZER,
     COLUMN_TYPE,

diff --git a/rscraping/parsers/df/gdrive.py → rscraping/parsers/df/tabular.py b/rscraping/parsers/df/gdrive.py → rscraping/parsers/df/tabular.py
@@ -37,7 +37,7 @@
 COLUMN_NUMBER_LAPS = "N Largos"
 
 
-class GoogleDriveDataFrameParser(DataFrameParserProtocol):
+class TabularDataFrameParser(DataFrameParserProtocol):
     def parse_race_serie(self, row: Series, is_female: bool = False) -> Race | None:
         if not isinstance(row, Series):
             return None
@@ -61,7 +61,7 @@ def parse_race_serie(self, row: Series, is_female: bool = False) -> Race | None:
             race_id=str(int(row.name)),  # pyright: ignore
             url=None,
             gender=GENDER_FEMALE if is_female else GENDER_MALE,
-            datasource=Datasource.GDRIVE.value,
+            datasource=Datasource.TABULAR.value,
             cancelled=False,
             race_laps=int_or_none(str(row[COLUMN_NUMBER_LAPS])),
             race_lanes=int_or_none(str(row[COLUMN_NUMBER_LANES])),

diff --git a/tests/clients/client_test.py b/tests/clients/client_test.py
@@ -1,6 +1,6 @@
 import unittest
 
-from rscraping.clients import ABEClient, ACTClient, ARCClient, Client, LGTClient, TrainerasClient
+from rscraping.clients import ABEClient, ACTClient, ARCClient, Client, LGTClient, TabularDataClient, TrainerasClient
 from rscraping.data.models import Datasource
 
 
@@ -11,3 +11,13 @@ def test_client_initialization(self):
         self.assertTrue(isinstance(Client(source=Datasource.ARC), ARCClient))
         self.assertTrue(isinstance(Client(source=Datasource.ABE), ABEClient))
         self.assertTrue(isinstance(Client(source=Datasource.LGT), LGTClient))
+
+        with self.assertRaises(ValueError):
+            TabularDataClient(source=Datasource.TABULAR, gsheet_id="1", file_path="1")
+
+        TabularDataClient._load_dataframe = self._load_dataframe  # type: ignore
+        self.assertTrue(isinstance(Client(source=Datasource.TABULAR, gsheet_id="1"), TabularDataClient))
+
+    # testing replacement for _load_dataframe
+    def _load_dataframe(*_, **__):
+        return None