Benchmark Supported Data Types (#2206)

sdv-dev · Sep 25, 2024 · 2f9c842 · 2f9c842
1 parent 79961a9
commit 2f9c842
Show file tree

Hide file tree

Showing 17 changed files with 1,497 additions and 14 deletions.
diff --git a/.github/workflows/dtypes_benchmark.yml b/.github/workflows/dtypes_benchmark.yml
@@ -0,0 +1,80 @@
+name: Data Types Benchmark
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  run_dtypes_benchmark:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install invoke .[test]
+
+      - name: Create folder and JSON file
+        run: |
+          mkdir -p results
+          touch results/${{ matrix.python-version }}.json
+
+      # Run the benchmarking
+      - name: Benchmark Data Types
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+        run: |
+          invoke benchmark-dtypes
+
+      # Upload the json files as artifacts
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: results-${{ matrix.python-version }}
+          path: results/*.json
+
+  generate_dtypes_report:
+    runs-on: ubuntu-latest
+    needs: run_dtypes_benchmark
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      # Set up Python 3.10
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies for report
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[test]
+
+      # Download the artifacts
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
+        with:
+          path: results/
+
+      # Generate the report
+      - name: Generate the report
+        env:
+          PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+        run: python -m tests.benchmark.utils
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -29,4 +29,9 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install invoke .[test]
     - name: Run integration tests
-      run: invoke integration
+      env:
+        PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+
+      run: |
+        invoke integration
+        invoke benchmark-dtypes
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,10 @@ test = [
     'rundoc>=0.4.3,<0.5',
     'pytest-runner >= 2.11.1',
     'tomli>=2.0.0,<3',
+    'pydrive',
+    'pyarrow',
+    'gitpython',
+    'slack-sdk>=3.23,<4.0',
 ]
 pomegranate = ['pomegranate>=0.14.3,<0.15']
 dev = [
@@ -181,6 +185,7 @@ exclude = [
     ".tox",
     ".git",
     "__pycache__",
+    "*.ipynb",
     ".ipynb_checkpoints",
     "tasks.py",
 ]

diff --git a/sdv/_utils.py b/sdv/_utils.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 
 import pandas as pd
+from pandas.api.types import is_float, is_integer
 from pandas.core.tools.datetimes import _guess_datetime_format_for_array
 from rdt.transformers.utils import _GENERATORS
 
@@ -440,3 +441,11 @@ def get_possible_chars(regex, num_subpatterns=None):
         possible_chars += _get_chars_for_option(option, params)
 
     return possible_chars
+
+
+def _is_numerical(value):
+    """Determine if the input is a numerical type or not."""
+    try:
+        return is_integer(value) or is_float(value)
+    except Exception:
+        return False
diff --git a/sdv/constraints/tabular.py b/sdv/constraints/tabular.py
@@ -36,7 +36,7 @@
 import numpy as np
 import pandas as pd
 
-from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type
+from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type, _is_numerical
 from sdv.constraints.base import Constraint
 from sdv.constraints.errors import (
     AggregateConstraintsError,
@@ -604,7 +604,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
         sdtype = metadata.columns.get(column_name, {}).get('sdtype')
         value = kwargs.get('value')
         if sdtype == 'numerical':
-            if not isinstance(value, (int, float)):
+            if not _is_numerical(value):
                 raise ConstraintMetadataError("'value' must be an int or float.")
 
         elif sdtype == 'datetime':
@@ -632,7 +632,7 @@ def _validate_init_inputs(column_name, value, relation):
         if relation not in ['>', '>=', '<', '<=']:
             raise ValueError('`relation` must be one of the following: `>`, `>=`, `<`, `<=`')
 
-        if not (isinstance(value, (int, float)) or value_is_datetime):
+        if not (_is_numerical(value) or value_is_datetime):
             raise ValueError('`value` must be a number or a string that represents a datetime.')
 
         if value_is_datetime and not isinstance(value, str):
@@ -1071,9 +1071,7 @@ def _validate_init_inputs(low_value, high_value):
         if values_are_datetimes and not values_are_strings:
             raise ValueError('Datetime must be represented as a string.')
 
-        values_are_numerical = bool(
-            isinstance(low_value, (int, float)) and isinstance(high_value, (int, float))
-        )
+        values_are_numerical = bool(_is_numerical(low_value) and _is_numerical(high_value))
         if not (values_are_numerical or values_are_datetimes):
             raise ValueError(
                 '``low_value`` and ``high_value`` must be a number or a string that '
@@ -1092,7 +1090,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
         high_value = kwargs.get('high_value')
         low_value = kwargs.get('low_value')
         if sdtype == 'numerical':
-            if not isinstance(high_value, (int, float)) or not isinstance(low_value, (int, float)):
+            if not _is_numerical(high_value) or not _is_numerical(low_value):
                 raise ConstraintMetadataError(
                     "Both 'high_value' and 'low_value' must be ints or floats"
                 )
@@ -1187,11 +1185,7 @@ def is_valid(self, table_data):
             self._operator(data, self._high_value),
             pd.isna(self._high_value),
         )
-
-        return np.logical_or(
-            np.logical_and(satisfy_low_bound, satisfy_high_bound),
-            pd.isna(data),
-        )
+        return (satisfy_low_bound & satisfy_high_bound) | pd.isna(data)
 
     def _transform(self, table_data):
         """Transform the table data.
@@ -1250,7 +1244,7 @@ def _reverse_transform(self, table_data):
             table_data[self._column_name] = data.round().astype(self._dtype)
 
         else:
-            table_data[self._column_name] = data.astype(self._dtype)
+            table_data[self._column_name] = data.astype(self._dtype, errors='ignore')
 
         table_data = table_data.drop(self._transformed_column, axis=1)
         return table_data

diff --git a/tasks.py b/tasks.py
@@ -34,6 +34,11 @@ def integration(c):
     c.run('python -m pytest ./tests/integration --reruns 3')
 
 
+@task
+def benchmark_dtypes(c):
+    c.run('python -m pytest ./tests/benchmark/supported_dtypes_benchmark.py')
+
+
 def _get_minimum_versions(dependencies, python_version):
     min_versions = {}
     for dependency in dependencies:

diff --git a/tests/_external/__init__.py b/tests/_external/__init__.py
@@ -0,0 +1 @@
+"""External utility functions."""
diff --git a/tests/_external/gdrive_utils.py b/tests/_external/gdrive_utils.py
@@ -0,0 +1,140 @@
+"""Google Drive utils."""
+
+import io
+import json
+import os
+import pathlib
+import tempfile
+from datetime import date
+
+import git
+import pandas as pd
+import yaml
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive
+
+PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'
+
+
+def _generate_filename():
+    """Generate a filename with today's date and the commit id."""
+    repo = git.Repo(search_parent_directories=True)
+    commit_id = repo.head.object.hexsha
+    today = str(date.today())
+    return f'{today}-{commit_id}.xlsx'
+
+
+def _get_drive_client():
+    tmp_credentials = os.getenv(PYDRIVE_CREDENTIALS)
+    if not tmp_credentials:
+        gauth = GoogleAuth()
+        gauth.LocalWebserverAuth()
+    else:
+        with tempfile.TemporaryDirectory() as tempdir:
+            credentials_file_path = pathlib.Path(tempdir) / 'credentials.json'
+            credentials_file_path.write_text(tmp_credentials)
+
+            credentials = json.loads(tmp_credentials)
+
+            settings = {
+                'client_config_backend': 'settings',
+                'client_config': {
+                    'client_id': credentials['client_id'],
+                    'client_secret': credentials['client_secret'],
+                },
+                'save_credentials': True,
+                'save_credentials_backend': 'file',
+                'save_credentials_file': str(credentials_file_path),
+                'get_refresh_token': True,
+            }
+            settings_file = pathlib.Path(tempdir) / 'settings.yaml'
+            settings_file.write_text(yaml.safe_dump(settings))
+
+            gauth = GoogleAuth(str(settings_file))
+            gauth.LocalWebserverAuth()
+
+    return GoogleDrive(gauth)
+
+
+def get_latest_file(folder_id):
+    """Get the latest file from the given Google Drive folder.
+
+    Args:
+        folder (str):
+            The string Google Drive folder ID.
+    """
+    drive = _get_drive_client()
+    drive_query = drive.ListFile({
+        'q': f"'{folder_id}' in parents and trashed=False",
+        'orderBy': 'modifiedDate desc',
+        'maxResults': 1,
+    })
+    file_list = drive_query.GetList()
+    if len(file_list) > 0:
+        return file_list[0]
+
+
+def read_excel(file_id):
+    """Read a file as an XLSX from Google Drive.
+
+    Args:
+        file_id (str):
+            The ID of the file to load.
+
+    Returns:
+        pd.DataFrame or dict[pd.DataFrame]:
+            A DataFrame containing the body of file if single sheet else dict of DataFrames one for
+            each sheet
+
+    """
+    client = _get_drive_client()
+    drive_file = client.CreateFile({'id': file_id})
+    xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+    drive_file.FetchContent(mimetype=xlsx_mime)
+    return pd.read_excel(drive_file.content, sheet_name=None)
+
+
+def _set_column_width(writer, results, sheet_name):
+    for column in results:
+        column_width = max(results[column].astype(str).map(len).max(), len(column))
+        col_idx = results.columns.get_loc(column)
+        writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2)
+
+
+def save_to_gdrive(output_folder, results, output_filename=None):
+    """Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet).
+
+    Given the output folder id (google drive folder id), store the given ``results`` as
+    ``spreadsheet``. If not ``output_filename`` is given, the spreadsheet is saved with the
+    current date and commit as name.
+
+    Args:
+        output_folder (str):
+            String representing a google drive folder id.
+        results (pd.DataFrame or dict[pd.DataFrame]):
+            Dataframe to be stored as ``xlsx``, or dictionary mapping sheet names to dataframes for
+            storage in one ``xlsx`` file.
+        output_filename (str, optional):
+            String representing the filename to be used for the results spreadsheet. If None,
+            uses to the current date and commit as the name. Defaults to None.
+
+    Returns:
+        str:
+            Google drive file id of uploaded file.
+    """
+    if not output_filename:
+        output_filename = _generate_filename()
+
+    output = io.BytesIO()
+
+    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:  # pylint: disable=E0110
+        for sheet_name, data in results.items():
+            data.to_excel(writer, sheet_name=sheet_name, index=False)
+            _set_column_width(writer, data, sheet_name)
+
+    file_config = {'title': output_filename, 'parents': [{'id': output_folder}]}
+    drive = _get_drive_client()
+    drive_file = drive.CreateFile(file_config)
+    drive_file.content = output
+    drive_file.Upload({'convert': True})
+    return drive_file['id']