Skip to content

Commit

Permalink
Benchmark Supported Data Types (#2206)
Browse files Browse the repository at this point in the history
  • Loading branch information
pvk-developer authored Sep 25, 2024
1 parent 79961a9 commit 2f9c842
Show file tree
Hide file tree
Showing 17 changed files with 1,497 additions and 14 deletions.
80 changes: 80 additions & 0 deletions .github/workflows/dtypes_benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: Data Types Benchmark

on:
push:
branches:
- main

jobs:
run_dtypes_benchmark:
runs-on: ubuntu-latest

strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install invoke .[test]
- name: Create folder and JSON file
run: |
mkdir -p results
touch results/${{ matrix.python-version }}.json
# Run the benchmarking
- name: Benchmark Data Types
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
run: |
invoke benchmark-dtypes
# Upload the json files as artifacts
- name: Upload artifacts
uses: actions/upload-artifact@v3
with:
name: results-${{ matrix.python-version }}
path: results/*.json

generate_dtypes_report:
runs-on: ubuntu-latest
needs: run_dtypes_benchmark

steps:
- name: Checkout code
uses: actions/checkout@v4

# Set up Python 3.10
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install dependencies for report
run: |
python -m pip install --upgrade pip
python -m pip install .[test]
# Download the artifacts
- name: Download artifacts
uses: actions/download-artifact@v3
with:
path: results/

# Generate the report
- name: Generate the report
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}

run: python -m tests.benchmark.utils
7 changes: 6 additions & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,9 @@ jobs:
python -m pip install --upgrade pip
python -m pip install invoke .[test]
- name: Run integration tests
run: invoke integration
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}

run: |
invoke integration
invoke benchmark-dtypes
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ test = [
'rundoc>=0.4.3,<0.5',
'pytest-runner >= 2.11.1',
'tomli>=2.0.0,<3',
'pydrive',
'pyarrow',
'gitpython',
'slack-sdk>=3.23,<4.0',
]
pomegranate = ['pomegranate>=0.14.3,<0.15']
dev = [
Expand Down Expand Up @@ -181,6 +185,7 @@ exclude = [
".tox",
".git",
"__pycache__",
"*.ipynb",
".ipynb_checkpoints",
"tasks.py",
]
Expand Down
9 changes: 9 additions & 0 deletions sdv/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pathlib import Path

import pandas as pd
from pandas.api.types import is_float, is_integer
from pandas.core.tools.datetimes import _guess_datetime_format_for_array
from rdt.transformers.utils import _GENERATORS

Expand Down Expand Up @@ -440,3 +441,11 @@ def get_possible_chars(regex, num_subpatterns=None):
possible_chars += _get_chars_for_option(option, params)

return possible_chars


def _is_numerical(value):
"""Determine if the input is a numerical type or not."""
try:
return is_integer(value) or is_float(value)
except Exception:
return False
20 changes: 7 additions & 13 deletions sdv/constraints/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import numpy as np
import pandas as pd

from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type
from sdv._utils import _convert_to_timedelta, _create_unique_name, _is_datetime_type, _is_numerical
from sdv.constraints.base import Constraint
from sdv.constraints.errors import (
AggregateConstraintsError,
Expand Down Expand Up @@ -604,7 +604,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
sdtype = metadata.columns.get(column_name, {}).get('sdtype')
value = kwargs.get('value')
if sdtype == 'numerical':
if not isinstance(value, (int, float)):
if not _is_numerical(value):
raise ConstraintMetadataError("'value' must be an int or float.")

elif sdtype == 'datetime':
Expand Down Expand Up @@ -632,7 +632,7 @@ def _validate_init_inputs(column_name, value, relation):
if relation not in ['>', '>=', '<', '<=']:
raise ValueError('`relation` must be one of the following: `>`, `>=`, `<`, `<=`')

if not (isinstance(value, (int, float)) or value_is_datetime):
if not (_is_numerical(value) or value_is_datetime):
raise ValueError('`value` must be a number or a string that represents a datetime.')

if value_is_datetime and not isinstance(value, str):
Expand Down Expand Up @@ -1071,9 +1071,7 @@ def _validate_init_inputs(low_value, high_value):
if values_are_datetimes and not values_are_strings:
raise ValueError('Datetime must be represented as a string.')

values_are_numerical = bool(
isinstance(low_value, (int, float)) and isinstance(high_value, (int, float))
)
values_are_numerical = bool(_is_numerical(low_value) and _is_numerical(high_value))
if not (values_are_numerical or values_are_datetimes):
raise ValueError(
'``low_value`` and ``high_value`` must be a number or a string that '
Expand All @@ -1092,7 +1090,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs):
high_value = kwargs.get('high_value')
low_value = kwargs.get('low_value')
if sdtype == 'numerical':
if not isinstance(high_value, (int, float)) or not isinstance(low_value, (int, float)):
if not _is_numerical(high_value) or not _is_numerical(low_value):
raise ConstraintMetadataError(
"Both 'high_value' and 'low_value' must be ints or floats"
)
Expand Down Expand Up @@ -1187,11 +1185,7 @@ def is_valid(self, table_data):
self._operator(data, self._high_value),
pd.isna(self._high_value),
)

return np.logical_or(
np.logical_and(satisfy_low_bound, satisfy_high_bound),
pd.isna(data),
)
return (satisfy_low_bound & satisfy_high_bound) | pd.isna(data)

def _transform(self, table_data):
"""Transform the table data.
Expand Down Expand Up @@ -1250,7 +1244,7 @@ def _reverse_transform(self, table_data):
table_data[self._column_name] = data.round().astype(self._dtype)

else:
table_data[self._column_name] = data.astype(self._dtype)
table_data[self._column_name] = data.astype(self._dtype, errors='ignore')

table_data = table_data.drop(self._transformed_column, axis=1)
return table_data
Expand Down
5 changes: 5 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def integration(c):
c.run('python -m pytest ./tests/integration --reruns 3')


@task
def benchmark_dtypes(c):
c.run('python -m pytest ./tests/benchmark/supported_dtypes_benchmark.py')


def _get_minimum_versions(dependencies, python_version):
min_versions = {}
for dependency in dependencies:
Expand Down
1 change: 1 addition & 0 deletions tests/_external/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""External utility functions."""
140 changes: 140 additions & 0 deletions tests/_external/gdrive_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""Google Drive utils."""

import io
import json
import os
import pathlib
import tempfile
from datetime import date

import git
import pandas as pd
import yaml
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'


def _generate_filename():
"""Generate a filename with today's date and the commit id."""
repo = git.Repo(search_parent_directories=True)
commit_id = repo.head.object.hexsha
today = str(date.today())
return f'{today}-{commit_id}.xlsx'


def _get_drive_client():
tmp_credentials = os.getenv(PYDRIVE_CREDENTIALS)
if not tmp_credentials:
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
else:
with tempfile.TemporaryDirectory() as tempdir:
credentials_file_path = pathlib.Path(tempdir) / 'credentials.json'
credentials_file_path.write_text(tmp_credentials)

credentials = json.loads(tmp_credentials)

settings = {
'client_config_backend': 'settings',
'client_config': {
'client_id': credentials['client_id'],
'client_secret': credentials['client_secret'],
},
'save_credentials': True,
'save_credentials_backend': 'file',
'save_credentials_file': str(credentials_file_path),
'get_refresh_token': True,
}
settings_file = pathlib.Path(tempdir) / 'settings.yaml'
settings_file.write_text(yaml.safe_dump(settings))

gauth = GoogleAuth(str(settings_file))
gauth.LocalWebserverAuth()

return GoogleDrive(gauth)


def get_latest_file(folder_id):
"""Get the latest file from the given Google Drive folder.
Args:
folder (str):
The string Google Drive folder ID.
"""
drive = _get_drive_client()
drive_query = drive.ListFile({
'q': f"'{folder_id}' in parents and trashed=False",
'orderBy': 'modifiedDate desc',
'maxResults': 1,
})
file_list = drive_query.GetList()
if len(file_list) > 0:
return file_list[0]


def read_excel(file_id):
"""Read a file as an XLSX from Google Drive.
Args:
file_id (str):
The ID of the file to load.
Returns:
pd.DataFrame or dict[pd.DataFrame]:
A DataFrame containing the body of file if single sheet else dict of DataFrames one for
each sheet
"""
client = _get_drive_client()
drive_file = client.CreateFile({'id': file_id})
xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
drive_file.FetchContent(mimetype=xlsx_mime)
return pd.read_excel(drive_file.content, sheet_name=None)


def _set_column_width(writer, results, sheet_name):
for column in results:
column_width = max(results[column].astype(str).map(len).max(), len(column))
col_idx = results.columns.get_loc(column)
writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2)


def save_to_gdrive(output_folder, results, output_filename=None):
"""Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet).
Given the output folder id (google drive folder id), store the given ``results`` as
``spreadsheet``. If not ``output_filename`` is given, the spreadsheet is saved with the
current date and commit as name.
Args:
output_folder (str):
String representing a google drive folder id.
results (pd.DataFrame or dict[pd.DataFrame]):
Dataframe to be stored as ``xlsx``, or dictionary mapping sheet names to dataframes for
storage in one ``xlsx`` file.
output_filename (str, optional):
String representing the filename to be used for the results spreadsheet. If None,
uses to the current date and commit as the name. Defaults to None.
Returns:
str:
Google drive file id of uploaded file.
"""
if not output_filename:
output_filename = _generate_filename()

output = io.BytesIO()

with pd.ExcelWriter(output, engine='xlsxwriter') as writer: # pylint: disable=E0110
for sheet_name, data in results.items():
data.to_excel(writer, sheet_name=sheet_name, index=False)
_set_column_width(writer, data, sheet_name)

file_config = {'title': output_filename, 'parents': [{'id': output_folder}]}
drive = _get_drive_client()
drive_file = drive.CreateFile(file_config)
drive_file.content = output
drive_file.Upload({'convert': True})
return drive_file['id']
Loading

0 comments on commit 2f9c842

Please sign in to comment.