-
Notifications
You must be signed in to change notification settings - Fork 317
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Benchmark Supported Data Types (#2206)
- Loading branch information
1 parent
79961a9
commit 2f9c842
Showing
17 changed files
with
1,497 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
name: Data Types Benchmark | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
run_dtypes_benchmark: | ||
runs-on: ubuntu-latest | ||
|
||
strategy: | ||
matrix: | ||
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
python -m pip install invoke .[test] | ||
- name: Create folder and JSON file | ||
run: | | ||
mkdir -p results | ||
touch results/${{ matrix.python-version }}.json | ||
# Run the benchmarking | ||
- name: Benchmark Data Types | ||
env: | ||
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} | ||
run: | | ||
invoke benchmark-dtypes | ||
# Upload the json files as artifacts | ||
- name: Upload artifacts | ||
uses: actions/upload-artifact@v3 | ||
with: | ||
name: results-${{ matrix.python-version }} | ||
path: results/*.json | ||
|
||
generate_dtypes_report: | ||
runs-on: ubuntu-latest | ||
needs: run_dtypes_benchmark | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
# Set up Python 3.10 | ||
- name: Set up Python 3.10 | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.10' | ||
|
||
- name: Install dependencies for report | ||
run: | | ||
python -m pip install --upgrade pip | ||
python -m pip install .[test] | ||
# Download the artifacts | ||
- name: Download artifacts | ||
uses: actions/download-artifact@v3 | ||
with: | ||
path: results/ | ||
|
||
# Generate the report | ||
- name: Generate the report | ||
env: | ||
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} | ||
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} | ||
|
||
run: python -m tests.benchmark.utils |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""External utility functions.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
"""Google Drive utils.""" | ||
|
||
import io | ||
import json | ||
import os | ||
import pathlib | ||
import tempfile | ||
from datetime import date | ||
|
||
import git | ||
import pandas as pd | ||
import yaml | ||
from pydrive.auth import GoogleAuth | ||
from pydrive.drive import GoogleDrive | ||
|
||
PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS' | ||
|
||
|
||
def _generate_filename(): | ||
"""Generate a filename with today's date and the commit id.""" | ||
repo = git.Repo(search_parent_directories=True) | ||
commit_id = repo.head.object.hexsha | ||
today = str(date.today()) | ||
return f'{today}-{commit_id}.xlsx' | ||
|
||
|
||
def _get_drive_client(): | ||
tmp_credentials = os.getenv(PYDRIVE_CREDENTIALS) | ||
if not tmp_credentials: | ||
gauth = GoogleAuth() | ||
gauth.LocalWebserverAuth() | ||
else: | ||
with tempfile.TemporaryDirectory() as tempdir: | ||
credentials_file_path = pathlib.Path(tempdir) / 'credentials.json' | ||
credentials_file_path.write_text(tmp_credentials) | ||
|
||
credentials = json.loads(tmp_credentials) | ||
|
||
settings = { | ||
'client_config_backend': 'settings', | ||
'client_config': { | ||
'client_id': credentials['client_id'], | ||
'client_secret': credentials['client_secret'], | ||
}, | ||
'save_credentials': True, | ||
'save_credentials_backend': 'file', | ||
'save_credentials_file': str(credentials_file_path), | ||
'get_refresh_token': True, | ||
} | ||
settings_file = pathlib.Path(tempdir) / 'settings.yaml' | ||
settings_file.write_text(yaml.safe_dump(settings)) | ||
|
||
gauth = GoogleAuth(str(settings_file)) | ||
gauth.LocalWebserverAuth() | ||
|
||
return GoogleDrive(gauth) | ||
|
||
|
||
def get_latest_file(folder_id): | ||
"""Get the latest file from the given Google Drive folder. | ||
Args: | ||
folder (str): | ||
The string Google Drive folder ID. | ||
""" | ||
drive = _get_drive_client() | ||
drive_query = drive.ListFile({ | ||
'q': f"'{folder_id}' in parents and trashed=False", | ||
'orderBy': 'modifiedDate desc', | ||
'maxResults': 1, | ||
}) | ||
file_list = drive_query.GetList() | ||
if len(file_list) > 0: | ||
return file_list[0] | ||
|
||
|
||
def read_excel(file_id): | ||
"""Read a file as an XLSX from Google Drive. | ||
Args: | ||
file_id (str): | ||
The ID of the file to load. | ||
Returns: | ||
pd.DataFrame or dict[pd.DataFrame]: | ||
A DataFrame containing the body of file if single sheet else dict of DataFrames one for | ||
each sheet | ||
""" | ||
client = _get_drive_client() | ||
drive_file = client.CreateFile({'id': file_id}) | ||
xlsx_mime = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' | ||
drive_file.FetchContent(mimetype=xlsx_mime) | ||
return pd.read_excel(drive_file.content, sheet_name=None) | ||
|
||
|
||
def _set_column_width(writer, results, sheet_name): | ||
for column in results: | ||
column_width = max(results[column].astype(str).map(len).max(), len(column)) | ||
col_idx = results.columns.get_loc(column) | ||
writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2) | ||
|
||
|
||
def save_to_gdrive(output_folder, results, output_filename=None): | ||
"""Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet). | ||
Given the output folder id (google drive folder id), store the given ``results`` as | ||
``spreadsheet``. If not ``output_filename`` is given, the spreadsheet is saved with the | ||
current date and commit as name. | ||
Args: | ||
output_folder (str): | ||
String representing a google drive folder id. | ||
results (pd.DataFrame or dict[pd.DataFrame]): | ||
Dataframe to be stored as ``xlsx``, or dictionary mapping sheet names to dataframes for | ||
storage in one ``xlsx`` file. | ||
output_filename (str, optional): | ||
String representing the filename to be used for the results spreadsheet. If None, | ||
uses to the current date and commit as the name. Defaults to None. | ||
Returns: | ||
str: | ||
Google drive file id of uploaded file. | ||
""" | ||
if not output_filename: | ||
output_filename = _generate_filename() | ||
|
||
output = io.BytesIO() | ||
|
||
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: # pylint: disable=E0110 | ||
for sheet_name, data in results.items(): | ||
data.to_excel(writer, sheet_name=sheet_name, index=False) | ||
_set_column_width(writer, data, sheet_name) | ||
|
||
file_config = {'title': output_filename, 'parents': [{'id': output_folder}]} | ||
drive = _get_drive_client() | ||
drive_file = drive.CreateFile(file_config) | ||
drive_file.content = output | ||
drive_file.Upload({'convert': True}) | ||
return drive_file['id'] |
Oops, something went wrong.