Skip to content

Commit

Permalink
resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
Hanna Imshenetska authored and Hanna Imshenetska committed Oct 23, 2024
2 parents 65280be + ac1fd9c commit dae6045
Show file tree
Hide file tree
Showing 12 changed files with 242 additions and 20 deletions.
66 changes: 66 additions & 0 deletions .github/workflows/databricks-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: Databricks-compatibility
on: [push]

jobs:
Databricks-compatibility-test:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Display Python version
run: python -c "import sys; print(sys.version)"

- name: Install dependencies
run: |
cp -n databricks/requirements-databricks-15.4-LTS.txt .
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements-databricks-15.4-LTS.txt ]; then pip install -r requirements-databricks-15.4-LTS.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
export PYTHONPATH=$PYTHONPATH:${{ github.workspace }}/src
pytest src/tests
- name: Build test Docker image
run: |
cp -n databricks/databricks.dockerfile .
docker build -t databricks-test-image -f databricks.dockerfile .
build-and-publish:
needs: Databricks-compatibility-test
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install build dependencies
run: |
cp -n databricks/setup.cfg .
cp -n databricks/pyproject.toml .
sed -i 's/$/rc1+dbx/' src/syngen/VERSION
cat src/syngen/VERSION
python -m pip install --upgrade pip
pip install build
- name: Build Package
run: python -m build .
- name: Publish package
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_TEST_TOKEN }} # For release: use secrets.PYPI_TOKEN
verbose: true
3 changes: 3 additions & 0 deletions databricks/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This files related to the databricks compatible library

For testing purposes only
23 changes: 23 additions & 0 deletions databricks/databricks.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# syntax=docker/dockerfile:1

# Build the initial docker image:
FROM databricksruntime/standard:15.4-LTS AS builder

# Set arguments to handle proper pip install comand due to syngen rc version present in requirements file
# For local tests, use the following parameter to pass build argument:
# --build-arg PIP_INSTALL_CMD="pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ --use-pep517 --no-cache-dir -r requirements.txt"

ARG PIP_INSTALL_CMD="pip install --use-pep517 --no-cache-dir -r requirements-databricks-15.4-LTS.txt"

# Minimize the number of RUN commands and clean up cache and temporary files
RUN apt-get update && \
apt-get install -y gcc g++ ccache build-essential curl && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/{apt,dpkg,cache,log}
COPY src /src
COPY requirements-databricks-15.4-LTS.txt /requirements-databricks-15.4-LTS.txt
RUN /databricks/python3/bin/${PIP_INSTALL_CMD}
ENV MPLCONFIGDIR=/tmp
ENV PYTHONPATH="${PYTHONPATH}:/src"
WORKDIR /src
3 changes: 3 additions & 0 deletions databricks/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools>=68.0.0", "wheel"]
build-backend = "setuptools.build_meta"
37 changes: 37 additions & 0 deletions databricks/requirements-databricks-15.4-LTS.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
aiohttp>=3.9.0
attrs
avro
base32-crockford
boto3
category_encoders==2.6.3
click
Jinja2
keras==2.15.*
lazy==1.4
loguru
MarkupSafe==2.1.1
marshmallow==3.19.*
matplotlib==3.7.*
mlflow-skinny==2.11.*
numpy==1.23.*
openpyxl
pandas==2.2.*
pandavro==1.8.*
pathos==0.2.*
pillow==9.4.*
psutil
py-ulid
pytest
pytest-reportportal
python-slugify[unidecode]>=7.0.0
PyYAML==6.*
reportportal-client
scikit_learn==1.3.*
scipy==1.11.*
seaborn==0.12.*
setuptools==68.*
tensorflow==2.15.*
tqdm==4.66.3
Werkzeug==3.0.3
xlrd
xlwt
82 changes: 82 additions & 0 deletions databricks/setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
[metadata]
name = syngen
version = file: src/syngen/VERSION
description = file: DESCRIPTION
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/tdspora/syngen
author = EPAM Systems, Inc.
maintainer = Pavel Bobyrev
license = GPLv3 License
keywords = data, generation, synthetic, vae, tabular
classifiers =
Development Status :: 5 - Production/Stable
Operating System :: POSIX :: Linux
Operating System :: Microsoft :: Windows
License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11


[options]
package_dir =
= src
packages = find:
include_package_data = True
python_requires = >3.10, <3.12
install_requires =
aiohttp>=3.9.0
attrs
avro
base32-crockford
boto3
category_encoders==2.6.3
click
Jinja2
keras==2.15.*
lazy==1.4
loguru
MarkupSafe==2.1.1
marshmallow==3.19.*
matplotlib==3.7.*
mlflow-skinny==2.11.*
numpy==1.23.*
openpyxl
pandas==2.2.*
pandavro==1.8.*
pathos==0.2.*
pillow==9.4.*
psutil
py-ulid
pytest
pytest-reportportal
python-slugify[unidecode]>=7.0.0
PyYAML==6.*
reportportal-client
scikit_learn==1.3.*
scipy==1.11.*
seaborn==0.12.*
setuptools==68.*
tensorflow==2.15.*
tqdm==4.66.3
Werkzeug==3.0.3
xlrd
xlwt


[options.extras_require]
ui =
streamlit==1.31.*
streamlit_option_menu


[options.packages.find]
where = src
[options.package_data]
* = *.py, *.html, *.ttf, *.svg, *.css, *.js

[options.entry_points]
console_scripts =
train = syngen.train:launch_train
infer = syngen.infer:launch_infer
syngen = syngen:main
2 changes: 1 addition & 1 deletion src/syngen/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.46rc10
0.9.46rc11
1 change: 1 addition & 0 deletions src/syngen/ml/data_loaders/data_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ def _save_data(self, df: pd.DataFrame, schema: Optional[Dict]):

def save_data(self, df: pd.DataFrame, schema: Optional[Dict] = None, **kwargs):
if schema is not None:
logger.trace(f"The data will be saved with the schema: {schema}")
preprocessed_schema = (
self._get_preprocessed_schema(schema) if schema is not None else schema
)
Expand Down
5 changes: 0 additions & 5 deletions src/syngen/ml/handlers/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,11 +501,6 @@ def handle(self, **kwargs):
)
generated_data = generated_data[self.dataset.order_of_columns]

if self.original_schema:
logger.trace(
f"The synthetic data of the table - '{self.table_name}' "
f"will be saved with the schema: {self.original_schema}"
)
if generated_data is None:
DataLoader(self.paths["path_to_merged_infer"]).save_data(
prepared_data,
Expand Down
5 changes: 4 additions & 1 deletion src/syngen/ml/metrics/accuracy_test/accuracy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,11 @@ def __init__(
paths: dict,
table_name: str,
infer_config: Dict,
columns_nan_labels: Dict,
):
super().__init__(original, synthetic, paths, table_name, infer_config)
self.reports_path = f"{self.paths['reports_path']}/accuracy"
self.columns_nan_labels = columns_nan_labels
self.univariate = UnivariateMetric(
self.original,
self.synthetic,
Expand All @@ -127,7 +129,8 @@ def __init__(
self.original,
self.synthetic,
self.plot_exists,
self.reports_path
self.reports_path,
self.columns_nan_labels
)
self.correlations = Correlations(
self.original,
Expand Down
34 changes: 21 additions & 13 deletions src/syngen/ml/metrics/metrics_classes/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,11 @@ def __init__(
synthetic: pd.DataFrame,
plot: bool,
reports_path: str,
columns_nan_labels: Dict,
):
super().__init__(original, synthetic, plot, reports_path)
self.cmap = LinearSegmentedColormap.from_list("rg", ["#0D5598", "#3E92E0", "#E8F4FF"])
self.columns_nan_labels = columns_nan_labels

@staticmethod
def _format_date_labels(heatmap_orig_data, heatmap_synthetic_data, axis):
Expand Down Expand Up @@ -442,9 +444,15 @@ def calculate_all(
heatmap_orig_data, heatmap_synthetic_data, "y"
)

self._plot_heatmap(heatmap_orig_data, 0, heatmap_min, heatmap_max, cbar=False)
self._plot_heatmap(heatmap_orig_data, plt_index=0,
vrange=(heatmap_min, heatmap_max),
features=(first_col, second_col),
cbar=False)

self._plot_heatmap(heatmap_synthetic_data, 1, heatmap_min, heatmap_max, cbar=True)
self._plot_heatmap(heatmap_synthetic_data, plt_index=1,
vrange=(heatmap_min, heatmap_max),
features=(first_col, second_col),
cbar=True)
# first_col is x axis, second_col is y axis
title = f"{first_col} vs. {second_col}"
path_to_image = (
Expand All @@ -463,31 +471,31 @@ def get_common_min_max(original, synthetic):
return vmin, vmax

@staticmethod
def __format_float_tick_labels(labels: List) -> List:
def __format_float_tick_labels(labels: List, nan_label: str = 'NaN') -> List:
labels = [nan_label if pd.isna(l) else l for l in labels]
if all([isinstance(i, float) for i in labels]) and (
max(labels) > 1e5 or min(labels) < 1e-03
):
labels = [f"{label:.4e}" for label in labels]
return labels
return [f"{label:.4e}" for label in labels]
if all([isinstance(i, float) for i in labels]):
labels = [f"{round(i, 4)}" for i in labels]
return labels
else:
return labels
return [f"{round(i, 4)}" for i in labels]
return labels

def _plot_heatmap(
self,
heatmap_data: List,
plt_index: int,
vmin: float,
vmax: float,
vrange: tuple[float],
features: tuple[str],
cbar=True,
):
vmin, vmax = vrange
xfeature, yfeature = features
ax = self._axes.flat[plt_index]
ax.tick_params(labelsize=14)
heatmap, x_tick_labels, y_tick_labels = heatmap_data
x_tick_labels = self.__format_float_tick_labels(x_tick_labels)
y_tick_labels = self.__format_float_tick_labels(y_tick_labels)
x_tick_labels = self.__format_float_tick_labels(x_tick_labels, self.columns_nan_labels.get(xfeature, 'NaN'))
y_tick_labels = self.__format_float_tick_labels(y_tick_labels, self.columns_nan_labels.get(yfeature, 'NaN'))
ax = sns.heatmap(
heatmap,
xticklabels=x_tick_labels,
Expand Down
1 change: 1 addition & 0 deletions src/syngen/ml/reporters/reporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ def report(self):
self.paths,
self.table_name,
self.config,
self.columns_nan_labels,
)
accuracy_test.report(
cont_columns=list(float_columns | int_columns),
Expand Down

0 comments on commit dae6045

Please sign in to comment.