diff --git a/.github/workflows/databricks-test.yml b/.github/workflows/databricks-test.yml new file mode 100644 index 00000000..238d8111 --- /dev/null +++ b/.github/workflows/databricks-test.yml @@ -0,0 +1,66 @@ +name: Databricks-compatibility +on: [push] + +jobs: + Databricks-compatibility-test: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Display Python version + run: python -c "import sys; print(sys.version)" + + - name: Install dependencies + run: | + cp -n databricks/requirements-databricks-15.4-LTS.txt . + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements-databricks-15.4-LTS.txt ]; then pip install -r requirements-databricks-15.4-LTS.txt; fi + + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Test with pytest + run: | + export PYTHONPATH=$PYTHONPATH:${{ github.workspace }}/src + pytest src/tests + + - name: Build test Docker image + run: | + cp -n databricks/databricks.dockerfile . + docker build -t databricks-test-image -f databricks.dockerfile . + + build-and-publish: + needs: Databricks-compatibility-test + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install build dependencies + run: | + cp -n databricks/setup.cfg . + cp -n databricks/pyproject.toml . + sed -i 's/$/rc1+dbx/' src/syngen/VERSION + cat src/syngen/VERSION + python -m pip install --upgrade pip + pip install build + - name: Build Package + run: python -m build . + - name: Publish package + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_TEST_TOKEN }} # For release: use secrets.PYPI_TOKEN + verbose: true diff --git a/databricks/README b/databricks/README new file mode 100644 index 00000000..75cebae4 --- /dev/null +++ b/databricks/README @@ -0,0 +1,3 @@ +# This files related to the databricks compatible library + +For testing purposes only diff --git a/databricks/databricks.dockerfile b/databricks/databricks.dockerfile new file mode 100644 index 00000000..fbb7981d --- /dev/null +++ b/databricks/databricks.dockerfile @@ -0,0 +1,23 @@ +# syntax=docker/dockerfile:1 + +# Build the initial docker image: + FROM databricksruntime/standard:15.4-LTS AS builder + + # Set arguments to handle proper pip install comand due to syngen rc version present in requirements file + # For local tests, use the following parameter to pass build argument: + # --build-arg PIP_INSTALL_CMD="pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ --use-pep517 --no-cache-dir -r requirements.txt" + + ARG PIP_INSTALL_CMD="pip install --use-pep517 --no-cache-dir -r requirements-databricks-15.4-LTS.txt" + + # Minimize the number of RUN commands and clean up cache and temporary files + RUN apt-get update && \ + apt-get install -y gcc g++ ccache build-essential curl && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/{apt,dpkg,cache,log} + COPY src /src + COPY requirements-databricks-15.4-LTS.txt /requirements-databricks-15.4-LTS.txt + RUN /databricks/python3/bin/${PIP_INSTALL_CMD} + ENV MPLCONFIGDIR=/tmp + ENV PYTHONPATH="${PYTHONPATH}:/src" + WORKDIR /src diff --git a/databricks/pyproject.toml b/databricks/pyproject.toml new file mode 100644 index 00000000..6433992c --- /dev/null +++ b/databricks/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=68.0.0", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/databricks/requirements-databricks-15.4-LTS.txt b/databricks/requirements-databricks-15.4-LTS.txt new file mode 100644 index 00000000..ecae5982 --- /dev/null +++ b/databricks/requirements-databricks-15.4-LTS.txt @@ -0,0 +1,37 @@ +aiohttp>=3.9.0 +attrs +avro +base32-crockford +boto3 +category_encoders==2.6.3 +click +Jinja2 +keras==2.15.* +lazy==1.4 +loguru +MarkupSafe==2.1.1 +marshmallow==3.19.* +matplotlib==3.7.* +mlflow-skinny==2.11.* +numpy==1.23.* +openpyxl +pandas==2.2.* +pandavro==1.8.* +pathos==0.2.* +pillow==9.4.* +psutil +py-ulid +pytest +pytest-reportportal +python-slugify[unidecode]>=7.0.0 +PyYAML==6.* +reportportal-client +scikit_learn==1.3.* +scipy==1.11.* +seaborn==0.12.* +setuptools==68.* +tensorflow==2.15.* +tqdm==4.66.3 +Werkzeug==3.0.3 +xlrd +xlwt \ No newline at end of file diff --git a/databricks/setup.cfg b/databricks/setup.cfg new file mode 100644 index 00000000..31485683 --- /dev/null +++ b/databricks/setup.cfg @@ -0,0 +1,82 @@ +[metadata] +name = syngen +version = file: src/syngen/VERSION +description = file: DESCRIPTION +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/tdspora/syngen +author = EPAM Systems, Inc. +maintainer = Pavel Bobyrev +license = GPLv3 License +keywords = data, generation, synthetic, vae, tabular +classifiers = + Development Status :: 5 - Production/Stable + Operating System :: POSIX :: Linux + Operating System :: Microsoft :: Windows + License :: OSI Approved :: GNU General Public License v3 (GPLv3) + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + + +[options] +package_dir = + = src +packages = find: +include_package_data = True +python_requires = >3.10, <3.12 +install_requires = + aiohttp>=3.9.0 + attrs + avro + base32-crockford + boto3 + category_encoders==2.6.3 + click + Jinja2 + keras==2.15.* + lazy==1.4 + loguru + MarkupSafe==2.1.1 + marshmallow==3.19.* + matplotlib==3.7.* + mlflow-skinny==2.11.* + numpy==1.23.* + openpyxl + pandas==2.2.* + pandavro==1.8.* + pathos==0.2.* + pillow==9.4.* + psutil + py-ulid + pytest + pytest-reportportal + python-slugify[unidecode]>=7.0.0 + PyYAML==6.* + reportportal-client + scikit_learn==1.3.* + scipy==1.11.* + seaborn==0.12.* + setuptools==68.* + tensorflow==2.15.* + tqdm==4.66.3 + Werkzeug==3.0.3 + xlrd + xlwt + + +[options.extras_require] +ui = + streamlit==1.31.* + streamlit_option_menu + + +[options.packages.find] +where = src +[options.package_data] +* = *.py, *.html, *.ttf, *.svg, *.css, *.js + +[options.entry_points] +console_scripts = + train = syngen.train:launch_train + infer = syngen.infer:launch_infer + syngen = syngen:main \ No newline at end of file diff --git a/src/syngen/VERSION b/src/syngen/VERSION index ae6b7d64..5b3e7b1e 100644 --- a/src/syngen/VERSION +++ b/src/syngen/VERSION @@ -1 +1 @@ -0.9.46rc10 +0.9.46rc11 diff --git a/src/syngen/ml/data_loaders/data_loaders.py b/src/syngen/ml/data_loaders/data_loaders.py index 6b40c41e..d377029b 100644 --- a/src/syngen/ml/data_loaders/data_loaders.py +++ b/src/syngen/ml/data_loaders/data_loaders.py @@ -318,6 +318,7 @@ def _save_data(self, df: pd.DataFrame, schema: Optional[Dict]): def save_data(self, df: pd.DataFrame, schema: Optional[Dict] = None, **kwargs): if schema is not None: + logger.trace(f"The data will be saved with the schema: {schema}") preprocessed_schema = ( self._get_preprocessed_schema(schema) if schema is not None else schema ) diff --git a/src/syngen/ml/handlers/handlers.py b/src/syngen/ml/handlers/handlers.py index 890d3bf2..69e315b4 100644 --- a/src/syngen/ml/handlers/handlers.py +++ b/src/syngen/ml/handlers/handlers.py @@ -501,11 +501,6 @@ def handle(self, **kwargs): ) generated_data = generated_data[self.dataset.order_of_columns] - if self.original_schema: - logger.trace( - f"The synthetic data of the table - '{self.table_name}' " - f"will be saved with the schema: {self.original_schema}" - ) if generated_data is None: DataLoader(self.paths["path_to_merged_infer"]).save_data( prepared_data, diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_test.py b/src/syngen/ml/metrics/accuracy_test/accuracy_test.py index 9ecc8270..b92e5ae2 100644 --- a/src/syngen/ml/metrics/accuracy_test/accuracy_test.py +++ b/src/syngen/ml/metrics/accuracy_test/accuracy_test.py @@ -114,9 +114,11 @@ def __init__( paths: dict, table_name: str, infer_config: Dict, + columns_nan_labels: Dict, ): super().__init__(original, synthetic, paths, table_name, infer_config) self.reports_path = f"{self.paths['reports_path']}/accuracy" + self.columns_nan_labels = columns_nan_labels self.univariate = UnivariateMetric( self.original, self.synthetic, @@ -127,7 +129,8 @@ def __init__( self.original, self.synthetic, self.plot_exists, - self.reports_path + self.reports_path, + self.columns_nan_labels ) self.correlations = Correlations( self.original, diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py index db8c5511..3a36a34a 100644 --- a/src/syngen/ml/metrics/metrics_classes/metrics.py +++ b/src/syngen/ml/metrics/metrics_classes/metrics.py @@ -338,9 +338,11 @@ def __init__( synthetic: pd.DataFrame, plot: bool, reports_path: str, + columns_nan_labels: Dict, ): super().__init__(original, synthetic, plot, reports_path) self.cmap = LinearSegmentedColormap.from_list("rg", ["#0D5598", "#3E92E0", "#E8F4FF"]) + self.columns_nan_labels = columns_nan_labels @staticmethod def _format_date_labels(heatmap_orig_data, heatmap_synthetic_data, axis): @@ -442,9 +444,15 @@ def calculate_all( heatmap_orig_data, heatmap_synthetic_data, "y" ) - self._plot_heatmap(heatmap_orig_data, 0, heatmap_min, heatmap_max, cbar=False) + self._plot_heatmap(heatmap_orig_data, plt_index=0, + vrange=(heatmap_min, heatmap_max), + features=(first_col, second_col), + cbar=False) - self._plot_heatmap(heatmap_synthetic_data, 1, heatmap_min, heatmap_max, cbar=True) + self._plot_heatmap(heatmap_synthetic_data, plt_index=1, + vrange=(heatmap_min, heatmap_max), + features=(first_col, second_col), + cbar=True) # first_col is x axis, second_col is y axis title = f"{first_col} vs. {second_col}" path_to_image = ( @@ -463,31 +471,31 @@ def get_common_min_max(original, synthetic): return vmin, vmax @staticmethod - def __format_float_tick_labels(labels: List) -> List: + def __format_float_tick_labels(labels: List, nan_label: str = 'NaN') -> List: + labels = [nan_label if pd.isna(l) else l for l in labels] if all([isinstance(i, float) for i in labels]) and ( max(labels) > 1e5 or min(labels) < 1e-03 ): - labels = [f"{label:.4e}" for label in labels] - return labels + return [f"{label:.4e}" for label in labels] if all([isinstance(i, float) for i in labels]): - labels = [f"{round(i, 4)}" for i in labels] - return labels - else: - return labels + return [f"{round(i, 4)}" for i in labels] + return labels def _plot_heatmap( self, heatmap_data: List, plt_index: int, - vmin: float, - vmax: float, + vrange: tuple[float], + features: tuple[str], cbar=True, ): + vmin, vmax = vrange + xfeature, yfeature = features ax = self._axes.flat[plt_index] ax.tick_params(labelsize=14) heatmap, x_tick_labels, y_tick_labels = heatmap_data - x_tick_labels = self.__format_float_tick_labels(x_tick_labels) - y_tick_labels = self.__format_float_tick_labels(y_tick_labels) + x_tick_labels = self.__format_float_tick_labels(x_tick_labels, self.columns_nan_labels.get(xfeature, 'NaN')) + y_tick_labels = self.__format_float_tick_labels(y_tick_labels, self.columns_nan_labels.get(yfeature, 'NaN')) ax = sns.heatmap( heatmap, xticklabels=x_tick_labels, diff --git a/src/syngen/ml/reporters/reporters.py b/src/syngen/ml/reporters/reporters.py index 7f85a140..6aee0d87 100644 --- a/src/syngen/ml/reporters/reporters.py +++ b/src/syngen/ml/reporters/reporters.py @@ -288,6 +288,7 @@ def report(self): self.paths, self.table_name, self.config, + self.columns_nan_labels, ) accuracy_test.report( cont_columns=list(float_columns | int_columns),