resolve conflicts

tdspora · Oct 23, 2024 · dae6045 · dae6045
2 parents 65280be + ac1fd9c
commit dae6045
Show file tree

Hide file tree

Showing 12 changed files with 242 additions and 20 deletions.
diff --git a/.github/workflows/databricks-test.yml b/.github/workflows/databricks-test.yml
@@ -0,0 +1,66 @@
+name: Databricks-compatibility
+on: [push]
+
+jobs:
+  Databricks-compatibility-test:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+
+      - name: Install dependencies
+        run: |
+          cp -n databricks/requirements-databricks-15.4-LTS.txt .
+          python -m pip install --upgrade pip
+          pip install flake8 pytest
+          if [ -f requirements-databricks-15.4-LTS.txt ]; then pip install -r requirements-databricks-15.4-LTS.txt; fi
+
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+
+      - name: Test with pytest
+        run: |
+            export PYTHONPATH=$PYTHONPATH:${{ github.workspace }}/src
+            pytest src/tests
+
+      - name: Build test Docker image
+        run: |
+            cp -n databricks/databricks.dockerfile .
+            docker build -t databricks-test-image -f databricks.dockerfile .
+
+  build-and-publish:
+    needs: Databricks-compatibility-test
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install build dependencies
+        run: |
+          cp -n databricks/setup.cfg .
+          cp -n databricks/pyproject.toml .
+          sed -i 's/$/rc1+dbx/' src/syngen/VERSION
+          cat src/syngen/VERSION
+          python -m pip install --upgrade pip
+          pip install build
+      - name: Build Package
+        run: python -m build .
+      - name: Publish package
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.PYPI_TEST_TOKEN }} # For release: use secrets.PYPI_TOKEN
+          verbose: true
diff --git a/databricks/README b/databricks/README
@@ -0,0 +1,3 @@
+# This files related to the databricks compatible library
+
+For testing purposes only
diff --git a/databricks/databricks.dockerfile b/databricks/databricks.dockerfile
@@ -0,0 +1,23 @@
+# syntax=docker/dockerfile:1
+
+# Build the initial docker image:
+    FROM databricksruntime/standard:15.4-LTS AS builder
+
+    # Set arguments to handle proper pip install comand due to syngen rc version present in requirements file
+    # For local tests, use the following parameter to pass build argument:
+    # --build-arg PIP_INSTALL_CMD="pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ --use-pep517 --no-cache-dir -r requirements.txt"
+
+    ARG PIP_INSTALL_CMD="pip install --use-pep517 --no-cache-dir -r requirements-databricks-15.4-LTS.txt"
+
+    # Minimize the number of RUN commands and clean up cache and temporary files
+    RUN apt-get update && \
+        apt-get install -y gcc g++ ccache build-essential curl && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/{apt,dpkg,cache,log}
+    COPY src /src
+    COPY requirements-databricks-15.4-LTS.txt /requirements-databricks-15.4-LTS.txt
+    RUN /databricks/python3/bin/${PIP_INSTALL_CMD}
+    ENV MPLCONFIGDIR=/tmp
+    ENV PYTHONPATH="${PYTHONPATH}:/src"
+    WORKDIR /src
diff --git a/databricks/pyproject.toml b/databricks/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=68.0.0", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/databricks/requirements-databricks-15.4-LTS.txt b/databricks/requirements-databricks-15.4-LTS.txt
@@ -0,0 +1,37 @@
+aiohttp>=3.9.0
+attrs
+avro
+base32-crockford
+boto3
+category_encoders==2.6.3
+click
+Jinja2
+keras==2.15.*
+lazy==1.4
+loguru
+MarkupSafe==2.1.1
+marshmallow==3.19.*
+matplotlib==3.7.*
+mlflow-skinny==2.11.*
+numpy==1.23.*
+openpyxl
+pandas==2.2.*
+pandavro==1.8.*
+pathos==0.2.*
+pillow==9.4.*
+psutil
+py-ulid
+pytest
+pytest-reportportal
+python-slugify[unidecode]>=7.0.0
+PyYAML==6.*
+reportportal-client
+scikit_learn==1.3.*
+scipy==1.11.*
+seaborn==0.12.*
+setuptools==68.*
+tensorflow==2.15.*
+tqdm==4.66.3
+Werkzeug==3.0.3
+xlrd
+xlwt
diff --git a/databricks/setup.cfg b/databricks/setup.cfg
@@ -0,0 +1,82 @@
+[metadata]
+name = syngen
+version = file: src/syngen/VERSION
+description = file: DESCRIPTION
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/tdspora/syngen
+author = EPAM Systems, Inc.
+maintainer = Pavel Bobyrev
+license = GPLv3 License
+keywords = data, generation, synthetic, vae, tabular
+classifiers =
+    Development Status :: 5 - Production/Stable
+    Operating System :: POSIX :: Linux
+    Operating System :: Microsoft :: Windows
+    License :: OSI Approved :: GNU General Public License v3 (GPLv3)
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+
+
+[options]
+package_dir =
+    = src
+packages = find:
+include_package_data = True
+python_requires = >3.10, <3.12
+install_requires =
+    aiohttp>=3.9.0
+    attrs
+    avro
+    base32-crockford
+    boto3
+    category_encoders==2.6.3
+    click
+    Jinja2
+    keras==2.15.*
+    lazy==1.4
+    loguru
+    MarkupSafe==2.1.1
+    marshmallow==3.19.*
+    matplotlib==3.7.*
+    mlflow-skinny==2.11.*
+    numpy==1.23.*
+    openpyxl
+    pandas==2.2.*
+    pandavro==1.8.*
+    pathos==0.2.*
+    pillow==9.4.*
+    psutil
+    py-ulid
+    pytest
+    pytest-reportportal
+    python-slugify[unidecode]>=7.0.0
+    PyYAML==6.*
+    reportportal-client
+    scikit_learn==1.3.*
+    scipy==1.11.*
+    seaborn==0.12.*
+    setuptools==68.*
+    tensorflow==2.15.*
+    tqdm==4.66.3
+    Werkzeug==3.0.3
+    xlrd
+    xlwt
+
+
+[options.extras_require]
+ui =
+    streamlit==1.31.*
+    streamlit_option_menu
+
+
+[options.packages.find]
+where = src
+[options.package_data]
+* = *.py, *.html, *.ttf, *.svg, *.css, *.js
+
+[options.entry_points]
+console_scripts =
+    train = syngen.train:launch_train
+    infer = syngen.infer:launch_infer
+    syngen = syngen:main
diff --git a/src/syngen/VERSION b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.46rc10
+0.9.46rc11
diff --git a/src/syngen/ml/data_loaders/data_loaders.py b/src/syngen/ml/data_loaders/data_loaders.py
@@ -318,6 +318,7 @@ def _save_data(self, df: pd.DataFrame, schema: Optional[Dict]):
 
     def save_data(self, df: pd.DataFrame, schema: Optional[Dict] = None, **kwargs):
         if schema is not None:
+            logger.trace(f"The data will be saved with the schema: {schema}")
             preprocessed_schema = (
                 self._get_preprocessed_schema(schema) if schema is not None else schema
             )

diff --git a/src/syngen/ml/handlers/handlers.py b/src/syngen/ml/handlers/handlers.py
@@ -501,11 +501,6 @@ def handle(self, **kwargs):
                 )
                 generated_data = generated_data[self.dataset.order_of_columns]
 
-                if self.original_schema:
-                    logger.trace(
-                        f"The synthetic data of the table - '{self.table_name}' "
-                        f"will be saved with the schema: {self.original_schema}"
-                    )
                 if generated_data is None:
                     DataLoader(self.paths["path_to_merged_infer"]).save_data(
                         prepared_data,

diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_test.py b/src/syngen/ml/metrics/accuracy_test/accuracy_test.py
@@ -114,9 +114,11 @@ def __init__(
         paths: dict,
         table_name: str,
         infer_config: Dict,
+        columns_nan_labels: Dict,
     ):
         super().__init__(original, synthetic, paths, table_name, infer_config)
         self.reports_path = f"{self.paths['reports_path']}/accuracy"
+        self.columns_nan_labels = columns_nan_labels
         self.univariate = UnivariateMetric(
             self.original,
             self.synthetic,
@@ -127,7 +129,8 @@ def __init__(
             self.original,
             self.synthetic,
             self.plot_exists,
-            self.reports_path
+            self.reports_path,
+            self.columns_nan_labels
         )
         self.correlations = Correlations(
             self.original,

diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py
@@ -338,9 +338,11 @@ def __init__(
         synthetic: pd.DataFrame,
         plot: bool,
         reports_path: str,
+        columns_nan_labels: Dict,
     ):
         super().__init__(original, synthetic, plot, reports_path)
         self.cmap = LinearSegmentedColormap.from_list("rg", ["#0D5598", "#3E92E0", "#E8F4FF"])
+        self.columns_nan_labels = columns_nan_labels
 
     @staticmethod
     def _format_date_labels(heatmap_orig_data, heatmap_synthetic_data, axis):
@@ -442,9 +444,15 @@ def calculate_all(
                     heatmap_orig_data, heatmap_synthetic_data, "y"
                 )
 
-            self._plot_heatmap(heatmap_orig_data, 0, heatmap_min, heatmap_max, cbar=False)
+            self._plot_heatmap(heatmap_orig_data, plt_index=0,
+                               vrange=(heatmap_min, heatmap_max),
+                               features=(first_col, second_col),
+                               cbar=False)
 
-            self._plot_heatmap(heatmap_synthetic_data, 1, heatmap_min, heatmap_max, cbar=True)
+            self._plot_heatmap(heatmap_synthetic_data, plt_index=1,
+                               vrange=(heatmap_min, heatmap_max),
+                               features=(first_col, second_col),
+                               cbar=True)
             # first_col is x axis, second_col is y axis
             title = f"{first_col} vs. {second_col}"
             path_to_image = (
@@ -463,31 +471,31 @@ def get_common_min_max(original, synthetic):
         return vmin, vmax
 
     @staticmethod
-    def __format_float_tick_labels(labels: List) -> List:
+    def __format_float_tick_labels(labels: List, nan_label: str = 'NaN') -> List:
+        labels = [nan_label if pd.isna(l) else l for l in labels]
         if all([isinstance(i, float) for i in labels]) and (
             max(labels) > 1e5 or min(labels) < 1e-03
         ):
-            labels = [f"{label:.4e}" for label in labels]
-            return labels
+            return [f"{label:.4e}" for label in labels]
         if all([isinstance(i, float) for i in labels]):
-            labels = [f"{round(i, 4)}" for i in labels]
-            return labels
-        else:
-            return labels
+            return [f"{round(i, 4)}" for i in labels]
+        return labels
 
     def _plot_heatmap(
         self,
         heatmap_data: List,
         plt_index: int,
-        vmin: float,
-        vmax: float,
+        vrange: tuple[float],
+        features: tuple[str],
         cbar=True,
     ):
+        vmin, vmax = vrange
+        xfeature, yfeature = features
         ax = self._axes.flat[plt_index]
         ax.tick_params(labelsize=14)
         heatmap, x_tick_labels, y_tick_labels = heatmap_data
-        x_tick_labels = self.__format_float_tick_labels(x_tick_labels)
-        y_tick_labels = self.__format_float_tick_labels(y_tick_labels)
+        x_tick_labels = self.__format_float_tick_labels(x_tick_labels, self.columns_nan_labels.get(xfeature, 'NaN'))
+        y_tick_labels = self.__format_float_tick_labels(y_tick_labels, self.columns_nan_labels.get(yfeature, 'NaN'))
         ax = sns.heatmap(
             heatmap,
             xticklabels=x_tick_labels,

diff --git a/src/syngen/ml/reporters/reporters.py b/src/syngen/ml/reporters/reporters.py
@@ -288,6 +288,7 @@ def report(self):
             self.paths,
             self.table_name,
             self.config,
+            self.columns_nan_labels,
         )
         accuracy_test.report(
             cont_columns=list(float_columns | int_columns),
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# This files related to the databricks compatible library

		For testing purposes only