Merge branch 'main' into main

kedro-org · Sep 24, 2024 · 38f3338 · 38f3338
2 parents 49d3a88 + 4b75db7
commit 38f3338
Show file tree

Hide file tree

Showing 23 changed files with 507 additions and 176 deletions.
diff --git a/.gitpod.yml b/.gitpod.yml
@@ -1,6 +1,4 @@
-# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart
-image: gitpod/workspace-python-3.10:2023-04-20-16-32-37
-
+image: gitpod/workspace-python-3.11
 
 tasks:
   # We want packages installed during the pre-build init steps to go to /workspace
@@ -12,22 +10,16 @@ tasks:
       echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no
     init: |
       make sign-off
+      pip install uv
+      uv venv
+      echo source .venv/bin/activate >> ~/.bashrc
+      source ~/.bashrc
+      make install-test-requirements plugin=kedro-datasets
     command: |
       pre-commit install --install-hooks
       clear
 
-
-github:
-  prebuilds:
-    # enable for the master/default branch (defaults to true)
-    master: true
-    # enable for all branches in this repo (defaults to false)
-    branches: true
-    # enable for pull requests coming from this repo (defaults to true)
-    pullRequests: true
-    # enable for pull requests coming from forks (defaults to false)
-    pullRequestsFromForks: true
-    # add a "Review in Gitpod" button as a comment to pull requests (defaults to true)
-    addComment: false
-    # add a "Review in Gitpod" button to pull requests (defaults to false)
-    addBadge: true
+  - name: system
+    init: |
+      sudo apt-get update && sudo apt-get install -y --no-install-recommends libgl1 make
+      sudo apt-get install -y --no-install-recommends libatk-bridge2.0-0 libcups2 ca-certificates fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgbm1 libgcc1 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 lsb-release wget xdg-utils
diff --git a/Makefile b/Makefile
@@ -5,13 +5,6 @@ package:
 	rm -Rf dist;\
 	python -m build
 
-pypi:
-	python -m pip install twine -U
-	python -m twine upload $(plugin)/dist/*
-
-install: package
-	cd $(plugin) && pip install -U dist/*.whl
-
 install-pip-setuptools:
 	python -m pip install -U pip setuptools wheel
 
@@ -25,46 +18,14 @@ mypy:
 test:
 	cd $(plugin) && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile
 
-# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite
-dataset-tests: dataset-doctests
-	cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow
-	cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py  --no-cov
-
-extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark
-extra_pytest_args=
-dataset-doctest%:
-	if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \
-	  echo "make: *** No rule to make target \`${@}\`.  Stop."; \
-	  exit 2; \
-	fi; \
-    \
-	# The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples.
-	cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \
-	  --ignore kedro_datasets/pandas/gbq_dataset.py \
-	  --ignore kedro_datasets/partitions/partitioned_dataset.py \
-	  --ignore kedro_datasets/redis/redis_dataset.py \
-	  --ignore kedro_datasets/snowflake/snowpark_dataset.py \
-	  --ignore kedro_datasets/spark/spark_hive_dataset.py \
-	  --ignore kedro_datasets/spark/spark_jdbc_dataset.py \
-	  $(extra_pytest_arg${*})
-
-test-sequential:
-	cd $(plugin) && pytest tests --cov-config pyproject.toml
-
 e2e-tests:
 	cd $(plugin) && behave
 
 secret-scan:
 	trufflehog --max_depth 1 --exclude_paths trufflehog-ignore.txt .
 
-clean:
-	cd $(plugin);\
-	rm -rf build dist pip-wheel-metadata .pytest_cache;\
-	find . -regex ".*/__pycache__" -exec rm -rf {} +;\
-	find . -regex ".*\.egg-info" -exec rm -rf {} +;\
-
 install-test-requirements:
-	cd $(plugin) && pip install ".[test]"
+	cd $(plugin) && uv pip install ".[test]"
 
 install-pre-commit:
 	pre-commit install --install-hooks
@@ -79,12 +40,12 @@ sign-off:
 	echo '--in-place "$$1"' >> .git/hooks/commit-msg
 	chmod +x .git/hooks/commit-msg
 
+## kedro-datasets specific
+
 # kedro-datasets related only
 test-no-spark: dataset-doctests-no-spark
 	cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile
 
-test-no-spark-sequential: dataset-doctests-no-spark
-	cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks
 
 # kedro-datasets/snowflake tests skipped from default scope
 test-snowflake-only:
@@ -93,3 +54,26 @@ test-snowflake-only:
 
 check-datasets-docs:
 	cd kedro-datasets && python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck
+
+# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite
+dataset-tests: dataset-doctests
+	cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow
+	cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py  --no-cov
+
+extra_pytest_args-no-spark=--ignore kedro_datasets/databricks --ignore kedro_datasets/spark
+extra_pytest_args=
+dataset-doctest%:
+	if [ "${*}" != 's-no-spark' ] && [ "${*}" != 's' ]; then \
+	  echo "make: *** No rule to make target \`${@}\`.  Stop."; \
+	  exit 2; \
+	fi; \
+    \
+	# The ignored datasets below require complicated setup with cloud/database clients which is overkill for the doctest examples.
+	cd kedro-datasets && pytest kedro_datasets --doctest-modules --doctest-continue-on-failure --no-cov \
+	  --ignore kedro_datasets/pandas/gbq_dataset.py \
+	  --ignore kedro_datasets/partitions/partitioned_dataset.py \
+	  --ignore kedro_datasets/redis/redis_dataset.py \
+	  --ignore kedro_datasets/snowflake/snowpark_dataset.py \
+	  --ignore kedro_datasets/spark/spark_hive_dataset.py \
+	  --ignore kedro_datasets/spark/spark_jdbc_dataset.py \
+	  $(extra_pytest_arg${*})
diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml
@@ -33,8 +33,9 @@ api = ["kedro-datasets[api-apidataset]"]
 biosequence-biosequencedataset = ["biopython~=1.73"]
 biosequence = ["kedro-datasets[biosequence-biosequencedataset]"]
 
+dask-csvdataset = ["dask[dataframe]>=2021.10"]
 dask-parquetdataset = ["dask[complete]>=2021.10", "triad>=0.6.7, <1.0"]
-dask = ["kedro-datasets[dask-parquetdataset]"]
+dask = ["kedro-datasets[dask-parquetdataset, dask-csvdataset]"]
 
 databricks-managedtabledataset = ["kedro-datasets[spark-base,pandas-base,delta-base,hdfs-base,s3fs-base]"]
 databricks = ["kedro-datasets[databricks-managedtabledataset]"]
@@ -92,7 +93,7 @@ pandas-featherdataset = ["kedro-datasets[pandas-base]"]
 pandas-gbqtabledataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"]
 pandas-gbqquerydataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"]
 pandas-genericdataset = ["kedro-datasets[pandas-base]"]
-pandas-hdfdataset = ["kedro-datasets[pandas-base]", "tables~=3.6"]
+pandas-hdfdataset = ["kedro-datasets[pandas-base]", "tables>=3.6"]
 pandas-jsondataset = ["kedro-datasets[pandas-base]"]
 pandas-parquetdataset = ["kedro-datasets[pandas-base]", "pyarrow>=6.0"]
 pandas-sqltabledataset = ["kedro-datasets[pandas-base]", "SQLAlchemy>=1.4, <3.0"]
@@ -127,9 +128,12 @@ plotly = ["kedro-datasets[plotly-htmldataset,plotly-jsondataset,plotly-plotlydat
 
 polars-csvdataset = ["kedro-datasets[polars-base]"]
 polars-eagerpolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"]
-polars-genericdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"]
 polars-lazypolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "deltalake >= 0.6.2"]
-polars = ["kedro-datasets[polars-genericdataset]"]
+polars = [
+    """kedro-datasets[polars-csvdataset,\
+    polars-eagerpolarsdataset,\
+    polars-lazypolarsdataset]"""
+]
 
 redis-pickledataset = ["redis~=4.1"]
 redis = ["kedro-datasets[redis-pickledataset]"]
@@ -140,8 +144,15 @@ snowflake = ["kedro-datasets[snowflake-snowparktabledataset]"]
 spark-deltatabledataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base,delta-base]"]
 spark-sparkdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"]
 spark-sparkhivedataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"]
-spark-sparkjdbcdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"]
-spark = ["kedro-datasets[spark-deltatabledataset]"]
+spark-sparkjdbcdataset = ["kedro-datasets[spark-base]"]
+spark-sparkstreamingdataset = ["kedro-datasets[spark-base,hdfs-base,s3fs-base]"]
+spark = [
+    """kedro-datasets[spark-deltatabledataset,\
+    spark-sparkdataset,\
+    spark-sparkhivedataset,\
+    spark-sparkjdbcdataset,\
+    spark-sparkstreamingdataset]"""
+]
 
 svmlight-svmlightdataset = ["scikit-learn>=1.0.2", "scipy~=1.7.3"]
 svmlight = ["kedro-datasets[svmlight-svmlightdataset]"]
@@ -211,7 +222,7 @@ test = [
     "ibis-framework[duckdb,examples]",
     "import-linter[toml]==1.2.6",
     "ipython>=7.31.1, <8.0",
-    "Jinja2<3.1.0",
+    "Jinja2<3.2.0",
     "joblib>=0.14",
     "jupyterlab>=3.0",
     "jupyter~=1.0",
@@ -250,8 +261,7 @@ test = [
     "scipy>=1.7.3",
     "packaging",
     "SQLAlchemy>=1.2",
-    "tables>=3.8.0; platform_system == 'Windows'",  # Import issues with python 3.8 with pytables pinning to 3.8.0 fixes this https://github.com/PyTables/PyTables/issues/933#issuecomment-1555917593
-    "tables~=3.6; platform_system != 'Windows'",
+    "tables>=3.6",
     "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'",
     "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'",
     "triad>=0.6.7, <1.0",

diff --git a/kedro-telemetry/kedro_telemetry/__init__.py b/kedro-telemetry/kedro_telemetry/__init__.py
@@ -4,4 +4,4 @@
 
 import logging
 
-logging.getLogger(__name__).setLevel(logging.INFO)
+logging.getLogger(__name__).setLevel(logging.DEBUG)
diff --git a/kedro-telemetry/kedro_telemetry/masking.py b/kedro-telemetry/kedro_telemetry/masking.py
@@ -1,7 +1,7 @@
 """Module containing command masking functionality."""
 from __future__ import annotations
 
-from typing import Any, Iterator
+from typing import Any
 
 import click
 
@@ -81,16 +81,19 @@ def _get_cli_structure(
     return output
 
 
-def _mask_kedro_cli(
-    cli_struct: dict[str | None, Any], command_args: list[str]
-) -> list[str]:
+def _mask_kedro_cli(cli: click.CommandCollection, command_args: list[str]) -> list[str]:
     """Takes a dynamic vocabulary (based on `KedroCLI`) and returns
     a masked CLI input"""
     output = []
-
-    # Preserve the initial part of the command until parameters sections begin
     arg_index = 0
-    current_CLI = cli_struct.get("kedro", {})
+    cmd = command_args[0] if command_args else ""
+    if cmd in {"--help", "--version", "-h", "-v", ""}:
+        return command_args
+    click_cmd = cli.get_command(ctx=None, cmd_name=cmd)  # type: ignore
+    if click_cmd is None:
+        return [MASK]
+
+    current_CLI = _get_cli_structure(click_cmd)
     while (
         arg_index < len(command_args)
         and not command_args[arg_index].startswith("-")
@@ -116,13 +119,3 @@ def _mask_kedro_cli(
             output.append(MASK)
 
     return output
-
-
-def _recursive_items(dictionary: dict[Any, Any]) -> Iterator[Any]:
-    for key, value in dictionary.items():
-        if isinstance(value, dict):
-            yield key
-            yield from _recursive_items(value)
-        else:
-            yield key
-            yield value
diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py
@@ -26,7 +26,7 @@
 from kedro.pipeline import Pipeline
 
 from kedro_telemetry import __version__ as TELEMETRY_VERSION
-from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli
+from kedro_telemetry.masking import _mask_kedro_cli
 
 HEAP_APPID_PROD = "2388822444"
 HEAP_ENDPOINT = "https://heapanalytics.com/api/track"
@@ -49,6 +49,7 @@
 CONFIG_FILENAME = "telemetry.toml"
 PYPROJECT_CONFIG_NAME = "pyproject.toml"
 UNDEFINED_PACKAGE_NAME = "undefined_package_name"
+MISSING_USER_IDENTITY = "missing_user_identity"
 
 logger = logging.getLogger(__name__)
 
@@ -78,7 +79,7 @@ def _get_or_create_uuid() -> str:
         return new_uuid
 
     except Exception as e:
-        logging.error(f"Failed to retrieve UUID: {e}")
+        logging.debug(f"Failed to retrieve UUID: {e}")
         return ""
 
 
@@ -104,7 +105,7 @@ def _get_or_create_project_id(pyproject_path: Path) -> str | None:
                     file.write(toml_string)
                 return project_id
             except KeyError:
-                logging.error(
+                logging.debug(
                     f"Failed to retrieve project id or save project id: "
                     f"{str(pyproject_path)} does not contain a [tool.kedro] section"
                 )
@@ -148,7 +149,7 @@ def _generate_new_uuid(full_path: str) -> str:
 
         return new_uuid
     except Exception as e:
-        logging.error(f"Failed to create UUID: {e}")
+        logging.debug(f"Failed to create UUID: {e}")
         return ""
 
 
@@ -176,10 +177,7 @@ def before_command_run(
 
         # get KedroCLI and its structure from actual project root
         cli = KedroCLI(project_path=project_path if project_path else Path.cwd())
-        cli_struct = _get_cli_structure(cli_obj=cli, get_help=False)
-        masked_command_args = _mask_kedro_cli(
-            cli_struct=cli_struct, command_args=command_args
-        )
+        masked_command_args = _mask_kedro_cli(cli, command_args=command_args)
 
         self._user_uuid = _get_or_create_uuid()
 
@@ -200,13 +198,15 @@ def after_command_run(self):
 
     @hook_impl
     def after_context_created(self, context):
-        """Hook implementation to send project statistics data to Heap"""
+        """Hook implementation to read metadata"""
 
         self._consent = _check_for_telemetry_consent(context.project_path)
         self._project_path = context.project_path
 
     @hook_impl
     def after_catalog_created(self, catalog):
+        """Hook implementation to send project statistics data to Heap"""
+
         if self._consent is False:
             return
 
@@ -241,12 +241,12 @@ def _send_telemetry_heap_event(self, event_name: str):
         try:
             _send_heap_event(
                 event_name=event_name,
-                identity=self._user_uuid,
+                identity=self._user_uuid if self._user_uuid else MISSING_USER_IDENTITY,
                 properties=self._event_properties,
             )
             self._sent = True
         except Exception as exc:
-            logger.warning(
+            logger.debug(
                 "Something went wrong in hook implementation to send command run data to Heap. "
                 "Exception: %s",
                 exc,
@@ -324,22 +324,21 @@ def _send_heap_event(
         "event": event_name,
         "timestamp": datetime.now().strftime(TIMESTAMP_FORMAT),
         "properties": properties or {},
+        "identity": identity,
     }
-    if identity:
-        data["identity"] = identity
 
     try:
         resp = requests.post(
             url=HEAP_ENDPOINT, headers=HEAP_HEADERS, data=json.dumps(data), timeout=10
         )
         if resp.status_code != 200:  # noqa: PLR2004
-            logger.warning(
+            logger.debug(
                 "Failed to send data to Heap. Response code returned: %s, Response reason: %s",
                 resp.status_code,
                 resp.reason,
             )
     except requests.exceptions.RequestException as exc:
-        logger.warning(
+        logger.debug(
             "Failed to send data to Heap. Exception of type '%s' was raised.",
             type(exc).__name__,
         )