Merge branch 'main' into feat/datasets/delay-connection

kedro-org · Oct 13, 2023 · 86c92af · 86c92af
2 parents 54f12b3 + b946eec
commit 86c92af
Show file tree

Hide file tree

Showing 60 changed files with 1,951 additions and 218 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,6 +15,15 @@ repos:
       - id: check-merge-conflict # Check for files that contain merge conflict strings.
       - id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source.
 
+  - repo: https://github.com/adamchainz/blacken-docs
+    rev: 1.16.0
+    hooks:
+    - id: blacken-docs
+      args:
+        - "--rst-literal-blocks"
+      additional_dependencies:
+        - black==22.12.0
+
   - repo: local
     hooks:
       - id: ruff-kedro-datasets

diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md
@@ -2,6 +2,7 @@
 * Added support for Python 3.11
 * Added the `--all` CLI argument to `kedro-airflow` to convert registered all pipelines at once.
 * Simplify the output of the `kedro airflow create` command.
+* Fixed compatibility of `kedro-airflow` with older versions of the config loaders (`kedro<=0.18.2`).
 
 ## Community contributions
 Many thanks to the following Kedroids for contributing PRs to this release:

diff --git a/kedro-airflow/features/steps/cli_steps.py b/kedro-airflow/features/steps/cli_steps.py
@@ -20,27 +20,27 @@ def init_airflow(context, home_dir):
 def prepare_old_catalog(context):
     config = {
         "example_train_x": {
-            "type": "PickleLocalDataSet",
+            "type": "PickleLocalDataset",
             "filepath": "data/02_intermediate/example_train_x.pkl",
         },
         "example_train_y": {
-            "type": "PickleLocalDataSet",
+            "type": "PickleLocalDataset",
             "filepath": "data/02_intermediate/example_train_y.pkl",
         },
         "example_test_x": {
-            "type": "PickleLocalDataSet",
+            "type": "PickleLocalDataset",
             "filepath": "data/02_intermediate/example_test_x.pkl",
         },
         "example_test_y": {
-            "type": "PickleLocalDataSet",
+            "type": "PickleLocalDataset",
             "filepath": "data/02_intermediate/example_test_y.pkl",
         },
         "example_model": {
-            "type": "PickleLocalDataSet",
+            "type": "PickleLocalDataset",
             "filepath": "data/02_intermediate/example_model.pkl",
         },
         "example_predictions": {
-            "type": "PickleLocalDataSet",
+            "type": "PickleLocalDataset",
             "filepath": "data/02_intermediate/example_predictions.pkl",
         },
     }
@@ -53,27 +53,27 @@ def prepare_old_catalog(context):
 def prepare_catalog(context):
     config = {
         "example_train_x": {
-            "type": "pickle.PickleDataSet",
+            "type": "pickle.PickleDataset",
             "filepath": "data/02_intermediate/example_train_x.pkl",
         },
         "example_train_y": {
-            "type": "pickle.PickleDataSet",
+            "type": "pickle.PickleDataset",
             "filepath": "data/02_intermediate/example_train_y.pkl",
         },
         "example_test_x": {
-            "type": "pickle.PickleDataSet",
+            "type": "pickle.PickleDataset",
             "filepath": "data/02_intermediate/example_test_x.pkl",
         },
         "example_test_y": {
-            "type": "pickle.PickleDataSet",
+            "type": "pickle.PickleDataset",
             "filepath": "data/02_intermediate/example_test_y.pkl",
         },
         "example_model": {
-            "type": "pickle.PickleDataSet",
+            "type": "pickle.PickleDataset",
             "filepath": "data/02_intermediate/example_model.pkl",
         },
         "example_predictions": {
-            "type": "pickle.PickleDataSet",
+            "type": "pickle.PickleDataset",
             "filepath": "data/02_intermediate/example_predictions.pkl",
         },
     }

diff --git a/kedro-airflow/kedro_airflow/plugin.py b/kedro-airflow/kedro_airflow/plugin.py
@@ -36,17 +36,22 @@ def airflow_commands():
 
 
 def _load_config(context: KedroContext) -> dict[str, Any]:
+    # Backwards compatibility for ConfigLoader that does not support `config_patterns`
+    config_loader = context.config_loader
+    if not hasattr(config_loader, "config_patterns"):
+        return config_loader.get("airflow*", "airflow/**")
+
     # Set the default pattern for `airflow` if not provided in `settings.py`
-    if "airflow" not in context.config_loader.config_patterns.keys():
-        context.config_loader.config_patterns.update(  # pragma: no cover
+    if "airflow" not in config_loader.config_patterns.keys():
+        config_loader.config_patterns.update(  # pragma: no cover
             {"airflow": ["airflow*", "airflow/**"]}
         )
 
-    assert "airflow" in context.config_loader.config_patterns.keys()
+    assert "airflow" in config_loader.config_patterns.keys()
 
     # Load the config
     try:
-        return context.config_loader["airflow"]
+        return config_loader["airflow"]
     except MissingConfigException:
         # File does not exist
         return {}

diff --git a/kedro-airflow/tests/test_plugin.py b/kedro-airflow/tests/test_plugin.py
@@ -5,8 +5,11 @@
 
 import pytest
 import yaml
+from kedro.config import ConfigLoader
+from kedro.framework.context import KedroContext
+from pluggy import PluginManager
 
-from kedro_airflow.plugin import commands
+from kedro_airflow.plugin import _load_config, commands
 
 
 @pytest.mark.parametrize(
@@ -264,3 +267,22 @@ def test_create_airflow_all_and_pipeline(cli_runner, metadata):
         "Error: Invalid value: The `--all` and `--pipeline` option are mutually exclusive."
         in result.stdout
     )
+
+
+def test_config_loader_backwards_compatibility(cli_runner, metadata):
+    # Emulate ConfigLoader in kedro <= 0.18.2
+    conf_source = Path.cwd() / "conf"
+    config_loader = ConfigLoader(conf_source=conf_source)
+    del config_loader.config_patterns
+    context = KedroContext(
+        config_loader=config_loader,
+        hook_manager=PluginManager(project_name=metadata.project_name),
+        package_name=metadata.package_name,
+        project_path=metadata.project_path,
+    )
+
+    config = _load_config(context)
+    assert config == {
+        "default": {"owner": "again someone else"},
+        "ds": {"owner": "finally someone else"},
+    }
diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -1,7 +1,17 @@
 # Upcoming Release
 ## Major features and improvements
+* Moved `PartitionedDataSet` and `IncrementalDataSet` from the core Kedro repo to `kedro-datasets` and renamed to `PartitionedDataset` and `IncrementalDataset`.
+
 ## Bug fixes and other changes
+* Fix erroneous warning when using an cloud protocol file path with SparkDataSet on Databricks.
+* Updated `PickleDataset` to explicitly mention `cloudpickle` support.
+
 ## Upcoming deprecations for Kedro-Datasets 2.0.0
+## Community contributions
+Many thanks to the following Kedroids for contributing PRs to this release:
+* [PtrBld](https://github.com/PtrBld)
+* [Alistair McKelvie](https://github.com/alamastor)
+* [Felix Wittmann](https://github.com/hfwittmann)
 
 # Release 1.7.1
 ## Bug fixes and other changes
@@ -10,8 +20,6 @@
 ## Upcoming deprecations for Kedro-Datasets 2.0.0
 * Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" are deprecated and will be removed in 2.0.0.
 
-## Community contributions
-
 # Release 1.7.0:
 ## Major features and improvements
 * Added `polars.GenericDataSet`, a `GenericDataSet` backed by [polars](https://www.pola.rs/), a lightning fast dataframe package built entirely using Rust.

diff --git a/kedro-datasets/docs/source/kedro_datasets.rst b/kedro-datasets/docs/source/kedro_datasets.rst
@@ -59,6 +59,8 @@ kedro_datasets
    kedro_datasets.pandas.SQLTableDataset
    kedro_datasets.pandas.XMLDataSet
    kedro_datasets.pandas.XMLDataset
+   kedro_datasets.partitions.IncrementalDataset
+   kedro_datasets.partitions.PartitionedDataset
    kedro_datasets.pickle.PickleDataSet
    kedro_datasets.pickle.PickleDataset
    kedro_datasets.pillow.ImageDataSet

diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -37,7 +37,8 @@ class APIDataset(AbstractDataset[None, requests.Response]):
     Example usage for the
     `Python API <https://kedro.readthedocs.io/en/stable/data/\
     advanced_data_catalog_usage.html>`_:
-    ::
+
+    .. code-block:: pycon
 
         >>> from kedro_datasets.api import APIDataset
         >>>
@@ -51,23 +52,22 @@ class APIDataset(AbstractDataset[None, requests.Response]):
         ...             "commodity_desc": "CORN",
         ...             "statisticcat_des": "YIELD",
         ...             "agg_level_desc": "STATE",
-        ...             "year": 2000
+        ...             "year": 2000,
         ...         }
         ...     },
-        ...     credentials=("username", "password")
+        ...     credentials=("username", "password"),
         ... )
         >>> data = dataset.load()
 
     ``APIDataset`` can also be used to save output on a remote server using HTTP(S)
     methods.
-    ::
+
+    .. code-block:: pycon
 
         >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}'
         >>>
         >>> dataset = APIDataset(
-        ...     method = "POST",
-        ...     url = "url_of_remote_server",
-        ...     save_args = {"chunk_size":1}
+        ...     method="POST", url="url_of_remote_server", save_args={"chunk_size": 1}
         ... )
         >>> dataset.save(example_table)
 

diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py
@@ -18,7 +18,8 @@ class BioSequenceDataset(AbstractDataset[List, List]):
     r"""``BioSequenceDataset`` loads and saves data to a sequence file.
 
     Example:
-    ::
+
+    .. code-block:: pycon
 
         >>> from kedro_datasets.biosequence import BioSequenceDataset
         >>> from io import StringIO
@@ -28,10 +29,13 @@ class BioSequenceDataset(AbstractDataset[List, List]):
         >>> raw_data = []
         >>> for record in SeqIO.parse(StringIO(data), "fasta"):
         ...     raw_data.append(record)
+        ...
         >>>
-        >>> dataset = BioSequenceDataset(filepath="ls_orchid.fasta",
-        ...                              load_args={"format": "fasta"},
-        ...                              save_args={"format": "fasta"})
+        >>> dataset = BioSequenceDataset(
+        ...     filepath="ls_orchid.fasta",
+        ...     load_args={"format": "fasta"},
+        ...     save_args={"format": "fasta"},
+        ... )
         >>> dataset.save(raw_data)
         >>> sequence_list = dataset.load()
         >>>

diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py
@@ -37,25 +37,25 @@ class ParquetDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]):
     Example usage for the
     `Python API <https://kedro.readthedocs.io/en/stable/data/\
     advanced_data_catalog_usage.html>`_:
-    ::
+
+    .. code-block:: pycon
 
         >>> from kedro.extras.datasets.dask import ParquetDataset
         >>> import pandas as pd
         >>> import dask.dataframe as dd
         >>>
-        >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5],
-        ...                      'col3': [[5, 6], [7, 8]]})
+        >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [[5, 6], [7, 8]]})
         >>> ddf = dd.from_pandas(data, npartitions=2)
         >>>
         >>> dataset = ParquetDataset(
         ...     filepath="s3://bucket_name/path/to/folder",
         ...     credentials={
-        ...         'client_kwargs':{
-        ...             'aws_access_key_id': 'YOUR_KEY',
-        ...             'aws_secret_access_key': 'YOUR SECRET',
+        ...         "client_kwargs": {
+        ...             "aws_access_key_id": "YOUR_KEY",
+        ...             "aws_secret_access_key": "YOUR SECRET",
         ...         }
         ...     },
-        ...     save_args={"compression": "GZIP"}
+        ...     save_args={"compression": "GZIP"},
         ... )
         >>> dataset.save(ddf)
         >>> reloaded = dataset.load()

diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py
@@ -176,12 +176,13 @@ class ManagedTableDataset(AbstractVersionedDataset):
     .. code-block:: python
 
         from pyspark.sql import SparkSession
-        from pyspark.sql.types import (StructField, StringType,
-                                       IntegerType, StructType)
+        from pyspark.sql.types import StructField, StringType, IntegerType, StructType
         from kedro_datasets.databricks import ManagedTableDataset
-        schema = StructType([StructField("name", StringType(), True),
-                             StructField("age", IntegerType(), True)])
-        data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)]
+
+        schema = StructType(
+            [StructField("name", StringType(), True), StructField("age", IntegerType(), True)]
+        )
+        data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)]
         spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema)
         dataset = ManagedTableDataset(table="names_and_ages")
         dataset.save(spark_df)

diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py
@@ -26,7 +26,8 @@ class EmailMessageDataset(AbstractVersionedDataset[Message, Message]):
     Note that ``EmailMessageDataset`` doesn't handle sending email messages.
 
     Example:
-    ::
+
+    .. code-block:: pycon
 
         >>> from email.message import EmailMessage
         >>>

diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
@@ -26,14 +26,17 @@ class GeoJSONDataset(
     allowed geopandas (pandas) options for loading and saving GeoJSON files.
 
     Example:
-    ::
+
+    .. code-block:: pycon
 
         >>> import geopandas as gpd
         >>> from shapely.geometry import Point
         >>> from kedro_datasets.geopandas import GeoJSONDataset
         >>>
-        >>> data = gpd.GeoDataFrame({'col1': [1, 2], 'col2': [4, 5],
-        ...                          'col3': [5, 6]}, geometry=[Point(1,1), Point(2,4)])
+        >>> data = gpd.GeoDataFrame(
+        ...     {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]},
+        ...     geometry=[Point(1, 1), Point(2, 4)],
+        ... )
         >>> dataset = GeoJSONDataset(filepath="test.geojson", save_args=None)
         >>> dataset.save(data)
         >>> reloaded = dataset.load()

diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py
@@ -21,7 +21,8 @@ class HoloviewsWriter(AbstractVersionedDataset[HoloViews, NoReturn]):
     filesystem (e.g. local, S3, GCS).
 
     Example:
-    ::
+
+    .. code-block:: pycon
 
         >>> import holoviews as hv
         >>> from kedro_datasets.holoviews import HoloviewsWriter

diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py
@@ -34,11 +34,12 @@ class JSONDataset(AbstractVersionedDataset[Any, Any]):
     Example usage for the
     `Python API <https://kedro.readthedocs.io/en/stable/data/\
     advanced_data_catalog_usage.html>`_:
-    ::
+
+    .. code-block:: pycon
 
         >>> from kedro_datasets.json import JSONDataset
         >>>
-        >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]}
+        >>> data = {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}
         >>>
         >>> dataset = JSONDataset(filepath="test.json")
         >>> dataset.save(data)