Merge branch 'main' into doc/slicing-pipeline-kedro-viz

kedro-org · Sep 20, 2024 · 4d0f38d · 4d0f38d
2 parents 794c478 + 13ff99e
commit 4d0f38d
Show file tree

Hide file tree

Showing 29 changed files with 606 additions and 544 deletions.
diff --git a/.github/styles/Kedro/ignore.txt b/.github/styles/Kedro/ignore.txt
@@ -44,3 +44,5 @@ transcoding
 transcode
 Claypot
 ethanknights
+Aneira
+Printify
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,12 +1,19 @@
 # Upcoming Release
 
 ## Major features and improvements
+* Implemented `Protocol` abstraction for the current `DataCatalog` and adding new catalog implementations.
+* Refactored `kedro run` and `kedro catalog` commands.
+* Moved pattern resolution logic from `DataCatalog` to a separate component - `CatalogConfigResolver`. Updated `DataCatalog` to use `CatalogConfigResolver` internally.
 * Made packaged Kedro projects return `session.run()` output to be used when running it in the interactive environment.
 * Enhanced `OmegaConfigLoader` configuration validation to detect duplicate keys at all parameter levels, ensuring comprehensive nested key checking.
 ## Bug fixes and other changes
 * Fixed bug where using dataset factories breaks with `ThreadRunner`.
+* Fixed template projects example tests.
+* Made credentials loading consistent between `KedroContext._get_catalog()` and `resolve_patterns` so that both us
+e `_get_config_credentials()`
 
 ## Breaking changes to the API
+* Removed `ShelveStore` to address a security vulnerability.
 
 ## Documentation changes
 * Fix logo on PyPI page.
@@ -15,6 +22,8 @@
 ## Community contributions
 * [Puneet](https://github.com/puneeter)
 * [ethanknights](https://github.com/ethanknights)
+* [Manezki](https://github.com/Manezki)
+* [MigQ2](https://github.com/MigQ2)
 
 # Release 0.19.8
 

diff --git a/docs/source/api/kedro.framework.session.shelvestore.ShelveStore.rst b/docs/source/api/kedro.framework.session.shelvestore.ShelveStore.rst
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -127,8 +127,10 @@
         "typing.Type",
         "typing.Set",
         "kedro.config.config.ConfigLoader",
+        "kedro.io.catalog_config_resolver.CatalogConfigResolver",
         "kedro.io.core.AbstractDataset",
         "kedro.io.core.AbstractVersionedDataset",
+        "kedro.io.core.CatalogProtocol",
         "kedro.io.core.DatasetError",
         "kedro.io.core.Version",
         "kedro.io.data_catalog.DataCatalog",
@@ -168,6 +170,8 @@
         "D[k] if k in D, else d.  d defaults to None.",
         "None.  Update D from mapping/iterable E and F.",
         "Patterns",
+        "CatalogConfigResolver",
+        "CatalogProtocol",
     ),
     "py:data": (
         "typing.Any",

diff --git a/docs/source/contribution/technical_steering_committee.md b/docs/source/contribution/technical_steering_committee.md
@@ -61,10 +61,10 @@ We look for commitment markers who can do the following:
 | [Huong Nguyen](https://github.com/Huongg)                | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Ivan Danov](https://github.com/idanov)                  | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Jitendra Gundaniya](https://github.com/jitu5)           | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
-| [Joel Schwarzmann](https://github.com/datajoely)         | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
+| [Joel Schwarzmann](https://github.com/datajoely)         | [Aneira Health](https://www.aneira.health)                                              |
 | [Juan Luis Cano](https://github.com/astrojuanlu)         | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Laura Couto](https://github.com/lrcouto)                | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
-| [Marcin Zabłocki](https://github.com/marrrcin)           | [Printify, Inc.](https://printify.com/)  |
+| [Marcin Zabłocki](https://github.com/marrrcin)           | [Printify, Inc.](https://printify.com/)                                                 |
 | [Merel Theisen](https://github.com/merelcht)             | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Nok Lam Chan](https://github.com/noklam)                | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |
 | [Rashida Kanchwala](https://github.com/rashidakanchwala) | [QuantumBlack, AI by McKinsey](https://www.mckinsey.com/capabilities/quantumblack)      |

diff --git a/docs/source/starters/new_project_tools.md b/docs/source/starters/new_project_tools.md
@@ -44,7 +44,7 @@ To skip this step in future use --tools
 To find out more: https://docs.kedro.org/en/stable/starters/new_project_tools.html
 
 Tools
-1) Lint: Basic linting with Black and Ruff
+1) Lint: Basic linting with Ruff
 2) Test: Basic testing with pytest
 3) Log: Additional, environment-specific logging options
 4) Docs: A Sphinx documentation setup
@@ -65,8 +65,7 @@ A list of available tools can also be accessed by running `kedro new --help`
 
                       Tools
 
-                      1) Linting: Provides a basic linting setup with Black
-                      and Ruff
+                      1) Linting: Provides a basic linting setup with Ruff
 
                       2) Testing: Provides basic testing setup with pytest
 
@@ -165,7 +164,7 @@ The available tools include: [linting](#linting), [testing](#testing), [custom l
 
 ### Linting
 
-The Kedro linting tool introduces [`black`](https://black.readthedocs.io/en/stable/index.html) and [`ruff`](https://docs.astral.sh/ruff/) as dependencies in your new project's requirements. After project creation, make sure these are installed by running the following command from the project root:
+The Kedro linting tool introduces [`ruff`](https://docs.astral.sh/ruff/) as dependency in your new project's requirements. After project creation, make sure these are installed by running the following command from the project root:
 
 ```bash
 pip install -r requirements.txt
@@ -175,7 +174,6 @@ The linting tool will configure `ruff` with the following settings by default:
 ```toml
 #pyproject.toml
 
-[tool.ruff]
 line-length = 88
 show-fixes = true
 select = [
@@ -187,7 +185,7 @@ select = [
     "PL",  # Pylint
     "T201", # Print Statement
 ]
-ignore = ["E501"]  # Black takes care of line-too-long
+ignore = ["E501"]  # Ruff format takes care of line-too-long
 ```
 
 With these installed, you can then make use of the following commands to format and lint your code:

diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py
@@ -2,9 +2,8 @@
 
 from __future__ import annotations
 
-import copy
 from collections import defaultdict
-from itertools import chain
+from itertools import chain, filterfalse
 from typing import TYPE_CHECKING, Any
 
 import click
@@ -28,6 +27,11 @@ def _create_session(package_name: str, **kwargs: Any) -> KedroSession:
     return KedroSession.create(**kwargs)
 
 
+def is_parameter(dataset_name: str) -> bool:
+    """Check if dataset is a parameter."""
+    return dataset_name.startswith("params:") or dataset_name == "parameters"
+
+
 @click.group(name="Kedro")
 def catalog_cli() -> None:  # pragma: no cover
     pass
@@ -88,21 +92,15 @@ def list_datasets(metadata: ProjectMetadata, pipeline: str, env: str) -> None:
 
         # resolve any factory datasets in the pipeline
         factory_ds_by_type = defaultdict(list)
+
         for ds_name in default_ds:
-            matched_pattern = data_catalog._match_pattern(
-                data_catalog._dataset_patterns, ds_name
-            ) or data_catalog._match_pattern(data_catalog._default_pattern, ds_name)
-            if matched_pattern:
-                ds_config_copy = copy.deepcopy(
-                    data_catalog._dataset_patterns.get(matched_pattern)
-                    or data_catalog._default_pattern.get(matched_pattern)
-                    or {}
+            if data_catalog.config_resolver.match_pattern(ds_name):
+                ds_config = data_catalog.config_resolver.resolve_dataset_pattern(
+                    ds_name
                 )
-
-                ds_config = data_catalog._resolve_config(
-                    ds_name, matched_pattern, ds_config_copy
+                factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append(
+                    ds_name
                 )
-                factory_ds_by_type[ds_config["type"]].append(ds_name)
 
         default_ds = default_ds - set(chain.from_iterable(factory_ds_by_type.values()))
 
@@ -128,12 +126,10 @@ def _map_type_to_datasets(
     datasets of the specific type as a value.
     """
     mapping = defaultdict(list)  # type: ignore[var-annotated]
-    for dataset in datasets:
-        is_param = dataset.startswith("params:") or dataset == "parameters"
-        if not is_param:
-            ds_type = datasets_meta[dataset].__class__.__name__
-            if dataset not in mapping[ds_type]:
-                mapping[ds_type].append(dataset)
+    for dataset_name in filterfalse(is_parameter, datasets):
+        ds_type = datasets_meta[dataset_name].__class__.__name__
+        if dataset_name not in mapping[ds_type]:
+            mapping[ds_type].append(dataset_name)
     return mapping
 
 
@@ -170,20 +166,12 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name: str, env: str) -> N
             f"'{pipeline_name}' pipeline not found! Existing pipelines: {existing_pipelines}"
         )
 
-    pipe_datasets = {
-        ds_name
-        for ds_name in pipeline.datasets()
-        if not ds_name.startswith("params:") and ds_name != "parameters"
-    }
+    pipeline_datasets = set(filterfalse(is_parameter, pipeline.datasets()))
 
-    catalog_datasets = {
-        ds_name
-        for ds_name in context.catalog._datasets.keys()
-        if not ds_name.startswith("params:") and ds_name != "parameters"
-    }
+    catalog_datasets = set(filterfalse(is_parameter, context.catalog.list()))
 
     # Datasets that are missing in Data Catalog
-    missing_ds = sorted(pipe_datasets - catalog_datasets)
+    missing_ds = sorted(pipeline_datasets - catalog_datasets)
     if missing_ds:
         catalog_path = (
             context.project_path
@@ -221,12 +209,9 @@ def rank_catalog_factories(metadata: ProjectMetadata, env: str) -> None:
     session = _create_session(metadata.package_name, env=env)
     context = session.load_context()
 
-    catalog_factories = {
-        **context.catalog._dataset_patterns,
-        **context.catalog._default_pattern,
-    }
+    catalog_factories = context.catalog.config_resolver.list_patterns()
     if catalog_factories:
-        click.echo(yaml.dump(list(catalog_factories.keys())))
+        click.echo(yaml.dump(catalog_factories))
     else:
         click.echo("There are no dataset factories in the catalog.")
 
@@ -242,43 +227,33 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None:
     context = session.load_context()
 
     catalog_config = context.config_loader["catalog"]
-    credentials_config = context.config_loader.get("credentials", None)
+    credentials_config = context._get_config_credentials()
     data_catalog = DataCatalog.from_config(
         catalog=catalog_config, credentials=credentials_config
     )
 
     explicit_datasets = {
         ds_name: ds_config
         for ds_name, ds_config in catalog_config.items()
-        if not data_catalog._is_pattern(ds_name)
+        if not data_catalog.config_resolver.is_pattern(ds_name)
     }
 
     target_pipelines = pipelines.keys()
-    datasets = set()
+    pipeline_datasets = set()
 
     for pipe in target_pipelines:
         pl_obj = pipelines.get(pipe)
         if pl_obj:
-            datasets.update(pl_obj.datasets())
+            pipeline_datasets.update(pl_obj.datasets())
 
-    for ds_name in datasets:
-        is_param = ds_name.startswith("params:") or ds_name == "parameters"
-        if ds_name in explicit_datasets or is_param:
+    for ds_name in pipeline_datasets:
+        if ds_name in explicit_datasets or is_parameter(ds_name):
             continue
 
-        matched_pattern = data_catalog._match_pattern(
-            data_catalog._dataset_patterns, ds_name
-        ) or data_catalog._match_pattern(data_catalog._default_pattern, ds_name)
-        if matched_pattern:
-            ds_config_copy = copy.deepcopy(
-                data_catalog._dataset_patterns.get(matched_pattern)
-                or data_catalog._default_pattern.get(matched_pattern)
-                or {}
-            )
+        ds_config = data_catalog.config_resolver.resolve_dataset_pattern(ds_name)
 
-            ds_config = data_catalog._resolve_config(
-                ds_name, matched_pattern, ds_config_copy
-            )
+        # Exclude MemoryDatasets not set in the catalog explicitly
+        if ds_config:
             explicit_datasets[ds_name] = ds_config
 
     secho(yaml.dump(explicit_datasets))
diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py
@@ -14,7 +14,7 @@
 
 from kedro.config import AbstractConfigLoader, MissingConfigException
 from kedro.framework.project import settings
-from kedro.io import DataCatalog  # noqa: TCH001
+from kedro.io import CatalogProtocol, DataCatalog  # noqa: TCH001
 from kedro.pipeline.transcoding import _transcode_split
 
 if TYPE_CHECKING:
@@ -123,7 +123,7 @@ def _convert_paths_to_absolute_posix(
     return conf_dictionary
 
 
-def _validate_transcoded_datasets(catalog: DataCatalog) -> None:
+def _validate_transcoded_datasets(catalog: CatalogProtocol) -> None:
     """Validates transcoded datasets are correctly named
 
     Args:
@@ -178,13 +178,13 @@ class KedroContext:
     )
 
     @property
-    def catalog(self) -> DataCatalog:
-        """Read-only property referring to Kedro's ``DataCatalog`` for this context.
+    def catalog(self) -> CatalogProtocol:
+        """Read-only property referring to Kedro's catalog` for this context.
 
         Returns:
-            DataCatalog defined in `catalog.yml`.
+            catalog defined in `catalog.yml`.
         Raises:
-            KedroContextError: Incorrect ``DataCatalog`` registered for the project.
+            KedroContextError: Incorrect catalog registered for the project.
 
         """
         return self._get_catalog()
@@ -213,13 +213,13 @@ def _get_catalog(
         self,
         save_version: str | None = None,
         load_versions: dict[str, str] | None = None,
-    ) -> DataCatalog:
-        """A hook for changing the creation of a DataCatalog instance.
+    ) -> CatalogProtocol:
+        """A hook for changing the creation of a catalog instance.
 
         Returns:
-            DataCatalog defined in `catalog.yml`.
+            catalog defined in `catalog.yml`.
         Raises:
-            KedroContextError: Incorrect ``DataCatalog`` registered for the project.
+            KedroContextError: Incorrect catalog registered for the project.
 
         """
         # '**/catalog*' reads modular pipeline configs