Skip to content

Commit

Permalink
Merge branch 'main' into feat/datasets/delay-connection
Browse files Browse the repository at this point in the history
  • Loading branch information
deepyaman authored Oct 13, 2023
2 parents 54f12b3 + b946eec commit 86c92af
Show file tree
Hide file tree
Showing 60 changed files with 1,951 additions and 218 deletions.
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ repos:
- id: check-merge-conflict # Check for files that contain merge conflict strings.
- id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source.

- repo: https://github.com/adamchainz/blacken-docs
rev: 1.16.0
hooks:
- id: blacken-docs
args:
- "--rst-literal-blocks"
additional_dependencies:
- black==22.12.0

- repo: local
hooks:
- id: ruff-kedro-datasets
Expand Down
1 change: 1 addition & 0 deletions kedro-airflow/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Added support for Python 3.11
* Added the `--all` CLI argument to `kedro-airflow` to convert registered all pipelines at once.
* Simplify the output of the `kedro airflow create` command.
* Fixed compatibility of `kedro-airflow` with older versions of the config loaders (`kedro<=0.18.2`).

## Community contributions
Many thanks to the following Kedroids for contributing PRs to this release:
Expand Down
24 changes: 12 additions & 12 deletions kedro-airflow/features/steps/cli_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,27 @@ def init_airflow(context, home_dir):
def prepare_old_catalog(context):
config = {
"example_train_x": {
"type": "PickleLocalDataSet",
"type": "PickleLocalDataset",
"filepath": "data/02_intermediate/example_train_x.pkl",
},
"example_train_y": {
"type": "PickleLocalDataSet",
"type": "PickleLocalDataset",
"filepath": "data/02_intermediate/example_train_y.pkl",
},
"example_test_x": {
"type": "PickleLocalDataSet",
"type": "PickleLocalDataset",
"filepath": "data/02_intermediate/example_test_x.pkl",
},
"example_test_y": {
"type": "PickleLocalDataSet",
"type": "PickleLocalDataset",
"filepath": "data/02_intermediate/example_test_y.pkl",
},
"example_model": {
"type": "PickleLocalDataSet",
"type": "PickleLocalDataset",
"filepath": "data/02_intermediate/example_model.pkl",
},
"example_predictions": {
"type": "PickleLocalDataSet",
"type": "PickleLocalDataset",
"filepath": "data/02_intermediate/example_predictions.pkl",
},
}
Expand All @@ -53,27 +53,27 @@ def prepare_old_catalog(context):
def prepare_catalog(context):
config = {
"example_train_x": {
"type": "pickle.PickleDataSet",
"type": "pickle.PickleDataset",
"filepath": "data/02_intermediate/example_train_x.pkl",
},
"example_train_y": {
"type": "pickle.PickleDataSet",
"type": "pickle.PickleDataset",
"filepath": "data/02_intermediate/example_train_y.pkl",
},
"example_test_x": {
"type": "pickle.PickleDataSet",
"type": "pickle.PickleDataset",
"filepath": "data/02_intermediate/example_test_x.pkl",
},
"example_test_y": {
"type": "pickle.PickleDataSet",
"type": "pickle.PickleDataset",
"filepath": "data/02_intermediate/example_test_y.pkl",
},
"example_model": {
"type": "pickle.PickleDataSet",
"type": "pickle.PickleDataset",
"filepath": "data/02_intermediate/example_model.pkl",
},
"example_predictions": {
"type": "pickle.PickleDataSet",
"type": "pickle.PickleDataset",
"filepath": "data/02_intermediate/example_predictions.pkl",
},
}
Expand Down
13 changes: 9 additions & 4 deletions kedro-airflow/kedro_airflow/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,22 @@ def airflow_commands():


def _load_config(context: KedroContext) -> dict[str, Any]:
# Backwards compatibility for ConfigLoader that does not support `config_patterns`
config_loader = context.config_loader
if not hasattr(config_loader, "config_patterns"):
return config_loader.get("airflow*", "airflow/**")

# Set the default pattern for `airflow` if not provided in `settings.py`
if "airflow" not in context.config_loader.config_patterns.keys():
context.config_loader.config_patterns.update( # pragma: no cover
if "airflow" not in config_loader.config_patterns.keys():
config_loader.config_patterns.update( # pragma: no cover
{"airflow": ["airflow*", "airflow/**"]}
)

assert "airflow" in context.config_loader.config_patterns.keys()
assert "airflow" in config_loader.config_patterns.keys()

# Load the config
try:
return context.config_loader["airflow"]
return config_loader["airflow"]
except MissingConfigException:
# File does not exist
return {}
Expand Down
24 changes: 23 additions & 1 deletion kedro-airflow/tests/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@

import pytest
import yaml
from kedro.config import ConfigLoader
from kedro.framework.context import KedroContext
from pluggy import PluginManager

from kedro_airflow.plugin import commands
from kedro_airflow.plugin import _load_config, commands


@pytest.mark.parametrize(
Expand Down Expand Up @@ -264,3 +267,22 @@ def test_create_airflow_all_and_pipeline(cli_runner, metadata):
"Error: Invalid value: The `--all` and `--pipeline` option are mutually exclusive."
in result.stdout
)


def test_config_loader_backwards_compatibility(cli_runner, metadata):
# Emulate ConfigLoader in kedro <= 0.18.2
conf_source = Path.cwd() / "conf"
config_loader = ConfigLoader(conf_source=conf_source)
del config_loader.config_patterns
context = KedroContext(
config_loader=config_loader,
hook_manager=PluginManager(project_name=metadata.project_name),
package_name=metadata.package_name,
project_path=metadata.project_path,
)

config = _load_config(context)
assert config == {
"default": {"owner": "again someone else"},
"ds": {"owner": "finally someone else"},
}
12 changes: 10 additions & 2 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
# Upcoming Release
## Major features and improvements
* Moved `PartitionedDataSet` and `IncrementalDataSet` from the core Kedro repo to `kedro-datasets` and renamed to `PartitionedDataset` and `IncrementalDataset`.

## Bug fixes and other changes
* Fix erroneous warning when using an cloud protocol file path with SparkDataSet on Databricks.
* Updated `PickleDataset` to explicitly mention `cloudpickle` support.

## Upcoming deprecations for Kedro-Datasets 2.0.0
## Community contributions
Many thanks to the following Kedroids for contributing PRs to this release:
* [PtrBld](https://github.com/PtrBld)
* [Alistair McKelvie](https://github.com/alamastor)
* [Felix Wittmann](https://github.com/hfwittmann)

# Release 1.7.1
## Bug fixes and other changes
Expand All @@ -10,8 +20,6 @@
## Upcoming deprecations for Kedro-Datasets 2.0.0
* Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" are deprecated and will be removed in 2.0.0.

## Community contributions

# Release 1.7.0:
## Major features and improvements
* Added `polars.GenericDataSet`, a `GenericDataSet` backed by [polars](https://www.pola.rs/), a lightning fast dataframe package built entirely using Rust.
Expand Down
2 changes: 2 additions & 0 deletions kedro-datasets/docs/source/kedro_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ kedro_datasets
kedro_datasets.pandas.SQLTableDataset
kedro_datasets.pandas.XMLDataSet
kedro_datasets.pandas.XMLDataset
kedro_datasets.partitions.IncrementalDataset
kedro_datasets.partitions.PartitionedDataset
kedro_datasets.pickle.PickleDataSet
kedro_datasets.pickle.PickleDataset
kedro_datasets.pillow.ImageDataSet
Expand Down
14 changes: 7 additions & 7 deletions kedro-datasets/kedro_datasets/api/api_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class APIDataset(AbstractDataset[None, requests.Response]):
Example usage for the
`Python API <https://kedro.readthedocs.io/en/stable/data/\
advanced_data_catalog_usage.html>`_:
::
.. code-block:: pycon
>>> from kedro_datasets.api import APIDataset
>>>
Expand All @@ -51,23 +52,22 @@ class APIDataset(AbstractDataset[None, requests.Response]):
... "commodity_desc": "CORN",
... "statisticcat_des": "YIELD",
... "agg_level_desc": "STATE",
... "year": 2000
... "year": 2000,
... }
... },
... credentials=("username", "password")
... credentials=("username", "password"),
... )
>>> data = dataset.load()
``APIDataset`` can also be used to save output on a remote server using HTTP(S)
methods.
::
.. code-block:: pycon
>>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}'
>>>
>>> dataset = APIDataset(
... method = "POST",
... url = "url_of_remote_server",
... save_args = {"chunk_size":1}
... method="POST", url="url_of_remote_server", save_args={"chunk_size": 1}
... )
>>> dataset.save(example_table)
Expand Down
12 changes: 8 additions & 4 deletions kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ class BioSequenceDataset(AbstractDataset[List, List]):
r"""``BioSequenceDataset`` loads and saves data to a sequence file.
Example:
::
.. code-block:: pycon
>>> from kedro_datasets.biosequence import BioSequenceDataset
>>> from io import StringIO
Expand All @@ -28,10 +29,13 @@ class BioSequenceDataset(AbstractDataset[List, List]):
>>> raw_data = []
>>> for record in SeqIO.parse(StringIO(data), "fasta"):
... raw_data.append(record)
...
>>>
>>> dataset = BioSequenceDataset(filepath="ls_orchid.fasta",
... load_args={"format": "fasta"},
... save_args={"format": "fasta"})
>>> dataset = BioSequenceDataset(
... filepath="ls_orchid.fasta",
... load_args={"format": "fasta"},
... save_args={"format": "fasta"},
... )
>>> dataset.save(raw_data)
>>> sequence_list = dataset.load()
>>>
Expand Down
14 changes: 7 additions & 7 deletions kedro-datasets/kedro_datasets/dask/parquet_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,25 +37,25 @@ class ParquetDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]):
Example usage for the
`Python API <https://kedro.readthedocs.io/en/stable/data/\
advanced_data_catalog_usage.html>`_:
::
.. code-block:: pycon
>>> from kedro.extras.datasets.dask import ParquetDataset
>>> import pandas as pd
>>> import dask.dataframe as dd
>>>
>>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5],
... 'col3': [[5, 6], [7, 8]]})
>>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [[5, 6], [7, 8]]})
>>> ddf = dd.from_pandas(data, npartitions=2)
>>>
>>> dataset = ParquetDataset(
... filepath="s3://bucket_name/path/to/folder",
... credentials={
... 'client_kwargs':{
... 'aws_access_key_id': 'YOUR_KEY',
... 'aws_secret_access_key': 'YOUR SECRET',
... "client_kwargs": {
... "aws_access_key_id": "YOUR_KEY",
... "aws_secret_access_key": "YOUR SECRET",
... }
... },
... save_args={"compression": "GZIP"}
... save_args={"compression": "GZIP"},
... )
>>> dataset.save(ddf)
>>> reloaded = dataset.load()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,13 @@ class ManagedTableDataset(AbstractVersionedDataset):
.. code-block:: python
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField, StringType,
IntegerType, StructType)
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
from kedro_datasets.databricks import ManagedTableDataset
schema = StructType([StructField("name", StringType(), True),
StructField("age", IntegerType(), True)])
data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)]
schema = StructType(
[StructField("name", StringType(), True), StructField("age", IntegerType(), True)]
)
data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)]
spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema)
dataset = ManagedTableDataset(table="names_and_ages")
dataset.save(spark_df)
Expand Down
3 changes: 2 additions & 1 deletion kedro-datasets/kedro_datasets/email/message_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class EmailMessageDataset(AbstractVersionedDataset[Message, Message]):
Note that ``EmailMessageDataset`` doesn't handle sending email messages.
Example:
::
.. code-block:: pycon
>>> from email.message import EmailMessage
>>>
Expand Down
9 changes: 6 additions & 3 deletions kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,17 @@ class GeoJSONDataset(
allowed geopandas (pandas) options for loading and saving GeoJSON files.
Example:
::
.. code-block:: pycon
>>> import geopandas as gpd
>>> from shapely.geometry import Point
>>> from kedro_datasets.geopandas import GeoJSONDataset
>>>
>>> data = gpd.GeoDataFrame({'col1': [1, 2], 'col2': [4, 5],
... 'col3': [5, 6]}, geometry=[Point(1,1), Point(2,4)])
>>> data = gpd.GeoDataFrame(
... {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]},
... geometry=[Point(1, 1), Point(2, 4)],
... )
>>> dataset = GeoJSONDataset(filepath="test.geojson", save_args=None)
>>> dataset.save(data)
>>> reloaded = dataset.load()
Expand Down
3 changes: 2 additions & 1 deletion kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ class HoloviewsWriter(AbstractVersionedDataset[HoloViews, NoReturn]):
filesystem (e.g. local, S3, GCS).
Example:
::
.. code-block:: pycon
>>> import holoviews as hv
>>> from kedro_datasets.holoviews import HoloviewsWriter
Expand Down
5 changes: 3 additions & 2 deletions kedro-datasets/kedro_datasets/json/json_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@ class JSONDataset(AbstractVersionedDataset[Any, Any]):
Example usage for the
`Python API <https://kedro.readthedocs.io/en/stable/data/\
advanced_data_catalog_usage.html>`_:
::
.. code-block:: pycon
>>> from kedro_datasets.json import JSONDataset
>>>
>>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]}
>>> data = {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}
>>>
>>> dataset = JSONDataset(filepath="test.json")
>>> dataset.save(data)
Expand Down
Loading

0 comments on commit 86c92af

Please sign in to comment.