From 0192170563565922899395297ddbe9eb62992139 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Mon, 2 Oct 2023 13:40:18 +0100 Subject: [PATCH 01/15] style: Introduce `ruff` for linting in all plugins. (#354) Signed-off-by: Merel Theisen --- .github/workflows/kedro-airflow.yml | 2 +- .github/workflows/kedro-docker.yml | 2 +- .github/workflows/kedro-telemetry.yml | 2 +- .pre-commit-config.yaml | 172 +------ Makefile | 2 +- kedro-airflow/.pylintrc | 425 ------------------ kedro-airflow/CONTRIBUTING.md | 8 +- kedro-airflow/features/environment.py | 2 - kedro-airflow/features/steps/cli_steps.py | 1 + kedro-airflow/features/steps/sh_run.py | 5 +- kedro-airflow/kedro_airflow/plugin.py | 6 +- kedro-airflow/pyproject.toml | 22 +- kedro-airflow/tests/conftest.py | 4 +- kedro-airflow/tests/test_plugin.py | 5 +- kedro-datasets/docs/source/conf.py | 6 +- .../kedro_datasets/api/api_dataset.py | 3 +- .../biosequence/biosequence_dataset.py | 3 +- .../kedro_datasets/dask/parquet_dataset.py | 3 +- .../databricks/managed_table_dataset.py | 9 +- .../kedro_datasets/email/message_dataset.py | 3 +- .../geopandas/geojson_dataset.py | 3 +- .../holoviews/holoviews_writer.py | 3 +- .../kedro_datasets/json/json_dataset.py | 3 +- .../matplotlib/matplotlib_writer.py | 3 +- .../kedro_datasets/networkx/gml_dataset.py | 3 +- .../networkx/graphml_dataset.py | 3 +- .../kedro_datasets/networkx/json_dataset.py | 3 +- .../kedro_datasets/pandas/csv_dataset.py | 5 +- .../pandas/deltatable_dataset.py | 4 +- .../kedro_datasets/pandas/excel_dataset.py | 6 +- .../kedro_datasets/pandas/feather_dataset.py | 3 +- .../kedro_datasets/pandas/gbq_dataset.py | 8 +- .../kedro_datasets/pandas/generic_dataset.py | 3 +- .../kedro_datasets/pandas/hdf_dataset.py | 4 +- .../kedro_datasets/pandas/json_dataset.py | 3 +- .../kedro_datasets/pandas/parquet_dataset.py | 3 +- .../kedro_datasets/pandas/sql_dataset.py | 7 +- .../kedro_datasets/pandas/xml_dataset.py | 3 +- .../kedro_datasets/pickle/pickle_dataset.py | 3 +- .../kedro_datasets/pillow/image_dataset.py | 3 +- .../kedro_datasets/plotly/json_dataset.py | 3 +- .../kedro_datasets/plotly/plotly_dataset.py | 3 +- .../kedro_datasets/polars/csv_dataset.py | 3 +- .../kedro_datasets/polars/generic_dataset.py | 6 +- .../kedro_datasets/redis/redis_dataset.py | 3 +- .../snowflake/snowpark_dataset.py | 2 +- .../kedro_datasets/spark/spark_dataset.py | 10 +- .../spark/spark_hive_dataset.py | 5 +- .../spark/spark_jdbc_dataset.py | 3 +- .../svmlight/svmlight_dataset.py | 3 +- .../tensorflow/tensorflow_model_dataset.py | 7 +- .../kedro_datasets/text/text_dataset.py | 3 +- .../kedro_datasets/video/video_dataset.py | 3 +- .../kedro_datasets/yaml/yaml_dataset.py | 3 +- kedro-datasets/pyproject.toml | 58 +-- kedro-datasets/setup.py | 4 +- kedro-datasets/tests/api/test_api_dataset.py | 9 +- kedro-datasets/tests/databricks/conftest.py | 2 +- .../databricks/test_managed_table_dataset.py | 1 - .../matplotlib/test_matplotlib_writer.py | 6 +- .../tests/pandas/test_gbq_dataset.py | 10 +- .../tests/pandas/test_generic_dataset.py | 1 - .../tests/pandas/test_hdf_dataset.py | 2 +- .../tests/pandas/test_sql_dataset.py | 1 - .../tests/spark/test_spark_dataset.py | 46 +- .../tests/spark/test_spark_hive_dataset.py | 6 +- .../test_tensorflow_model_dataset.py | 4 +- kedro-docker/.pylintrc | 425 ------------------ kedro-docker/CONTRIBUTING.md | 8 +- kedro-docker/features/environment.py | 1 - kedro-docker/features/steps/sh_run.py | 7 +- kedro-docker/features/steps/util.py | 2 +- kedro-docker/kedro_docker/helpers.py | 3 +- kedro-docker/kedro_docker/plugin.py | 24 +- kedro-docker/pyproject.toml | 30 +- kedro-telemetry/kedro_telemetry/masking.py | 9 +- kedro-telemetry/kedro_telemetry/plugin.py | 12 +- kedro-telemetry/pyproject.toml | 29 +- kedro-telemetry/tests/test_masking.py | 2 - kedro-telemetry/tests/test_plugin.py | 4 +- 80 files changed, 233 insertions(+), 1291 deletions(-) delete mode 100644 kedro-airflow/.pylintrc delete mode 100644 kedro-docker/.pylintrc diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml index 20d1c14bb..6926215ee 100644 --- a/.github/workflows/kedro-airflow.yml +++ b/.github/workflows/kedro-airflow.yml @@ -34,7 +34,7 @@ jobs: with: plugin: kedro-airflow os: ubuntu-latest - python-version: "3.8" + python-version: "3.11" e2e-tests: strategy: diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml index 3ffec91a3..3fcae7b36 100644 --- a/.github/workflows/kedro-docker.yml +++ b/.github/workflows/kedro-docker.yml @@ -34,7 +34,7 @@ jobs: with: plugin: kedro-docker os: ubuntu-latest - python-version: "3.8" + python-version: "3.11" e2e-tests: strategy: diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml index 034965230..00e9b69ee 100644 --- a/.github/workflows/kedro-telemetry.yml +++ b/.github/workflows/kedro-telemetry.yml @@ -34,7 +34,7 @@ jobs: with: plugin: kedro-telemetry os: ubuntu-latest - python-version: "3.8" + python-version: "3.11" e2e-tests: strategy: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f45c6c8e4..e8804f2cb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ default_stages: [commit, manual] repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.4.0 + rev: v3.4.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -14,178 +14,44 @@ repos: - id: check-case-conflict # Check for files that would conflict in case-insensitive filesystems - id: check-merge-conflict # Check for files that contain merge conflict strings. - id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source. - - id: flake8 - files: ^(kedro-datasets/kedro_datasets/|kedro-airflow/kedro_airflow/|kedro-docker/kedro_docker/|kedro-telemetry/kedro_telemetry/) - args: - - "--max-line-length=88" - - "--max-complexity=18" - - "--select=B,C,E,F,W,T4,B9" - - "--ignore=E203,E266,E501,W503" - exclude: "^kedro_airflow/dag_template.py|^template.py" - repo: local hooks: - # pylint quick checks - - id: pylint-quick-kedro-datasets - name: "Quick PyLint on kedro_datasets/*" + - id: ruff-kedro-datasets + name: "Ruff on kedro_datasets/*" language: system - types: [file, python] files: ^kedro-datasets/kedro_datasets/ exclude: ^(?!kedro-datasets/kedro_datasets/).*\.py$ - entry: pylint --rcfile kedro-datasets/pyproject.toml --disable=unnecessary-pass - stages: [commit] - - - id: pylint-quick-kedro-airflow - name: "Quick PyLint on kedro_airflow/*" - language: system - types: [file, python] - files: ^kedro-airflow/kedro_airflow/ - exclude: ^(?!kedro-airflow/kedro_airflow/).*\.py$ - entry: pylint --disable=unnecessary-pass - stages: [commit] - - - id: pylint-quick-kedro-docker - name: "Quick PyLint on kedro_docker/*" - language: system - types: [file, python] - files: ^kedro-docker/kedro_docker/ - exclude: ^(?!kedro-docker/kedro_docker/).*\.py$ - entry: pylint --disable=unnecessary-pass - stages: [commit] - - - id: pylint-quick-kedro-telemetry - name: "Quick PyLint on kedro_telemetry/*" - language: system - types: [file, python] - files: ^kedro-telemetry/kedro_telemetry/ - exclude: ^(?!kedro-telemetry/kedro_telemetry/).*\.py$ - entry: pylint --disable=unnecessary-pass - stages: [commit] - - # pylint full checks - - id: pylint-kedro-datasets - name: "PyLint on kedro_datasets/*" - language: system - files: ^kedro-datasets/kedro_datasets/.*\.py$ - exclude: ^(?!kedro-datasets/kedro_datasets/).*\.py$ pass_filenames: false - stages: [manual] - entry: pylint --rcfile kedro-datasets/pyproject.toml --disable=unnecessary-pass,E0401 kedro-datasets/kedro_datasets - - - id: pylint-kedro-datasets-features - name: "PyLint on kedro-datasets features/*" - language: system - files: ^kedro-datasets/features/.*\.py$ - exclude: ^(?!kedro-datasets/features/).*\.py$ - pass_filenames: false - stages: [manual] - entry: pylint --rcfile kedro-datasets/pyproject.toml --disable=missing-docstring,no-name-in-module,E0401 kedro-datasets/features - - - id: pylint-kedro-datasets-tests - name: "PyLint on kedro-datasets tests/*" - language: system - files: ^kedro-datasets/tests/.*\.py$ - exclude: ^(?!kedro-datasets/tests/).*\.py$ - pass_filenames: false - stages: [manual] - entry: pylint --rcfile kedro-datasets/pyproject.toml --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments,E0401 kedro-datasets/tests + stages: [ manual ] + entry: ruff kedro-datasets --fix --exit-non-zero-on-fix - - id: pylint-kedro-airflow - name: "PyLint on kedro_airflow/*" + - id: ruff-kedro-airflow + name: "Ruff on kedro_airflow/*" language: system - files: ^kedro-airflow/kedro_airflow/.*\.py$ + files: ^kedro-airflow/kedro_airflow/ exclude: ^(?!kedro-airflow/kedro_airflow/).*\.py$ pass_filenames: false - stages: [manual] - entry: pylint --disable=unnecessary-pass,E0401 kedro-airflow/kedro_airflow - - - id: pylint-kedro-airflow-features - name: "PyLint on kedro-airflow features/*" - language: system - pass_filenames: false - stages: [manual] - entry: pylint --disable=missing-docstring,no-name-in-module kedro-airflow/features - - - id: pylint-kedro-airflow-tests - name: "PyLint on kedro-airflow tests/*" - language: system - pass_filenames: false - stages: [manual] - entry: pylint --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments kedro-airflow/tests + stages: [ manual ] + entry: ruff kedro-airflow --fix --exit-non-zero-on-fix - - id: pylint-kedro-docker - name: "PyLint on kedro_docker/*" + - id: ruff-kedro-docker + name: "Ruff on kedro_docker/*" language: system - files: ^kedro-docker/kedro_docker/.*\.py$ + files: ^kedro-docker/kedro_docker/ exclude: ^(?!kedro-docker/kedro_docker/).*\.py$ pass_filenames: false - stages: [manual] - entry: pylint --disable=unnecessary-pass,E0401 kedro-docker/kedro_docker - - - id: pylint-kedro-docker-features - name: "PyLint on kedro-docker features/*" - language: system - pass_filenames: false - stages: [manual] - entry: pylint --disable=missing-docstring,no-name-in-module kedro-docker/features - - - id: pylint-kedro-docker-tests - name: "PyLint on kedro-docker tests/*" - language: system - pass_filenames: false - stages: [manual] - entry: pylint --disable=missing-docstring,redefined-outer-name,invalid-name,protected-access,too-many-arguments kedro-docker/tests - - - id: pylint-kedro-telemetry - name: "PyLint on kedro_telemetry/*" - language: system - files: ^kedro-telemetry/kedro_telemetry/.*\.py$ - exclude: ^(?!kedro-telemetry/kedro_telemetry/).*\.py$ - pass_filenames: false - stages: [manual] - entry: pylint --disable=unnecessary-pass,E0401 kedro-telemetry/kedro_telemetry - - - id: pylint-kedro-telemetry-features - name: "PyLint on kedro-docker features/*" - language: system stages: [ manual ] - entry: echo 'Not needed to run for this directory' - files: .* + entry: ruff kedro-docker --fix --exit-non-zero-on-fix - - id: pylint-kedro-telemetry-tests - name: "PyLint on kedro-telemetry tests/*" + - id: ruff-kedro-telemetry + name: "Ruff on kedro_telemetry/*" language: system + files: ^kedro-telemetry/kedro_telemetry/ + exclude: ^(?!kedro-telemetry/kedro_telemetry/).*\.py$ pass_filenames: false stages: [manual] - entry: pylint --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments kedro-telemetry/tests - - - id: isort-kedro-datasets - name: "Sort imports" - language: system - types: [ file, python ] - files: ^kedro-datasets/ - entry: isort - - - id: isort-kedro-docker - name: "Sort imports" - language: system - types: [ file, python ] - files: ^kedro-docker/ - entry: isort - - - id: isort-kedro-airflow - name: "Sort imports" - language: system - types: [ file, python ] - files: ^kedro-airflow/ - entry: isort - - - id: isort-kedro-telemetry - name: "Sort imports" - language: system - types: [ file, python ] - files: ^kedro-telemetry/ - entry: isort + entry: ruff kedro-telemetry --fix --exit-non-zero-on-fix - id: black-kedro-datasets name: "Black" diff --git a/Makefile b/Makefile index 1c6c7e478..03e74bec0 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ install-pip-setuptools: python -m pip install -U pip setuptools wheel lint: - pre-commit run trailing-whitespace --all-files && pre-commit run end-of-file-fixer --all-files && pre-commit run check-yaml --all-files && pre-commit run check-added-large-files --all-files && pre-commit run check-case-conflict --all-files && pre-commit run check-merge-conflict --all-files && pre-commit run debug-statements --all-files && pre-commit run flake8 --all-files && pre-commit run isort-$(plugin) --all-files --hook-stage manual && pre-commit run black-$(plugin) --all-files --hook-stage manual && pre-commit run secret_scan --all-files --hook-stage manual && pre-commit run bandit --all-files --hook-stage manual && pre-commit run pylint-$(plugin) --all-files --hook-stage manual && pre-commit run pylint-$(plugin)-features --all-files --hook-stage manual && pre-commit run pylint-$(plugin)-tests --all-files --hook-stage manual + pre-commit run -a --hook-stage manual ruff-$(plugin) && pre-commit run trailing-whitespace --all-files && pre-commit run end-of-file-fixer --all-files && pre-commit run check-yaml --all-files && pre-commit run check-added-large-files --all-files && pre-commit run check-case-conflict --all-files && pre-commit run check-merge-conflict --all-files && pre-commit run debug-statements --all-files && pre-commit run black-$(plugin) --all-files --hook-stage manual && pre-commit run secret_scan --all-files --hook-stage manual && pre-commit run bandit --all-files --hook-stage manual test: cd $(plugin) && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile diff --git a/kedro-airflow/.pylintrc b/kedro-airflow/.pylintrc deleted file mode 100644 index 6a2acae02..000000000 --- a/kedro-airflow/.pylintrc +++ /dev/null @@ -1,425 +0,0 @@ -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code -extension-pkg-whitelist= - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns=.*template\.py - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. -jobs=1 - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins=pylint.extensions.docparams - -# Pickle collected data for later comparisons. -persistent=yes - -# Specify a configuration file. -#rcfile= - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -disable=ungrouped-imports,bad-continuation - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=useless-suppression - - -[REPORTS] - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio).You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - - -[BASIC] - -# Naming hint for argument names -argument-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct argument names -argument-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Naming hint for attribute names -attr-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct attribute names -attr-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Bad variable names which should always be refused, separated by a comma -bad-names=foo,bar,baz,toto,tutu,tata - -# Naming hint for class attribute names -class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Naming hint for class names -class-name-hint=[A-Z_][a-zA-Z0-9]+$ - -# Regular expression matching correct class names -class-rgx=[A-Z_][a-zA-Z0-9]+$ - -# Naming hint for constant names -const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Regular expression matching correct constant names -const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming hint for function names -function-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct function names -function-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Good variable names which should always be accepted, separated by a comma -good-names=i,j,k,ex,Run,_ - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# Naming hint for inline iteration names -inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ - -# Naming hint for method names -method-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct method names -method-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Naming hint for module names -module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Regular expression matching correct module names -module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty - -# Naming hint for variable names -variable-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct variable names -variable-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=100 - -# Maximum number of lines in a module -max-module-lines=1000 - -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma,dict-separator - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME,XXX,TODO - - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Minimum lines number of a similarity. -min-similarity-lines=20 - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,future.builtins - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict,_fields,_replace,_source,_make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - - -[DESIGN] - -# Maximum number of arguments for function / method -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in a if statement -max-bool-expr=5 - -# Maximum number of branch for function / method body -max-branches=12 - -# Maximum number of locals for function / method body -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body -max-returns=6 - -# Maximum number of statements in function / method body -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=1 - - -[IMPORTS] - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception diff --git a/kedro-airflow/CONTRIBUTING.md b/kedro-airflow/CONTRIBUTING.md index 2d3e4c020..0d081ed7f 100644 --- a/kedro-airflow/CONTRIBUTING.md +++ b/kedro-airflow/CONTRIBUTING.md @@ -84,20 +84,20 @@ pip install ".[test]" All checks run by our CI / CD pipeline can be run locally on your computer. -#### PEP-8 Standards (`isort`, `pylint` and `flake8`) +#### Linting (`ruff` and `black`) ```bash -make lint +make plugin=kedro-airflow lint ``` #### Unit tests, 100% coverage (`pytest`, `pytest-cov`) ```bash -make test +make plugin=kedro-airflow test ``` #### End-to-end tests (`behave`) ```bash -make e2e-tests +make plugin=kedro-airflow e2e-tests ``` diff --git a/kedro-airflow/features/environment.py b/kedro-airflow/features/environment.py index 0da6ac934..8f87afd7f 100644 --- a/kedro-airflow/features/environment.py +++ b/kedro-airflow/features/environment.py @@ -11,7 +11,6 @@ def before_scenario(context, scenario): - # pylint: disable=unused-argument """Environment preparation before other cli tests are run. Installs kedro by running pip in the top level directory. """ @@ -56,7 +55,6 @@ def call(cmd, print_output=False): def after_scenario(context, scenario): - # pylint: disable=unused-argument rmtree(str(context.temp_dir)) rmtree(str(context.venv_dir)) diff --git a/kedro-airflow/features/steps/cli_steps.py b/kedro-airflow/features/steps/cli_steps.py index 79dde5622..23eb58727 100644 --- a/kedro-airflow/features/steps/cli_steps.py +++ b/kedro-airflow/features/steps/cli_steps.py @@ -2,6 +2,7 @@ import yaml from behave import given, then, when + from features.steps.sh_run import run OK_EXIT_CODE = 0 diff --git a/kedro-airflow/features/steps/sh_run.py b/kedro-airflow/features/steps/sh_run.py index 634eab66e..cc8afc413 100644 --- a/kedro-airflow/features/steps/sh_run.py +++ b/kedro-airflow/features/steps/sh_run.py @@ -34,10 +34,7 @@ def run( """ if isinstance(cmd, str) and split: cmd = shlex.split(cmd) - # pylint: disable=subprocess-run-check - result = subprocess.run( - cmd, input="", stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs - ) + result = subprocess.run(cmd, input="", capture_output=True, **kwargs) result.stdout = result.stdout.decode("utf-8") result.stderr = result.stderr.decode("utf-8") if print_output: diff --git a/kedro-airflow/kedro_airflow/plugin.py b/kedro-airflow/kedro_airflow/plugin.py index 569e91be2..921643c8e 100644 --- a/kedro-airflow/kedro_airflow/plugin.py +++ b/kedro-airflow/kedro_airflow/plugin.py @@ -22,7 +22,7 @@ @click.group(name="Kedro-Airflow") -def commands(): # pylint: disable=missing-function-docstring +def commands(): pass @@ -88,14 +88,14 @@ def _load_config(context: KedroContext, pipeline_name: str) -> dict[str, Any]: callback=_split_params, ) @click.pass_obj -def create( +def create( # noqa: PLR0913 metadata: ProjectMetadata, pipeline_name, env, target_path, jinja_file, params, -): # pylint: disable=too-many-locals,too-many-arguments +): """Create an Airflow DAG for a project""" project_path = Path.cwd().resolve() bootstrap_project(project_path) diff --git a/kedro-airflow/pyproject.toml b/kedro-airflow/pyproject.toml index ca177dfbd..50f5eabee 100644 --- a/kedro-airflow/pyproject.toml +++ b/kedro-airflow/pyproject.toml @@ -28,15 +28,14 @@ test = [ "bandit", "behave", "black~=22.0", - "flake8", "kedro-datasets", - "pre-commit>=1.17.0, <2.0", - "pylint>=2.5.2, <3.0", + "pre-commit>=2.9.2", "pytest", "pytest-cov", "pytest-mock", "pytest-xdist", "trufflehog>=2.1.0, <3.0", + "ruff~=0.0.290", "wheel" ] @@ -72,3 +71,20 @@ fail_under = 100 show_missing = true omit = ["tests/*"] exclude_lines = ["pragma: no cover", "raise NotImplementedError"] + +[tool.ruff] +line-length = 88 +show-fixes = true +select = [ + "F", # Pyflakes + "W", # pycodestyle + "E", # pycodestyle + "I", # isort + "UP", # pyupgrade + "PL", # Pylint + "T201", # Print Statement +] +ignore = ["E501"] # Black takes care of line-too-long + +[tool.ruff.per-file-ignores] +"{tests,features}/*" = ["T201", "PLR2004", "PLR0915", "PLW1510"] diff --git a/kedro-airflow/tests/conftest.py b/kedro-airflow/tests/conftest.py index ea285bb2c..4fc790668 100644 --- a/kedro-airflow/tests/conftest.py +++ b/kedro-airflow/tests/conftest.py @@ -42,7 +42,7 @@ def _create_kedro_settings_py(file_name: Path, patterns: list[str]): @fixture(scope="session") -def kedro_project(cli_runner): # pylint: disable=unused-argument +def kedro_project(cli_runner): tmp_path = Path().cwd() # From `kedro-mlflow.tests.conftest.py` config = { @@ -98,7 +98,7 @@ def register_pipelines(): @fixture(scope="session") -def metadata(kedro_project): # pylint: disable=unused-argument +def metadata(kedro_project): # cwd() depends on ^ the isolated filesystem, created by CliRunner() project_path = kedro_project return ProjectMetadata( diff --git a/kedro-airflow/tests/test_plugin.py b/kedro-airflow/tests/test_plugin.py index 4b67ff840..2bcdde472 100644 --- a/kedro-airflow/tests/test_plugin.py +++ b/kedro-airflow/tests/test_plugin.py @@ -5,6 +5,7 @@ import pytest import yaml + from kedro_airflow.plugin import commands @@ -46,9 +47,7 @@ def _create_kedro_airflow_yml(file_name: Path, content: dict[str, Any]): yaml.dump(content, fp) -def test_airflow_config_params( - cli_runner, metadata -): # pylint: disable=too-many-statements +def test_airflow_config_params(cli_runner, metadata): """Check if config variables are picked up""" dag_name = "hello_world" template_name = "airflow_params.j2" diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py index 4b231efe9..c5e84732c 100644 --- a/kedro-datasets/docs/source/conf.py +++ b/kedro-datasets/docs/source/conf.py @@ -370,8 +370,8 @@ def autodoc_process_docstring(app, what, name, obj, options, lines): print( style( "Failed to check for class name mentions that can be " - "converted to reStructuredText links in docstring of {}. " - "Error is: \n{}".format(name, str(e)), + f"converted to reStructuredText links in docstring of {name}. " + f"Error is: \n{str(e)}", fg="red", ) ) @@ -430,7 +430,7 @@ def setup(app): style( "Failed to create list of (regex, reStructuredText link " "replacement) for class names and method names in docstrings. " - "Error is: \n{}".format(str(e)), + f"Error is: \n{str(e)}", fg="red", ) ) diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 7081eaed7..8a696f456 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -91,9 +91,8 @@ class APIDataset(AbstractDataset[None, requests.Response]): "timeout": 60, "chunk_size": 100, } - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, url: str, method: str = "GET", diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index a85ff6bd9..d24d38ba0 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -42,8 +42,7 @@ class BioSequenceDataset(AbstractDataset[List, List]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 713d08651..9900e1a19 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -88,8 +88,7 @@ class ParquetDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {"write_index": False} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index b46511ff0..e2e847484 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -43,7 +43,7 @@ def __post_init__(self): The validation is performed by calling a function named: `validate_(self, value) -> raises DatasetError` """ - for name in self.__dataclass_fields__.keys(): # pylint: disable=no-member + for name in self.__dataclass_fields__.keys(): method = getattr(self, f"_validate_{name}", None) if method: method() @@ -194,7 +194,7 @@ class ManagedTableDataset(AbstractVersionedDataset): # using ``ThreadRunner`` instead _SINGLE_PROCESS = True - def __init__( # pylint: disable=R0913 + def __init__( # noqa: PLR0913 self, table: str, catalog: str = None, @@ -383,9 +383,8 @@ def _save(self, data: Union[DataFrame, pd.DataFrame]) -> None: ) else: data = data.select(*cols) - else: - if self._table.dataframe_type == "pandas": - data = self._get_spark().createDataFrame(data) + elif self._table.dataframe_type == "pandas": + data = self._get_spark().createDataFrame(data) if self._table.write_mode == "overwrite": self._save_overwrite(data) elif self._table.write_mode == "upsert": diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index 573ea55dd..b81dc7804 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -50,8 +50,7 @@ class EmailMessageDataset(AbstractVersionedDataset[Message, Message]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py index 334b83ac5..3c5807b9a 100644 --- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py +++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py @@ -44,8 +44,7 @@ class GeoJSONDataset( DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS = {"driver": "GeoJSON"} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py index 5cb1bf138..7d64b8bf6 100644 --- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py +++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py @@ -35,8 +35,7 @@ class HoloviewsWriter(AbstractVersionedDataset[HoloViews, NoReturn]): DEFAULT_SAVE_ARGS: Dict[str, Any] = {"fmt": "png"} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, fs_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index fcb489466..6cae55cce 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -48,8 +48,7 @@ class JSONDataset(AbstractVersionedDataset[Any, Any]): DEFAULT_SAVE_ARGS: Dict[str, Any] = {"indent": 2} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, save_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index f17174c96..568928caf 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -103,8 +103,7 @@ class MatplotlibWriter( DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, fs_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index c27978885..cc7d21bf0 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -36,8 +36,7 @@ class GMLDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index 1704c4a78..902b29114 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -35,8 +35,7 @@ class GraphMLDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index 91b2fbc53..3d565003d 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -36,8 +36,7 @@ class JSONDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 94bf9384e..4887968cd 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -69,8 +69,7 @@ class CSVDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {"index": False} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, @@ -198,7 +197,7 @@ def _invalidate_cache(self) -> None: def _preview(self, nrows: int = 40) -> Dict: # Create a copy so it doesn't contaminate the original dataset dataset_copy = self._copy() - dataset_copy._load_args["nrows"] = nrows # pylint: disable=protected-access + dataset_copy._load_args["nrows"] = nrows data = dataset_copy.load() return data.to_dict(orient="split") diff --git a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py index cbf1413dc..9df340c6d 100644 --- a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py @@ -14,7 +14,7 @@ from kedro_datasets._io import AbstractDataset, DatasetError -class DeltaTableDataset(AbstractDataset): # pylint:disable=too-many-instance-attributes +class DeltaTableDataset(AbstractDataset): """``DeltaTableDataset`` loads/saves delta tables from/to a filesystem (e.g.: local, S3, GCS), Databricks unity catalog and AWS Glue catalog respectively. It handles load and save using a pandas dataframe. When saving data, you can specify one of two @@ -84,7 +84,7 @@ class DeltaTableDataset(AbstractDataset): # pylint:disable=too-many-instance-at DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {"mode": DEFAULT_WRITE_MODE} - def __init__( # pylint: disable=too-many-arguments + def __init__( # noqa: PLR0913 self, filepath: Optional[str] = None, catalog_type: Optional[DataCatalog] = None, diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 8ffc814bd..181e6cd71 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -109,8 +109,7 @@ class ExcelDataset( DEFAULT_LOAD_ARGS = {"engine": "openpyxl"} DEFAULT_SAVE_ARGS = {"index": False} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, engine: str = "openpyxl", @@ -232,7 +231,6 @@ def _save(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]) -> None: output = BytesIO() save_path = get_filepath_str(self._get_save_path(), self._protocol) - # pylint: disable=abstract-class-instantiated with pd.ExcelWriter(output, **self._writer_args) as writer: if isinstance(data, dict): for sheet_name, sheet_data in data.items(): @@ -267,7 +265,7 @@ def _invalidate_cache(self) -> None: def _preview(self, nrows: int = 40) -> Dict: # Create a copy so it doesn't contaminate the original dataset dataset_copy = self._copy() - dataset_copy._load_args["nrows"] = nrows # pylint: disable=protected-access + dataset_copy._load_args["nrows"] = nrows data = dataset_copy.load() return data.to_dict(orient="split") diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index c409493d9..45a454dcf 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -70,8 +70,7 @@ class FeatherDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index c39a37ed0..8dba87dd8 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -65,8 +65,7 @@ class GBQTableDataset(AbstractDataset[None, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {"progress_bar": False} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, dataset: str, table_name: str, @@ -210,8 +209,7 @@ class GBQQueryDataset(AbstractDataset[None, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, sql: str = None, project: str = None, @@ -316,7 +314,7 @@ def _load(self) -> pd.DataFrame: **load_args, ) - def _save(self, data: None) -> NoReturn: # pylint: disable=no-self-use + def _save(self, data: None) -> NoReturn: raise DatasetError("'save' is not supported on GBQQueryDataset") diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index eae3f9b3a..d9395b8e8 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -81,8 +81,7 @@ class GenericDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, file_format: str, diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index 6fb94ba23..50d33e460 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -56,8 +56,7 @@ class HDFDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, key: str, @@ -177,7 +176,6 @@ def _save(self, data: pd.DataFrame) -> None: **self._save_args, ) as store: store.put(self._key, data, format="table") - # pylint: disable=protected-access binary_data = store._handle.get_file_image() with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index c6c87e17f..91dd2930d 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -64,8 +64,7 @@ class JSONDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index 96f35ff66..dc4c05618 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -75,8 +75,7 @@ class ParquetDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index 59c1c20b2..59feb51b4 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -153,8 +153,7 @@ class SQLTableDataset(AbstractDataset[pd.DataFrame, pd.DataFrame]): # sqlalchemy.engine.Engine or sqlalchemy.engine.base.Engine engines: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, table_name: str, credentials: Dict[str, Any], @@ -376,7 +375,7 @@ class SQLQueryDataset(AbstractDataset[None, pd.DataFrame]): # sqlalchemy.engine.Engine or sqlalchemy.engine.base.Engine engines: Dict[str, Any] = {} - def __init__( # pylint: disable=too-many-arguments + def __init__( # noqa: PLR0913 self, sql: str = None, credentials: Dict[str, Any] = None, @@ -509,7 +508,7 @@ def _load(self) -> pd.DataFrame: return pd.read_sql_query(con=engine, **load_args) - def _save(self, data: None) -> NoReturn: # pylint: disable=no-self-use + def _save(self, data: None) -> NoReturn: raise DatasetError("'save' is not supported on SQLQueryDataset") # For mssql only diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index 43dd40084..129d5e3fb 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -47,8 +47,7 @@ class XMLDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {"index": False} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 52004f4e8..19f8072a0 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -68,8 +68,7 @@ class PickleDataset(AbstractVersionedDataset[Any, Any]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments,too-many-locals - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, backend: str = "pickle", diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index 99a16d572..91bae8842 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -32,8 +32,7 @@ class ImageDataset(AbstractVersionedDataset[Image.Image, Image.Image]): DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, save_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index 97ad31e27..b21f4f9bc 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -50,8 +50,7 @@ class JSONDataset( DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index 9a5e53b20..985588e0a 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -66,8 +66,7 @@ class PlotlyDataset(JSONDataset): """ - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, plotly_args: Dict[str, Any], diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 1ed8ce2d5..0e87c2bb2 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -67,8 +67,7 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {"rechunk": True} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/polars/generic_dataset.py b/kedro-datasets/kedro_datasets/polars/generic_dataset.py index a7e030378..8b790e456 100644 --- a/kedro-datasets/kedro_datasets/polars/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/generic_dataset.py @@ -15,7 +15,6 @@ from kedro_datasets._io import AbstractVersionedDataset, DatasetError -# pylint: disable=too-many-instance-attributes class GenericDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): """``polars.GenericDataset`` loads/saves data from/to a data file using an underlying filesystem (e.g.: local, S3, GCS). It uses polars to handle the dynamically select the @@ -54,8 +53,7 @@ class GenericDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, file_format: str, @@ -139,7 +137,7 @@ def __init__( self._fs_open_args_load = _fs_open_args_load self._fs_open_args_save = _fs_open_args_save - def _load(self) -> pl.DataFrame: # pylint: disable= inconsistent-return-statements + def _load(self) -> pl.DataFrame: load_path = get_filepath_str(self._get_load_path(), self._protocol) load_method = getattr(pl, f"read_{self._file_format}", None) diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 8c2809e7a..dc04de00e 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -60,8 +60,7 @@ class PickleDataset(AbstractDataset[Any, Any]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, key: str, backend: str = "pickle", diff --git a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py index 85cdc1450..d98ef2dd6 100644 --- a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py +++ b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py @@ -102,7 +102,7 @@ class SnowparkTableDataset(AbstractDataset): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - def __init__( # pylint: disable=too-many-arguments + def __init__( # noqa: PLR0913 self, table_name: str, schema: str = None, diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 0bf24643d..5971ba495 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -37,7 +37,7 @@ def _parse_glob_pattern(pattern: str) -> str: def _split_filepath(filepath: str) -> Tuple[str, str]: split_ = filepath.split("://", 1) - if len(split_) == 2: + if len(split_) == 2: # noqa: PLR2004 return split_[0] + "://", split_[1] return "", split_[0] @@ -80,12 +80,12 @@ def _get_dbutils(spark: SparkSession) -> Optional[Any]: return dbutils try: - from pyspark.dbutils import DBUtils # pylint: disable=import-outside-toplevel + from pyspark.dbutils import DBUtils dbutils = DBUtils(spark) except ImportError: try: - import IPython # pylint: disable=import-outside-toplevel + import IPython except ImportError: pass else: @@ -111,7 +111,7 @@ def _dbfs_exists(pattern: str, dbutils: Any) -> bool: try: dbutils.fs.ls(file) return True - except Exception: # pylint: disable=broad-except + except Exception: return False @@ -233,7 +233,7 @@ class SparkDataset(AbstractVersionedDataset[DataFrame, DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - def __init__( # pylint: disable=too-many-arguments disable=too-many-locals + def __init__( # noqa: PLR0913 self, filepath: str, file_format: str = "parquet", diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index 5343791ee..860855719 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -12,7 +12,6 @@ from kedro_datasets._io import AbstractDataset, DatasetError -# pylint:disable=too-many-instance-attributes class SparkHiveDataset(AbstractDataset[DataFrame, DataFrame]): """``SparkHiveDataset`` loads and saves Spark dataframes stored on Hive. This data set also handles some incompatible file types such as using partitioned parquet on @@ -67,8 +66,7 @@ class SparkHiveDataset(AbstractDataset[DataFrame, DataFrame]): DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint:disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, database: str, table: str, @@ -211,7 +209,6 @@ def _validate_save(self, data: DataFrame): ) def _exists(self) -> bool: - # noqa # pylint:disable=protected-access return ( self._get_spark() ._jsparkSession.catalog() diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index 301067bb0..c062a6a70 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -65,8 +65,7 @@ class SparkJDBCDataset(AbstractDataset[DataFrame, DataFrame]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, url: str, table: str, diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index 7318cb3b0..2ea1b3be7 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -86,8 +86,7 @@ class SVMLightDataset(AbstractVersionedDataset[_DI, _DO]): DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 1a283a331..18b4274c7 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -60,8 +60,7 @@ class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.M DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS: Dict[str, Any] = {"save_format": "tf"} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, load_args: Dict[str, Any] = None, @@ -132,7 +131,7 @@ def _load(self) -> tf.keras.Model: with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path: if self._is_h5: - path = str(PurePath(path) / TEMPORARY_H5_FILE) + path = str(PurePath(path) / TEMPORARY_H5_FILE) # noqa: PLW2901 self._fs.copy(load_path, path) else: self._fs.get(load_path, path, recursive=True) @@ -151,7 +150,7 @@ def _save(self, data: tf.keras.Model) -> None: with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path: if self._is_h5: - path = str(PurePath(path) / TEMPORARY_H5_FILE) + path = str(PurePath(path) / TEMPORARY_H5_FILE) # noqa: PLW2901 tf.keras.models.save_model(data, path, **self._save_args) diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index 58c2e2a19..2c1ecff6f 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -42,8 +42,7 @@ class TextDataset(AbstractVersionedDataset[str, str]): """ - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, version: Version = None, diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index cf101de1c..b85fc1231 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -264,8 +264,7 @@ class VideoDataset(AbstractDataset[AbstractVideo, AbstractVideo]): """ - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, fourcc: Optional[str] = "mp4v", diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index 76dd94473..45350b338 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -45,8 +45,7 @@ class YAMLDataset(AbstractVersionedDataset[Dict, Dict]): DEFAULT_SAVE_ARGS: Dict[str, Any] = {"default_flow_style": False} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: PLR0913 self, filepath: str, save_args: Dict[str, Any] = None, diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 96828d508..d5be97bbc 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -28,46 +28,6 @@ include = ["kedro_datasets*"] readme = {file = "README.md", content-type = "text/markdown"} version = {attr = "kedro_datasets.__version__"} -[tool.isort] -profile = "black" - -[tool.pylint] -[tool.pylint.master] -ignore = "CVS" -load-plugins = [ - "pylint.extensions.docparams", - "pylint.extensions.no_self_use" -] -extension-pkg-whitelist = "cv2" -unsafe-load-any-extension = false - -[tool.pylint.messages_control] -disable = [ - "ungrouped-imports", - "duplicate-code", - "too-many-instance-attributes", - "too-few-public-methods", # https://github.com/pylint-dev/pylint/issues/8865 -] -enable = ["useless-suppression"] - -[tool.pylint.refactoring] -max-nested-blocks = 5 - -[tool.pylint.format] -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines='^\s*(# )??$' -indent-after-paren = 4 -indent-string = " " - -[tool.pylint.miscellaneous] -notes = [ - "FIXME", - "XXX" -] - -[tool.pylint.design] -min-public-methods = 1 - [tool.coverage.report] fail_under = 100 show_missing = true @@ -84,3 +44,21 @@ addopts = """ --no-cov-on-fail \ -ra \ -W ignore""" + +[tool.ruff] +line-length = 88 +show-fixes = true +select = [ + "F", # Pyflakes + "W", # pycodestyle + "E", # pycodestyle + "I", # isort + "UP", # pyupgrade + "PL", # Pylint + "T201", # Print Statement +] +ignore = ["E501"] # Black takes care of line-too-long + +[tool.ruff.per-file-ignores] +"{tests,docs}/*" = ["PLR2004", "PLR0913", "T201"] +"*/{__init__.py}" = ["F821"] # temporarily ignore ruff undefined name errors for dataset aliases diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index e79d58954..1535d28dd 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -186,10 +186,9 @@ def _collect_requirements(requires): "Pillow~=9.0", "plotly>=4.8.0, <6.0", "polars[xlsx2csv, deltalake]~=0.18.0", - "pre-commit>=2.9.2, <3.0", # The hook `mypy` requires pre-commit version 2.9.2. + "pre-commit>=2.9.2", "pyarrow>=1.0; python_version < '3.11'", "pyarrow>=7.0; python_version >= '3.11'", # Adding to avoid numpy build errors - "pylint>=2.5.2, <3.0", "pyodbc~=4.0.35", "pyproj~=3.0", "pyspark>=2.2, <3.4; python_version < '3.11'", @@ -201,6 +200,7 @@ def _collect_requirements(requires): "redis~=4.1", "requests-mock~=1.6", "requests~=2.20", + "ruff~=0.0.290", "s3fs>=0.3.0, <0.5", # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem. "snowflake-snowpark-python~=1.0.0; python_version == '3.8'", "scikit-learn>=1.0.2,<2", diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index e5a0e6827..10a0baf6d 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -1,4 +1,3 @@ -# pylint: disable=no-member import base64 import importlib import json @@ -296,9 +295,7 @@ def test_successful_save(self, requests_mock, method, data): Then check that the response is OK and the sent data is in the correct form. """ - def json_callback( - request: requests.Request, context: Any # pylint: disable=unused-argument - ) -> dict: + def json_callback(request: requests.Request, context: Any) -> dict: """Callback that sends back the json.""" return request.json() @@ -342,9 +339,7 @@ def test_successful_save_with_json(self, requests_mock, save_methods): Then check we get a response """ - def json_callback( - request: requests.Request, context: Any # pylint: disable=unused-argument - ) -> dict: + def json_callback(request: requests.Request, context: Any) -> dict: """Callback that sends back the json.""" return request.json() diff --git a/kedro-datasets/tests/databricks/conftest.py b/kedro-datasets/tests/databricks/conftest.py index 958ee6a83..afe164adc 100644 --- a/kedro-datasets/tests/databricks/conftest.py +++ b/kedro-datasets/tests/databricks/conftest.py @@ -5,7 +5,7 @@ https://docs.pytest.org/en/latest/fixture.html """ # importlib_metadata needs backport for python 3.8 and older -import importlib_metadata as importlib_metadata # pylint: disable=useless-import-alias +import importlib_metadata import pytest from pyspark.sql import SparkSession diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py index 0ae7964ec..dc2595740 100644 --- a/kedro-datasets/tests/databricks/test_managed_table_dataset.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -183,7 +183,6 @@ def test_deprecation(module_name, class_name): getattr(importlib.import_module(module_name), class_name) -# pylint: disable=too-many-public-methods class TestManagedTableDataset: def test_full_table(self): unity_ds = ManagedTableDataset(catalog="test", database="test", table="test") diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py index a8e83b2da..5270e13a5 100644 --- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py +++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py @@ -32,7 +32,7 @@ def mock_single_plot(): def mock_list_plot(): plots_list = [] colour = "red" - for index in range(5): # pylint: disable=unused-variable + for index in range(5): plots_list.append(plt.figure()) plt.plot([1, 2, 3], [4, 5, 6], color=colour) plt.close("all") @@ -104,9 +104,7 @@ def overwrite(request): @pytest.fixture -def plot_writer( - mocked_s3_bucket, fs_args, save_args, overwrite -): # pylint: disable=unused-argument +def plot_writer(mocked_s3_bucket, fs_args, save_args, overwrite): return MatplotlibWriter( filepath=FULL_PATH, credentials=AWS_CREDENTIALS, diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py index f392f6ae8..be4d65942 100644 --- a/kedro-datasets/tests/pandas/test_gbq_dataset.py +++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py @@ -28,9 +28,7 @@ def mock_bigquery_client(mocker): @pytest.fixture -def gbq_dataset( - load_args, save_args, mock_bigquery_client -): # pylint: disable=unused-argument +def gbq_dataset(load_args, save_args, mock_bigquery_client): return GBQTableDataset( dataset=DATASET, table_name=TABLE_NAME, @@ -42,7 +40,7 @@ def gbq_dataset( @pytest.fixture(params=[{}]) -def gbq_sql_dataset(load_args, mock_bigquery_client): # pylint: disable=unused-argument +def gbq_sql_dataset(load_args, mock_bigquery_client): return GBQQueryDataset( sql=SQL_QUERY, project=PROJECT, @@ -59,9 +57,7 @@ def sql_file(tmp_path: PosixPath): @pytest.fixture(params=[{}]) -def gbq_sql_file_dataset( - load_args, sql_file, mock_bigquery_client -): # pylint: disable=unused-argument +def gbq_sql_file_dataset(load_args, sql_file, mock_bigquery_client): return GBQQueryDataset( filepath=sql_file, project=PROJECT, diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py index b48e099d1..8cacaa5bc 100644 --- a/kedro-datasets/tests/pandas/test_generic_dataset.py +++ b/kedro-datasets/tests/pandas/test_generic_dataset.py @@ -33,7 +33,6 @@ def filepath_html(tmp_path): return tmp_path / "test.html" -# pylint: disable=line-too-long @pytest.fixture() def sas_binary(): return b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc2\xea\x81`\xb3\x14\x11\xcf\xbd\x92\x08\x00\t\xc71\x8c\x18\x1f\x10\x11""\x002"\x01\x022\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x01\x18\x1f\x10\x11""\x002"\x01\x022\x042\x01""\x00\x00\x00\x00\x10\x03\x01\x00\x00\x00\x00\x00\x00\x00\x00SAS FILEAIRLINE DATA \x00\x00\xc0\x95j\xbe\xd6A\x00\x00\xc0\x95j\xbe\xd6A\x00\x00\x00\x00\x00 \xbc@\x00\x00\x00\x00\x00 \xbc@\x00\x04\x00\x00\x00\x10\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x009.0000M0WIN\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00WIN\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc0\x95LN\xaf\xf0LN\xaf\xf0LN\xaf\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jIW-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00kIW-\x00\x00\x00\x00\x00\x00\x00\x00<\x04\x00\x00\x00\x02-\x00\r\x00\x00\x00 \x0e\x00\x00\xe0\x01\x00\x00\x00\x00\x00\x00\x14\x0e\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\xe4\x0c\x00\x000\x01\x00\x00\x00\x00\x00\x00H\x0c\x00\x00\x9c\x00\x00\x00\x00\x01\x00\x00\x04\x0c\x00\x00D\x00\x00\x00\x00\x01\x00\x00\xa8\x0b\x00\x00\\\x00\x00\x00\x00\x01\x00\x00t\x0b\x00\x004\x00\x00\x00\x00\x00\x00\x00@\x0b\x00\x004\x00\x00\x00\x00\x00\x00\x00\x0c\x0b\x00\x004\x00\x00\x00\x00\x00\x00\x00\xd8\n\x00\x004\x00\x00\x00\x00\x00\x00\x00\xa4\n\x00\x004\x00\x00\x00\x00\x00\x00\x00p\n\x00\x004\x00\x00\x00\x00\x00\x00\x00p\n\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00p\x9e@\x00\x00\x00@\x8bl\xf3?\x00\x00\x00\xc0\x9f\x1a\xcf?\x00\x00\x00\xa0w\x9c\xc2?\x00\x00\x00\x00\xd7\xa3\xf6?\x00\x00\x00\x00\x81\x95\xe3?\x00t\x9e@\x00\x00\x00\xe0\xfb\xa9\xf5?\x00\x00\x00\x00\xd7\xa3\xd0?\x00\x00\x00`\xb3\xea\xcb?\x00\x00\x00 \xdd$\xf6?\x00\x00\x00\x00T\xe3\xe1?\x00x\x9e@\x00\x00\x00\xc0\x9f\x1a\xf9?\x00\x00\x00\x80\xc0\xca\xd1?\x00\x00\x00\xc0m4\xd4?\x00\x00\x00\x80?5\xf6?\x00\x00\x00 \x04V\xe2?\x00|\x9e@\x00\x00\x00\x00\x02+\xff?\x00\x00\x00@\x0c\x02\xd3?\x00\x00\x00\xc0K7\xd9?\x00\x00\x00\xc0\xcc\xcc\xf8?\x00\x00\x00\xc0I\x0c\xe2?\x00\x80\x9e@\x00\x00\x00`\xb8\x1e\x02@\x00\x00\x00@\n\xd7\xd3?\x00\x00\x00\xc0\x10\xc7\xd6?\x00\x00\x00\x00\xfe\xd4\xfc?\x00\x00\x00@5^\xe2?\x00\x84\x9e@\x00\x00\x00\x80\x16\xd9\x05@\x00\x00\x00\xe0\xa5\x9b\xd4?\x00\x00\x00`\xc5\xfe\xd6?\x00\x00\x00`\xe5\xd0\xfe?\x00\x00\x00 \x83\xc0\xe6?\x00\x88\x9e@\x00\x00\x00@33\x08@\x00\x00\x00\xe0\xa3p\xd5?\x00\x00\x00`\x8f\xc2\xd9?\x00\x00\x00@\x8bl\xff?\x00\x00\x00\x00\xfe\xd4\xe8?\x00\x8c\x9e@\x00\x00\x00\xe0\xf9~\x0c@\x00\x00\x00`ff\xd6?\x00\x00\x00\xe0\xb3Y\xd9?\x00\x00\x00`\x91\xed\x00@\x00\x00\x00\xc0\xc8v\xea?\x00\x90\x9e@\x00\x00\x00\x00\xfe\xd4\x0f@\x00\x00\x00\xc0\x9f\x1a\xd7?\x00\x00\x00\x00\xf7u\xd8?\x00\x00\x00@\xe1z\x03@\x00\x00\x00\xa0\x99\x99\xe9?\x00\x94\x9e@\x00\x00\x00\x80\x14\xae\x11@\x00\x00\x00@\x89A\xd8?\x00\x00\x00\xa0\xed|\xd3?\x00\x00\x00\xa0\xef\xa7\x05@\x00\x00\x00\x00\xd5x\xed?\x00\x98\x9e@\x00\x00\x00 \x83@\x12@\x00\x00\x00\xe0$\x06\xd9?\x00\x00\x00`\x81\x04\xd5?\x00\x00\x00`\xe3\xa5\x05@\x00\x00\x00\xa0n\x12\xf1?\x00\x9c\x9e@\x00\x00\x00\x80=\x8a\x15@\x00\x00\x00\x80\x95C\xdb?\x00\x00\x00\xa0\xab\xad\xd8?\x00\x00\x00\xa0\x9b\xc4\x06@\x00\x00\x00\xc0\xf7S\xf1?\x00\xa0\x9e@\x00\x00\x00\xc0K7\x16@\x00\x00\x00 X9\xdc?\x00\x00\x00@io\xd4?\x00\x00\x00\xa0E\xb6\x08@\x00\x00\x00\x00-\xb2\xf7?\x00\xa4\x9e@\x00\x00\x00\x00)\xdc\x15@\x00\x00\x00\xe0\xa3p\xdd?\x00\x00\x00@\xa2\xb4\xd3?\x00\x00\x00 \xdb\xf9\x08@\x00\x00\x00\xe0\xa7\xc6\xfb?\x00\xa8\x9e@\x00\x00\x00\xc0\xccL\x17@\x00\x00\x00\x80=\n\xdf?\x00\x00\x00@\x116\xd8?\x00\x00\x00\x00\xd5x\t@\x00\x00\x00`\xe5\xd0\xfe?\x00\xac\x9e@\x00\x00\x00 \x06\x81\x1b@\x00\x00\x00\xe0&1\xe0?\x00\x00\x00 \x83\xc0\xda?\x00\x00\x00\xc0\x9f\x1a\n@\x00\x00\x00\xc0\xf7S\x00@\x00\xb0\x9e@\x00\x00\x00\x80\xc0J\x1f@\x00\x00\x00\xc0K7\xe1?\x00\x00\x00\xa0\x87\x85\xe0?\x00\x00\x00\xa0\xc6K\x0b@\x00\x00\x00@\xb6\xf3\xff?\x00\xb4\x9e@\x00\x00\x00\xa0p="@\x00\x00\x00\xc0I\x0c\xe2?\x00\x00\x00\xa0\x13\xd0\xe2?\x00\x00\x00`\xe7\xfb\x0c@\x00\x00\x00\x00V\x0e\x02@\x00\xb8\x9e@\x00\x00\x00\xe0$\x06%@\x00\x00\x00 \x83\xc0\xe2?\x00\x00\x00\xe0H.\xe1?\x00\x00\x00\xa0\xc6K\x10@\x00\x00\x00\xc0\x9d\xef\x05@\x00\xbc\x9e@\x00\x00\x00\x80=\n*@\x00\x00\x00\x80l\xe7\xe3?\x00\x00\x00@io\xdc?\x00\x00\x00@\n\xd7\x12@\x00\x00\x00`\x12\x83\x0c@\x00\xc0\x9e@\x00\x00\x00\xc0\xa1\x85.@\x00\x00\x00@\xdfO\xe5?\x00\x00\x00\xa0e\x88\xd3?\x00\x00\x00@5\xde\x14@\x00\x00\x00\x80h\x11\x13@\x00\xc4\x9e@\x00\x00\x00\xc0 P0@\x00\x00\x00 Zd\xe7?\x00\x00\x00`\x7f\xd9\xcd?\x00\x00\x00\xe0\xa7F\x16@\x00\x00\x00\xa0C\x0b\x1a@\x00\xc8\x9e@\x00\x00\x00 \x83\x000@\x00\x00\x00@\x8d\x97\xea?\x00\x00\x00\xe06\x1a\xc8?\x00\x00\x00@\xe1\xfa\x15@\x00\x00\x00@\x0c\x82\x1e@\x00\xcc\x9e@\x00\x00\x00 \x83\xc0/@\x00\x00\x00\xc0\xf3\xfd\xec?\x00\x00\x00`\xf7\xe4\xc9?\x00\x00\x00 \x04V\x15@\x00\x00\x00\x80\x93X!@\x00\xd0\x9e@\x00\x00\x00\xe0x\xa90@\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\xa0\xd4\t\xd0?\x00\x00\x00\xa0Ga\x15@\x00\x00\x00\xe0x\xa9 @\x00\xd4\x9e@\x00\x00\x00\x80\x95\x031@\x00\x00\x00@`\xe5\xf0?\x00\x00\x00@@\x13\xd1?\x00\x00\x00`\xe3\xa5\x16@\x00\x00\x00 /\x1d!@\x00\xd8\x9e@\x00\x00\x00\x80\x14N3@\x00\x00\x00\x80\x93\x18\xf2?\x00\x00\x00\xa0\xb2\x0c\xd1?\x00\x00\x00\x00\x7f\xea\x16@\x00\x00\x00\xa0\x18\x04#@\x00\xdc\x9e@\x00\x00\x00\x80\x93\xb82@\x00\x00\x00@\xb6\xf3\xf3?\x00\x00\x00\xc0\xeas\xcd?\x00\x00\x00\x00T\xe3\x16@\x00\x00\x00\x80\xbe\x1f"@\x00\xe0\x9e@\x00\x00\x00\x00\x00@3@\x00\x00\x00\x00\x00\x00\xf6?\x00\x00\x00\xc0\xc1\x17\xd6?\x00\x00\x00\xc0I\x0c\x17@\x00\x00\x00\xe0$\x86 @\x00\xe4\x9e@\x00\x00\x00\xc0\xa1\xa54@\x00\x00\x00`9\xb4\xf8?\x00\x00\x00@\xe8\xd9\xdc?\x00\x00\x00@\x0c\x82\x17@\x00\x00\x00@`\xe5\x1d@\x00\xe8\x9e@\x00\x00\x00 \xdb\xb96@\x00\x00\x00\xe0|?\xfb?\x00\x00\x00@p\xce\xe2?\x00\x00\x00\x80\x97n\x18@\x00\x00\x00\x00\x7fj\x1c@\x00\xec\x9e@\x00\x00\x00\xc0v\x9e7@\x00\x00\x00\xc0\xc8v\xfc?\x00\x00\x00\x80q\x1b\xe1?\x00\x00\x00\xc0rh\x1b@\x00\x00\x00\xe0\xf9~\x1b@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00p\x00\r\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00`\x00\x0b\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00L\x00\r\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00<\x00\t\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00(\x00\x0f\x00\x00\x00\x00\x00\x00\x00\xfe\xfb\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00 \x00\x04\x00\x00\x00\x00\x00\x00\x00\xfc\xff\xff\xffP\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x04\x01\x00\x04\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00\x0c\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00\x14\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00\x1c\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00$\x00\x00\x00\x08\x00\x00\x00\x00\x04\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff8\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x04\x00\x00\x00\x00\x00$\x00\x01\x00\x00\x00\x00\x008\x00\x01\x00\x00\x00\x00\x00H\x00\x01\x00\x00\x00\x00\x00\\\x00\x01\x00\x00\x00\x00\x00l\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfd\xff\xff\xff\x90\x00\x10\x00\x80\x00\x00\x00\x00\x00\x00\x00Written by SAS\x00\x00YEARyearY\x00\x00\x00level of output\x00W\x00\x00\x00wage rate\x00\x00\x00R\x00\x00\x00interest rate\x00\x00\x00L\x00\x00\x00labor input\x00K\x00\x00\x00capital input\x00\x00\x00\x01\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfc\xff\xff0\x00\x00\x00\x04\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x07\x00\x00\x00\x00\x00\x00\xfc\xff\xff\xff\x01\x00\x00\x00\x06\x00\x00\x00\x01\x00\x00\x00\x06\x00\x00\x00\xfd\xff\xff\xff\x01\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\xff\xff\xff\xff\x01\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00\x05\x00\x00\x00\xfe\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfb\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfa\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf9\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf6\xf6\xf6\xf6\x06\x00\x00\x00\x00\x00\x00\x00\xf7\xf7\xf7\xf7\xcd\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x110\x02\x00,\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00.\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00 \x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00kIW-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x01\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x0e\x00\x00\x00\x01\x00\x00\x00-\x00\x00\x00\x01\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x0c\x00\x10\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x08\x00\x00\x00\x1c\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\\\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py index 07860d745..74b3fee86 100644 --- a/kedro-datasets/tests/pandas/test_hdf_dataset.py +++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py @@ -144,7 +144,7 @@ def test_thread_lock_usage(self, hdf_dataset, dummy_dataframe, mocker): hdf_dataset.save(dummy_dataframe) calls = [ - mocker.call.__enter__(), # pylint: disable=unnecessary-dunder-call + mocker.call.__enter__(), mocker.call.__exit__(None, None, None), ] mocked_lock.assert_has_calls(calls) diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py index 10b9cb093..26f7e0bd4 100644 --- a/kedro-datasets/tests/pandas/test_sql_dataset.py +++ b/kedro-datasets/tests/pandas/test_sql_dataset.py @@ -1,4 +1,3 @@ -# pylint: disable=no-member import importlib from pathlib import PosixPath from unittest.mock import ANY diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index 010f65895..393b401f5 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -1,4 +1,3 @@ -# pylint: disable=too-many-lines import importlib import re import sys @@ -182,7 +181,6 @@ def test_deprecation(module_name, class_name): getattr(importlib.import_module(module_name), class_name) -# pylint: disable=too-many-public-methods class TestSparkDataset: def test_load_parquet(self, tmp_path, sample_pandas_df): temp_path = (tmp_path / "data").as_posix() @@ -537,8 +535,8 @@ def test_save_version_warning(self, tmp_path, sample_spark_df): ) pattern = ( - r"Save version '{ev.save}' did not match load version " - r"'{ev.load}' for SparkDataset\(.+\)".format(ev=exact_version) + rf"Save version '{exact_version.save}' did not match load version " + rf"'{exact_version.load}' for SparkDataset\(.+\)" ) with pytest.warns(UserWarning, match=pattern): ds_local.save(sample_spark_df) @@ -578,7 +576,7 @@ def test_versioning_existing_dataset( sys.platform.startswith("win"), reason="DBFS doesn't work on Windows" ) class TestSparkDatasetVersionedDBFS: - def test_load_latest( # pylint: disable=too-many-arguments + def test_load_latest( self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") @@ -605,7 +603,7 @@ def test_load_exact(self, tmp_path, sample_spark_df): assert reloaded.exceptAll(sample_spark_df).count() == 0 - def test_save( # pylint: disable=too-many-arguments + def test_save( self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") @@ -618,7 +616,7 @@ def test_save( # pylint: disable=too-many-arguments ) assert (tmp_path / FILENAME / version.save / FILENAME).exists() - def test_exists( # pylint: disable=too-many-arguments + def test_exists( self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") @@ -750,9 +748,7 @@ def test_load_latest(self, mocker, versioned_dataset_s3): versioned_dataset_s3.load() - mocked_glob.assert_called_once_with( - "{b}/{f}/*/{f}".format(b=BUCKET_NAME, f=FILENAME) - ) + mocked_glob.assert_called_once_with(f"{BUCKET_NAME}/{FILENAME}/*/{FILENAME}") get_spark.return_value.read.load.assert_called_once_with( "s3a://{b}/{f}/{v}/{f}".format( b=BUCKET_NAME, f=FILENAME, v="mocked_version" @@ -771,7 +767,7 @@ def test_load_exact(self, mocker): ds_s3.load() get_spark.return_value.read.load.assert_called_once_with( - "s3a://{b}/{f}/{v}/{f}".format(b=BUCKET_NAME, f=FILENAME, v=ts), "parquet" + f"s3a://{BUCKET_NAME}/{FILENAME}/{ts}/{FILENAME}", "parquet" ) def test_save(self, versioned_dataset_s3, version, mocker): @@ -785,7 +781,7 @@ def test_save(self, versioned_dataset_s3, version, mocker): versioned_dataset_s3.save(mocked_spark_df) mocked_spark_df.write.save.assert_called_once_with( - "s3a://{b}/{f}/{v}/{f}".format(b=BUCKET_NAME, f=FILENAME, v=version.save), + f"s3a://{BUCKET_NAME}/{FILENAME}/{version.save}/{FILENAME}", "parquet", ) @@ -799,15 +795,13 @@ def test_save_version_warning(self, mocker): mocked_spark_df = mocker.Mock() pattern = ( - r"Save version '{ev.save}' did not match load version " - r"'{ev.load}' for SparkDataset\(.+\)".format(ev=exact_version) + rf"Save version '{exact_version.save}' did not match load version " + rf"'{exact_version.load}' for SparkDataset\(.+\)" ) with pytest.warns(UserWarning, match=pattern): ds_s3.save(mocked_spark_df) mocked_spark_df.write.save.assert_called_once_with( - "s3a://{b}/{f}/{v}/{f}".format( - b=BUCKET_NAME, f=FILENAME, v=exact_version.save - ), + f"s3a://{BUCKET_NAME}/{FILENAME}/{exact_version.save}/{FILENAME}", "parquet", ) @@ -883,7 +877,7 @@ def test_load_exact(self, mocker): versioned_hdfs.load() get_spark.return_value.read.load.assert_called_once_with( - "hdfs://{fn}/{f}/{v}/{f}".format(fn=FOLDER_NAME, f=FILENAME, v=ts), + f"hdfs://{FOLDER_NAME}/{FILENAME}/{ts}/{FILENAME}", "parquet", ) @@ -905,13 +899,11 @@ def test_save(self, mocker, version): versioned_hdfs.save(mocked_spark_df) hdfs_status.assert_called_once_with( - "{fn}/{f}/{v}/{f}".format(fn=FOLDER_NAME, v=version.save, f=FILENAME), + f"{FOLDER_NAME}/{FILENAME}/{version.save}/{FILENAME}", strict=False, ) mocked_spark_df.write.save.assert_called_once_with( - "hdfs://{fn}/{f}/{v}/{f}".format( - fn=FOLDER_NAME, v=version.save, f=FILENAME - ), + f"hdfs://{FOLDER_NAME}/{FILENAME}/{version.save}/{FILENAME}", "parquet", ) @@ -924,16 +916,14 @@ def test_save_version_warning(self, mocker): mocked_spark_df = mocker.Mock() pattern = ( - r"Save version '{ev.save}' did not match load version " - r"'{ev.load}' for SparkDataset\(.+\)".format(ev=exact_version) + rf"Save version '{exact_version.save}' did not match load version " + rf"'{exact_version.load}' for SparkDataset\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_hdfs.save(mocked_spark_df) mocked_spark_df.write.save.assert_called_once_with( - "hdfs://{fn}/{f}/{sv}/{f}".format( - fn=FOLDER_NAME, f=FILENAME, sv=exact_version.save - ), + f"hdfs://{FOLDER_NAME}/{FILENAME}/{exact_version.save}/{FILENAME}", "parquet", ) @@ -955,7 +945,7 @@ def test_prevent_overwrite(self, mocker, version): versioned_hdfs.save(mocked_spark_df) hdfs_status.assert_called_once_with( - "{fn}/{f}/{v}/{f}".format(fn=FOLDER_NAME, v=version.save, f=FILENAME), + f"{FOLDER_NAME}/{FILENAME}/{version.save}/{FILENAME}", strict=False, ) mocked_spark_df.write.save.assert_not_called() diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py index 4a7f4c97e..e33ca5cce 100644 --- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py @@ -52,8 +52,8 @@ def spark_session(): pass # remove the cached JVM vars - SparkContext._jvm = None # pylint: disable=protected-access - SparkContext._gateway = None # pylint: disable=protected-access + SparkContext._jvm = None + SparkContext._gateway = None # py4j doesn't shutdown properly so kill the actual JVM process for obj in gc.get_objects(): @@ -145,7 +145,7 @@ def test_deprecation(module_name, class_name): class TestSparkHiveDataset: def test_cant_pickle(self): - import pickle # pylint: disable=import-outside-toplevel + import pickle with pytest.raises(pickle.PicklingError): pickle.dumps( diff --git a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py index 03d016e4b..ffeafe321 100644 --- a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py +++ b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py @@ -1,4 +1,3 @@ -# pylint: disable=import-outside-toplevel import importlib from pathlib import PurePosixPath @@ -125,7 +124,6 @@ def __init__(self): self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu) self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax) - # pylint: disable=unused-argument def call(self, inputs, training=None, mask=None): # pragma: no cover x = self.dense1(inputs) return self.dense2(x) @@ -313,7 +311,7 @@ def test_save_and_load( dummy_x_test, load_version, save_version, - ): # pylint: disable=unused-argument + ): """Test saving and reloading the versioned data set.""" predictions = dummy_tf_base_model.predict(dummy_x_test) diff --git a/kedro-docker/.pylintrc b/kedro-docker/.pylintrc deleted file mode 100644 index e1f257e03..000000000 --- a/kedro-docker/.pylintrc +++ /dev/null @@ -1,425 +0,0 @@ -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code -extension-pkg-whitelist= - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. -jobs=1 - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins=pylint.extensions.docparams - -# Pickle collected data for later comparisons. -persistent=yes - -# Specify a configuration file. -#rcfile= - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -disable=ungrouped-imports,bad-continuation - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=useless-suppression - - -[REPORTS] - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio).You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - - -[BASIC] - -# Naming hint for argument names -argument-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct argument names -argument-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Naming hint for attribute names -attr-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct attribute names -attr-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Bad variable names which should always be refused, separated by a comma -bad-names=foo,bar,baz,toto,tutu,tata - -# Naming hint for class attribute names -class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Naming hint for class names -class-name-hint=[A-Z_][a-zA-Z0-9]+$ - -# Regular expression matching correct class names -class-rgx=[A-Z_][a-zA-Z0-9]+$ - -# Naming hint for constant names -const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Regular expression matching correct constant names -const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming hint for function names -function-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct function names -function-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Good variable names which should always be accepted, separated by a comma -good-names=i,j,k,ex,Run,_ - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# Naming hint for inline iteration names -inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ - -# Naming hint for method names -method-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct method names -method-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Naming hint for module names -module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Regular expression matching correct module names -module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty - -# Naming hint for variable names -variable-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - -# Regular expression matching correct variable names -variable-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=100 - -# Maximum number of lines in a module -max-module-lines=1000 - -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma,dict-separator - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME,XXX,TODO - - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Minimum lines number of a similarity. -min-similarity-lines=20 - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,future.builtins - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict,_fields,_replace,_source,_make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - - -[DESIGN] - -# Maximum number of arguments for function / method -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in a if statement -max-bool-expr=5 - -# Maximum number of branch for function / method body -max-branches=12 - -# Maximum number of locals for function / method body -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body -max-returns=6 - -# Maximum number of statements in function / method body -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=1 - - -[IMPORTS] - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception diff --git a/kedro-docker/CONTRIBUTING.md b/kedro-docker/CONTRIBUTING.md index 7bbab860a..57e92017a 100644 --- a/kedro-docker/CONTRIBUTING.md +++ b/kedro-docker/CONTRIBUTING.md @@ -84,20 +84,20 @@ pip install ".[test]" All checks run by our CI / CD pipeline can be run locally on your computer. -#### PEP-8 Standards (`isort`, `pylint` and `flake8`) +#### Linting (`ruff` and `black`) ```bash -make lint +make plugin=kedro-docker lint ``` #### Unit tests, 100% coverage (`pytest`, `pytest-cov`) ```bash -make test +make plugin=kedro-docker test ``` #### End-to-end tests (`behave`) ```bash -make e2e-tests +make plugin=kedro-docker e2e-tests ``` diff --git a/kedro-docker/features/environment.py b/kedro-docker/features/environment.py index 930f97a7d..e006227ee 100644 --- a/kedro-docker/features/environment.py +++ b/kedro-docker/features/environment.py @@ -91,7 +91,6 @@ def after_all(context): def before_scenario(context, feature): - # pylint: disable=unused-argument context.temp_dir = Path(tempfile.mkdtemp()) diff --git a/kedro-docker/features/steps/sh_run.py b/kedro-docker/features/steps/sh_run.py index 66ef9092e..7d9f6152a 100644 --- a/kedro-docker/features/steps/sh_run.py +++ b/kedro-docker/features/steps/sh_run.py @@ -9,7 +9,7 @@ def run( cmd: Union[str, Sequence], split: bool = True, print_output: bool = False, - **kwargs: str + **kwargs: str, ) -> subprocess.CompletedProcess: """ Args: @@ -39,10 +39,7 @@ def run( """ if isinstance(cmd, str) and split: cmd = shlex.split(cmd) - # pylint: disable=subprocess-run-check - result = subprocess.run( - cmd, input="", stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs - ) + result = subprocess.run(cmd, input="", capture_output=True, **kwargs) result.stdout = result.stdout.decode("utf-8") result.stderr = result.stderr.decode("utf-8") if print_output: diff --git a/kedro-docker/features/steps/util.py b/kedro-docker/features/steps/util.py index 2d259f2ce..dd212f6ca 100644 --- a/kedro-docker/features/steps/util.py +++ b/kedro-docker/features/steps/util.py @@ -43,7 +43,7 @@ def wait_for( while time() <= end: try: retval = func(**kwargs) - except Exception as err: # pylint: disable=broad-except + except Exception as err: if print_error: print(err) else: diff --git a/kedro-docker/kedro_docker/helpers.py b/kedro-docker/kedro_docker/helpers.py index 879ec4fab..981bfcdcb 100644 --- a/kedro-docker/kedro_docker/helpers.py +++ b/kedro-docker/kedro_docker/helpers.py @@ -57,8 +57,7 @@ def _list_docker_volumes(host_root: str, container_root: str, volumes: Sequence[ yield "-v", str(hpath) + ":" + str(cpath) -# pylint: disable=too-many-arguments -def compose_docker_run_args( +def compose_docker_run_args( # noqa: PLR0913 host_root: str = None, container_root: str = None, mount_volumes: Sequence[str] = None, diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py index 27af7db96..eabd7986e 100644 --- a/kedro-docker/kedro_docker/plugin.py +++ b/kedro-docker/kedro_docker/plugin.py @@ -1,5 +1,4 @@ """ Kedro plugin for packaging a project with Docker """ -# pylint: disable=unused-argument import shlex import subprocess from pathlib import Path @@ -88,7 +87,7 @@ def _make_docker_args_option(**kwargs): @click.group(name="Kedro-Docker") -def commands(): # pylint: disable=missing-function-docstring +def commands(): pass @@ -125,7 +124,7 @@ def docker_init(spark): if KEDRO_VERSION.match(">=0.17.0"): verbose = KedroCliError.VERBOSE_ERROR else: - from kedro.framework.cli.cli import ( # noqa # pylint:disable=import-outside-toplevel, no-name-in-module + from kedro.framework.cli.cli import ( _VERBOSE as verbose, ) @@ -169,9 +168,7 @@ def docker_init(spark): help="Optional arguments to be passed to `docker build` command" ) @click.pass_context -def docker_build( - ctx, uid, gid, spark, base_image, image, docker_args -): # pylint: disable=too-many-arguments +def docker_build(ctx, uid, gid, spark, base_image, image, docker_args): # noqa: PLR0913 """Build a Docker image for the project.""" uid, gid = get_uid_gid(uid, gid) project_path = Path.cwd() @@ -210,7 +207,8 @@ def docker_run(image, docker_args, args, **kwargs): Any extra arguments unspecified in this help are passed to `docker run` as is. - **kwargs is needed to make the global `verbose` argument work and pass it through.""" + **kwargs is needed to make the global `verbose` argument work and pass it through. + """ container_name = make_container_name(image, "run") _docker_run_args = compose_docker_run_args( @@ -233,7 +231,8 @@ def docker_ipython(image, docker_args, args, **kwargs): Any extra arguments unspecified in this help are passed to `kedro ipython` command inside the container as is. - **kwargs is needed to make the global `verbose` argument work and pass it through.""" + **kwargs is needed to make the global `verbose` argument work and pass it through. + """ container_name = make_container_name(image, "ipython") _docker_run_args = compose_docker_run_args( @@ -262,7 +261,8 @@ def docker_jupyter_notebook(docker_args, port, image, args, **kwargs): Any extra arguments unspecified in this help are passed to `kedro jupyter notebook` command inside the container as is. - **kwargs is needed to make the global `verbose` argument work and pass it through.""" + **kwargs is needed to make the global `verbose` argument work and pass it through. + """ container_name = make_container_name(image, "jupyter-notebook") _docker_run_args = compose_docker_run_args( @@ -291,7 +291,8 @@ def docker_jupyter_lab(docker_args, port, image, args, **kwargs): Any extra arguments unspecified in this help are passed to `kedro jupyter lab` command inside the container as is. - **kwargs is needed to make the global `verbose` argument work and pass it through.""" + **kwargs is needed to make the global `verbose` argument work and pass it through. + """ container_name = make_container_name(image, "jupyter-lab") _docker_run_args = compose_docker_run_args( @@ -315,7 +316,8 @@ def docker_cmd(args, docker_args, image, **kwargs): """Run arbitrary command from ARGS in the Docker container. If ARGS are not specified, this will invoke `kedro run` inside the container. - **kwargs is needed to make the global `verbose` argument work and pass it through.""" + **kwargs is needed to make the global `verbose` argument work and pass it through. + """ container_name = make_container_name(image, "cmd") _docker_run_args = compose_docker_run_args( diff --git a/kedro-docker/pyproject.toml b/kedro-docker/pyproject.toml index be5c89c67..e49603c6a 100644 --- a/kedro-docker/pyproject.toml +++ b/kedro-docker/pyproject.toml @@ -28,16 +28,15 @@ test = [ "behave", "black~=22.0", "docker", - "flake8>=3.5, <4.0", - "pre-commit>=1.17.0, <2.0", + "pre-commit>=2.9.2", "psutil", - "pylint>=2.4.4, <3.0", "pytest", "pytest-cov", "pytest-mock", "pytest-xdist[psutil]~=2.2.1", "PyYAML>=5.1, <7.0", "trufflehog>=2.0.99, <3.0", + "ruff~=0.0.290", "wheel==0.32.2" ] @@ -72,16 +71,25 @@ addopts = """ --no-cov-on-fail -ra""" -[tool.isort] -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true -line_length = 88 -default_section = "THIRDPARTY" - [tool.coverage.report] fail_under = 100 show_missing = true omit = ["tests/*", "*/plugin.py"] exclude_lines = ["pragma: no cover", "raise NotImplementedError"] + +[tool.ruff] +line-length = 88 +show-fixes = true +select = [ + "F", # Pyflakes + "W", # pycodestyle + "E", # pycodestyle + "I", # isort + "UP", # pyupgrade + "PL", # Pylint + "T201", # Print Statement +] +ignore = ["E501"] # Black takes care of line-too-long + +[tool.ruff.per-file-ignores] +"{tests,features}/*" = ["T201", "PLW1510"] diff --git a/kedro-telemetry/kedro_telemetry/masking.py b/kedro-telemetry/kedro_telemetry/masking.py index 53955dffc..fe5f0a3f6 100644 --- a/kedro-telemetry/kedro_telemetry/masking.py +++ b/kedro-telemetry/kedro_telemetry/masking.py @@ -84,11 +84,10 @@ def _mask_kedro_cli(cli_struct: Dict[str, Any], command_args: List[str]) -> List output.append(arg_part) elif arg_part: output.append(MASK) - else: - if arg in vocabulary: - output.append(arg) - elif arg: - output.append(MASK) + elif arg in vocabulary: + output.append(arg) + elif arg: + output.append(MASK) return output diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py index 5eeb4d489..cc27731b6 100644 --- a/kedro-telemetry/kedro_telemetry/plugin.py +++ b/kedro-telemetry/kedro_telemetry/plugin.py @@ -42,7 +42,7 @@ def _get_hashed_username(): try: username = getpass.getuser() return _hash(username) - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: logger.warning( "Something went wrong with getting the username. Exception: %s", exc, @@ -53,8 +53,6 @@ def _get_hashed_username(): class KedroTelemetryCLIHooks: """Hook to send CLI command data to Heap""" - # pylint: disable=too-few-public-methods - @cli_hook_impl def before_command_run( self, project_metadata: ProjectMetadata, command_args: List[str] @@ -101,7 +99,7 @@ def before_command_run( identity=hashed_username, properties=generic_properties, ) - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: logger.warning( "Something went wrong in hook implementation to send command run data to Heap. " "Exception: %s", @@ -109,7 +107,7 @@ def before_command_run( ) -class KedroTelemetryProjectHooks: # pylint: disable=too-few-public-methods +class KedroTelemetryProjectHooks: """Hook to send project statistics data to Heap""" @hook_impl @@ -209,7 +207,7 @@ def _send_heap_event( resp = requests.post( url=HEAP_ENDPOINT, headers=HEAP_HEADERS, data=json.dumps(data), timeout=10 ) - if resp.status_code != 200: + if resp.status_code != 200: # noqa: PLR2004 logger.warning( "Failed to send data to Heap. Response code returned: %s, Response reason: %s", resp.status_code, @@ -261,7 +259,7 @@ def _confirm_consent(telemetry_file_path: Path) -> bool: ) yaml.dump({"consent": False}, telemetry_file) return False - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: logger.warning( "Failed to confirm consent. No data was sent to Heap. Exception: %s", exc, diff --git a/kedro-telemetry/pyproject.toml b/kedro-telemetry/pyproject.toml index 21d80ee1a..f5ca2627b 100644 --- a/kedro-telemetry/pyproject.toml +++ b/kedro-telemetry/pyproject.toml @@ -26,15 +26,13 @@ test = [ "bandit>=1.6.2, <2.0", "behave", "black~=22.0", - "flake8", - "isort>=4.3.21, <5.0", - "pre-commit>=1.17.0, <2.0", - "pylint>=2.5.2, <3.0", + "pre-commit>=2.9.2", "pytest", "pytest-cov", "pytest-mock", "pytest-xdist[psutil]~=2.2.1", "trufflehog>=2.1.0, <3.0", + "ruff~=0.0.290", "wheel" ] @@ -53,10 +51,19 @@ zip-safe = false readme = {file = "README.md", content-type = "text/markdown"} version = {attr = "kedro_telemetry.__version__"} -[tool.isort] -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true -line_length = 88 -known_first_party = "kedro_telemetry" +[tool.ruff] +line-length = 88 +show-fixes = true +select = [ + "F", # Pyflakes + "W", # pycodestyle + "E", # pycodestyle + "I", # isort + "UP", # pyupgrade + "PL", # Pylint + "T201", # Print Statement +] +ignore = ["E501"] # Black takes care of line-too-long + +[tool.ruff.isort] +known-first-party = ["kedro_telemetry"] diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py index e094ee1ae..b5ef954f9 100644 --- a/kedro-telemetry/tests/test_masking.py +++ b/kedro-telemetry/tests/test_masking.py @@ -1,5 +1,3 @@ -# pylint: disable=protected-access - """Testing module for CLI tools""" import shutil from collections import namedtuple diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index 222bcc914..ccbaf8afe 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -22,8 +22,6 @@ REPO_NAME = "dummy_project" PACKAGE_NAME = "dummy_package" -# pylint: disable=too-few-public-methods - @fixture def fake_metadata(tmp_path): @@ -379,7 +377,7 @@ def test_after_context_created_without_kedro_run( # The 1st call is the Project Hook without CLI assert mocked_heap_call.call_args_list[0] == expected_call - def test_after_context_created_with_kedro_run( + def test_after_context_created_with_kedro_run( # noqa: PLR0913 self, mocker, fake_context, From 5f884e6099e88c1909042bf12ce622dabfe107da Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 2 Oct 2023 17:29:26 -0500 Subject: [PATCH 02/15] feat(datasets): create custom `DeprecationWarning` (#356) * feat(datasets): create custom `DeprecationWarning` Signed-off-by: Deepyaman Datta * feat(datasets): use the custom deprecation warning Signed-off-by: Deepyaman Datta * chore(datasets): show Kedro's deprecation warnings Signed-off-by: Deepyaman Datta * fix(datasets): remove unused imports in test files Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta --- kedro-datasets/kedro_datasets/__init__.py | 16 ++++++++++++++++ kedro-datasets/kedro_datasets/api/api_dataset.py | 3 ++- .../biosequence/biosequence_dataset.py | 3 ++- .../kedro_datasets/dask/parquet_dataset.py | 3 ++- .../databricks/managed_table_dataset.py | 3 ++- .../kedro_datasets/email/message_dataset.py | 3 ++- .../kedro_datasets/geopandas/geojson_dataset.py | 3 ++- .../kedro_datasets/json/json_dataset.py | 3 ++- .../kedro_datasets/networkx/gml_dataset.py | 3 ++- .../kedro_datasets/networkx/graphml_dataset.py | 3 ++- .../kedro_datasets/networkx/json_dataset.py | 3 ++- .../kedro_datasets/pandas/csv_dataset.py | 3 ++- .../kedro_datasets/pandas/deltatable_dataset.py | 3 ++- .../kedro_datasets/pandas/excel_dataset.py | 3 ++- .../kedro_datasets/pandas/feather_dataset.py | 3 ++- .../kedro_datasets/pandas/gbq_dataset.py | 3 ++- .../kedro_datasets/pandas/generic_dataset.py | 3 ++- .../kedro_datasets/pandas/hdf_dataset.py | 3 ++- .../kedro_datasets/pandas/json_dataset.py | 3 ++- .../kedro_datasets/pandas/parquet_dataset.py | 3 ++- .../kedro_datasets/pandas/sql_dataset.py | 3 ++- .../kedro_datasets/pandas/xml_dataset.py | 3 ++- .../kedro_datasets/pickle/pickle_dataset.py | 3 ++- .../kedro_datasets/pillow/image_dataset.py | 3 ++- .../kedro_datasets/plotly/json_dataset.py | 3 ++- .../kedro_datasets/plotly/plotly_dataset.py | 5 +++-- .../kedro_datasets/polars/csv_dataset.py | 3 ++- .../kedro_datasets/polars/generic_dataset.py | 3 ++- .../kedro_datasets/redis/redis_dataset.py | 3 ++- .../kedro_datasets/snowflake/snowpark_dataset.py | 3 ++- .../kedro_datasets/spark/deltatable_dataset.py | 3 ++- .../kedro_datasets/spark/spark_dataset.py | 3 ++- .../kedro_datasets/spark/spark_hive_dataset.py | 3 ++- .../kedro_datasets/spark/spark_jdbc_dataset.py | 3 ++- .../spark/spark_streaming_dataset.py | 3 ++- .../kedro_datasets/svmlight/svmlight_dataset.py | 3 ++- .../tensorflow/tensorflow_model_dataset.py | 3 ++- .../kedro_datasets/text/text_dataset.py | 3 ++- .../kedro_datasets/tracking/json_dataset.py | 3 ++- .../kedro_datasets/tracking/metrics_dataset.py | 3 ++- .../kedro_datasets/video/video_dataset.py | 3 ++- .../kedro_datasets/yaml/yaml_dataset.py | 3 ++- kedro-datasets/tests/api/test_api_dataset.py | 5 ++++- .../biosequence/test_biosequence_dataset.py | 5 ++++- .../tests/dask/test_parquet_dataset.py | 5 ++++- .../databricks/test_managed_table_dataset.py | 5 ++++- .../tests/email/test_message_dataset.py | 5 ++++- .../tests/geopandas/test_geojson_dataset.py | 5 ++++- kedro-datasets/tests/json/test_json_dataset.py | 5 ++++- .../tests/networkx/test_gml_dataset.py | 5 ++++- .../tests/networkx/test_graphml_dataset.py | 5 ++++- .../tests/networkx/test_json_dataset.py | 5 ++++- kedro-datasets/tests/pandas/test_csv_dataset.py | 5 ++++- .../tests/pandas/test_deltatable_dataset.py | 5 ++++- .../tests/pandas/test_excel_dataset.py | 5 ++++- .../tests/pandas/test_feather_dataset.py | 5 ++++- kedro-datasets/tests/pandas/test_gbq_dataset.py | 5 ++++- .../tests/pandas/test_generic_dataset.py | 5 ++++- kedro-datasets/tests/pandas/test_hdf_dataset.py | 5 ++++- kedro-datasets/tests/pandas/test_json_dataset.py | 5 ++++- .../tests/pandas/test_parquet_dataset.py | 5 ++++- kedro-datasets/tests/pandas/test_sql_dataset.py | 5 ++++- kedro-datasets/tests/pandas/test_xml_dataset.py | 5 ++++- .../tests/pickle/test_pickle_dataset.py | 5 ++++- .../tests/pillow/test_image_dataset.py | 5 ++++- kedro-datasets/tests/plotly/test_json_dataset.py | 5 ++++- .../tests/plotly/test_plotly_dataset.py | 5 ++++- kedro-datasets/tests/polars/test_csv_dataset.py | 5 ++++- .../tests/polars/test_generic_dataset.py | 5 ++++- kedro-datasets/tests/redis/test_redis_dataset.py | 5 ++++- .../tests/snowflake/test_snowpark_dataset.py | 5 ++++- .../tests/spark/test_deltatable_dataset.py | 5 ++++- kedro-datasets/tests/spark/test_spark_dataset.py | 5 ++++- .../tests/spark/test_spark_hive_dataset.py | 5 ++++- .../tests/spark/test_spark_jdbc_dataset.py | 5 ++++- .../tests/spark/test_spark_streaming_dataset.py | 5 ++++- .../tests/svmlight/test_svmlight_dataset.py | 5 ++++- .../tensorflow/test_tensorflow_model_dataset.py | 5 ++++- kedro-datasets/tests/text/test_text_dataset.py | 5 ++++- .../tests/tracking/test_json_dataset.py | 5 ++++- .../tests/tracking/test_metrics_dataset.py | 5 ++++- kedro-datasets/tests/video/test_video_dataset.py | 5 ++++- kedro-datasets/tests/yaml/test_yaml_dataset.py | 5 ++++- 83 files changed, 263 insertions(+), 83 deletions(-) diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index f06eb30db..13f456ad3 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,3 +1,19 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" +__all__ = ["KedroDeprecationWarning"] __version__ = "1.7.0" + +import sys +import warnings + +try: + # Custom `KedroDeprecationWarning` class was added in Kedro 0.18.14. + from kedro import KedroDeprecationWarning +except ImportError: + + class KedroDeprecationWarning(DeprecationWarning): + """Custom class for warnings about deprecated Kedro features.""" + + +if not sys.warnoptions: + warnings.simplefilter("default", KedroDeprecationWarning) diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 8a696f456..b40ab1640 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -10,6 +10,7 @@ from requests import Session, sessions from requests.auth import AuthBase +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError @@ -248,7 +249,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index d24d38ba0..ebd0722f5 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -10,6 +10,7 @@ from Bio import SeqIO from kedro.io.core import get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset @@ -150,7 +151,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 9900e1a19..5ec39fed5 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -9,6 +9,7 @@ import triad from kedro.io.core import get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset @@ -222,7 +223,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index e2e847484..33c7ef1d1 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -13,6 +13,7 @@ from pyspark.sql.types import StructType from pyspark.sql.utils import AnalysisException, ParseException +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError logger = logging.getLogger(__name__) @@ -450,7 +451,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index b81dc7804..1f45042fd 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -14,6 +14,7 @@ import fsspec from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError @@ -193,7 +194,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py index 3c5807b9a..56a8890a7 100644 --- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py +++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py @@ -11,6 +11,7 @@ import geopandas as gpd from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError @@ -164,7 +165,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index 6cae55cce..341e13933 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -10,6 +10,7 @@ import fsspec from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError @@ -167,7 +168,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index cc7d21bf0..f4d63e87e 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -11,6 +11,7 @@ import networkx from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset @@ -152,7 +153,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index 902b29114..0a368f505 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -10,6 +10,7 @@ import networkx from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset @@ -150,7 +151,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index 3d565003d..4a41f9a67 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -11,6 +11,7 @@ import networkx from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset @@ -157,7 +158,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 4887968cd..543035238 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -17,6 +17,7 @@ get_protocol_and_path, ) +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError logger = logging.getLogger(__name__) @@ -214,7 +215,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py index 9df340c6d..4581312c5 100644 --- a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py @@ -11,6 +11,7 @@ from deltalake.exceptions import TableNotFoundError from deltalake.writer import write_deltalake +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError @@ -271,7 +272,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 181e6cd71..6f4b0ff27 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -17,6 +17,7 @@ get_protocol_and_path, ) +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError logger = logging.getLogger(__name__) @@ -282,7 +283,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index 45a454dcf..41995dda4 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -18,6 +18,7 @@ get_protocol_and_path, ) +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset logger = logging.getLogger(__name__) @@ -202,7 +203,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index 8dba87dd8..d672cae0c 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -17,6 +17,7 @@ validate_on_forbidden_chars, ) +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError @@ -330,7 +331,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index d9395b8e8..987d79be7 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -11,6 +11,7 @@ import pandas as pd from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError NON_FILE_SYSTEM_TARGETS = [ @@ -252,7 +253,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index 50d33e460..73870e56a 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -11,6 +11,7 @@ import pandas as pd from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError HDFSTORE_DRIVER = "H5FD_CORE" @@ -212,7 +213,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 91dd2930d..f480f0754 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -17,6 +17,7 @@ get_protocol_and_path, ) +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError logger = logging.getLogger(__name__) @@ -200,7 +201,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index dc4c05618..b132d69b3 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -17,6 +17,7 @@ get_protocol_and_path, ) +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError logger = logging.getLogger(__name__) @@ -226,7 +227,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index 59feb51b4..beb25fb3f 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -12,6 +12,7 @@ from sqlalchemy import create_engine, inspect from sqlalchemy.exc import NoSuchModuleError +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError __all__ = ["SQLTableDataset", "SQLQueryDataset"] @@ -547,7 +548,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index 129d5e3fb..fa3fe1de4 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -17,6 +17,7 @@ get_protocol_and_path, ) +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError logger = logging.getLogger(__name__) @@ -184,7 +185,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 19f8072a0..b28103e7e 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -12,6 +12,7 @@ import fsspec from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError @@ -252,7 +253,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index 91bae8842..161ff9dc5 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -10,6 +10,7 @@ from kedro.io.core import Version, get_filepath_str, get_protocol_and_path from PIL import Image +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError @@ -160,7 +161,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index b21f4f9bc..68e5ad9a5 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -11,6 +11,7 @@ from kedro.io.core import Version, get_filepath_str, get_protocol_and_path from plotly import graph_objects as go +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset @@ -176,7 +177,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index 985588e0a..a30e62f0d 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -11,7 +11,8 @@ from kedro.io.core import Version from plotly import graph_objects as go -from .json_dataset import JSONDataset +from kedro_datasets import KedroDeprecationWarning +from kedro_datasets.plotly.json_dataset import JSONDataset class PlotlyDataset(JSONDataset): @@ -153,7 +154,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 0e87c2bb2..e2638107f 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -17,6 +17,7 @@ get_protocol_and_path, ) +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError logger = logging.getLogger(__name__) @@ -207,7 +208,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/polars/generic_dataset.py b/kedro-datasets/kedro_datasets/polars/generic_dataset.py index 8b790e456..5deceff44 100644 --- a/kedro-datasets/kedro_datasets/polars/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/generic_dataset.py @@ -12,6 +12,7 @@ import polars as pl from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError @@ -209,7 +210,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index dc04de00e..770ee98af 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -9,6 +9,7 @@ import redis +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError @@ -202,7 +203,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py index d98ef2dd6..6fbfa60a0 100644 --- a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py +++ b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py @@ -7,6 +7,7 @@ import snowflake.snowpark as sp +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError logger = logging.getLogger(__name__) @@ -255,7 +256,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index 7df0c411a..f1b6a74b5 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -9,6 +9,7 @@ from pyspark.sql import SparkSession from pyspark.sql.utils import AnalysisException +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix @@ -120,7 +121,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 5971ba495..221e4e562 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -20,6 +20,7 @@ from pyspark.sql.utils import AnalysisException from s3fs import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError logger = logging.getLogger(__name__) @@ -439,7 +440,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index 860855719..33cc31f02 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -9,6 +9,7 @@ from pyspark.sql import DataFrame, SparkSession, Window from pyspark.sql.functions import col, lit, row_number +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError @@ -233,7 +234,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index c062a6a70..18af44546 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -5,6 +5,7 @@ from pyspark.sql import DataFrame, SparkSession +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset, DatasetError @@ -187,7 +188,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py index 4e02a4c13..7ebe84ae4 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py @@ -7,6 +7,7 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.utils import AnalysisException +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset from kedro_datasets.spark.spark_dataset import ( SparkDataset, @@ -171,7 +172,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index 2ea1b3be7..05edae8a6 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -13,6 +13,7 @@ from scipy.sparse.csr import csr_matrix from sklearn.datasets import dump_svmlight_file, load_svmlight_file +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError # NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. @@ -202,7 +203,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 18b4274c7..a95b1bfa2 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -11,6 +11,7 @@ import tensorflow as tf from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError TEMPORARY_H5_FILE = "tmp_tensorflow_model.h5" @@ -200,7 +201,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index 2c1ecff6f..a6d9be17e 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -9,6 +9,7 @@ import fsspec from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError @@ -151,7 +152,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 8dac0fc4d..943e686fd 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -7,6 +7,7 @@ from kedro.io.core import DatasetError +from kedro_datasets import KedroDeprecationWarning from kedro_datasets.json import json_dataset @@ -57,7 +58,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index 9e05855fa..cfd30d1a4 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -9,6 +9,7 @@ from kedro.io.core import DatasetError, get_filepath_str +from kedro_datasets import KedroDeprecationWarning from kedro_datasets.json import json_dataset @@ -78,7 +79,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index b85fc1231..de97d7b8e 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -16,6 +16,7 @@ import PIL.Image from kedro.io.core import get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractDataset @@ -376,7 +377,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index 45350b338..d9aa536fb 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -10,6 +10,7 @@ import yaml from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import AbstractVersionedDataset, DatasetError @@ -163,7 +164,7 @@ def __getattr__(name): warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " f"and the alias will be removed in Kedro-Datasets 2.0.0", - DeprecationWarning, + KedroDeprecationWarning, stacklevel=2, ) return alias diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index 10a0baf6d..e0ac4af93 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -8,6 +8,7 @@ import requests from requests.auth import HTTPBasicAuth +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.api import APIDataset from kedro_datasets.api.api_dataset import _DEPRECATED_CLASSES @@ -33,7 +34,9 @@ ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/biosequence/test_biosequence_dataset.py b/kedro-datasets/tests/biosequence/test_biosequence_dataset.py index d429dd420..3ee151f7d 100644 --- a/kedro-datasets/tests/biosequence/test_biosequence_dataset.py +++ b/kedro-datasets/tests/biosequence/test_biosequence_dataset.py @@ -10,6 +10,7 @@ from kedro.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.biosequence import BioSequenceDataset from kedro_datasets.biosequence.biosequence_dataset import _DEPRECATED_CLASSES @@ -45,7 +46,9 @@ def dummy_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py index 08c753f59..255c2717a 100644 --- a/kedro-datasets/tests/dask/test_parquet_dataset.py +++ b/kedro-datasets/tests/dask/test_parquet_dataset.py @@ -10,6 +10,7 @@ from pandas.testing import assert_frame_equal from s3fs import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.dask import ParquetDataset from kedro_datasets.dask.parquet_dataset import _DEPRECATED_CLASSES @@ -79,7 +80,9 @@ def s3fs_cleanup(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py index dc2595740..929021a9e 100644 --- a/kedro-datasets/tests/databricks/test_managed_table_dataset.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -6,6 +6,7 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.databricks import ManagedTableDataset from kedro_datasets.databricks.managed_table_dataset import _DEPRECATED_CLASSES @@ -179,7 +180,9 @@ def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/email/test_message_dataset.py b/kedro-datasets/tests/email/test_message_dataset.py index bb65304df..423741c9c 100644 --- a/kedro-datasets/tests/email/test_message_dataset.py +++ b/kedro-datasets/tests/email/test_message_dataset.py @@ -10,6 +10,7 @@ from kedro.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.email import EmailMessageDataset from kedro_datasets.email.message_dataset import _DEPRECATED_CLASSES @@ -56,7 +57,9 @@ def dummy_msg(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/geopandas/test_geojson_dataset.py b/kedro-datasets/tests/geopandas/test_geojson_dataset.py index 42131f1f4..0bf32552a 100644 --- a/kedro-datasets/tests/geopandas/test_geojson_dataset.py +++ b/kedro-datasets/tests/geopandas/test_geojson_dataset.py @@ -11,6 +11,7 @@ from s3fs import S3FileSystem from shapely.geometry import Point +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.geopandas import GeoJSONDataset from kedro_datasets.geopandas.geojson_dataset import _DEPRECATED_CLASSES @@ -69,7 +70,9 @@ def versioned_geojson_dataset(filepath, load_version, save_version): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/json/test_json_dataset.py b/kedro-datasets/tests/json/test_json_dataset.py index 6fae0f9ef..e88ac689a 100644 --- a/kedro-datasets/tests/json/test_json_dataset.py +++ b/kedro-datasets/tests/json/test_json_dataset.py @@ -8,6 +8,7 @@ from kedro.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.json import JSONDataset from kedro_datasets.json.json_dataset import _DEPRECATED_CLASSES @@ -40,7 +41,9 @@ def dummy_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/networkx/test_gml_dataset.py b/kedro-datasets/tests/networkx/test_gml_dataset.py index 903e2019e..5fe193a57 100644 --- a/kedro-datasets/tests/networkx/test_gml_dataset.py +++ b/kedro-datasets/tests/networkx/test_gml_dataset.py @@ -10,6 +10,7 @@ from kedro.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.networkx import GMLDataset from kedro_datasets.networkx.gml_dataset import _DEPRECATED_CLASSES @@ -57,7 +58,9 @@ def dummy_graph_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/networkx/test_graphml_dataset.py b/kedro-datasets/tests/networkx/test_graphml_dataset.py index 69e6269f5..5c60beee7 100644 --- a/kedro-datasets/tests/networkx/test_graphml_dataset.py +++ b/kedro-datasets/tests/networkx/test_graphml_dataset.py @@ -10,6 +10,7 @@ from kedro.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.networkx import GraphMLDataset from kedro_datasets.networkx.graphml_dataset import _DEPRECATED_CLASSES @@ -58,7 +59,9 @@ def dummy_graph_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/networkx/test_json_dataset.py b/kedro-datasets/tests/networkx/test_json_dataset.py index 91b221e0a..e649bc1fb 100644 --- a/kedro-datasets/tests/networkx/test_json_dataset.py +++ b/kedro-datasets/tests/networkx/test_json_dataset.py @@ -10,6 +10,7 @@ from kedro.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.networkx import JSONDataset from kedro_datasets.networkx.json_dataset import _DEPRECATED_CLASSES @@ -57,7 +58,9 @@ def dummy_graph_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 623d1cf29..5364ff19c 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -16,6 +16,7 @@ from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import CSVDataset from kedro_datasets.pandas.csv_dataset import _DEPRECATED_CLASSES @@ -92,7 +93,9 @@ def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_deltatable_dataset.py b/kedro-datasets/tests/pandas/test_deltatable_dataset.py index 9665f7e36..eaed4b4fe 100644 --- a/kedro-datasets/tests/pandas/test_deltatable_dataset.py +++ b/kedro-datasets/tests/pandas/test_deltatable_dataset.py @@ -5,6 +5,7 @@ from deltalake import DataCatalog, Metadata from pandas.testing import assert_frame_equal +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import DeltaTableDataset from kedro_datasets.pandas.deltatable_dataset import _DEPRECATED_CLASSES @@ -35,7 +36,9 @@ def deltatable_dataset_from_path(filepath, load_args, save_args, fs_args): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py index 9a299028c..a80a299b0 100644 --- a/kedro-datasets/tests/pandas/test_excel_dataset.py +++ b/kedro-datasets/tests/pandas/test_excel_dataset.py @@ -10,6 +10,7 @@ from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import ExcelDataset from kedro_datasets.pandas.excel_dataset import _DEPRECATED_CLASSES @@ -63,7 +64,9 @@ def another_dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py index e2903aefc..66eef5b88 100644 --- a/kedro-datasets/tests/pandas/test_feather_dataset.py +++ b/kedro-datasets/tests/pandas/test_feather_dataset.py @@ -10,6 +10,7 @@ from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import FeatherDataset from kedro_datasets.pandas.feather_dataset import _DEPRECATED_CLASSES @@ -44,7 +45,9 @@ def dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py index be4d65942..d340c5a7d 100644 --- a/kedro-datasets/tests/pandas/test_gbq_dataset.py +++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py @@ -6,6 +6,7 @@ from google.cloud.exceptions import NotFound from pandas.testing import assert_frame_equal +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import GBQQueryDataset, GBQTableDataset from kedro_datasets.pandas.gbq_dataset import _DEPRECATED_CLASSES @@ -71,7 +72,9 @@ def gbq_sql_file_dataset(load_args, sql_file, mock_bigquery_client): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py index 8cacaa5bc..817d98720 100644 --- a/kedro-datasets/tests/pandas/test_generic_dataset.py +++ b/kedro-datasets/tests/pandas/test_generic_dataset.py @@ -13,6 +13,7 @@ from pandas._testing import assert_frame_equal from s3fs import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import GenericDataset from kedro_datasets.pandas.generic_dataset import _DEPRECATED_CLASSES @@ -97,7 +98,9 @@ def dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py index 74b3fee86..c43528e6a 100644 --- a/kedro-datasets/tests/pandas/test_hdf_dataset.py +++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py @@ -10,6 +10,7 @@ from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import HDFDataset from kedro_datasets.pandas.hdf_dataset import _DEPRECATED_CLASSES @@ -51,7 +52,9 @@ def dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index 0b246b3fe..e20366eaf 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -11,6 +11,7 @@ from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import JSONDataset from kedro_datasets.pandas.json_dataset import _DEPRECATED_CLASSES @@ -48,7 +49,9 @@ def dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_parquet_dataset.py b/kedro-datasets/tests/pandas/test_parquet_dataset.py index 64a497725..83f0695fb 100644 --- a/kedro-datasets/tests/pandas/test_parquet_dataset.py +++ b/kedro-datasets/tests/pandas/test_parquet_dataset.py @@ -11,6 +11,7 @@ from pyarrow.fs import FSSpecHandler, PyFileSystem from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import ParquetDataset from kedro_datasets.pandas.parquet_dataset import _DEPRECATED_CLASSES @@ -50,7 +51,9 @@ def dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py index 26f7e0bd4..a90cff0b7 100644 --- a/kedro-datasets/tests/pandas/test_sql_dataset.py +++ b/kedro-datasets/tests/pandas/test_sql_dataset.py @@ -6,6 +6,7 @@ import pytest import sqlalchemy +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import SQLQueryDataset, SQLTableDataset from kedro_datasets.pandas.sql_dataset import _DEPRECATED_CLASSES @@ -66,7 +67,9 @@ def query_file_dataset(request, sql_file): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pandas/test_xml_dataset.py b/kedro-datasets/tests/pandas/test_xml_dataset.py index 9a54174e4..345dfcdbd 100644 --- a/kedro-datasets/tests/pandas/test_xml_dataset.py +++ b/kedro-datasets/tests/pandas/test_xml_dataset.py @@ -11,6 +11,7 @@ from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import XMLDataset from kedro_datasets.pandas.xml_dataset import _DEPRECATED_CLASSES @@ -48,7 +49,9 @@ def dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py index 4cc547e90..be09d6291 100644 --- a/kedro-datasets/tests/pickle/test_pickle_dataset.py +++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py @@ -11,6 +11,7 @@ from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pickle import PickleDataset from kedro_datasets.pickle.pickle_dataset import _DEPRECATED_CLASSES @@ -54,7 +55,9 @@ def dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/pillow/test_image_dataset.py b/kedro-datasets/tests/pillow/test_image_dataset.py index e2c970835..8ce6bf825 100644 --- a/kedro-datasets/tests/pillow/test_image_dataset.py +++ b/kedro-datasets/tests/pillow/test_image_dataset.py @@ -9,6 +9,7 @@ from PIL import Image, ImageChops from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pillow import ImageDataset from kedro_datasets.pillow.image_dataset import _DEPRECATED_CLASSES @@ -47,7 +48,9 @@ def images_equal(image_1, image_2): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/plotly/test_json_dataset.py b/kedro-datasets/tests/plotly/test_json_dataset.py index 52cda8d07..2525a9a73 100644 --- a/kedro-datasets/tests/plotly/test_json_dataset.py +++ b/kedro-datasets/tests/plotly/test_json_dataset.py @@ -10,6 +10,7 @@ from kedro.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.plotly import JSONDataset from kedro_datasets.plotly.json_dataset import _DEPRECATED_CLASSES @@ -40,7 +41,9 @@ def dummy_plot(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/plotly/test_plotly_dataset.py b/kedro-datasets/tests/plotly/test_plotly_dataset.py index 9a7c9d3a1..4f43b34ee 100644 --- a/kedro-datasets/tests/plotly/test_plotly_dataset.py +++ b/kedro-datasets/tests/plotly/test_plotly_dataset.py @@ -12,6 +12,7 @@ from plotly.graph_objs import Scatter from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.plotly import PlotlyDataset from kedro_datasets.plotly.plotly_dataset import _DEPRECATED_CLASSES @@ -52,7 +53,9 @@ def dummy_dataframe(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index e0519dd46..9226a89a1 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -16,6 +16,7 @@ from polars.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.polars import CSVDataset from kedro_datasets.polars.csv_dataset import _DEPRECATED_CLASSES @@ -95,7 +96,9 @@ def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/polars/test_generic_dataset.py b/kedro-datasets/tests/polars/test_generic_dataset.py index 2c7769b14..b300cfd78 100644 --- a/kedro-datasets/tests/polars/test_generic_dataset.py +++ b/kedro-datasets/tests/polars/test_generic_dataset.py @@ -14,6 +14,7 @@ from polars.testing import assert_frame_equal from s3fs import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.polars import GenericDataset from kedro_datasets.polars.generic_dataset import _DEPRECATED_CLASSES @@ -109,7 +110,9 @@ def excel_dataset(dummy_dataframe: pl.DataFrame, filepath_excel): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py index 8b879edd6..a2ec3bf83 100644 --- a/kedro-datasets/tests/redis/test_redis_dataset.py +++ b/kedro-datasets/tests/redis/test_redis_dataset.py @@ -8,6 +8,7 @@ import redis from pandas.testing import assert_frame_equal +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.redis import PickleDataset from kedro_datasets.redis.redis_dataset import _DEPRECATED_CLASSES @@ -63,7 +64,9 @@ def pickle_data_set(mocker, key, backend, load_args, save_args, redis_args): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py index 1423fbc12..4d5e473e9 100644 --- a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py +++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py @@ -4,6 +4,7 @@ import pytest +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError try: @@ -147,7 +148,9 @@ def sf_session(): @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) @pytest.mark.snowflake def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py index cc2d57adc..58940f5ce 100644 --- a/kedro-datasets/tests/spark/test_deltatable_dataset.py +++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py @@ -12,6 +12,7 @@ from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql.utils import AnalysisException +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.spark import DeltaTableDataset, SparkDataset from kedro_datasets.spark.deltatable_dataset import _DEPRECATED_CLASSES @@ -38,7 +39,9 @@ def sample_spark_df(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index 393b401f5..7970b4ce9 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -26,6 +26,7 @@ ) from pyspark.sql.utils import AnalysisException +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.pandas import CSVDataset, ParquetDataset from kedro_datasets.pickle import PickleDataset @@ -177,7 +178,9 @@ def isDir(self): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py index e33ca5cce..202d1ade8 100644 --- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py @@ -10,6 +10,7 @@ from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.spark import SparkHiveDataset from kedro_datasets.spark.spark_hive_dataset import _DEPRECATED_CLASSES @@ -139,7 +140,9 @@ def _generate_spark_df_upsert_expected(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py index 9f869cf1d..e9bb33ddb 100644 --- a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py @@ -2,6 +2,7 @@ import pytest +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.spark import SparkJDBCDataset from kedro_datasets.spark.spark_jdbc_dataset import _DEPRECATED_CLASSES @@ -41,7 +42,9 @@ def spark_jdbc_args_save_load(spark_jdbc_args): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py index cb36fb7a4..d199df812 100644 --- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py @@ -10,6 +10,7 @@ from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql.utils import AnalysisException +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.spark import SparkDataset, SparkStreamingDataset from kedro_datasets.spark.spark_streaming_dataset import _DEPRECATED_CLASSES @@ -96,7 +97,9 @@ def mocked_s3_schema(tmp_path, mocked_s3_bucket, sample_spark_df_schema: StructT ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/svmlight/test_svmlight_dataset.py b/kedro-datasets/tests/svmlight/test_svmlight_dataset.py index c16555c8f..63596d6d5 100644 --- a/kedro-datasets/tests/svmlight/test_svmlight_dataset.py +++ b/kedro-datasets/tests/svmlight/test_svmlight_dataset.py @@ -9,6 +9,7 @@ from kedro.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.svmlight import SVMLightDataset from kedro_datasets.svmlight.svmlight_dataset import _DEPRECATED_CLASSES @@ -46,7 +47,9 @@ def dummy_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py index ffeafe321..bedaf8eab 100644 --- a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py +++ b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py @@ -9,6 +9,7 @@ from kedro.io.core import PROTOCOL_DELIMITER, Version from s3fs import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError @@ -140,7 +141,9 @@ def call(self, inputs, training=None, mask=None): # pragma: no cover ) @pytest.mark.parametrize("class_name", ["TensorFlowModelDataSet"]) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/text/test_text_dataset.py b/kedro-datasets/tests/text/test_text_dataset.py index a6f173dfc..ae90dd343 100644 --- a/kedro-datasets/tests/text/test_text_dataset.py +++ b/kedro-datasets/tests/text/test_text_dataset.py @@ -8,6 +8,7 @@ from kedro.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.text import TextDataset from kedro_datasets.text.text_dataset import _DEPRECATED_CLASSES @@ -37,7 +38,9 @@ def versioned_txt_dataset(filepath_txt, load_version, save_version): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py index f22789469..218521349 100644 --- a/kedro-datasets/tests/tracking/test_json_dataset.py +++ b/kedro-datasets/tests/tracking/test_json_dataset.py @@ -8,6 +8,7 @@ from kedro.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.tracking import JSONDataset from kedro_datasets.tracking.json_dataset import _DEPRECATED_CLASSES @@ -40,7 +41,9 @@ def dummy_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py index 2b50617e1..a78664756 100644 --- a/kedro-datasets/tests/tracking/test_metrics_dataset.py +++ b/kedro-datasets/tests/tracking/test_metrics_dataset.py @@ -8,6 +8,7 @@ from kedro.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.tracking import MetricsDataset from kedro_datasets.tracking.metrics_dataset import _DEPRECATED_CLASSES @@ -41,7 +42,9 @@ def dummy_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/video/test_video_dataset.py b/kedro-datasets/tests/video/test_video_dataset.py index 74c387889..94442aa1c 100644 --- a/kedro-datasets/tests/video/test_video_dataset.py +++ b/kedro-datasets/tests/video/test_video_dataset.py @@ -5,6 +5,7 @@ from moto import mock_s3 from utils import TEST_FPS, assert_videos_equal +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.video import VideoDataset from kedro_datasets.video.video_dataset import ( @@ -58,7 +59,9 @@ def mocked_s3_bucket(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) diff --git a/kedro-datasets/tests/yaml/test_yaml_dataset.py b/kedro-datasets/tests/yaml/test_yaml_dataset.py index b439d0e80..dfbc5d923 100644 --- a/kedro-datasets/tests/yaml/test_yaml_dataset.py +++ b/kedro-datasets/tests/yaml/test_yaml_dataset.py @@ -10,6 +10,7 @@ from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem +from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError from kedro_datasets.yaml import YAMLDataset from kedro_datasets.yaml.yaml_dataset import _DEPRECATED_CLASSES @@ -42,7 +43,9 @@ def dummy_data(): ) @pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): + with pytest.warns( + KedroDeprecationWarning, match=f"{repr(class_name)} has been renamed" + ): getattr(importlib.import_module(module_name), class_name) From ecb00f53b66035cfce55563ccf0028d468390b52 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 2 Oct 2023 22:08:31 -0500 Subject: [PATCH 03/15] docs(datasets): add note about DataSet deprecation (#357) --- kedro-datasets/RELEASE.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 01596c95c..9c6661fda 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,6 +1,9 @@ # Upcoming Release ## Major features and improvements ## Bug fixes and other changes +## Upcoming deprecations for Kedro-Datasets 2.0.0 +* Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" are deprecated and will be removed in 2.0.0. + ## Community contributions # Release 1.7.0: From c4dc6e424fc6c0934c56d129dbd5b870b667ed12 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Wed, 4 Oct 2023 10:26:39 -0500 Subject: [PATCH 04/15] test(datasets): skip `tensorflow` tests on Windows (#363) Signed-off-by: Deepyaman Datta --- .../tests/tensorflow/test_tensorflow_model_dataset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py index bedaf8eab..610618a1e 100644 --- a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py +++ b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py @@ -1,4 +1,5 @@ import importlib +import sys from pathlib import PurePosixPath import numpy as np @@ -12,6 +13,12 @@ from kedro_datasets import KedroDeprecationWarning from kedro_datasets._io import DatasetError +if sys.platform == "win32": + pytest.skip( + "TensorFlow tests have become inexplicably flaky in Windows CI", + allow_module_level=True, + ) + # In this test module, we wrap tensorflow and TensorFlowModelDataset imports into a module-scoped # fixtures to avoid them being evaluated immediately when a new test process is spawned. From b9c109f0a591b301dd0c666cd0e03de2eb226b9c Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Thu, 5 Oct 2023 14:44:02 +0100 Subject: [PATCH 05/15] ci: Pin `tables` version (#370) * Pin tables version Signed-off-by: Ankita Katiyar * Also fix kedro-airflow Signed-off-by: Ankita Katiyar * Revert trying to fix airflow Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar --- kedro-datasets/setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 1535d28dd..340ad5e67 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -37,7 +37,8 @@ def _collect_requirements(requires): "pandas.HDFDataSet": [ PANDAS, "tables~=3.6.0; platform_system == 'Windows'", - "tables~=3.6; platform_system != 'Windows'", + "tables~=3.6, <3.9; platform_system != 'Windows' and python_version<'3.9'", + "tables~=3.6; platform_system != 'Windows' and python_version>='3.9'", ], "pandas.JSONDataSet": [PANDAS], "pandas.ParquetDataSet": [PANDAS, "pyarrow>=6.0"], @@ -209,7 +210,8 @@ def _collect_requirements(requires): "SQLAlchemy~=1.2", "tables~=3.6.0; platform_system == 'Windows' and python_version<'3.8'", "tables~=3.8.0; platform_system == 'Windows' and python_version>='3.8'", # Import issues with python 3.8 with pytables pinning to 3.8.0 fixes this https://github.com/PyTables/PyTables/issues/933#issuecomment-1555917593 - "tables~=3.6; platform_system != 'Windows'", + "tables~=3.6, <3.9; platform_system != 'Windows' and python_version<'3.9'", + "tables~=3.6; platform_system != 'Windows' and python_version>='3.9'", "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", "triad>=0.6.7, <1.0", From 080c73a8b5dfd5688750d60081f3e754668a927b Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Fri, 6 Oct 2023 12:26:18 +0100 Subject: [PATCH 06/15] build(datasets): Release `1.7.1` (#378) Signed-off-by: Merel Theisen --- kedro-datasets/RELEASE.md | 6 ++++++ kedro-datasets/kedro_datasets/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 9c6661fda..0b7ac02cc 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,6 +1,12 @@ # Upcoming Release ## Major features and improvements ## Bug fixes and other changes +## Upcoming deprecations for Kedro-Datasets 2.0.0 + +# Release 1.7.1 +## Bug fixes and other changes +* Pin `tables` version on `kedro-datasets` for Python < 3.8. + ## Upcoming deprecations for Kedro-Datasets 2.0.0 * Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" are deprecated and will be removed in 2.0.0. diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index 13f456ad3..60aa4afb2 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,7 +1,7 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" __all__ = ["KedroDeprecationWarning"] -__version__ = "1.7.0" +__version__ = "1.7.1" import sys import warnings From 7cbc2654fcfbc51d92386cedfed05f27c0a8cf73 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Fri, 6 Oct 2023 12:54:16 +0100 Subject: [PATCH 07/15] docs: Update CONTRIBUTING.md and add one for `kedro-datasets` (#379) Update CONTRIBUTING.md + add one for kedro-datasets Signed-off-by: Ankita Katiyar --- CONTRIBUTING.md | 7 +-- Makefile | 2 +- kedro-airflow/CONTRIBUTING.md | 23 +++---- kedro-datasets/CONTRIBUTING.md | 106 +++++++++++++++++++++++++++++++++ kedro-docker/CONTRIBUTING.md | 25 ++++---- 5 files changed, 135 insertions(+), 28 deletions(-) create mode 100644 kedro-datasets/CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b7b8abb6a..532de5048 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,20 +15,19 @@ We also curate a [GitHub repo that lists content created by the Kedro community] ## Contribute to the project There are quite a few ways to contribute to Kedro, such as answering questions about Kedro to help others, fixing a typo on the documentation, reporting a bug, reviewing pull requests or adding a feature. -ls Take a look at some of our [contribution suggestions on the Kedro GitHub Wiki](https://github.com/kedro-org/kedro/wiki/Contribute-to-Kedro)! ## Which plugin contributions are likely to be accepted? Dataset contributions to the [Kedro-Datasets](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets) plugin are the most frequently accepted, since they do not require any changes to the framework itself. -However, we accept contributions to any of the other [Kedro-Plugins](https://github.com/kedro-org/kedro-plugins) or the framework or `Kedro-Viz`. As a guide, contributions based on existing issues from the Kedro team, or issues that the team has deemed useful, are most likely to be accepted. Any contributions that affect fundamental changes to the Kedro Framework would require discussion first. In this case, we recommend opening an issue instead of a pull request. +However, we accept contributions to any of the other [Kedro-Plugins](https://github.com/kedro-org/kedro-plugins) or the framework or [`Kedro-Viz`](https://github.com/kedro-org/kedro-viz). As a guide, contributions based on existing issues from the Kedro team, or issues that the team has deemed useful, are most likely to be accepted. Any contributions that affect fundamental changes to the Kedro Framework would require discussion first. In this case, we recommend opening an issue instead of a pull request. -Make sure to check out the contributing guides for [Kedro-Docker](https://github.com/kedro-org/kedro-plugins/blob/main/kedro-docker/CONTRIBUTING.md) and [Kedro-Airflow](https://github.com/kedro-org/kedro-plugins/blob/main/kedro-airflow/CONTRIBUTING.md) if you intend to contribute to those specific plugins. +Make sure to check out the contributing guides for [Kedro-Docker](https://github.com/kedro-org/kedro-plugins/blob/main/kedro-docker/CONTRIBUTING.md), [Kedro-Datasets](https://github.com/kedro-org/kedro-plugins/blob/main/kedro-datasets/CONTRIBUTING.md) and [Kedro-Airflow](https://github.com/kedro-org/kedro-plugins/blob/main/kedro-airflow/CONTRIBUTING.md) if you intend to contribute to those specific plugins. ## Join the Technical Steering Committee Kedro is an incubating project in [LF AI & Data](https://lfaidata.foundation/), a sub-organisation within the Linux Foundation that focuses on open innovation within the data and AI space. -The project is governed by a group of maintainers, known as the Technical Steering Committee (TSC); read more about the structure of our TSC in our [Technical Charter](./kedro_technical_charter.pdf). +The project is governed by a group of maintainers, known as the Technical Steering Committee (TSC); read more about the structure of our TSC in our [Technical Charter](https://github.com/kedro-org/kedro/blob/main/kedro_technical_charter.pdf). We regularly invite community members to join the TSC and help define the future of the Kedro project. Read the [guidance on becoming a Kedro maintainer](https://docs.kedro.org/en/stable/contribution/technical_steering_committee.html) to understand the process of joining the TSC. diff --git a/Makefile b/Makefile index 03e74bec0..1ab21d7cc 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ clean: install-test-requirements: cd $(plugin) && pip install ".[test]" -install-pre-commit: install-test-requirements +install-pre-commit: pre-commit install --install-hooks uninstall-pre-commit: diff --git a/kedro-airflow/CONTRIBUTING.md b/kedro-airflow/CONTRIBUTING.md index 0d081ed7f..72f3cce1e 100644 --- a/kedro-airflow/CONTRIBUTING.md +++ b/kedro-airflow/CONTRIBUTING.md @@ -6,15 +6,13 @@ The following sections describe our vision and the contribution process. ## Code of conduct -The Kedro team pledges to foster and maintain a welcoming and friendly community in all of our spaces. All members of our community are expected to follow our [Code of Conduct](CODE_OF_CONDUCT.md) and we will do our best to enforce those principles and build a happy environment where everyone is treated with respect and dignity. +The Kedro team pledges to foster and maintain a welcoming and friendly community in all of our spaces. All members of our community are expected to follow our [Code of Conduct](CODE_OF_CONDUCT.md), and we will do our best to enforce those principles and build a happy environment where everyone is treated with respect and dignity. # Get started We use [GitHub Issues](https://github.com/kedro-org/kedro-plugins/issues) to keep track of known bugs. We keep a close eye on them and try to make it clear when we have an internal fix in progress. Before reporting a new issue, please do your best to ensure your problem hasn't already been reported. If so, it's often better to just leave a comment on an existing issue, rather than create a new one. Old issues also can often include helpful tips and solutions to common problems. -If you are looking for help with your code, please consider posting a question on [Stack Overflow](https://stackoverflow.com/questions/tagged/kedro-airflow). If you tag it `kedro-airflow`, `kedro` and `python`, more people will see it and may be able to help. We are unable to provide individual support via email. In the interest of community engagement we also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. - -If you're over on Stack Overflow and want to boost your points, take a look at the `kedro-airflow` tag and see if you can help others out by sharing your knowledge. It's another great way to contribute. +If you are looking for help with your code, please consider posting a question on [our Slack organisation](https://slack.kedro.org/). You can post your questions to the `#questions` or the `#plugins-integrations` channel. Past questions and discussions from our Slack organisation are accessible on [Linen](https://linen-slack.kedro.org/). In the interest of community engagement we also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. If you have already checked the [existing issues](https://github.com/kedro-org/kedro-plugins/issues) on GitHub and are still convinced that you have found odd or erroneous behaviour then please file a [new issue](https://github.com/kedro-org/kedro-plugins/issues/new/choose). We have a template that helps you provide the necessary information we'll need in order to address your query. @@ -22,13 +20,13 @@ If you have already checked the [existing issues](https://github.com/kedro-org/k ### Suggest a new feature -If you have new ideas for Kedro-Airflow functionality then please open a [GitHub issue](https://github.com/kedro-org/kedro-plugins/issues) with the label `Type: Enhancement`. Please describe in your own words the feature you would like to see, why you need it, and how it should work. +If you have new ideas for Kedro-Airflow functionality then please open a [GitHub issue](https://github.com/kedro-org/kedro-plugins/issues) with the label `enhancement`. Please describe in your own words the feature you would like to see, why you need it, and how it should work. ### Contribute a new feature -If you're unsure where to begin contributing to Kedro-Airflow, please start by looking through the `good first issues` and `Request: Help Wanted` on [GitHub](https://github.com/kedro-org/kedro-plugins/issues). +If you're unsure where to begin contributing to Kedro-Airflow, please start by looking through the `good first issue` and `help wanted` on [GitHub](https://github.com/kedro-org/kedro-plugins/issues). -Typically, small contributions to Kedro-Airflow are more preferable due to an easier review process, but we accept any new features if they prove to be essential for the functioning of the plugin or if we believe that they are used by most projects. +Typically, small contributions to `kedro-airflow` are more preferable due to an easier review process, but we accept any new features if they prove to be essential for the functioning of the plugin or if we believe that they are used by most projects. ## Your first contribution @@ -69,15 +67,18 @@ We use a branching model that helps us keep track of branches in a logical, cons ## Plugin contribution process 1. Fork the project - 2. Develop your contribution in a new branch and open a PR against the `master` branch - 3. Make sure the CI builds are green (have a look at the section [Running checks locally](#running-checks-locally) below) - 4. Update the PR according to the reviewer's comments + 2. Develop your contribution in a new branch. + 3. Make sure all your commits are signed off by using `-s` flag with `git commit`. + 4. Open a PR against the `main` branch and sure that the PR title follows the [Conventional Commits specs](https://www.conventionalcommits.org/en/v1.0.0/) with the scope `(airflow)`. + 5. Make sure the CI builds are green (have a look at the section [Running checks locally](#running-checks-locally) below) + 6. Update the PR according to the reviewer's comments ## CI / CD and running checks locally To run E2E tests you need to install the test requirements which includes `behave`, do this using the following command: ```bash -pip install ".[test]" +make plugin=kedro-airflow install-test-requirements +make install-pre-commit ``` ### Running checks locally diff --git a/kedro-datasets/CONTRIBUTING.md b/kedro-datasets/CONTRIBUTING.md new file mode 100644 index 000000000..df8ee59cc --- /dev/null +++ b/kedro-datasets/CONTRIBUTING.md @@ -0,0 +1,106 @@ +# Introduction + + +Thank you for considering contributing to Kedro-Datasets! Kedro-Datasets is a collection of [Kedro's](https://github.com/kedro-org/kedro) data connectors. We welcome contributions in the form of pull requests, issues or code reviews. You can contribute new datasets, fix bugs in existing datasets, or simply send us spelling and grammar fixes or extra tests. Contribute anything that you think improves the community for us all! + +The following sections describe our vision and the contribution process. + +## Code of conduct + +The Kedro team pledges to foster and maintain a welcoming and friendly community in all of our spaces. All members of our community are expected to follow our [Code of Conduct](CODE_OF_CONDUCT.md), and we will do our best to enforce those principles and build a happy environment where everyone is treated with respect and dignity. + +# Get started + +We use [GitHub Issues](https://github.com/kedro-org/kedro-plugins/issues) to keep track of known bugs. We keep a close eye on them and try to make it clear when we have an internal fix in progress. Before reporting a new issue, please do your best to ensure your problem hasn't already been reported. If so, it's often better to just leave a comment on an existing issue, rather than create a new one. Old issues also can often include helpful tips and solutions to common problems. + +If you are looking for help with your code, please consider posting a question on [our Slack organisation](https://slack.kedro.org/). You can post your questions to the `#questions` channel. Past questions and discussions from our Slack organisation are accessible on [Linen](https://linen-slack.kedro.org/). In the interest of community engagement we also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. + +If you have already checked the [existing issues](https://github.com/kedro-org/kedro-plugins/issues) on GitHub and are still convinced that you have found odd or erroneous behaviour then please file a [new issue](https://github.com/kedro-org/kedro-plugins/issues/new/choose). We have a template that helps you provide the necessary information we'll need in order to address your query. + +## Feature requests + +### Suggest a new feature + +If you have new ideas for Kedro-Datasets then please open a [GitHub issue](https://github.com/kedro-org/kedro-plugins/issues) with the label `enhancement`. Please describe in your own words the feature you would like to see, why you need it, and how it should work. + +### Contribute a new dataset + +If you're unsure where to begin contributing to Kedro-Datasets, please start by looking through the `good first issue` and `help wanted` on [GitHub](https://github.com/kedro-org/kedro-plugins/issues). +If you want to contribute a new dataset, read the [tutorial to create and contribute a custom dataset](https://docs.kedro.org/en/stable/data/how_to_create_a_custom_dataset.html) in the Kedro documentation. +Make sure to add the new dataset to `kedro_datasets.rst` so that it shows up in the API documentation and to `static/jsonschema/kedro-catalog-X.json` for IDE validation. + + +## Your first contribution + +Working on your first pull request? You can learn how from these resources: +* [First timers only](https://www.firsttimersonly.com/) +* [How to contribute to an open source project on GitHub](https://egghead.io/courses/how-to-contribute-to-an-open-source-project-on-github) + +### Guidelines + + - Aim for cross-platform compatibility on Windows, macOS and Linux + - We use [Anaconda](https://www.anaconda.com/distribution/) as a preferred virtual environment + - We use [SemVer](https://semver.org/) for versioning + +Our code is designed to be compatible with Python 3.6 onwards and our style guidelines are (in cascading order): + +* [PEP 8 conventions](https://www.python.org/dev/peps/pep-0008/) for all Python code +* [Google docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) for code comments +* [PEP 484 type hints](https://www.python.org/dev/peps/pep-0484/) for all user-facing functions / class methods e.g. + +``` +def count_truthy(elements: List[Any]) -> int: + return sum(1 for elem in elements if elem) +``` + +> *Note:* We only accept contributions under the [Apache 2.0](https://opensource.org/licenses/Apache-2.0) license, and you should have permission to share the submitted code. + +### Branching conventions + +We use a branching model that helps us keep track of branches in a logical, consistent way. All branches should have the hyphen-separated convention of: `/` e.g. `feature/awesome-new-feature` + +| Types of changes | Description | +| ---------------- | --------------------------------------------------------------------------- | +| `docs` | Changes to the documentation of the plugin | +| `feature` | Non-breaking change which adds functionality | +| `fix` | Non-breaking change which fixes an issue | +| `tests` | Changes to project unit (`tests/`) and / or integration (`features/`) tests | + +## Plugin contribution process + + 1. Fork the project + 2. Develop your contribution in a new branch. + 3. Make sure all your commits are signed off by using `-s` flag with `git commit`. + 4. Open a PR against the `main` branch and sure that the PR title follows the [Conventional Commits specs](https://www.conventionalcommits.org/en/v1.0.0/) with the scope `(datasets)`. + 5. Make sure the CI builds are green (have a look at the section [Running checks locally](#running-checks-locally) below) + 6. Update the PR according to the reviewer's comments + +## CI / CD and running checks locally +To run tests you need to install the test requirements, do this using the following command: + +```bash +make plugin=kedro-datasets install-test-requirements +make install-pre-commit +``` + + +### Running checks locally + +All checks run by our CI / CD pipeline can be run locally on your computer. + +#### Linting (`ruff` and `black`) + +```bash +make plugin=kedro-datasets lint +``` + +#### Unit tests, 100% coverage (`pytest`, `pytest-cov`) + +```bash +make plugin=kedro-datasets test +``` + +If the tests in `kedro-datasets/kedro_datasets/spark` are failing, and you are not planning to work on Spark related features, then you can run the reduced test suite that excludes them with this command: +```bash +make test-no-spark +``` diff --git a/kedro-docker/CONTRIBUTING.md b/kedro-docker/CONTRIBUTING.md index 57e92017a..216618325 100644 --- a/kedro-docker/CONTRIBUTING.md +++ b/kedro-docker/CONTRIBUTING.md @@ -6,15 +6,13 @@ The following sections describe our vision and the contribution process. ## Code of conduct -The Kedro team pledges to foster and maintain a welcoming and friendly community in all of our spaces. All members of our community are expected to follow our [Code of Conduct](CODE_OF_CONDUCT.md) and we will do our best to enforce those principles and build a happy environment where everyone is treated with respect and dignity. +The Kedro team pledges to foster and maintain a welcoming and friendly community in all of our spaces. All members of our community are expected to follow our [Code of Conduct](CODE_OF_CONDUCT.md), and we will do our best to enforce those principles and build a happy environment where everyone is treated with respect and dignity. # Get started We use [GitHub Issues](https://github.com/kedro-org/kedro-plugins/issues) to keep track of known bugs. We keep a close eye on them and try to make it clear when we have an internal fix in progress. Before reporting a new issue, please do your best to ensure your problem hasn't already been reported. If so, it's often better to just leave a comment on an existing issue, rather than create a new one. Old issues also can often include helpful tips and solutions to common problems. -If you are looking for help with your code, please consider posting a question on [Stack Overflow](https://stackoverflow.com/questions/tagged/kedro-docker). If you tag it `kedro-docker`, `kedro` and `python`, more people will see it and may be able to help. We are unable to provide individual support via email. In the interest of community engagement we also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. - -If you're over on Stack Overflow and want to boost your points, take a look at the `kedro-docker` tag and see if you can help others out by sharing your knowledge. It's another great way to contribute. +If you are looking for help with your code, please consider posting a question on [our Slack organisation](https://slack.kedro.org/). You can post your questions to the `#questions` or the `#plugins-integrations` channel. Past questions and discussions from our Slack organisation are accessible on [Linen](https://linen-slack.kedro.org/). In the interest of community engagement we also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. If you have already checked the [existing issues](https://github.com/kedro-org/kedro-plugins/issues) on GitHub and are still convinced that you have found odd or erroneous behaviour then please file a [new issue](https://github.com/kedro-org/kedro-plugins/issues/new/choose). We have a template that helps you provide the necessary information we'll need in order to address your query. @@ -22,11 +20,11 @@ If you have already checked the [existing issues](https://github.com/kedro-org/k ### Suggest a new feature -If you have new ideas for Kedro-Docker functionality then please open a [GitHub issue](https://github.com/kedro-org/kedro-plugins/issues) with the label `Type: Enhancement`. Please describe in your own words the feature you would like to see, why you need it, and how it should work. +If you have new ideas for Kedro-Docker functionality then please open a [GitHub issue](https://github.com/kedro-org/kedro-plugins/issues) with the label `enhancement`. Please describe in your own words the feature you would like to see, why you need it, and how it should work. ### Contribute a new feature -If you're unsure where to begin contributing to Kedro-Docker, please start by looking through the `good first issues` and `Request: Help Wanted` on [GitHub](https://github.com/kedro-org/kedro-plugins/issues). +If you're unsure where to begin contributing to Kedro-Docker, please start by looking through the `good first issue` and `help wanted` on [GitHub](https://github.com/kedro-org/kedro-plugins/issues). Typically, small contributions to Kedro-Docker are more preferable due to an easier review process, but we accept any new features if they prove to be essential for the functioning of the plugin or if we believe that they are used by most projects. @@ -53,7 +51,7 @@ def count_truthy(elements: List[Any]) -> int: return sum(1 for elem in elements if elem) ``` -> *Note:* We only accept contributions under the [Apache 2.0](https://opensource.org/licenses/Apache-2.0) license and you should have permission to share the submitted code. +> *Note:* We only accept contributions under the [Apache 2.0](https://opensource.org/licenses/Apache-2.0) license, and you should have permission to share the submitted code. ### Branching conventions @@ -69,15 +67,18 @@ We use a branching model that helps us keep track of branches in a logical, cons ## Plugin contribution process 1. Fork the project - 2. Develop your contribution in a new branch and open a PR against the `master` branch - 3. Make sure the CI builds are green (have a look at the section [Running checks locally](#running-checks-locally) below) - 4. Update the PR according to the reviewer's comments + 2. Develop your contribution in a new branch. + 3. Make sure all your commits are signed off by using `-s` flag with `git commit`. + 4. Open a PR against the `main` branch and sure that the PR title follows the [Conventional Commits specs](https://www.conventionalcommits.org/en/v1.0.0/) with the scope `(docker)`. + 5. Make sure the CI builds are green (have a look at the section [Running checks locally](#running-checks-locally) below) + 6. Update the PR according to the reviewer's comments ## CI / CD and running checks locally To run E2E tests you need to install the test requirements which includes `behave`, do this using the following command: ```bash -pip install ".[test]" +make plugin=kedro-docker install-test-requirements +make install-pre-commit ``` ### Running checks locally @@ -99,5 +100,5 @@ make plugin=kedro-docker test #### End-to-end tests (`behave`) ```bash -make plugin=kedro-docker e2e-tests +make plugin=kedro-docker e2e-tests ``` From 8b47e1da0cffe6ff90b6045a7896d78b67ccb672 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:15:51 +0100 Subject: [PATCH 08/15] ci(datasets): Run tensorflow tests separately from other dataset tests (#377) Signed-off-by: Merel Theisen --- .github/workflows/unit-tests.yml | 7 +++++-- Makefile | 5 +++++ kedro-datasets/pyproject.toml | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 5f479afa5..6e3e2ecb7 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -49,9 +49,12 @@ jobs: pip install ".[test]" - name: pip freeze run: pip freeze - - name: Run unit tests for Linux / all plugins - if: inputs.os != 'windows-latest' + - name: Run unit tests for Linux / kedro-airflow, kedro-docker, kedro-telemetry + if: inputs.os != 'windows-latest' && inputs.plugin != 'kedro-datasets' run: make plugin=${{ inputs.plugin }} test + - name: Run unit tests for Linux / kedro-datasets + if: inputs.os != 'windows-latest' && inputs.plugin == 'kedro-datasets' + run: make dataset-tests - name: Run unit tests for Windows / kedro-airflow, kedro-docker, kedro-telemetry if: inputs.os == 'windows-latest' && inputs.plugin != 'kedro-datasets' run: | diff --git a/Makefile b/Makefile index 1ab21d7cc..1d8d839a2 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,11 @@ lint: test: cd $(plugin) && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile +# Run test_tensorflow_model_dataset separately, because these tests are flaky when run as part of the full test-suite +dataset-tests: + cd kedro-datasets && pytest tests --cov-config pyproject.toml --numprocesses 4 --dist loadfile --ignore tests/tensorflow + cd kedro-datasets && pytest tests/tensorflow/test_tensorflow_model_dataset.py --no-cov + test-sequential: cd $(plugin) && pytest tests --cov-config pyproject.toml diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index d5be97bbc..e485149ed 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -31,7 +31,7 @@ version = {attr = "kedro_datasets.__version__"} [tool.coverage.report] fail_under = 100 show_missing = true -omit = ["tests/*", "kedro_datasets/holoviews/*", "kedro_datasets/snowflake/*"] +omit = ["tests/*", "kedro_datasets/holoviews/*", "kedro_datasets/snowflake/*", "kedro_datasets/tensorflow/*"] exclude_lines = ["pragma: no cover", "raise NotImplementedError"] [tool.pytest.ini_options] From 279da244fadc309c4deec87f29b65d084e2c5273 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Mon, 9 Oct 2023 14:10:29 +0200 Subject: [PATCH 09/15] feat: Kedro-Airflow convert all pipelines option (#335) * feat: kedro airflow convert --all option Signed-off-by: Simon Brugman * docs: release docs Signed-off-by: Simon Brugman --------- Signed-off-by: Simon Brugman --- kedro-airflow/README.md | 5 +- kedro-airflow/RELEASE.md | 7 ++ kedro-airflow/kedro_airflow/plugin.py | 137 ++++++++++++++------------ kedro-airflow/tests/test_plugin.py | 39 ++++++++ 4 files changed, 125 insertions(+), 63 deletions(-) diff --git a/kedro-airflow/README.md b/kedro-airflow/README.md index b61ed141d..9cc006bb3 100644 --- a/kedro-airflow/README.md +++ b/kedro-airflow/README.md @@ -32,10 +32,12 @@ kedro airflow create This command will generate an Airflow DAG file located in the `airflow_dags/` directory in your project. You can pass a `--pipeline` flag to generate the DAG file for a specific Kedro pipeline and an `--env` flag to generate the DAG file for a specific Kedro environment. +Passing `--all` will convert all registered Kedro pipelines to Airflow DAGs. ### Step 2: Copy the DAG file to the Airflow DAGs folder. For more information about the DAGs folder, please visit [Airflow documentation](https://airflow.apache.org/docs/stable/concepts.html#dags). +The Airflow DAG configuration can be customized by editing this file. ### Step 3: Package and install the Kedro pipeline in the Airflow executor's environment @@ -101,8 +103,9 @@ For instance, if you would like to use the name `scheduler`, then change the fil CONFIG_LOADER_ARGS = { "config_patterns": {"airflow": ["scheduler*", "scheduler/**"]} } +``` -Follow Kedro's official documentation, to see how to add templating, custom resolvers etc. (https://docs.kedro.org/en/stable/configuration/advanced_configuration.html#how-to-do-templating-with-the-omegaconfigloader)[https://docs.kedro.org/en/stable/configuration/advanced_configuration.html#how-to-do-templating-with-the-omegaconfigloader] +Follow Kedro's [official documentation](https://docs.kedro.org/en/stable/configuration/advanced_configuration.html#how-to-do-templating-with-the-omegaconfigloader), to see how to add templating, custom resolvers etc. #### What if I want to pass different arguments? diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 0ea332f2b..32f705069 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,5 +1,12 @@ # Upcoming Release * Added support for Python 3.11 +* Added the `--all` CLI argument to `kedro-airflow` to convert registered all pipelines at once. +* Simplify the output of the `kedro airflow create` command. + +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: + +* [sbrugman](https://github.com/sbrugman) # Release 0.6.0 * Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. diff --git a/kedro-airflow/kedro_airflow/plugin.py b/kedro-airflow/kedro_airflow/plugin.py index 921643c8e..ba998dabc 100644 --- a/kedro-airflow/kedro_airflow/plugin.py +++ b/kedro-airflow/kedro_airflow/plugin.py @@ -18,7 +18,10 @@ from slugify import slugify PIPELINE_ARG_HELP = """Name of the registered pipeline to convert. -If not set, the '__default__' pipeline is used.""" +If not set, the '__default__' pipeline is used. This argument supports +passing multiple values using `--pipeline [p1] --pipeline [p2]`. +Use the `--all` flag to convert all registered pipelines at once.""" +ALL_ARG_HELP = """Convert all registered pipelines at once.""" @click.group(name="Kedro-Airflow") @@ -32,7 +35,7 @@ def airflow_commands(): pass -def _load_config(context: KedroContext, pipeline_name: str) -> dict[str, Any]: +def _load_config(context: KedroContext) -> dict[str, Any]: # Set the default pattern for `airflow` if not provided in `settings.py` if "airflow" not in context.config_loader.config_patterns.keys(): context.config_loader.config_patterns.update( # pragma: no cover @@ -43,11 +46,13 @@ def _load_config(context: KedroContext, pipeline_name: str) -> dict[str, Any]: # Load the config try: - config_airflow = context.config_loader["airflow"] + return context.config_loader["airflow"] except MissingConfigException: # File does not exist return {} + +def _get_pipeline_config(config_airflow: dict, params: dict, pipeline_name: str): dag_config = {} # Load the default config if specified if "default" in config_airflow: @@ -55,13 +60,23 @@ def _load_config(context: KedroContext, pipeline_name: str) -> dict[str, Any]: # Update with pipeline-specific config if present if pipeline_name in config_airflow: dag_config.update(config_airflow[pipeline_name]) + + # Update with params if provided + dag_config.update(params) return dag_config @airflow_commands.command() @click.option( - "-p", "--pipeline", "pipeline_name", default="__default__", help=PIPELINE_ARG_HELP + "-p", + "--pipeline", + "--pipelines", + "pipeline_names", + multiple=True, + default=("__default__",), + help=PIPELINE_ARG_HELP, ) +@click.option("--all", "convert_all", is_flag=True, help=ALL_ARG_HELP) @click.option("-e", "--env", default="local", help=ENV_HELP) @click.option( "-t", @@ -90,21 +105,24 @@ def _load_config(context: KedroContext, pipeline_name: str) -> dict[str, Any]: @click.pass_obj def create( # noqa: PLR0913 metadata: ProjectMetadata, - pipeline_name, + pipeline_names, env, target_path, jinja_file, params, + convert_all: bool, ): """Create an Airflow DAG for a project""" + if convert_all and pipeline_names != ("__default__",): + raise click.BadParameter( + "The `--all` and `--pipeline` option are mutually exclusive." + ) + project_path = Path.cwd().resolve() bootstrap_project(project_path) with KedroSession.create(project_path=project_path, env=env) as session: context = session.load_context() - dag_config = _load_config(context, pipeline_name) - - # Update with params if provided - dag_config.update(params) + config_airflow = _load_config(context) jinja_file = Path(jinja_file).resolve() loader = jinja2.FileSystemLoader(jinja_file.parent) @@ -112,57 +130,52 @@ def create( # noqa: PLR0913 jinja_env.filters["slugify"] = slugify template = jinja_env.get_template(jinja_file.name) + dags_folder = Path(target_path) + # Ensure that the DAGs folder exists + dags_folder.mkdir(parents=True, exist_ok=True) + secho(f"Location of the Airflow DAG folder: {target_path!s}", fg="green") + package_name = metadata.package_name - dag_filename = ( - f"{package_name}_dag.py" - if pipeline_name == "__default__" - else f"{package_name}_{pipeline_name}_dag.py" - ) - - target_path = Path(target_path) - target_path = target_path / dag_filename - - target_path.parent.mkdir(parents=True, exist_ok=True) - - pipeline = pipelines.get(pipeline_name) - if pipeline is None: - raise KedroCliError(f"Pipeline {pipeline_name} not found.") - - dependencies = defaultdict(list) - for node, parent_nodes in pipeline.node_dependencies.items(): - for parent in parent_nodes: - dependencies[parent].append(node) - - template.stream( - dag_name=package_name, - dependencies=dependencies, - env=env, - pipeline_name=pipeline_name, - package_name=package_name, - pipeline=pipeline, - **dag_config, - ).dump(str(target_path)) - - secho("") - secho("An Airflow DAG has been generated in:", fg="green") - secho(str(target_path)) - secho("This file should be copied to your Airflow DAG folder.", fg="yellow") - secho( - "The Airflow configuration can be customized by editing this file.", - fg="green", - ) - secho("") - secho( - "This file also contains the path to the config directory, this directory will need to " - "be available to Airflow and any workers.", - fg="yellow", - ) - secho("") - secho( - "Additionally all data sets must have an entry in the data catalog.", - fg="yellow", - ) - secho( - "And all local paths in both the data catalog and log config must be absolute paths.", - fg="yellow", - ) + + if convert_all: + # Convert all pipelines + conversion_pipelines = pipelines + else: + conversion_pipelines = { + pipeline_name: pipelines.get(pipeline_name) + for pipeline_name in pipeline_names + } + + # Convert selected pipelines + for name, pipeline in conversion_pipelines.items(): + dag_config = _get_pipeline_config(config_airflow, params, name) + + if pipeline is None: + raise KedroCliError(f"Pipeline {name} not found.") + + # Obtain the file name + dag_filename = dags_folder / ( + f"{package_name}_dag.py" + if name == "__default__" + else f"{package_name}_{name}_dag.py" + ) + + dependencies = defaultdict(list) + for node, parent_nodes in pipeline.node_dependencies.items(): + for parent in parent_nodes: + dependencies[parent].append(node) + + template.stream( + dag_name=package_name, + dependencies=dependencies, + env=env, + pipeline_name=name, + package_name=package_name, + pipeline=pipeline, + **dag_config, + ).dump(str(dag_filename)) + + secho( + f"Converted pipeline `{name}` to Airflow DAG in the file `{dag_filename.name}`", + fg="green", + ) diff --git a/kedro-airflow/tests/test_plugin.py b/kedro-airflow/tests/test_plugin.py index 2bcdde472..1d282f0c3 100644 --- a/kedro-airflow/tests/test_plugin.py +++ b/kedro-airflow/tests/test_plugin.py @@ -225,3 +225,42 @@ def test_create_airflow_dag_nonexistent_pipeline(cli_runner, metadata): "kedro.framework.cli.utils.KedroCliError: Pipeline de not found." in result.stdout ) + + +def test_create_airflow_all_dags(cli_runner, metadata): + command = ["airflow", "create", "--all"] + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + print(result.stdout) + + for dag_name, pipeline_name in [ + ("hello_world", "__default__"), + ("hello_world", "ds"), + ]: + dag_file = ( + Path.cwd() + / "airflow_dags" + / ( + f"{dag_name}_dag.py" + if pipeline_name == "__default__" + else f"{dag_name}_{pipeline_name}_dag.py" + ) + ) + assert dag_file.exists() + + expected_airflow_dag = 'tasks["node0"] >> tasks["node1"]' + with dag_file.open(encoding="utf-8") as f: + dag_code = [line.strip() for line in f.read().splitlines()] + assert expected_airflow_dag in dag_code + dag_file.unlink() + + +def test_create_airflow_all_and_pipeline(cli_runner, metadata): + command = ["airflow", "create", "--all", "-p", "ds"] + result = cli_runner.invoke(commands, command, obj=metadata) + assert result.exit_code == 2 + assert ( + "Error: Invalid value: The `--all` and `--pipeline` option are mutually exclusive." + in result.stdout + ) From 78913d8d9fd78f80a58bc6dc27427831c4bab95d Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 10 Oct 2023 02:51:34 -0600 Subject: [PATCH 10/15] docs(datasets): blacken code in rst literal blocks (#362) Signed-off-by: Deepyaman Datta --- .pre-commit-config.yaml | 9 +++ .../kedro_datasets/api/api_dataset.py | 14 ++--- .../biosequence/biosequence_dataset.py | 12 ++-- .../kedro_datasets/dask/parquet_dataset.py | 14 ++--- .../databricks/managed_table_dataset.py | 11 ++-- .../kedro_datasets/email/message_dataset.py | 3 +- .../geopandas/geojson_dataset.py | 9 ++- .../holoviews/holoviews_writer.py | 3 +- .../kedro_datasets/json/json_dataset.py | 5 +- .../matplotlib/matplotlib_writer.py | 24 ++++--- .../kedro_datasets/networkx/gml_dataset.py | 3 +- .../networkx/graphml_dataset.py | 3 +- .../kedro_datasets/networkx/json_dataset.py | 3 +- .../kedro_datasets/pandas/csv_dataset.py | 6 +- .../pandas/deltatable_dataset.py | 7 ++- .../kedro_datasets/pandas/excel_dataset.py | 14 ++--- .../kedro_datasets/pandas/feather_dataset.py | 6 +- .../kedro_datasets/pandas/gbq_dataset.py | 15 +++-- .../kedro_datasets/pandas/generic_dataset.py | 8 +-- .../kedro_datasets/pandas/hdf_dataset.py | 8 +-- .../kedro_datasets/pandas/json_dataset.py | 6 +- .../kedro_datasets/pandas/parquet_dataset.py | 6 +- .../kedro_datasets/pandas/sql_dataset.py | 63 ++++++++++--------- .../kedro_datasets/pandas/xml_dataset.py | 6 +- .../kedro_datasets/pickle/pickle_dataset.py | 16 ++--- .../kedro_datasets/pillow/image_dataset.py | 3 +- .../kedro_datasets/plotly/json_dataset.py | 3 +- .../kedro_datasets/plotly/plotly_dataset.py | 15 ++--- .../kedro_datasets/polars/csv_dataset.py | 8 +-- .../kedro_datasets/polars/generic_dataset.py | 8 +-- .../kedro_datasets/redis/redis_dataset.py | 6 +- kedro-datasets/kedro_datasets/spark/README.md | 1 + .../spark/deltatable_dataset.py | 13 ++-- .../kedro_datasets/spark/spark_dataset.py | 16 ++--- .../spark/spark_hive_dataset.py | 18 +++--- .../spark/spark_jdbc_dataset.py | 25 ++++---- .../svmlight/svmlight_dataset.py | 3 +- .../tensorflow/tensorflow_model_dataset.py | 3 +- .../kedro_datasets/text/text_dataset.py | 3 +- .../kedro_datasets/tracking/json_dataset.py | 5 +- .../tracking/metrics_dataset.py | 5 +- .../kedro_datasets/video/video_dataset.py | 15 +++-- .../kedro_datasets/yaml/yaml_dataset.py | 5 +- 43 files changed, 236 insertions(+), 193 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e8804f2cb..38f6e6bfe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,15 @@ repos: - id: check-merge-conflict # Check for files that contain merge conflict strings. - id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source. + - repo: https://github.com/adamchainz/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + args: + - "--rst-literal-blocks" + additional_dependencies: + - black==22.12.0 + - repo: local hooks: - id: ruff-kedro-datasets diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index b40ab1640..438f3b976 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -37,7 +37,8 @@ class APIDataset(AbstractDataset[None, requests.Response]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.api import APIDataset >>> @@ -51,23 +52,22 @@ class APIDataset(AbstractDataset[None, requests.Response]): ... "commodity_desc": "CORN", ... "statisticcat_des": "YIELD", ... "agg_level_desc": "STATE", - ... "year": 2000 + ... "year": 2000, ... } ... }, - ... credentials=("username", "password") + ... credentials=("username", "password"), ... ) >>> data = dataset.load() ``APIDataset`` can also be used to save output on a remote server using HTTP(S) methods. - :: + + .. code-block:: pycon >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}' >>> >>> dataset = APIDataset( - ... method = "POST", - ... url = "url_of_remote_server", - ... save_args = {"chunk_size":1} + ... method="POST", url="url_of_remote_server", save_args={"chunk_size": 1} ... ) >>> dataset.save(example_table) diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index ebd0722f5..89ea37fce 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -18,7 +18,8 @@ class BioSequenceDataset(AbstractDataset[List, List]): r"""``BioSequenceDataset`` loads and saves data to a sequence file. Example: - :: + + .. code-block:: pycon >>> from kedro_datasets.biosequence import BioSequenceDataset >>> from io import StringIO @@ -28,10 +29,13 @@ class BioSequenceDataset(AbstractDataset[List, List]): >>> raw_data = [] >>> for record in SeqIO.parse(StringIO(data), "fasta"): ... raw_data.append(record) + ... >>> - >>> dataset = BioSequenceDataset(filepath="ls_orchid.fasta", - ... load_args={"format": "fasta"}, - ... save_args={"format": "fasta"}) + >>> dataset = BioSequenceDataset( + ... filepath="ls_orchid.fasta", + ... load_args={"format": "fasta"}, + ... save_args={"format": "fasta"}, + ... ) >>> dataset.save(raw_data) >>> sequence_list = dataset.load() >>> diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 5ec39fed5..3d6626d3d 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -37,25 +37,25 @@ class ParquetDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro.extras.datasets.dask import ParquetDataset >>> import pandas as pd >>> import dask.dataframe as dd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [[5, 6], [7, 8]]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [[5, 6], [7, 8]]}) >>> ddf = dd.from_pandas(data, npartitions=2) >>> >>> dataset = ParquetDataset( ... filepath="s3://bucket_name/path/to/folder", ... credentials={ - ... 'client_kwargs':{ - ... 'aws_access_key_id': 'YOUR_KEY', - ... 'aws_secret_access_key': 'YOUR SECRET', + ... "client_kwargs": { + ... "aws_access_key_id": "YOUR_KEY", + ... "aws_secret_access_key": "YOUR SECRET", ... } ... }, - ... save_args={"compression": "GZIP"} + ... save_args={"compression": "GZIP"}, ... ) >>> dataset.save(ddf) >>> reloaded = dataset.load() diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 33c7ef1d1..dd119559d 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -176,12 +176,13 @@ class ManagedTableDataset(AbstractVersionedDataset): .. code-block:: python from pyspark.sql import SparkSession - from pyspark.sql.types import (StructField, StringType, - IntegerType, StructType) + from pyspark.sql.types import StructField, StringType, IntegerType, StructType from kedro_datasets.databricks import ManagedTableDataset - schema = StructType([StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + + schema = StructType( + [StructField("name", StringType(), True), StructField("age", IntegerType(), True)] + ) + data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) dataset = ManagedTableDataset(table="names_and_ages") dataset.save(spark_df) diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index 1f45042fd..076bfd492 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -26,7 +26,8 @@ class EmailMessageDataset(AbstractVersionedDataset[Message, Message]): Note that ``EmailMessageDataset`` doesn't handle sending email messages. Example: - :: + + .. code-block:: pycon >>> from email.message import EmailMessage >>> diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py index 56a8890a7..ab1e0e620 100644 --- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py +++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py @@ -26,14 +26,17 @@ class GeoJSONDataset( allowed geopandas (pandas) options for loading and saving GeoJSON files. Example: - :: + + .. code-block:: pycon >>> import geopandas as gpd >>> from shapely.geometry import Point >>> from kedro_datasets.geopandas import GeoJSONDataset >>> - >>> data = gpd.GeoDataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}, geometry=[Point(1,1), Point(2,4)]) + >>> data = gpd.GeoDataFrame( + ... {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}, + ... geometry=[Point(1, 1), Point(2, 4)], + ... ) >>> dataset = GeoJSONDataset(filepath="test.geojson", save_args=None) >>> dataset.save(data) >>> reloaded = dataset.load() diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py index 7d64b8bf6..18e817c9b 100644 --- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py +++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py @@ -21,7 +21,8 @@ class HoloviewsWriter(AbstractVersionedDataset[HoloViews, NoReturn]): filesystem (e.g. local, S3, GCS). Example: - :: + + .. code-block:: pycon >>> import holoviews as hv >>> from kedro_datasets.holoviews import HoloviewsWriter diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index 341e13933..418355ea9 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -34,11 +34,12 @@ class JSONDataset(AbstractVersionedDataset[Any, Any]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.json import JSONDataset >>> - >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} + >>> data = {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]} >>> >>> dataset = JSONDataset(filepath="test.json") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index 568928caf..bea1cde1c 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -37,21 +37,21 @@ class MatplotlibWriter( Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter >>> >>> fig = plt.figure() >>> plt.plot([1, 2, 3]) - >>> plot_writer = MatplotlibWriter( - ... filepath="data/08_reporting/output_plot.png" - ... ) + >>> plot_writer = MatplotlibWriter(filepath="data/08_reporting/output_plot.png") >>> plt.close() >>> plot_writer.save(fig) Example saving a plot as a PDF file: - :: + + .. code-block:: pycon >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter @@ -66,7 +66,8 @@ class MatplotlibWriter( >>> pdf_plot_writer.save(fig) Example saving multiple plots in a folder, using a dictionary: - :: + + .. code-block:: pycon >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter @@ -77,13 +78,12 @@ class MatplotlibWriter( ... plt.plot([1, 2, 3], color=colour) ... >>> plt.close("all") - >>> dict_plot_writer = MatplotlibWriter( - ... filepath="data/08_reporting/plots" - ... ) + >>> dict_plot_writer = MatplotlibWriter(filepath="data/08_reporting/plots") >>> dict_plot_writer.save(plots_dict) Example saving multiple plots in a folder, using a list: - :: + + .. code-block:: pycon >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter @@ -94,9 +94,7 @@ class MatplotlibWriter( ... plt.plot([i, i + 1, i + 2]) ... >>> plt.close("all") - >>> list_plot_writer = MatplotlibWriter( - ... filepath="data/08_reporting/plots" - ... ) + >>> list_plot_writer = MatplotlibWriter(filepath="data/08_reporting/plots") >>> list_plot_writer.save(plots_list) """ diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index f4d63e87e..1fd26a7d3 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -22,7 +22,8 @@ class GMLDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): See https://networkx.org/documentation/stable/tutorial.html for details. Example: - :: + + .. code-block:: pycon >>> from kedro_datasets.networkx import GMLDataset >>> import networkx as nx diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index 0a368f505..a797b948d 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -21,7 +21,8 @@ class GraphMLDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): See https://networkx.org/documentation/stable/tutorial.html for details. Example: - :: + + .. code-block:: pycon >>> from kedro_datasets.networkx import GraphMLDataset >>> import networkx as nx diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index 4a41f9a67..55301faca 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -22,7 +22,8 @@ class JSONDataset(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): See https://networkx.org/documentation/stable/tutorial.html for details. Example: - :: + + .. code-block:: pycon >>> from kedro_datasets.networkx import JSONDataset >>> import networkx as nx diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 543035238..557beaf4f 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -52,13 +52,13 @@ class CSVDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import CSVDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = CSVDataset(filepath="test.csv") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py index 4581312c5..c009fe92d 100644 --- a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py @@ -61,19 +61,20 @@ class DeltaTableDataset(AbstractDataset): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import DeltaTableDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> dataset = DeltaTableDataset(filepath="test") >>> >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) >>> - >>> new_data = pd.DataFrame({'col1': [7, 8], 'col2': [9, 10], 'col3': [11, 12]}) + >>> new_data = pd.DataFrame({"col1": [7, 8], "col2": [9, 10], "col3": [11, 12]}) >>> dataset.save(new_data) >>> dataset.get_loaded_version() diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 6f4b0ff27..048130464 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -56,13 +56,13 @@ class ExcelDataset( Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import ExcelDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = ExcelDataset(filepath="test.xlsx") >>> dataset.save(data) @@ -90,16 +90,16 @@ class ExcelDataset( `Python API `_ for a multi-sheet Excel file: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import ExcelDataset >>> import pandas as pd >>> - >>> dataframe = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> dataframe = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> another_dataframe = pd.DataFrame({"x": [10, 20], "y": ["hello", "world"]}) >>> multiframe = {"Sheet1": dataframe, "Sheet2": another_dataframe} - >>> dataset = ExcelDataset(filepath="test.xlsx", load_args = {"sheet_name": None}) + >>> dataset = ExcelDataset(filepath="test.xlsx", load_args={"sheet_name": None}) >>> dataset.save(multiframe) >>> reloaded = dataset.load() >>> assert multiframe["Sheet1"].equals(reloaded["Sheet1"]) diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index 41995dda4..cfca7ce59 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -51,13 +51,13 @@ class FeatherDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import FeatherDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = FeatherDataset(filepath="test.feather") >>> diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index d672cae0c..11ace04ee 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -45,17 +45,15 @@ class GBQTableDataset(AbstractDataset[None, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import GBQTableDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - >>> 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = GBQTableDataset('dataset', - >>> 'table_name', - >>> project='my-project') + >>> dataset = GBQTableDataset("dataset", "table_name", project="my-project") >>> dataset.save(data) >>> reloaded = dataset.load() >>> @@ -196,13 +194,14 @@ class GBQQueryDataset(AbstractDataset[None, pd.DataFrame]): Example using Python API: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import GBQQueryDataset >>> >>> sql = "SELECT * FROM dataset_1.table_a" >>> - >>> dataset = GBQQueryDataset(sql, project='my-project') + >>> dataset = GBQQueryDataset(sql, project="my-project") >>> >>> sql_data = dataset.load() >>> diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 987d79be7..f8e813a74 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -64,15 +64,15 @@ class GenericDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import GenericDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = GenericDataset(filepath="test.csv", file_format='csv') + >>> dataset = GenericDataset(filepath="test.csv", file_format="csv") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index 73870e56a..5d9a6bc16 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -36,15 +36,15 @@ class HDFDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import HDFDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = HDFDataset(filepath="test.h5", key='data') + >>> dataset = HDFDataset(filepath="test.h5", key="data") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index f480f0754..cfc53d627 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -47,13 +47,13 @@ class JSONDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import JSONDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = JSONDataset(filepath="test.json") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index b132d69b3..6cd862379 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -58,13 +58,13 @@ class ParquetDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import ParquetDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = ParquetDataset(filepath="test.parquet") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index beb25fb3f..5bad6e98b 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -127,19 +127,16 @@ class SQLTableDataset(AbstractDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import SQLTableDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], - ... "col3": [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> table_name = "table_a" - >>> credentials = { - ... "con": "postgresql://scott:tiger@localhost/test" - ... } - >>> data_set = SQLTableDataset(table_name=table_name, - ... credentials=credentials) + >>> credentials = {"con": "postgresql://scott:tiger@localhost/test"} + >>> data_set = SQLTableDataset(table_name=table_name, credentials=credentials) >>> >>> data_set.save(data) >>> reloaded = data_set.load() @@ -311,44 +308,48 @@ class SQLQueryDataset(AbstractDataset[None, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import SQLQueryDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], - ... "col3": [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> sql = "SELECT * FROM table_a" - >>> credentials = { - ... "con": "postgresql://scott:tiger@localhost/test" - ... } - >>> data_set = SQLQueryDataset(sql=sql, - ... credentials=credentials) + >>> credentials = {"con": "postgresql://scott:tiger@localhost/test"} + >>> data_set = SQLQueryDataset(sql=sql, credentials=credentials) >>> >>> sql_data = data_set.load() Example of usage for mssql: - :: + + .. code-block:: pycon - >>> credentials = {"server": "localhost", "port": "1433", - ... "database": "TestDB", "user": "SA", - ... "password": "StrongPassword"} + >>> credentials = { + ... "server": "localhost", + ... "port": "1433", + ... "database": "TestDB", + ... "user": "SA", + ... "password": "StrongPassword", + ... } >>> def _make_mssql_connection_str( - ... server: str, port: str, database: str, user: str, password: str + ... server: str, port: str, database: str, user: str, password: str ... ) -> str: - ... import pyodbc # noqa - ... from sqlalchemy.engine import URL # noqa - ... - ... driver = pyodbc.drivers()[-1] - ... connection_str = (f"DRIVER={driver};SERVER={server},{port};DATABASE={database};" - ... f"ENCRYPT=yes;UID={user};PWD={password};" - ... f"TrustServerCertificate=yes;") - ... return URL.create("mssql+pyodbc", query={"odbc_connect": connection_str}) + ... import pyodbc # noqa + ... from sqlalchemy.engine import URL # noqa + ... driver = pyodbc.drivers()[-1] + ... connection_str = ( + ... f"DRIVER={driver};SERVER={server},{port};DATABASE={database};" + ... f"ENCRYPT=yes;UID={user};PWD={password};" + ... f"TrustServerCertificate=yes;" + ... ) + ... return URL.create("mssql+pyodbc", query={"odbc_connect": connection_str}) ... >>> connection_str = _make_mssql_connection_str(**credentials) - >>> data_set = SQLQueryDataset(credentials={"con": connection_str}, - ... sql="SELECT TOP 5 * FROM TestTable;") + >>> data_set = SQLQueryDataset( + ... credentials={"con": connection_str}, sql="SELECT TOP 5 * FROM TestTable;" + ... ) >>> df = data_set.load() In addition, here is an example of a catalog with dates parsing: diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index fa3fe1de4..70196bd63 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -30,13 +30,13 @@ class XMLDataset(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pandas import XMLDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = XMLDataset(filepath="test.xml") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index b28103e7e..21d3b8c71 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -44,23 +44,25 @@ class PickleDataset(AbstractVersionedDataset[Any, Any]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pickle import PickleDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> dataset = PickleDataset(filepath="test.pkl", backend="pickle") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) >>> - >>> dataset = PickleDataset(filepath="test.pickle.lz4", - ... backend="compress_pickle", - ... load_args={"compression":"lz4"}, - ... save_args={"compression":"lz4"}) + >>> dataset = PickleDataset( + ... filepath="test.pickle.lz4", + ... backend="compress_pickle", + ... load_args={"compression": "lz4"}, + ... save_args={"compression": "lz4"}, + ... ) >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.equals(reloaded) diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index 161ff9dc5..c7f7fbeaa 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -21,7 +21,8 @@ class ImageDataset(AbstractVersionedDataset[Image.Image, Image.Image]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.pillow import ImageDataset >>> diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index 68e5ad9a5..cb7f2d1e7 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -36,7 +36,8 @@ class JSONDataset( Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.plotly import JSONDataset >>> import plotly.express as px diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index a30e62f0d..2983233fe 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -46,24 +46,25 @@ class PlotlyDataset(JSONDataset): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.plotly import PlotlyDataset >>> import plotly.express as px >>> import pandas as pd >>> - >>> df_data = pd.DataFrame([[0, 1], [1, 0]], columns=('x1', 'x2')) + >>> df_data = pd.DataFrame([[0, 1], [1, 0]], columns=("x1", "x2")) >>> >>> dataset = PlotlyDataset( - ... filepath='scatter_plot.json', + ... filepath="scatter_plot.json", ... plotly_args={ - ... 'type': 'scatter', - ... 'fig': {'x': 'x1', 'y': 'x2'}, - ... } + ... "type": "scatter", + ... "fig": {"x": "x1", "y": "x2"}, + ... }, ... ) >>> dataset.save(df_data) >>> reloaded = dataset.load() - >>> assert px.scatter(df_data, x='x1', y='x2') == reloaded + >>> assert px.scatter(df_data, x="x1", y="x2") == reloaded """ diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index e2638107f..8a33e09a2 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -50,15 +50,15 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.polars import CSVDataset >>> import polars as pl >>> - >>> data = pl.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pl.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = CSVDataset(filepath='test.csv') + >>> dataset = CSVDataset(filepath="test.csv") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.frame_equal(reloaded) diff --git a/kedro-datasets/kedro_datasets/polars/generic_dataset.py b/kedro-datasets/kedro_datasets/polars/generic_dataset.py index 5deceff44..aa6eedd48 100644 --- a/kedro-datasets/kedro_datasets/polars/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/generic_dataset.py @@ -36,15 +36,15 @@ class GenericDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): compression: "snappy" Example using Python API: - :: + + .. code-block:: pycon >>> from kedro_datasets.polars import GenericDataset >>> import polars as pl >>> - >>> data = pl.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pl.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> - >>> dataset = GenericDataset(filepath='test.parquet', file_format='parquet') + >>> dataset = GenericDataset(filepath="test.parquet", file_format="parquet") >>> dataset.save(data) >>> reloaded = dataset.load() >>> assert data.frame_equal(reloaded) diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 770ee98af..9979cf386 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -43,13 +43,13 @@ class PickleDataset(AbstractDataset[Any, Any]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.redis import PickleDataset >>> import pandas as pd >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - ... 'col3': [5, 6]}) + >>> data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) >>> >>> my_data = PickleDataset(key="my_data") >>> my_data.save(data) diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md index 7400c3c47..a0bcef8e1 100644 --- a/kedro-datasets/kedro_datasets/spark/README.md +++ b/kedro-datasets/kedro_datasets/spark/README.md @@ -23,6 +23,7 @@ Supported file formats are: from kedro.framework.hooks import hook_impl from pyspark.sql import SparkSession + class SparkStreamsHook: @hook_impl def after_pipeline_run(self) -> None: diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index f1b6a74b5..e5e40a9fe 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -35,18 +35,19 @@ class DeltaTableDataset(AbstractDataset[None, DeltaTable]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - ... IntegerType, StructType) + >>> from pyspark.sql.types import StructField, StringType, IntegerType, StructType >>> >>> from kedro.extras.datasets.spark import DeltaTableDataset, SparkDataset >>> - >>> schema = StructType([StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) + >>> schema = StructType( + ... [StructField("name", StringType(), True), StructField("age", IntegerType(), True)] + ... ) >>> - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] >>> >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) >>> diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 221e4e562..d83e3227a 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -203,21 +203,21 @@ class SparkDataset(AbstractVersionedDataset[DataFrame, DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - ... IntegerType, StructType) + >>> from pyspark.sql.types import StructField, StringType, IntegerType, StructType >>> >>> from kedro_datasets.spark import SparkDataset >>> - >>> schema = StructType([StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) + >>> schema = StructType( + ... [StructField("name", StringType(), True), StructField("age", IntegerType(), True)] + ... ) >>> - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] >>> - >>> spark_df = SparkSession.builder.getOrCreate()\ - ... .createDataFrame(data, schema) + >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) >>> >>> dataset = SparkDataset(filepath="test_data") >>> dataset.save(spark_df) diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index 33cc31f02..b7bd3363c 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -42,23 +42,25 @@ class SparkHiveDataset(AbstractDataset[DataFrame, DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - ... IntegerType, StructType) + >>> from pyspark.sql.types import StructField, StringType, IntegerType, StructType >>> >>> from kedro_datasets.spark import SparkHiveDataset >>> - >>> schema = StructType([StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) + >>> schema = StructType( + ... [StructField("name", StringType(), True), StructField("age", IntegerType(), True)] + ... ) >>> - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] >>> >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) >>> - >>> dataset = SparkHiveDataset(database="test_database", table="test_table", - ... write_mode="overwrite") + >>> dataset = SparkHiveDataset( + ... database="test_database", table="test_table", write_mode="overwrite" + ... ) >>> dataset.save(spark_df) >>> reloaded = dataset.load() >>> diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index 18af44546..029cf15b5 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -37,24 +37,27 @@ class SparkJDBCDataset(AbstractDataset[DataFrame, DataFrame]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> import pandas as pd >>> from kedro_datasets import SparkJBDCDataset >>> from pyspark.sql import SparkSession >>> >>> spark = SparkSession.builder.getOrCreate() - >>> data = spark.createDataFrame(pd.DataFrame({'col1': [1, 2], - ... 'col2': [4, 5], - ... 'col3': [5, 6]})) - >>> url = 'jdbc:postgresql://localhost/test' - >>> table = 'table_a' - >>> connection_properties = {'driver': 'org.postgresql.Driver'} + >>> data = spark.createDataFrame( + ... pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + ... ) + >>> url = "jdbc:postgresql://localhost/test" + >>> table = "table_a" + >>> connection_properties = {"driver": "org.postgresql.Driver"} >>> dataset = SparkJDBCDataset( - ... url=url, table=table, credentials={'user': 'scott', - ... 'password': 'tiger'}, - ... load_args={'properties': connection_properties}, - ... save_args={'properties': connection_properties}) + ... url=url, + ... table=table, + ... credentials={"user": "scott", "password": "tiger"}, + ... load_args={"properties": connection_properties}, + ... save_args={"properties": connection_properties}, + ... ) >>> >>> dataset.save(data) >>> reloaded = dataset.load() diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index 05edae8a6..a24f40947 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -68,7 +68,8 @@ class SVMLightDataset(AbstractVersionedDataset[_DI, _DO]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.svmlight import SVMLightDataset >>> import numpy as np diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index a95b1bfa2..e2ca6f12e 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -41,7 +41,8 @@ class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.M Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.tensorflow import TensorFlowModelDataset >>> import tensorflow as tf diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index a6d9be17e..3d31dd3dd 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -30,7 +30,8 @@ class TextDataset(AbstractVersionedDataset[str, str]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.text import TextDataset >>> diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 943e686fd..2dbe0c9ca 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -30,11 +30,12 @@ class JSONDataset(json_dataset.JSONDataset): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.tracking import JSONDataset >>> - >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} + >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} >>> >>> dataset = JSONDataset(filepath="test.json") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index cfd30d1a4..d4336cf69 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -32,11 +32,12 @@ class MetricsDataset(json_dataset.JSONDataset): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.tracking import MetricsDataset >>> - >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} + >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} >>> >>> dataset = MetricsDataset(filepath="test.json") >>> dataset.save(data) diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index de97d7b8e..1e601fb8a 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -220,24 +220,26 @@ class VideoDataset(AbstractDataset[AbstractVideo, AbstractVideo]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.video import VideoDataset >>> import numpy as np >>> - >>> video = VideoDataset(filepath='/video/file/path.mp4').load() + >>> video = VideoDataset(filepath="/video/file/path.mp4").load() >>> frame = video[0] >>> np.sum(np.asarray(frame)) Example creating a video from numpy frames using Python API: - :: + + .. code-block:: pycon >>> from kedro_datasets.video.video_dataset import VideoDataset, SequenceVideo >>> import numpy as np >>> from PIL import Image >>> - >>> frame = np.ones((640,480,3), dtype=np.uint8) * 255 + >>> frame = np.ones((640, 480, 3), dtype=np.uint8) * 255 >>> imgs = [] >>> for i in range(255): ... imgs.append(Image.fromarray(frame)) @@ -248,14 +250,15 @@ class VideoDataset(AbstractDataset[AbstractVideo, AbstractVideo]): Example creating a video from numpy frames using a generator and the Python API: - :: + + .. code-block:: pycon >>> from kedro_datasets.video.video_dataset import VideoDataset, GeneratorVideo >>> import numpy as np >>> from PIL import Image >>> >>> def gen(): - ... frame = np.ones((640,480,3), dtype=np.uint8) * 255 + ... frame = np.ones((640, 480, 3), dtype=np.uint8) * 255 ... for i in range(255): ... yield Image.fromarray(frame) ... frame -= 1 diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index d9aa536fb..77d3dcf96 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -31,11 +31,12 @@ class YAMLDataset(AbstractVersionedDataset[Dict, Dict]): Example usage for the `Python API `_: - :: + + .. code-block:: pycon >>> from kedro_datasets.yaml import YAMLDataset >>> - >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} + >>> data = {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]} >>> >>> dataset = YAMLDataset(filepath="test.yaml") >>> dataset.save(data) From 527706d3c512e3f75ae2183afa17757213379e80 Mon Sep 17 00:00:00 2001 From: Felix Wittmann Date: Tue, 10 Oct 2023 12:51:22 +0200 Subject: [PATCH 11/15] docs: cloudpickle is an interesting extension of the pickle functionality (#361) Signed-off-by: H. Felix Wittmann --- kedro-datasets/RELEASE.md | 7 +++++-- kedro-datasets/kedro_datasets/pickle/pickle_dataset.py | 4 ++++ kedro-datasets/kedro_datasets/redis/redis_dataset.py | 5 +++++ kedro-datasets/setup.py | 1 + kedro-datasets/tests/pickle/test_pickle_dataset.py | 1 + kedro-datasets/tests/redis/test_redis_dataset.py | 1 + 6 files changed, 17 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 0b7ac02cc..63f6ae91a 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,8 +1,13 @@ # Upcoming Release ## Major features and improvements ## Bug fixes and other changes +* Updated `PickleDataset` to explicitly mention `cloudpickle` support. ## Upcoming deprecations for Kedro-Datasets 2.0.0 +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: +* [Felix Wittmann](https://github.com/hfwittmann) + # Release 1.7.1 ## Bug fixes and other changes * Pin `tables` version on `kedro-datasets` for Python < 3.8. @@ -10,8 +15,6 @@ ## Upcoming deprecations for Kedro-Datasets 2.0.0 * Renamed dataset and error classes, in accordance with the [Kedro lexicon](https://github.com/kedro-org/kedro/wiki/Kedro-documentation-style-guide#kedro-lexicon). Dataset classes ending with "DataSet" are deprecated and will be removed in 2.0.0. -## Community contributions - # Release 1.7.0: ## Major features and improvements * Added `polars.GenericDataSet`, a `GenericDataSet` backed by [polars](https://www.pola.rs/), a lightning fast dataframe package built entirely using Rust. diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 21d3b8c71..21f97b713 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -110,6 +110,8 @@ def __init__( # noqa: PLR0913 dill.load: https://dill.readthedocs.io/en/latest/index.html#dill.load compress_pickle.load: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.load + cloudpickle.load: + https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py All defaults are preserved. save_args: Pickle options for saving pickle files. You can pass in arguments that the backend dump function specified accepts, e.g: @@ -118,6 +120,8 @@ def __init__( # noqa: PLR0913 dill.dump: https://dill.readthedocs.io/en/latest/index.html#dill.dump compress_pickle.dump: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dump + cloudpickle.dump: + https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 9979cf386..1e782059b 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -79,6 +79,7 @@ def __init__( # noqa: PLR0913 * `pickle` * `dill` * `compress_pickle` + * `cloudpickle` Example backends that are incompatible: * `torch` @@ -94,6 +95,8 @@ def __init__( # noqa: PLR0913 dill.loads: https://dill.readthedocs.io/en/latest/index.html#dill.loads compress_pickle.loads: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.loads + cloudpickle.loads: + https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py All defaults are preserved. save_args: Pickle options for saving pickle files. You can pass in arguments that the backend dump function specified accepts, e.g: @@ -101,6 +104,8 @@ def __init__( # noqa: PLR0913 dill.dumps: https://dill.readthedocs.io/en/latest/index.html#dill.dumps compress_pickle.dumps: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dumps + cloudpickle.dumps: + https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py All defaults are preserved. credentials: Credentials required to get access to the redis server. E.g. `{"password": None}`. diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 340ad5e67..a22e83f81 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -153,6 +153,7 @@ def _collect_requirements(requires): "biopython~=1.73", "blacken-docs==1.9.2", "black~=22.0", + "cloudpickle<=2.0.0", "compress-pickle[lz4]~=2.1.0", "coverage[toml]", "dask[complete]~=2021.10", # pinned by Snyk to avoid a vulnerability diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py index be09d6291..e53a8b675 100644 --- a/kedro-datasets/tests/pickle/test_pickle_dataset.py +++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py @@ -68,6 +68,7 @@ class TestPickleDataset: ("pickle", None, None), ("joblib", None, None), ("dill", None, None), + ("cloudpickle", None, None), ("compress_pickle", {"compression": "lz4"}, {"compression": "lz4"}), ], indirect=True, diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py index a2ec3bf83..f569d7d22 100644 --- a/kedro-datasets/tests/redis/test_redis_dataset.py +++ b/kedro-datasets/tests/redis/test_redis_dataset.py @@ -76,6 +76,7 @@ class TestPickleDataset: [ ("a", "pickle", None, None), (1, "dill", None, None), + (2, "cloudpickle", None, None), ("key", "compress_pickle", {"compression": "lz4"}, {"compression": "lz4"}), ], indirect=True, From 358fd952656fe2450f4f6e27abc6c60433d61f8a Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Wed, 11 Oct 2023 10:13:29 +0100 Subject: [PATCH 12/15] fix(datasets): Fix secret scan entropy error (#383) Fix secret scan entropy error Signed-off-by: Merel Theisen --- kedro-datasets/kedro_datasets/pickle/pickle_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/redis/redis_dataset.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 21f97b713..05be25733 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -111,7 +111,7 @@ def __init__( # noqa: PLR0913 compress_pickle.load: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.load cloudpickle.load: - https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py + https://github.com/cloudpipe/cloudpickle/blob/master/tests/cloudpickle_test.py All defaults are preserved. save_args: Pickle options for saving pickle files. You can pass in arguments that the backend dump function specified accepts, e.g: @@ -121,7 +121,7 @@ def __init__( # noqa: PLR0913 compress_pickle.dump: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dump cloudpickle.dump: - https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py + https://github.com/cloudpipe/cloudpickle/blob/master/tests/cloudpickle_test.py All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 1e782059b..8031e6907 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -96,7 +96,7 @@ def __init__( # noqa: PLR0913 compress_pickle.loads: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.loads cloudpickle.loads: - https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py + https://github.com/cloudpipe/cloudpickle/blob/master/tests/cloudpickle_test.py All defaults are preserved. save_args: Pickle options for saving pickle files. You can pass in arguments that the backend dump function specified accepts, e.g: @@ -105,7 +105,7 @@ def __init__( # noqa: PLR0913 compress_pickle.dumps: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dumps cloudpickle.dumps: - https://github.com/cloudpipe/cloudpickle/blob/0f330b6afe55313fc1efc090a7d350f5ad5c9317/tests/cloudpickle_test.py + https://github.com/cloudpipe/cloudpickle/blob/master/tests/cloudpickle_test.py All defaults are preserved. credentials: Credentials required to get access to the redis server. E.g. `{"password": None}`. From d8f1fd5203d1ef06d0dc7d5c405d86a214705fae Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Wed, 11 Oct 2023 16:20:07 +0100 Subject: [PATCH 13/15] style: Rename mentions of `DataSet` to `Dataset` in `kedro-airflow` and `kedro-telemetry` (#384) Signed-off-by: Merel Theisen --- kedro-airflow/features/steps/cli_steps.py | 24 +++++++++++------------ kedro-telemetry/tests/test_plugin.py | 8 ++++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/kedro-airflow/features/steps/cli_steps.py b/kedro-airflow/features/steps/cli_steps.py index 23eb58727..7bfa482ac 100644 --- a/kedro-airflow/features/steps/cli_steps.py +++ b/kedro-airflow/features/steps/cli_steps.py @@ -20,27 +20,27 @@ def init_airflow(context, home_dir): def prepare_old_catalog(context): config = { "example_train_x": { - "type": "PickleLocalDataSet", + "type": "PickleLocalDataset", "filepath": "data/02_intermediate/example_train_x.pkl", }, "example_train_y": { - "type": "PickleLocalDataSet", + "type": "PickleLocalDataset", "filepath": "data/02_intermediate/example_train_y.pkl", }, "example_test_x": { - "type": "PickleLocalDataSet", + "type": "PickleLocalDataset", "filepath": "data/02_intermediate/example_test_x.pkl", }, "example_test_y": { - "type": "PickleLocalDataSet", + "type": "PickleLocalDataset", "filepath": "data/02_intermediate/example_test_y.pkl", }, "example_model": { - "type": "PickleLocalDataSet", + "type": "PickleLocalDataset", "filepath": "data/02_intermediate/example_model.pkl", }, "example_predictions": { - "type": "PickleLocalDataSet", + "type": "PickleLocalDataset", "filepath": "data/02_intermediate/example_predictions.pkl", }, } @@ -53,27 +53,27 @@ def prepare_old_catalog(context): def prepare_catalog(context): config = { "example_train_x": { - "type": "pickle.PickleDataSet", + "type": "pickle.PickleDataset", "filepath": "data/02_intermediate/example_train_x.pkl", }, "example_train_y": { - "type": "pickle.PickleDataSet", + "type": "pickle.PickleDataset", "filepath": "data/02_intermediate/example_train_y.pkl", }, "example_test_x": { - "type": "pickle.PickleDataSet", + "type": "pickle.PickleDataset", "filepath": "data/02_intermediate/example_test_x.pkl", }, "example_test_y": { - "type": "pickle.PickleDataSet", + "type": "pickle.PickleDataset", "filepath": "data/02_intermediate/example_test_y.pkl", }, "example_model": { - "type": "pickle.PickleDataSet", + "type": "pickle.PickleDataset", "filepath": "data/02_intermediate/example_model.pkl", }, "example_predictions": { - "type": "pickle.PickleDataSet", + "type": "pickle.PickleDataset", "filepath": "data/02_intermediate/example_predictions.pkl", }, } diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index ccbaf8afe..bee020e7d 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -6,7 +6,7 @@ from kedro import __version__ as kedro_version from kedro.framework.project import pipelines from kedro.framework.startup import ProjectMetadata -from kedro.io import DataCatalog, MemoryDataSet +from kedro.io import DataCatalog, MemoryDataset from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from pytest import fixture @@ -40,9 +40,9 @@ def fake_metadata(tmp_path): @fixture def fake_context(mocker): mock_context = mocker.Mock() - dummy_1 = MemoryDataSet() - dummy_2 = MemoryDataSet() - dummy_3 = MemoryDataSet() + dummy_1 = MemoryDataset() + dummy_2 = MemoryDataset() + dummy_3 = MemoryDataset() mock_context.catalog = DataCatalog( {"dummy_1": dummy_1, "dummy_2": dummy_2, "dummy_3": dummy_3} ) From 6d1fca47dd54ee8fdc14d0fefa47f0cbe7e59efe Mon Sep 17 00:00:00 2001 From: PtrBld <7523956+PtrBld@users.noreply.github.com> Date: Wed, 11 Oct 2023 17:45:57 +0200 Subject: [PATCH 14/15] feat(datasets): Migrated `PartitionedDataSet` and `IncrementalDataSet` from main repository to kedro-datasets (#253) Signed-off-by: Peter Bludau Co-authored-by: Merel Theisen --- kedro-datasets/RELEASE.md | 5 + kedro-datasets/docs/source/kedro_datasets.rst | 2 + .../kedro_datasets/partitions/__init__.py | 11 + .../partitions/incremental_dataset.py | 237 ++++++++ .../partitions/partitioned_dataset.py | 329 +++++++++++ kedro-datasets/tests/partitions/__init__.py | 0 .../partitions/test_incremental_dataset.py | 508 ++++++++++++++++ .../partitions/test_partitioned_dataset.py | 540 ++++++++++++++++++ 8 files changed, 1632 insertions(+) create mode 100644 kedro-datasets/kedro_datasets/partitions/__init__.py create mode 100644 kedro-datasets/kedro_datasets/partitions/incremental_dataset.py create mode 100644 kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py create mode 100644 kedro-datasets/tests/partitions/__init__.py create mode 100644 kedro-datasets/tests/partitions/test_incremental_dataset.py create mode 100644 kedro-datasets/tests/partitions/test_partitioned_dataset.py diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 63f6ae91a..6769730f7 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,8 +1,13 @@ # Upcoming Release ## Major features and improvements +* Moved `PartitionedDataSet` and `IncrementalDataSet` from the core Kedro repo to `kedro-datasets` and renamed to `PartitionedDataset` and `IncrementalDataset`. + ## Bug fixes and other changes * Updated `PickleDataset` to explicitly mention `cloudpickle` support. ## Upcoming deprecations for Kedro-Datasets 2.0.0 +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: +* [PtrBld](https://github.com/PtrBld) ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: diff --git a/kedro-datasets/docs/source/kedro_datasets.rst b/kedro-datasets/docs/source/kedro_datasets.rst index d8db36ee0..67f87e0e3 100644 --- a/kedro-datasets/docs/source/kedro_datasets.rst +++ b/kedro-datasets/docs/source/kedro_datasets.rst @@ -59,6 +59,8 @@ kedro_datasets kedro_datasets.pandas.SQLTableDataset kedro_datasets.pandas.XMLDataSet kedro_datasets.pandas.XMLDataset + kedro_datasets.partitions.IncrementalDataset + kedro_datasets.partitions.PartitionedDataset kedro_datasets.pickle.PickleDataSet kedro_datasets.pickle.PickleDataset kedro_datasets.pillow.ImageDataSet diff --git a/kedro-datasets/kedro_datasets/partitions/__init__.py b/kedro-datasets/kedro_datasets/partitions/__init__.py new file mode 100644 index 000000000..2f464a907 --- /dev/null +++ b/kedro-datasets/kedro_datasets/partitions/__init__.py @@ -0,0 +1,11 @@ +"""``AbstractDataset`` implementation to load/save data in partitions +from/to any underlying Dataset format. +""" + +__all__ = ["PartitionedDataset", "IncrementalDataset"] + +from contextlib import suppress + +with suppress(ImportError): + from .incremental_dataset import IncrementalDataset + from .partitioned_dataset import PartitionedDataset diff --git a/kedro-datasets/kedro_datasets/partitions/incremental_dataset.py b/kedro-datasets/kedro_datasets/partitions/incremental_dataset.py new file mode 100644 index 000000000..59aa9789d --- /dev/null +++ b/kedro-datasets/kedro_datasets/partitions/incremental_dataset.py @@ -0,0 +1,237 @@ +"""``IncrementalDataset`` inherits from ``PartitionedDataset``, which loads +and saves partitioned file-like data using the underlying dataset +definition. ``IncrementalDataset`` also stores the information about the last +processed partition in so-called `checkpoint` that is persisted to the location +of the data partitions by default, so that subsequent pipeline run loads only +new partitions past the checkpoint.It also uses `fsspec` for filesystem level operations. +""" +from __future__ import annotations + +import operator +from copy import deepcopy +from typing import Any, Callable + +from cachetools import cachedmethod +from kedro.io.core import ( + VERSION_KEY, + VERSIONED_FLAG_KEY, + AbstractDataset, + DatasetError, + parse_dataset_definition, +) +from kedro.io.data_catalog import CREDENTIALS_KEY +from kedro.utils import load_obj + +from .partitioned_dataset import KEY_PROPAGATION_WARNING, PartitionedDataset + + +class IncrementalDataset(PartitionedDataset): + """``IncrementalDataset`` inherits from ``PartitionedDataset``, which loads + and saves partitioned file-like data using the underlying dataset + definition. For filesystem level operations it uses `fsspec`: + https://github.com/intake/filesystem_spec. ``IncrementalDataset`` also stores + the information about the last processed partition in so-called `checkpoint` + that is persisted to the location of the data partitions by default, so that + subsequent pipeline run loads only new partitions past the checkpoint. + + Example: + :: + + >>> from kedro_datasets.partitions import IncrementalDataset + >>> + >>> # these credentials will be passed to: + >>> # a) 'fsspec.filesystem()' call, + >>> # b) the dataset initializer, + >>> # c) the checkpoint initializer + >>> credentials = {"key1": "secret1", "key2": "secret2"} + >>> + >>> data_set = IncrementalDataset( + >>> path="s3://bucket-name/path/to/folder", + >>> dataset="pandas.CSVDataset", + >>> credentials=credentials + >>> ) + >>> loaded = data_set.load() # loads all available partitions + >>> # assert isinstance(loaded, dict) + >>> + >>> data_set.confirm() # update checkpoint value to the last processed partition ID + >>> reloaded = data_set.load() # still loads all available partitions + >>> + >>> data_set.release() # clears load cache + >>> # returns an empty dictionary as no new partitions were added + >>> data_set.load() + """ + + DEFAULT_CHECKPOINT_TYPE = "kedro_datasets.text.TextDataset" + DEFAULT_CHECKPOINT_FILENAME = "CHECKPOINT" + + def __init__( # noqa: PLR0913 + self, + path: str, + dataset: str | type[AbstractDataset] | dict[str, Any], + checkpoint: str | dict[str, Any] | None = None, + filepath_arg: str = "filepath", + filename_suffix: str = "", + credentials: dict[str, Any] = None, + load_args: dict[str, Any] = None, + fs_args: dict[str, Any] = None, + metadata: dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``IncrementalDataset``. + + Args: + path: Path to the folder containing partitioned data. + If path starts with the protocol (e.g., ``s3://``) then the + corresponding ``fsspec`` concrete filesystem implementation will + be used. If protocol is not specified, + ``fsspec.implementations.local.LocalFileSystem`` will be used. + **Note:** Some concrete implementations are bundled with ``fsspec``, + while others (like ``s3`` or ``gcs``) must be installed separately + prior to usage of the ``PartitionedDataset``. + dataset: Underlying dataset definition. This is used to instantiate + the dataset for each file located inside the ``path``. + Accepted formats are: + a) object of a class that inherits from ``AbstractDataset`` + b) a string representing a fully qualified class name to such class + c) a dictionary with ``type`` key pointing to a string from b), + other keys are passed to the Dataset initializer. + Credentials for the dataset can be explicitly specified in + this configuration. + checkpoint: Optional checkpoint configuration. Accepts a dictionary + with the corresponding dataset definition including ``filepath`` + (unlike ``dataset`` argument). Checkpoint configuration is + described here: + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#checkpoint-configuration + Credentials for the checkpoint can be explicitly specified + in this configuration. + filepath_arg: Underlying dataset initializer argument that will + contain a path to each corresponding partition file. + If unspecified, defaults to "filepath". + filename_suffix: If specified, only partitions that end with this + string will be processed. + credentials: Protocol-specific options that will be passed to + ``fsspec.filesystem`` + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem, + the dataset dataset initializer and the checkpoint. If + the dataset or the checkpoint configuration contains explicit + credentials spec, then such spec will take precedence. + All possible credentials management scenarios are documented here: + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#partitioned-dataset-credentials + load_args: Keyword arguments to be passed into ``find()`` method of + the filesystem implementation. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + + Raises: + DatasetError: If versioning is enabled for the underlying dataset. + """ + + super().__init__( + path=path, + dataset=dataset, + filepath_arg=filepath_arg, + filename_suffix=filename_suffix, + credentials=credentials, + load_args=load_args, + fs_args=fs_args, + ) + + self._checkpoint_config = self._parse_checkpoint_config(checkpoint) + self._force_checkpoint = self._checkpoint_config.pop("force_checkpoint", None) + self.metadata = metadata + + comparison_func = self._checkpoint_config.pop("comparison_func", operator.gt) + if isinstance(comparison_func, str): + comparison_func = load_obj(comparison_func) + self._comparison_func = comparison_func + + def _parse_checkpoint_config( + self, checkpoint_config: str | dict[str, Any] | None + ) -> dict[str, Any]: + checkpoint_config = deepcopy(checkpoint_config) + if isinstance(checkpoint_config, str): + checkpoint_config = {"force_checkpoint": checkpoint_config} + checkpoint_config = checkpoint_config or {} + + for key in {VERSION_KEY, VERSIONED_FLAG_KEY} & checkpoint_config.keys(): + raise DatasetError( + f"'{self.__class__.__name__}' does not support versioning of the " + f"checkpoint. Please remove '{key}' key from the checkpoint definition." + ) + + default_checkpoint_path = self._sep.join( + [self._normalized_path.rstrip(self._sep), self.DEFAULT_CHECKPOINT_FILENAME] + ) + default_config = { + "type": self.DEFAULT_CHECKPOINT_TYPE, + self._filepath_arg: default_checkpoint_path, + } + if self._credentials: + default_config[CREDENTIALS_KEY] = deepcopy(self._credentials) + + if CREDENTIALS_KEY in default_config.keys() & checkpoint_config.keys(): + self._logger.warning( + KEY_PROPAGATION_WARNING, + {"keys": CREDENTIALS_KEY, "target": "checkpoint"}, + ) + + return {**default_config, **checkpoint_config} + + @cachedmethod(cache=operator.attrgetter("_partition_cache")) + def _list_partitions(self) -> list[str]: + checkpoint = self._read_checkpoint() + checkpoint_path = self._filesystem._strip_protocol( + self._checkpoint_config[self._filepath_arg] + ) + + def _is_valid_partition(partition) -> bool: + if not partition.endswith(self._filename_suffix): + return False + if partition == checkpoint_path: + return False + if checkpoint is None: + # nothing was processed yet + return True + partition_id = self._path_to_partition(partition) + return self._comparison_func(partition_id, checkpoint) + + return sorted( + part + for part in self._filesystem.find(self._normalized_path, **self._load_args) + if _is_valid_partition(part) + ) + + @property + def _checkpoint(self) -> AbstractDataset: + type_, kwargs = parse_dataset_definition(self._checkpoint_config) + return type_(**kwargs) # type: ignore + + def _read_checkpoint(self) -> str | None: + if self._force_checkpoint is not None: + return self._force_checkpoint + try: + return self._checkpoint.load() + except DatasetError: + return None + + def _load(self) -> dict[str, Callable[[], Any]]: + partitions: dict[str, Any] = {} + + for partition in self._list_partitions(): + partition_id = self._path_to_partition(partition) + kwargs = deepcopy(self._dataset_config) + # join the protocol back since PySpark may rely on it + kwargs[self._filepath_arg] = self._join_protocol(partition) + partitions[partition_id] = self._dataset_type( # type: ignore + **kwargs + ).load() + + return partitions + + def confirm(self) -> None: + """Confirm the dataset by updating the checkpoint value to the latest + processed partition ID""" + partition_ids = [self._path_to_partition(p) for p in self._list_partitions()] + if partition_ids: + self._checkpoint.save(partition_ids[-1]) # checkpoint to last partition diff --git a/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py b/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py new file mode 100644 index 000000000..74242b113 --- /dev/null +++ b/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py @@ -0,0 +1,329 @@ +"""``PartitionedDataset`` loads and saves partitioned file-like data using the +underlying dataset definition. It also uses `fsspec` for filesystem level operations. +""" +from __future__ import annotations + +import operator +from copy import deepcopy +from typing import Any, Callable, Dict +from urllib.parse import urlparse +from warnings import warn + +import fsspec +from cachetools import Cache, cachedmethod +from kedro.io.core import ( + VERSION_KEY, + VERSIONED_FLAG_KEY, + AbstractDataset, + DatasetError, + parse_dataset_definition, +) +from kedro.io.data_catalog import CREDENTIALS_KEY + +KEY_PROPAGATION_WARNING = ( + "Top-level %(keys)s will not propagate into the %(target)s since " + "%(keys)s were explicitly defined in the %(target)s config." +) + +S3_PROTOCOLS = ("s3", "s3a", "s3n") + + +class PartitionedDataset(AbstractDataset[Dict[str, Any], Dict[str, Callable[[], Any]]]): + """``PartitionedDataset`` loads and saves partitioned file-like data using the + underlying dataset definition. For filesystem level operations it uses `fsspec`: + https://github.com/intake/filesystem_spec. + + It also supports advanced features like + `lazy saving `_. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + station_data: + type: PartitionedDataset + path: data/03_primary/station_data + dataset: + type: pandas.CSVDataset + load_args: + sep: '\\t' + save_args: + sep: '\\t' + index: true + filename_suffix: '.dat' + + Example usage for the + `Python API `_: + :: + + >>> import pandas as pd + >>> from kedro_datasets.partitions import PartitionedDataset + >>> + >>> # Create a fake pandas dataframe with 10 rows of data + >>> df = pd.DataFrame([{"DAY_OF_MONTH": str(i), "VALUE": i} for i in range(1, 11)]) + >>> + >>> # Convert it to a dict of pd.DataFrame with DAY_OF_MONTH as the dict key + >>> dict_df = { + day_of_month: df[df["DAY_OF_MONTH"] == day_of_month] + for day_of_month in df["DAY_OF_MONTH"] + } + >>> + >>> # Save it as small paritions with DAY_OF_MONTH as the partition key + >>> data_set = PartitionedDataset( + path="df_with_partition", + dataset="pandas.CSVDataset", + filename_suffix=".csv" + ) + >>> # This will create a folder `df_with_partition` and save multiple files + >>> # with the dict key + filename_suffix as filename, i.e. 1.csv, 2.csv etc. + >>> data_set.save(dict_df) + >>> + >>> # This will create lazy load functions instead of loading data into memory immediately. + >>> loaded = data_set.load() + >>> + >>> # Load all the partitions + >>> for partition_id, partition_load_func in loaded.items(): + # The actual function that loads the data + partition_data = partition_load_func() + >>> + >>> # Add the processing logic for individual partition HERE + >>> print(partition_data) + + You can also load multiple partitions from a remote storage and combine them + like this: + :: + + >>> import pandas as pd + >>> from kedro_datasets.partitions import PartitionedDataset + >>> + >>> # these credentials will be passed to both 'fsspec.filesystem()' call + >>> # and the dataset initializer + >>> credentials = {"key1": "secret1", "key2": "secret2"} + >>> + >>> data_set = PartitionedDataset( + path="s3://bucket-name/path/to/folder", + dataset="pandas.CSVDataset", + credentials=credentials + ) + >>> loaded = data_set.load() + >>> # assert isinstance(loaded, dict) + >>> + >>> combine_all = pd.DataFrame() + >>> + >>> for partition_id, partition_load_func in loaded.items(): + partition_data = partition_load_func() + combine_all = pd.concat( + [combine_all, partition_data], ignore_index=True, sort=True + ) + >>> + >>> new_data = pd.DataFrame({"new": [1, 2]}) + >>> # creates "s3://bucket-name/path/to/folder/new/partition.csv" + >>> data_set.save({"new/partition.csv": new_data}) + + """ + + def __init__( # noqa: PLR0913 + self, + path: str, + dataset: str | type[AbstractDataset] | dict[str, Any], + filepath_arg: str = "filepath", + filename_suffix: str = "", + credentials: dict[str, Any] = None, + load_args: dict[str, Any] = None, + fs_args: dict[str, Any] = None, + overwrite: bool = False, + metadata: dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``PartitionedDataset``. + + Args: + path: Path to the folder containing partitioned data. + If path starts with the protocol (e.g., ``s3://``) then the + corresponding ``fsspec`` concrete filesystem implementation will + be used. If protocol is not specified, + ``fsspec.implementations.local.LocalFileSystem`` will be used. + **Note:** Some concrete implementations are bundled with ``fsspec``, + while others (like ``s3`` or ``gcs``) must be installed separately + prior to usage of the ``PartitionedDataset``. + dataset: Underlying dataset definition. This is used to instantiate + the dataset for each file located inside the ``path``. + Accepted formats are: + a) object of a class that inherits from ``AbstractDataset`` + b) a string representing a fully qualified class name to such class + c) a dictionary with ``type`` key pointing to a string from b), + other keys are passed to the Dataset initializer. + Credentials for the dataset can be explicitly specified in + this configuration. + filepath_arg: Underlying dataset initializer argument that will + contain a path to each corresponding partition file. + If unspecified, defaults to "filepath". + filename_suffix: If specified, only partitions that end with this + string will be processed. + credentials: Protocol-specific options that will be passed to + ``fsspec.filesystem`` + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem + and the dataset initializer. If the dataset config contains + explicit credentials spec, then such spec will take precedence. + All possible credentials management scenarios are documented here: + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#partitioned-dataset-credentials + load_args: Keyword arguments to be passed into ``find()`` method of + the filesystem implementation. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``) + overwrite: If True, any existing partitions will be removed. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. + + Raises: + DatasetError: If versioning is enabled for the underlying dataset. + """ + from fsspec.utils import infer_storage_options # for performance reasons + + super().__init__() + + self._path = path + self._filename_suffix = filename_suffix + self._overwrite = overwrite + self._protocol = infer_storage_options(self._path)["protocol"] + self._partition_cache: Cache = Cache(maxsize=1) + self.metadata = metadata + + dataset = dataset if isinstance(dataset, dict) else {"type": dataset} + self._dataset_type, self._dataset_config = parse_dataset_definition(dataset) + if VERSION_KEY in self._dataset_config: + raise DatasetError( + f"'{self.__class__.__name__}' does not support versioning of the " + f"underlying dataset. Please remove '{VERSIONED_FLAG_KEY}' flag from " + f"the dataset definition." + ) + + if credentials: + if CREDENTIALS_KEY in self._dataset_config: + self._logger.warning( + KEY_PROPAGATION_WARNING, + {"keys": CREDENTIALS_KEY, "target": "underlying dataset"}, + ) + else: + self._dataset_config[CREDENTIALS_KEY] = deepcopy(credentials) + + self._credentials = deepcopy(credentials) or {} + + self._fs_args = deepcopy(fs_args) or {} + if self._fs_args: + if "fs_args" in self._dataset_config: + self._logger.warning( + KEY_PROPAGATION_WARNING, + {"keys": "filesystem arguments", "target": "underlying dataset"}, + ) + else: + self._dataset_config["fs_args"] = deepcopy(self._fs_args) + + self._filepath_arg = filepath_arg + if self._filepath_arg in self._dataset_config: + warn( + f"'{self._filepath_arg}' key must not be specified in the dataset " + f"definition as it will be overwritten by partition path" + ) + + self._load_args = deepcopy(load_args) or {} + self._sep = self._filesystem.sep + # since some filesystem implementations may implement a global cache + self._invalidate_caches() + + @property + def _filesystem(self): + protocol = "s3" if self._protocol in S3_PROTOCOLS else self._protocol + return fsspec.filesystem(protocol, **self._credentials, **self._fs_args) + + @property + def _normalized_path(self) -> str: + if self._protocol in S3_PROTOCOLS: + return urlparse(self._path)._replace(scheme="s3").geturl() + return self._path + + @cachedmethod(cache=operator.attrgetter("_partition_cache")) + def _list_partitions(self) -> list[str]: + return [ + path + for path in self._filesystem.find(self._normalized_path, **self._load_args) + if path.endswith(self._filename_suffix) + ] + + def _join_protocol(self, path: str) -> str: + protocol_prefix = f"{self._protocol}://" + if self._path.startswith(protocol_prefix) and not path.startswith( + protocol_prefix + ): + return f"{protocol_prefix}{path}" + return path + + def _partition_to_path(self, path: str): + dir_path = self._path.rstrip(self._sep) + path = path.lstrip(self._sep) + full_path = self._sep.join([dir_path, path]) + self._filename_suffix + return full_path + + def _path_to_partition(self, path: str) -> str: + dir_path = self._filesystem._strip_protocol(self._normalized_path) + path = path.split(dir_path, 1).pop().lstrip(self._sep) + if self._filename_suffix and path.endswith(self._filename_suffix): + path = path[: -len(self._filename_suffix)] + return path + + def _load(self) -> dict[str, Callable[[], Any]]: + partitions = {} + + for partition in self._list_partitions(): + kwargs = deepcopy(self._dataset_config) + # join the protocol back since PySpark may rely on it + kwargs[self._filepath_arg] = self._join_protocol(partition) + dataset = self._dataset_type(**kwargs) # type: ignore + partition_id = self._path_to_partition(partition) + partitions[partition_id] = dataset.load + + if not partitions: + raise DatasetError(f"No partitions found in '{self._path}'") + + return partitions + + def _save(self, data: dict[str, Any]) -> None: + if self._overwrite and self._filesystem.exists(self._normalized_path): + self._filesystem.rm(self._normalized_path, recursive=True) + + for partition_id, partition_data in sorted(data.items()): + kwargs = deepcopy(self._dataset_config) + partition = self._partition_to_path(partition_id) + # join the protocol back since tools like PySpark may rely on it + kwargs[self._filepath_arg] = self._join_protocol(partition) + dataset = self._dataset_type(**kwargs) # type: ignore + if callable(partition_data): + partition_data = partition_data() # noqa: PLW2901 + dataset.save(partition_data) + self._invalidate_caches() + + def _describe(self) -> dict[str, Any]: + clean_dataset_config = ( + {k: v for k, v in self._dataset_config.items() if k != CREDENTIALS_KEY} + if isinstance(self._dataset_config, dict) + else self._dataset_config + ) + return { + "path": self._path, + "dataset_type": self._dataset_type.__name__, + "dataset_config": clean_dataset_config, + } + + def _invalidate_caches(self) -> None: + self._partition_cache.clear() + self._filesystem.invalidate_cache(self._normalized_path) + + def _exists(self) -> bool: + return bool(self._list_partitions()) + + def _release(self) -> None: + super()._release() + self._invalidate_caches() diff --git a/kedro-datasets/tests/partitions/__init__.py b/kedro-datasets/tests/partitions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-datasets/tests/partitions/test_incremental_dataset.py b/kedro-datasets/tests/partitions/test_incremental_dataset.py new file mode 100644 index 000000000..539ab0a66 --- /dev/null +++ b/kedro-datasets/tests/partitions/test_incremental_dataset.py @@ -0,0 +1,508 @@ +from __future__ import annotations + +import os +import re +from pathlib import Path +from typing import Any + +import boto3 +import pandas as pd +import pytest +from kedro.io.core import AbstractDataset, DatasetError +from kedro.io.data_catalog import CREDENTIALS_KEY +from moto import mock_s3 +from pandas.util.testing import assert_frame_equal + +from kedro_datasets.partitions import IncrementalDataset +from kedro_datasets.pickle import PickleDataset +from kedro_datasets.text import TextDataset + +DATASET = "kedro_datasets.pandas.csv_dataset.CSVDataset" + + +@pytest.fixture +def partitioned_data_pandas(): + return { + f"p{counter:02d}/data.csv": pd.DataFrame( + {"part": counter, "col": list(range(counter + 1))} + ) + for counter in range(5) + } + + +@pytest.fixture +def local_csvs(tmp_path, partitioned_data_pandas): + local_dir = Path(tmp_path / "csvs") + local_dir.mkdir() + + for k, data in partitioned_data_pandas.items(): + path = local_dir / k + path.parent.mkdir(parents=True) + data.to_csv(str(path), index=False) + return local_dir + + +class DummyDataset(AbstractDataset): # pragma: no cover + def __init__(self, filepath): + pass + + def _describe(self) -> dict[str, Any]: + return {"dummy": True} + + def _load(self) -> Any: + pass + + def _save(self, data: Any) -> None: + pass + + +def dummy_gt_func(value1: str, value2: str): + return value1 > value2 + + +def dummy_lt_func(value1: str, value2: str): + return value1 < value2 + + +class TestIncrementalDatasetLocal: + def test_load_and_confirm(self, local_csvs, partitioned_data_pandas): + """Test the standard flow for loading, confirming and reloading + an IncrementalDataset""" + pds = IncrementalDataset(str(local_csvs), DATASET) + loaded = pds.load() + assert loaded.keys() == partitioned_data_pandas.keys() + for partition_id, data in loaded.items(): + assert_frame_equal(data, partitioned_data_pandas[partition_id]) + + checkpoint_path = local_csvs / pds.DEFAULT_CHECKPOINT_FILENAME + assert not checkpoint_path.exists() + pds.confirm() + assert checkpoint_path.is_file() + assert checkpoint_path.read_text() == pds._read_checkpoint() == "p04/data.csv" + + reloaded = pds.load() + assert reloaded.keys() == loaded.keys() + + pds.release() + reloaded_after_release = pds.load() + assert not reloaded_after_release + + def test_save(self, local_csvs): + """Test saving a new partition into an IncrementalDataset""" + df = pd.DataFrame({"dummy": [1, 2, 3]}) + new_partition_key = "p05/data.csv" + new_partition_path = local_csvs / new_partition_key + pds = IncrementalDataset(str(local_csvs), DATASET) + + assert not new_partition_path.exists() + assert new_partition_key not in pds.load() + + pds.save({new_partition_key: df}) + assert new_partition_path.exists() + loaded = pds.load() + assert_frame_equal(loaded[new_partition_key], df) + + @pytest.mark.parametrize( + "filename_suffix,expected_partitions", + [ + ( + "", + { + "p00/data.csv", + "p01/data.csv", + "p02/data.csv", + "p03/data.csv", + "p04/data.csv", + }, + ), + (".csv", {"p00/data", "p01/data", "p02/data", "p03/data", "p04/data"}), + (".fake", set()), + ], + ) + def test_filename_suffix(self, filename_suffix, expected_partitions, local_csvs): + """Test how specifying filename_suffix affects the available + partitions and their names""" + pds = IncrementalDataset( + str(local_csvs), DATASET, filename_suffix=filename_suffix + ) + loaded = pds.load() + assert loaded.keys() == expected_partitions + + @pytest.mark.parametrize( + "forced_checkpoint,expected_partitions", + [ + ( + "", + { + "p00/data.csv", + "p01/data.csv", + "p02/data.csv", + "p03/data.csv", + "p04/data.csv", + }, + ), + ( + "p00/data.csv", + {"p01/data.csv", "p02/data.csv", "p03/data.csv", "p04/data.csv"}, + ), + ("p03/data.csv", {"p04/data.csv"}), + ], + ) + def test_force_checkpoint_no_checkpoint_file( + self, forced_checkpoint, expected_partitions, local_csvs + ): + """Test how forcing checkpoint value affects the available partitions + if the checkpoint file does not exist""" + pds = IncrementalDataset(str(local_csvs), DATASET, checkpoint=forced_checkpoint) + loaded = pds.load() + assert loaded.keys() == expected_partitions + + confirm_path = local_csvs / pds.DEFAULT_CHECKPOINT_FILENAME + assert not confirm_path.exists() + pds.confirm() + assert confirm_path.is_file() + assert confirm_path.read_text() == max(expected_partitions) + + @pytest.mark.parametrize( + "forced_checkpoint,expected_partitions", + [ + ( + "", + { + "p00/data.csv", + "p01/data.csv", + "p02/data.csv", + "p03/data.csv", + "p04/data.csv", + }, + ), + ( + "p00/data.csv", + {"p01/data.csv", "p02/data.csv", "p03/data.csv", "p04/data.csv"}, + ), + ("p03/data.csv", {"p04/data.csv"}), + ], + ) + def test_force_checkpoint_checkpoint_file_exists( + self, forced_checkpoint, expected_partitions, local_csvs + ): + """Test how forcing checkpoint value affects the available partitions + if the checkpoint file exists""" + IncrementalDataset(str(local_csvs), DATASET).confirm() + checkpoint = local_csvs / IncrementalDataset.DEFAULT_CHECKPOINT_FILENAME + assert checkpoint.read_text() == "p04/data.csv" + + pds = IncrementalDataset(str(local_csvs), DATASET, checkpoint=forced_checkpoint) + assert pds._checkpoint.exists() + loaded = pds.load() + assert loaded.keys() == expected_partitions + + @pytest.mark.parametrize( + "forced_checkpoint", ["p04/data.csv", "p10/data.csv", "p100/data.csv"] + ) + def test_force_checkpoint_no_partitions(self, forced_checkpoint, local_csvs): + """Test that forcing the checkpoint to certain values results in no + partitions being returned""" + pds = IncrementalDataset(str(local_csvs), DATASET, checkpoint=forced_checkpoint) + loaded = pds.load() + assert not loaded + + confirm_path = local_csvs / pds.DEFAULT_CHECKPOINT_FILENAME + assert not confirm_path.exists() + pds.confirm() + # confirming with no partitions available must have no effect + assert not confirm_path.exists() + + def test_checkpoint_path(self, local_csvs, partitioned_data_pandas): + """Test configuring a different checkpoint path""" + checkpoint_path = local_csvs / "checkpoint_folder" / "checkpoint_file" + assert not checkpoint_path.exists() + + IncrementalDataset( + str(local_csvs), DATASET, checkpoint={"filepath": str(checkpoint_path)} + ).confirm() + assert checkpoint_path.is_file() + assert checkpoint_path.read_text() == max(partitioned_data_pandas) + + @pytest.mark.parametrize( + "checkpoint_config,expected_checkpoint_class", + [ + (None, TextDataset), + ({"type": "kedro_datasets.pickle.PickleDataset"}, PickleDataset), + ( + {"type": "tests.partitions.test_incremental_dataset.DummyDataset"}, + DummyDataset, + ), + ], + ) + def test_checkpoint_type( + self, tmp_path, checkpoint_config, expected_checkpoint_class + ): + """Test configuring a different checkpoint dataset type""" + pds = IncrementalDataset(str(tmp_path), DATASET, checkpoint=checkpoint_config) + assert isinstance(pds._checkpoint, expected_checkpoint_class) + + @pytest.mark.parametrize( + "checkpoint_config,error_pattern", + [ + ( + {"versioned": True}, + "'IncrementalDataset' does not support versioning " + "of the checkpoint. Please remove 'versioned' key from the " + "checkpoint definition.", + ), + ( + {"version": None}, + "'IncrementalDataset' does not support versioning " + "of the checkpoint. Please remove 'version' key from the " + "checkpoint definition.", + ), + ], + ) + def test_version_not_allowed(self, tmp_path, checkpoint_config, error_pattern): + """Test that invalid checkpoint configurations raise expected errors""" + with pytest.raises(DatasetError, match=re.escape(error_pattern)): + IncrementalDataset(str(tmp_path), DATASET, checkpoint=checkpoint_config) + + @pytest.mark.parametrize( + "pds_config,fs_creds,dataset_creds,checkpoint_creds", + [ + ( + {"dataset": DATASET, "credentials": {"cred": "common"}}, + {"cred": "common"}, + {"cred": "common"}, + {"cred": "common"}, + ), + ( + { + "dataset": {"type": DATASET, "credentials": {"ds": "only"}}, + "credentials": {"cred": "common"}, + }, + {"cred": "common"}, + {"ds": "only"}, + {"cred": "common"}, + ), + ( + { + "dataset": DATASET, + "credentials": {"cred": "common"}, + "checkpoint": {"credentials": {"cp": "only"}}, + }, + {"cred": "common"}, + {"cred": "common"}, + {"cp": "only"}, + ), + ( + { + "dataset": {"type": DATASET, "credentials": {"ds": "only"}}, + "checkpoint": {"credentials": {"cp": "only"}}, + }, + {}, + {"ds": "only"}, + {"cp": "only"}, + ), + ( + { + "dataset": {"type": DATASET, "credentials": None}, + "credentials": {"cred": "common"}, + "checkpoint": {"credentials": None}, + }, + {"cred": "common"}, + None, + None, + ), + ], + ) + def test_credentials(self, pds_config, fs_creds, dataset_creds, checkpoint_creds): + """Test correctness of credentials propagation into the dataset and + checkpoint constructors""" + pds = IncrementalDataset(str(Path.cwd()), **pds_config) + assert pds._credentials == fs_creds + assert pds._dataset_config[CREDENTIALS_KEY] == dataset_creds + assert pds._checkpoint_config[CREDENTIALS_KEY] == checkpoint_creds + + @pytest.mark.parametrize( + "comparison_func,expected_partitions", + [ + ( + "tests.partitions.test_incremental_dataset.dummy_gt_func", + {"p03/data.csv", "p04/data.csv"}, + ), + (dummy_gt_func, {"p03/data.csv", "p04/data.csv"}), + ( + "tests.partitions.test_incremental_dataset.dummy_lt_func", + {"p00/data.csv", "p01/data.csv"}, + ), + (dummy_lt_func, {"p00/data.csv", "p01/data.csv"}), + ], + ) + def test_comparison_func(self, comparison_func, expected_partitions, local_csvs): + """Test that specifying a custom function for comparing the checkpoint value + to a partition id results in expected partitions being returned on load""" + checkpoint_config = { + "force_checkpoint": "p02/data.csv", + "comparison_func": comparison_func, + } + pds = IncrementalDataset(str(local_csvs), DATASET, checkpoint=checkpoint_config) + assert pds.load().keys() == expected_partitions + + +BUCKET_NAME = "fake_bucket_name" + + +@pytest.fixture +def mocked_s3_bucket(): + """Create a bucket for testing using moto.""" + with mock_s3(): + conn = boto3.client( + "s3", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + conn.create_bucket(Bucket=BUCKET_NAME) + yield conn + + +@pytest.fixture +def mocked_csvs_in_s3(mocked_s3_bucket, partitioned_data_pandas): + prefix = "csvs" + for key, data in partitioned_data_pandas.items(): + mocked_s3_bucket.put_object( + Bucket=BUCKET_NAME, + Key=f"{prefix}/{key}", + Body=data.to_csv(index=False), + ) + return f"s3://{BUCKET_NAME}/{prefix}" + + +class TestIncrementalDatasetS3: + os.environ["AWS_ACCESS_KEY_ID"] = "FAKE_ACCESS_KEY" + os.environ["AWS_SECRET_ACCESS_KEY"] = "FAKE_SECRET_KEY" + + def test_load_and_confirm(self, mocked_csvs_in_s3, partitioned_data_pandas): + """Test the standard flow for loading, confirming and reloading + a IncrementalDataset in S3""" + pds = IncrementalDataset(mocked_csvs_in_s3, DATASET) + assert pds._checkpoint._protocol == "s3" + loaded = pds.load() + assert loaded.keys() == partitioned_data_pandas.keys() + for partition_id, data in loaded.items(): + assert_frame_equal(data, partitioned_data_pandas[partition_id]) + + assert not pds._checkpoint.exists() + assert pds._read_checkpoint() is None + pds.confirm() + assert pds._checkpoint.exists() + assert pds._read_checkpoint() == max(partitioned_data_pandas) + + def test_load_and_confirm_s3a( + self, mocked_csvs_in_s3, partitioned_data_pandas, mocker + ): + s3a_path = f"s3a://{mocked_csvs_in_s3.split('://', 1)[1]}" + pds = IncrementalDataset(s3a_path, DATASET) + assert pds._protocol == "s3a" + assert pds._checkpoint._protocol == "s3" + + mocked_ds = mocker.patch.object(pds, "_dataset_type") + mocked_ds.__name__ = "mocked" + loaded = pds.load() + + assert loaded.keys() == partitioned_data_pandas.keys() + assert not pds._checkpoint.exists() + assert pds._read_checkpoint() is None + pds.confirm() + assert pds._checkpoint.exists() + assert pds._read_checkpoint() == max(partitioned_data_pandas) + + @pytest.mark.parametrize( + "forced_checkpoint,expected_partitions", + [ + ( + "", + { + "p00/data.csv", + "p01/data.csv", + "p02/data.csv", + "p03/data.csv", + "p04/data.csv", + }, + ), + ( + "p00/data.csv", + {"p01/data.csv", "p02/data.csv", "p03/data.csv", "p04/data.csv"}, + ), + ("p03/data.csv", {"p04/data.csv"}), + ], + ) + def test_force_checkpoint_no_checkpoint_file( + self, forced_checkpoint, expected_partitions, mocked_csvs_in_s3 + ): + """Test how forcing checkpoint value affects the available partitions + in S3 if the checkpoint file does not exist""" + pds = IncrementalDataset( + mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint + ) + loaded = pds.load() + assert loaded.keys() == expected_partitions + + assert not pds._checkpoint.exists() + pds.confirm() + assert pds._checkpoint.exists() + assert pds._checkpoint.load() == max(expected_partitions) + + @pytest.mark.parametrize( + "forced_checkpoint,expected_partitions", + [ + ( + "", + { + "p00/data.csv", + "p01/data.csv", + "p02/data.csv", + "p03/data.csv", + "p04/data.csv", + }, + ), + ( + "p00/data.csv", + {"p01/data.csv", "p02/data.csv", "p03/data.csv", "p04/data.csv"}, + ), + ("p03/data.csv", {"p04/data.csv"}), + ], + ) + def test_force_checkpoint_checkpoint_file_exists( + self, forced_checkpoint, expected_partitions, mocked_csvs_in_s3 + ): + """Test how forcing checkpoint value affects the available partitions + in S3 if the checkpoint file exists""" + # create checkpoint and assert that it exists + IncrementalDataset(mocked_csvs_in_s3, DATASET).confirm() + checkpoint_path = ( + f"{mocked_csvs_in_s3}/{IncrementalDataset.DEFAULT_CHECKPOINT_FILENAME}" + ) + checkpoint_value = TextDataset(checkpoint_path).load() + assert checkpoint_value == "p04/data.csv" + + pds = IncrementalDataset( + mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint + ) + assert pds._checkpoint.exists() + loaded = pds.load() + assert loaded.keys() == expected_partitions + + @pytest.mark.parametrize( + "forced_checkpoint", ["p04/data.csv", "p10/data.csv", "p100/data.csv"] + ) + def test_force_checkpoint_no_partitions(self, forced_checkpoint, mocked_csvs_in_s3): + """Test that forcing the checkpoint to certain values results in no + partitions returned from S3""" + pds = IncrementalDataset( + mocked_csvs_in_s3, DATASET, checkpoint=forced_checkpoint + ) + loaded = pds.load() + assert not loaded + + assert not pds._checkpoint.exists() + pds.confirm() + # confirming with no partitions available must have no effect + assert not pds._checkpoint.exists() diff --git a/kedro-datasets/tests/partitions/test_partitioned_dataset.py b/kedro-datasets/tests/partitions/test_partitioned_dataset.py new file mode 100644 index 000000000..4feb79ac4 --- /dev/null +++ b/kedro-datasets/tests/partitions/test_partitioned_dataset.py @@ -0,0 +1,540 @@ +import logging +import os +import re +from pathlib import Path + +import boto3 +import pandas as pd +import pytest +import s3fs +from kedro.io import DatasetError +from kedro.io.data_catalog import CREDENTIALS_KEY +from moto import mock_s3 +from pandas.util.testing import assert_frame_equal + +from kedro_datasets.pandas import CSVDataset, ParquetDataset +from kedro_datasets.partitions import PartitionedDataset +from kedro_datasets.partitions.partitioned_dataset import KEY_PROPAGATION_WARNING + + +@pytest.fixture +def partitioned_data_pandas(): + keys = ("p1/data1.csv", "p2.csv", "p1/data2.csv", "p3", "_p4") + return { + k: pd.DataFrame({"part": k, "counter": list(range(counter))}) + for counter, k in enumerate(keys, 1) + } + + +@pytest.fixture +def local_csvs(tmp_path, partitioned_data_pandas): + local_dir = Path(str(tmp_path / "csvs")) + local_dir.mkdir() + + for k, data in partitioned_data_pandas.items(): + path = local_dir / k + path.parent.mkdir(parents=True, exist_ok=True) + data.to_csv(str(path), index=False) + return local_dir + + +LOCAL_DATASET_DEFINITION = [ + "pandas.CSVDataset", + "kedro_datasets.pandas.CSVDataset", + CSVDataset, + {"type": "kedro_datasets.pandas.CSVDataset", "save_args": {"index": False}}, + {"type": CSVDataset}, +] + + +class FakeDataset: # pylint: disable=too-few-public-methods + pass + + +class TestPartitionedDatasetLocal: + @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) + @pytest.mark.parametrize( + "suffix,expected_num_parts", [("", 5), (".csv", 3), ("p4", 1)] + ) + def test_load( + self, dataset, local_csvs, partitioned_data_pandas, suffix, expected_num_parts + ): + pds = PartitionedDataset(str(local_csvs), dataset, filename_suffix=suffix) + loaded_partitions = pds.load() + + assert len(loaded_partitions.keys()) == expected_num_parts + for partition_id, load_func in loaded_partitions.items(): + df = load_func() + assert_frame_equal(df, partitioned_data_pandas[partition_id + suffix]) + if suffix: + assert not partition_id.endswith(suffix) + + @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) + @pytest.mark.parametrize("suffix", ["", ".csv"]) + def test_save(self, dataset, local_csvs, suffix): + pds = PartitionedDataset(str(local_csvs), dataset, filename_suffix=suffix) + original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) + part_id = "new/data" + pds.save({part_id: original_data}) + + assert (local_csvs / "new" / ("data" + suffix)).is_file() + loaded_partitions = pds.load() + assert part_id in loaded_partitions + reloaded_data = loaded_partitions[part_id]() + assert_frame_equal(reloaded_data, original_data) + + @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) + @pytest.mark.parametrize("suffix", ["", ".csv"]) + def test_lazy_save(self, dataset, local_csvs, suffix): + pds = PartitionedDataset(str(local_csvs), dataset, filename_suffix=suffix) + + def original_data(): + return pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) + + part_id = "new/data" + pds.save({part_id: original_data}) + + assert (local_csvs / "new" / ("data" + suffix)).is_file() + loaded_partitions = pds.load() + assert part_id in loaded_partitions + reloaded_data = loaded_partitions[part_id]() + assert_frame_equal(reloaded_data, original_data()) + + def test_save_invalidates_cache(self, local_csvs, mocker): + """Test that save calls invalidate partition cache""" + pds = PartitionedDataset(str(local_csvs), "pandas.CSVDataset") + mocked_fs_invalidate = mocker.patch.object(pds._filesystem, "invalidate_cache") + first_load = pds.load() + assert pds._partition_cache.currsize == 1 + mocked_fs_invalidate.assert_not_called() + + # save clears cache + data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) + new_partition = "new/data.csv" + pds.save({new_partition: data}) + assert pds._partition_cache.currsize == 0 + # it seems that `_filesystem.invalidate_cache` calls itself inside, + # resulting in not one, but 2 mock calls + # hence using `assert_any_call` instead of `assert_called_once_with` + mocked_fs_invalidate.assert_any_call(pds._normalized_path) + + # new load returns new partition too + second_load = pds.load() + assert new_partition not in first_load + assert new_partition in second_load + + @pytest.mark.parametrize("overwrite,expected_num_parts", [(False, 6), (True, 1)]) + def test_overwrite(self, local_csvs, overwrite, expected_num_parts): + pds = PartitionedDataset( + str(local_csvs), "pandas.CSVDataset", overwrite=overwrite + ) + original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) + part_id = "new/data" + pds.save({part_id: original_data}) + loaded_partitions = pds.load() + + assert part_id in loaded_partitions + assert len(loaded_partitions.keys()) == expected_num_parts + + def test_release_instance_cache(self, local_csvs): + """Test that cache invalidation does not affect other instances""" + ds_a = PartitionedDataset(str(local_csvs), "pandas.CSVDataset") + ds_a.load() + ds_b = PartitionedDataset(str(local_csvs), "pandas.CSVDataset") + ds_b.load() + + assert ds_a._partition_cache.currsize == 1 + assert ds_b._partition_cache.currsize == 1 + + # invalidate cache of the dataset A + ds_a.release() + assert ds_a._partition_cache.currsize == 0 + # cache of the dataset B is unaffected + assert ds_b._partition_cache.currsize == 1 + + @pytest.mark.parametrize("dataset", ["pandas.CSVDataset", "pandas.ParquetDataset"]) + def test_exists(self, local_csvs, dataset): + assert PartitionedDataset(str(local_csvs), dataset).exists() + + empty_folder = local_csvs / "empty" / "folder" + assert not PartitionedDataset(str(empty_folder), dataset).exists() + empty_folder.mkdir(parents=True) + assert not PartitionedDataset(str(empty_folder), dataset).exists() + + @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) + def test_release(self, dataset, local_csvs): + partition_to_remove = "p2.csv" + pds = PartitionedDataset(str(local_csvs), dataset) + initial_load = pds.load() + assert partition_to_remove in initial_load + + (local_csvs / partition_to_remove).unlink() + cached_load = pds.load() + assert initial_load.keys() == cached_load.keys() + + pds.release() + load_after_release = pds.load() + assert initial_load.keys() ^ load_after_release.keys() == {partition_to_remove} + + @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) + def test_describe(self, dataset): + path = str(Path.cwd()) + pds = PartitionedDataset(path, dataset) + + assert f"path={path}" in str(pds) + assert "dataset_type=CSVDataset" in str(pds) + assert "dataset_config" in str(pds) + + def test_load_args(self, mocker): + fake_partition_name = "fake_partition" + mocked_filesystem = mocker.patch("fsspec.filesystem") + mocked_find = mocked_filesystem.return_value.find + mocked_find.return_value = [fake_partition_name] + + path = str(Path.cwd()) + load_args = {"maxdepth": 42, "withdirs": True} + pds = PartitionedDataset(path, "pandas.CSVDataset", load_args=load_args) + mocker.patch.object(pds, "_path_to_partition", return_value=fake_partition_name) + + assert pds.load().keys() == {fake_partition_name} + mocked_find.assert_called_once_with(path, **load_args) + + @pytest.mark.parametrize( + "credentials,expected_pds_creds,expected_dataset_creds", + [({"cred": "common"}, {"cred": "common"}, {"cred": "common"}), (None, {}, {})], + ) + def test_credentials( + self, mocker, credentials, expected_pds_creds, expected_dataset_creds + ): + mocked_filesystem = mocker.patch("fsspec.filesystem") + path = str(Path.cwd()) + pds = PartitionedDataset(path, "pandas.CSVDataset", credentials=credentials) + + assert mocked_filesystem.call_count == 2 + mocked_filesystem.assert_called_with("file", **expected_pds_creds) + if expected_dataset_creds: + assert pds._dataset_config[CREDENTIALS_KEY] == expected_dataset_creds + else: + assert CREDENTIALS_KEY not in pds._dataset_config + + str_repr = str(pds) + + def _assert_not_in_repr(value): + if isinstance(value, dict): + for k_, v_ in value.items(): + _assert_not_in_repr(k_) + _assert_not_in_repr(v_) + if value is not None: + assert str(value) not in str_repr + + _assert_not_in_repr(credentials) + + def test_fs_args(self, mocker): + fs_args = {"foo": "bar"} + + mocked_filesystem = mocker.patch("fsspec.filesystem") + path = str(Path.cwd()) + pds = PartitionedDataset(path, "pandas.CSVDataset", fs_args=fs_args) + + assert mocked_filesystem.call_count == 2 + mocked_filesystem.assert_called_with("file", **fs_args) + assert pds._dataset_config["fs_args"] == fs_args + + @pytest.mark.parametrize("dataset", ["pandas.ParquetDataset", ParquetDataset]) + def test_invalid_dataset(self, dataset, local_csvs): + pds = PartitionedDataset(str(local_csvs), dataset) + loaded_partitions = pds.load() + + for partition, df_loader in loaded_partitions.items(): + pattern = r"Failed while loading data from data set ParquetDataset(.*)" + with pytest.raises(DatasetError, match=pattern) as exc_info: + df_loader() + error_message = str(exc_info.value) + assert ( + "Either the file is corrupted or this is not a parquet file" + in error_message + ) + assert str(partition) in error_message + + @pytest.mark.parametrize( + "dataset_config,error_pattern", + [ + ("UndefinedDatasetType", "Class 'UndefinedDatasetType' not found"), + ( + "missing.module.UndefinedDatasetType", + r"Class 'missing\.module\.UndefinedDatasetType' not found", + ), + ( + FakeDataset, + r"Dataset type 'tests\.partitions\.test_partitioned_dataset\.FakeDataset' " + r"is invalid\: all data set types must extend 'AbstractDataset'", + ), + ({}, "'type' is missing from dataset catalog configuration"), + ], + ) + def test_invalid_dataset_config(self, dataset_config, error_pattern): + with pytest.raises(DatasetError, match=error_pattern): + PartitionedDataset(str(Path.cwd()), dataset_config) + + @pytest.mark.parametrize( + "dataset_config", + [ + {"type": CSVDataset, "versioned": True}, + {"type": "pandas.CSVDataset", "versioned": True}, + ], + ) + def test_versioned_dataset_not_allowed(self, dataset_config): + pattern = ( + "'PartitionedDataset' does not support versioning of the underlying " + "dataset. Please remove 'versioned' flag from the dataset definition." + ) + with pytest.raises(DatasetError, match=re.escape(pattern)): + PartitionedDataset(str(Path.cwd()), dataset_config) + + def test_no_partitions(self, tmpdir): + pds = PartitionedDataset(str(tmpdir), "pandas.CSVDataset") + + pattern = re.escape(f"No partitions found in '{tmpdir}'") + with pytest.raises(DatasetError, match=pattern): + pds.load() + + @pytest.mark.parametrize( + "pds_config,filepath_arg", + [ + ( + { + "path": str(Path.cwd()), + "dataset": {"type": CSVDataset, "filepath": "fake_path"}, + }, + "filepath", + ), + ( + { + "path": str(Path.cwd()), + "dataset": {"type": CSVDataset, "other_arg": "fake_path"}, + "filepath_arg": "other_arg", + }, + "other_arg", + ), + ], + ) + def test_filepath_arg_warning(self, pds_config, filepath_arg): + pattern = ( + f"'{filepath_arg}' key must not be specified in the dataset definition as it " + f"will be overwritten by partition path" + ) + with pytest.warns(UserWarning, match=re.escape(pattern)): + PartitionedDataset(**pds_config) + + def test_credentials_log_warning(self, caplog): + """Check that the warning is logged if the dataset credentials will overwrite + the top-level ones""" + pds = PartitionedDataset( + path=str(Path.cwd()), + dataset={"type": CSVDataset, "credentials": {"secret": "dataset"}}, + credentials={"secret": "global"}, + ) + log_message = KEY_PROPAGATION_WARNING % { + "keys": "credentials", + "target": "underlying dataset", + } + assert caplog.record_tuples == [("kedro.io.core", logging.WARNING, log_message)] + assert pds._dataset_config["credentials"] == {"secret": "dataset"} + + def test_fs_args_log_warning(self, caplog): + """Check that the warning is logged if the dataset filesystem + arguments will overwrite the top-level ones""" + pds = PartitionedDataset( + path=str(Path.cwd()), + dataset={"type": CSVDataset, "fs_args": {"args": "dataset"}}, + fs_args={"args": "dataset"}, + ) + log_message = KEY_PROPAGATION_WARNING % { + "keys": "filesystem arguments", + "target": "underlying dataset", + } + assert caplog.record_tuples == [("kedro.io.core", logging.WARNING, log_message)] + assert pds._dataset_config["fs_args"] == {"args": "dataset"} + + @pytest.mark.parametrize( + "pds_config,expected_ds_creds,global_creds", + [ + ( + {"dataset": "pandas.CSVDataset", "credentials": {"secret": "global"}}, + {"secret": "global"}, + {"secret": "global"}, + ), + ( + { + "dataset": { + "type": CSVDataset, + "credentials": {"secret": "expected"}, + }, + }, + {"secret": "expected"}, + {}, + ), + ( + { + "dataset": {"type": CSVDataset, "credentials": None}, + "credentials": {"secret": "global"}, + }, + None, + {"secret": "global"}, + ), + ( + { + "dataset": { + "type": CSVDataset, + "credentials": {"secret": "expected"}, + }, + "credentials": {"secret": "global"}, + }, + {"secret": "expected"}, + {"secret": "global"}, + ), + ], + ) + def test_dataset_creds(self, pds_config, expected_ds_creds, global_creds): + """Check that global credentials do not interfere dataset credentials.""" + pds = PartitionedDataset(path=str(Path.cwd()), **pds_config) + assert pds._dataset_config["credentials"] == expected_ds_creds + assert pds._credentials == global_creds + + +BUCKET_NAME = "fake_bucket_name" +S3_DATASET_DEFINITION = [ + "pandas.CSVDataset", + "kedro_datasets.pandas.CSVDataset", + CSVDataset, + {"type": "kedro_datasets.pandas.CSVDataset", "save_args": {"index": False}}, + {"type": CSVDataset}, +] + + +@pytest.fixture +def mocked_s3_bucket(): + """Create a bucket for testing using moto.""" + with mock_s3(): + conn = boto3.client( + "s3", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + conn.create_bucket(Bucket=BUCKET_NAME) + yield conn + + +@pytest.fixture +def mocked_csvs_in_s3(mocked_s3_bucket, partitioned_data_pandas): + prefix = "csvs" + for key, data in partitioned_data_pandas.items(): + mocked_s3_bucket.put_object( + Bucket=BUCKET_NAME, + Key=f"{prefix}/{key}", + Body=data.to_csv(index=False), + ) + return f"s3://{BUCKET_NAME}/{prefix}" + + +class TestPartitionedDatasetS3: + os.environ["AWS_ACCESS_KEY_ID"] = "FAKE_ACCESS_KEY" + os.environ["AWS_SECRET_ACCESS_KEY"] = "FAKE_SECRET_KEY" + + @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) + def test_load(self, dataset, mocked_csvs_in_s3, partitioned_data_pandas): + pds = PartitionedDataset(mocked_csvs_in_s3, dataset) + loaded_partitions = pds.load() + + assert loaded_partitions.keys() == partitioned_data_pandas.keys() + for partition_id, load_func in loaded_partitions.items(): + df = load_func() + assert_frame_equal(df, partitioned_data_pandas[partition_id]) + + def test_load_s3a(self, mocked_csvs_in_s3, partitioned_data_pandas, mocker): + path = mocked_csvs_in_s3.split("://", 1)[1] + s3a_path = f"s3a://{path}" + # any type is fine as long as it passes isinstance check + # since _dataset_type is mocked later anyways + pds = PartitionedDataset(s3a_path, "pandas.CSVDataset") + assert pds._protocol == "s3a" + + mocked_ds = mocker.patch.object(pds, "_dataset_type") + mocked_ds.__name__ = "mocked" + loaded_partitions = pds.load() + + assert loaded_partitions.keys() == partitioned_data_pandas.keys() + assert mocked_ds.call_count == len(loaded_partitions) + expected = [ + mocker.call(filepath=f"{s3a_path}/{partition_id}") + for partition_id in loaded_partitions + ] + mocked_ds.assert_has_calls(expected, any_order=True) + + @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) + def test_save(self, dataset, mocked_csvs_in_s3): + pds = PartitionedDataset(mocked_csvs_in_s3, dataset) + original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) + part_id = "new/data.csv" + pds.save({part_id: original_data}) + + s3 = s3fs.S3FileSystem() + assert s3.exists("/".join([mocked_csvs_in_s3, part_id])) + + loaded_partitions = pds.load() + assert part_id in loaded_partitions + reloaded_data = loaded_partitions[part_id]() + assert_frame_equal(reloaded_data, original_data) + + def test_save_s3a(self, mocked_csvs_in_s3, mocker): + """Test that save works in case of s3a protocol""" + path = mocked_csvs_in_s3.split("://", 1)[1] + s3a_path = f"s3a://{path}" + # any type is fine as long as it passes isinstance check + # since _dataset_type is mocked later anyways + pds = PartitionedDataset(s3a_path, "pandas.CSVDataset", filename_suffix=".csv") + assert pds._protocol == "s3a" + + mocked_ds = mocker.patch.object(pds, "_dataset_type") + mocked_ds.__name__ = "mocked" + new_partition = "new/data" + data = "data" + + pds.save({new_partition: data}) + mocked_ds.assert_called_once_with(filepath=f"{s3a_path}/{new_partition}.csv") + mocked_ds.return_value.save.assert_called_once_with(data) + + @pytest.mark.parametrize("dataset", ["pandas.CSVDataset", "pandas.HDFDataset"]) + def test_exists(self, dataset, mocked_csvs_in_s3): + assert PartitionedDataset(mocked_csvs_in_s3, dataset).exists() + + empty_folder = "/".join([mocked_csvs_in_s3, "empty", "folder"]) + assert not PartitionedDataset(empty_folder, dataset).exists() + + s3fs.S3FileSystem().mkdir(empty_folder) + assert not PartitionedDataset(empty_folder, dataset).exists() + + @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) + def test_release(self, dataset, mocked_csvs_in_s3): + partition_to_remove = "p2.csv" + pds = PartitionedDataset(mocked_csvs_in_s3, dataset) + initial_load = pds.load() + assert partition_to_remove in initial_load + + s3 = s3fs.S3FileSystem() + s3.rm("/".join([mocked_csvs_in_s3, partition_to_remove])) + cached_load = pds.load() + assert initial_load.keys() == cached_load.keys() + + pds.release() + load_after_release = pds.load() + assert initial_load.keys() ^ load_after_release.keys() == {partition_to_remove} + + @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) + def test_describe(self, dataset): + path = f"s3://{BUCKET_NAME}/foo/bar" + pds = PartitionedDataset(path, dataset) + + assert f"path={path}" in str(pds) + assert "dataset_type=CSVDataset" in str(pds) + assert "dataset_config" in str(pds) From cc75b4017550467c345a19fbdd797c6e77d28a31 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 12 Oct 2023 17:25:42 +0200 Subject: [PATCH 15/15] fix: backwards compatibility for `kedro-airflow` (#381) Signed-off-by: Simon Brugman --- kedro-airflow/RELEASE.md | 1 + kedro-airflow/kedro_airflow/plugin.py | 13 +++++++++---- kedro-airflow/tests/test_plugin.py | 24 +++++++++++++++++++++++- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 32f705069..e7ab78695 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -2,6 +2,7 @@ * Added support for Python 3.11 * Added the `--all` CLI argument to `kedro-airflow` to convert registered all pipelines at once. * Simplify the output of the `kedro airflow create` command. +* Fixed compatibility of `kedro-airflow` with older versions of the config loaders (`kedro<=0.18.2`). ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: diff --git a/kedro-airflow/kedro_airflow/plugin.py b/kedro-airflow/kedro_airflow/plugin.py index ba998dabc..cb20a9d38 100644 --- a/kedro-airflow/kedro_airflow/plugin.py +++ b/kedro-airflow/kedro_airflow/plugin.py @@ -36,17 +36,22 @@ def airflow_commands(): def _load_config(context: KedroContext) -> dict[str, Any]: + # Backwards compatibility for ConfigLoader that does not support `config_patterns` + config_loader = context.config_loader + if not hasattr(config_loader, "config_patterns"): + return config_loader.get("airflow*", "airflow/**") + # Set the default pattern for `airflow` if not provided in `settings.py` - if "airflow" not in context.config_loader.config_patterns.keys(): - context.config_loader.config_patterns.update( # pragma: no cover + if "airflow" not in config_loader.config_patterns.keys(): + config_loader.config_patterns.update( # pragma: no cover {"airflow": ["airflow*", "airflow/**"]} ) - assert "airflow" in context.config_loader.config_patterns.keys() + assert "airflow" in config_loader.config_patterns.keys() # Load the config try: - return context.config_loader["airflow"] + return config_loader["airflow"] except MissingConfigException: # File does not exist return {} diff --git a/kedro-airflow/tests/test_plugin.py b/kedro-airflow/tests/test_plugin.py index 1d282f0c3..4c11efd22 100644 --- a/kedro-airflow/tests/test_plugin.py +++ b/kedro-airflow/tests/test_plugin.py @@ -5,8 +5,11 @@ import pytest import yaml +from kedro.config import ConfigLoader +from kedro.framework.context import KedroContext +from pluggy import PluginManager -from kedro_airflow.plugin import commands +from kedro_airflow.plugin import _load_config, commands @pytest.mark.parametrize( @@ -264,3 +267,22 @@ def test_create_airflow_all_and_pipeline(cli_runner, metadata): "Error: Invalid value: The `--all` and `--pipeline` option are mutually exclusive." in result.stdout ) + + +def test_config_loader_backwards_compatibility(cli_runner, metadata): + # Emulate ConfigLoader in kedro <= 0.18.2 + conf_source = Path.cwd() / "conf" + config_loader = ConfigLoader(conf_source=conf_source) + del config_loader.config_patterns + context = KedroContext( + config_loader=config_loader, + hook_manager=PluginManager(project_name=metadata.project_name), + package_name=metadata.package_name, + project_path=metadata.project_path, + ) + + config = _load_config(context) + assert config == { + "default": {"owner": "again someone else"}, + "ds": {"owner": "finally someone else"}, + }