Skip to content

Commit

Permalink
Add the viz add-ons to kedro new (#3228)
Browse files Browse the repository at this point in the history
* adapt pyspark add-ons

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* fix promopt

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* fix minor stuff in prompt

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* refactor utils and fix template switch to starters

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* lint

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Refactor utils.py use toml and unstrip pyproject.toml and requirements.txt

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* fix broken test missing options 7

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* add more test

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* fix test

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* attempt to fix tests

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* fix test

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* remove reporting when viz selected

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* update tests for viz add-on

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* remove reporting.yml and update tests

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* add comment to _get_expected_files number counts

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Update tests to parse pyprojec.toml (#3230)

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>
Signed-off-by: Nok Lam Chan <nok.lam.chan@quantumblack.com>

* Add .gitkeep for conf/local/

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Fix tests

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* add comments for more context

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* More descriptive variable names

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* changes based on review - use patterns to remove parameter files

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* changes based on review cleaner template switching with more comments

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* add full stop to comment (nit)

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Replace anyconfig with toml for tests

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* add docstrings

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* add docstrings to helper methods and make them private

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* update docstring

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* changes based on review

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* lint

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

---------

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>
Signed-off-by: Nok Lam Chan <nok.lam.chan@quantumblack.com>
Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>
Signed-off-by: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com>
Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>
Co-authored-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>
  • Loading branch information
noklam and SajidAlamQB authored Nov 3, 2023
1 parent 2ed06f5 commit 93dc1a9
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 214 deletions.
40 changes: 27 additions & 13 deletions kedro/framework/cli/starters.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ class KedroStarterSpec: # noqa: too-few-public-methods
3) Custom Logging: Provides more logging options\n
4) Documentation: Basic documentation setup with Sphinx\n
5) Data Structure: Provides a directory structure for storing data\n
6) Pyspark: Provides a basic PySpark set up\n
6) Pyspark: Provides set up configuration for working with PySpark\n
7) Kedro Viz: Provides Kedro's native visualisation tool \n
Example usage:\n
kedro new --addons=lint,test,log,docs,data,pyspark (or any subset of these options)\n
Expand All @@ -123,6 +124,7 @@ class KedroStarterSpec: # noqa: too-few-public-methods
"docs": "4",
"data": "5",
"pyspark": "6",
"viz": "7",
}
NUMBER_TO_ADD_ONS_NAME = {
"1": "Linting",
Expand All @@ -131,6 +133,7 @@ class KedroStarterSpec: # noqa: too-few-public-methods
"4": "Documentation",
"5": "Data Structure",
"6": "Pyspark",
"7": "Kedro Viz",
}


Expand Down Expand Up @@ -223,7 +226,7 @@ def _validate_range(start, end):
def _validate_selection(add_ons: list[str]):
for add_on in add_ons:
if add_on not in NUMBER_TO_ADD_ONS_NAME:
message = f"'{add_on}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5, 6." # nosec
message = f"'{add_on}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5, 6, 7." # nosec
click.secho(message, fg="red", err=True)
sys.exit(1)

Expand Down Expand Up @@ -475,7 +478,7 @@ def _select_prompts_to_display(
for addon in addons:
if addon not in valid_addons:
click.secho(
"Please select from the available add-ons: lint, test, log, docs, data, pyspark, all, none",
"Please select from the available add-ons: lint, test, log, docs, data, pyspark, viz, all, none",
fg="red",
err=True,
)
Expand Down Expand Up @@ -581,11 +584,24 @@ def _make_cookiecutter_args(
def fetch_template_based_on_add_ons(template_path, cookiecutter_args: dict[str, Any]):
extra_context = cookiecutter_args["extra_context"]
add_ons = extra_context.get("add_ons")
if add_ons and "Pyspark" in add_ons:
cookiecutter_args["directory"] = "spaceflights-pyspark"
pyspark_path = "git+https://github.com/kedro-org/kedro-starters.git"
return pyspark_path
return template_path
starter_path = "git+https://github.com/kedro-org/kedro-starters.git"
if add_ons:
if "Pyspark" in add_ons and "Kedro Viz" in add_ons:
# Use the spaceflights-pyspark-viz starter if both Pyspark and Kedro Viz are chosen.
cookiecutter_args["directory"] = "spaceflights-pyspark-viz"
elif "Pyspark" in add_ons:
# Use the spaceflights-pyspark starter if only Pyspark is chosen.
cookiecutter_args["directory"] = "spaceflights-pyspark"
elif "Kedro Viz" in add_ons:
# Use the spaceflights-pandas-viz starter if only Kedro Viz is chosen.
cookiecutter_args["directory"] = "spaceflights-pandas-viz"
else:
# Use the default template path for any other combinations or if "none" is chosen.
starter_path = template_path
else:
# Use the default template path if add_ons is None, which can occur if there is no prompts.yml or its empty.
starter_path = template_path
return starter_path


def _create_project(template_path: str, cookiecutter_args: dict[str, Any]):
Expand Down Expand Up @@ -619,11 +635,9 @@ def _create_project(template_path: str, cookiecutter_args: dict[str, Any]):
)
add_ons = extra_context.get("add_ons")

# Only core template and spaceflights-pyspark have configurable add-ons
if (
template_path == str(TEMPLATE_PATH)
or add_ons is not None
and "Pyspark" in add_ons
# Only core template and spaceflight starters have configurable add-ons
if template_path == str(TEMPLATE_PATH) or (
add_ons and ("Pyspark" in add_ons or "Kedro Viz" in add_ons)
):
if add_ons == "[]": # TODO: This should be a list
click.secho("\nYou have selected no add-ons")
Expand Down
285 changes: 167 additions & 118 deletions kedro/templates/project/hooks/utils.py
Original file line number Diff line number Diff line change
@@ -1,140 +1,189 @@
from pathlib import Path
import shutil
import sys
import click
import toml

current_dir = Path.cwd()

lint_requirements = "black~=22.12.0\nruff~=0.0.290\n"
lint_pyproject_requirements = """
[tool.ruff]
select = [
"F", # Pyflakes
"E", # Pycodestyle
"W", # Pycodestyle
"UP", # pyupgrade
"I", # isort
"PL", # Pylint
]
ignore = ["E501"] # Black takes care of line-too-long
"""

test_requirements = "pytest-cov~=3.0\npytest-mock>=1.7.1, <2.0\npytest~=7.2"
test_pyproject_requirements = """
[tool.pytest.ini_options]
addopts = \"\"\"
--cov-report term-missing \\
--cov src/{{ cookiecutter.python_package }} -ra
\"\"\"
[tool.coverage.report]
fail_under = 0
show_missing = true
exclude_lines = ["pragma: no cover", "raise NotImplementedError"]
"""

docs_pyproject_requirements = """
[project.optional-dependencies]
docs = [
"docutils<0.18.0",
"sphinx~=3.4.3",
"sphinx_rtd_theme==0.5.1",
"nbsphinx==0.8.1",
"sphinx-autodoc-typehints==1.11.1",
"sphinx_copybutton==0.3.1",
"ipykernel>=5.3, <7.0",
"Jinja2<3.1.0",
"myst-parser~=0.17.2",
]
"""


def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproject_file_path, python_package_name):
"""Removes directories and files related to unwanted addons from
a Kedro project template. Adds the necessary requirements for
the addons that were selected.
# Requirements for linting tools
lint_requirements = "black~=22.0\nruff~=0.0.290\n" # For requirements.txt
lint_pyproject_requirements = ["tool.ruff"] # For pyproject.toml

# Requirements and configurations for testing tools and coverage reporting
test_requirements = "pytest-cov~=3.0\npytest-mock>=1.7.1, <2.0\npytest~=7.2" # For requirements.txt
test_pyproject_requirements = ["tool.pytest.ini_options", "tool.coverage.report"] # For pyproject.toml

# Configuration key for documentation dependencies
docs_pyproject_requirements = ["project.optional-dependencies"] # For pyproject.toml


# Helper Functions
def _remove_from_file(file_path: Path, content_to_remove: str) -> None:
"""Remove specified content from the file.
Args:
selected_add_ons_list: a list containing numbers from 1 to 5,
representing specific add-ons.
requirements_file_path: the path to the requirements.txt file.
pyproject_file_path: the path to the pyproject.toml file
located on the the root of the template.
file_path (Path): The path of the file from which to remove content.
content_to_remove (str): The content to be removed from the file.
"""
if "Linting" not in selected_add_ons_list:
pass
with open(file_path, 'r') as file:
lines = file.readlines()

# Split the content to remove into lines and remove trailing whitespaces/newlines
content_to_remove_lines = [line.strip() for line in content_to_remove.split('\n')]

# Keep lines that are not in content_to_remove
lines = [line for line in lines if line.strip() not in content_to_remove_lines]

with open(file_path, 'w') as file:
file.writelines(lines)


def _remove_nested_section(data: dict, nested_key: str) -> None:
"""Remove a nested section from a dictionary representing a TOML file.
Args:
data (dict): The dictionary from which to remove the section.
nested_key (str): The dotted path key representing the nested section to remove.
"""
keys = nested_key.split('.')
current_data = data
# Look for Parent section
for key in keys[:-1]: # Iterate over all but last element
if key in current_data:
current_data = current_data[key]
else:
return # Parent section not found, nothing to remove

# Remove the nested section and any empty parent sections
current_data.pop(keys[-1], None) # Remove last element otherwise return None
for key in reversed(keys[:-1]):
parent_section = data
for key_part in keys[:keys.index(key)]:
parent_section = parent_section[key_part]
if not current_data: # If the section is empty, remove it
parent_section.pop(key, None)
current_data = parent_section
else:
break # If the section is not empty, stop removing


def _remove_from_toml(file_path: Path, sections_to_remove: list) -> None:
"""Remove specified sections from a TOML file.
Args:
file_path (Path): The path to the TOML file.
sections_to_remove (list): A list of section keys to remove from the TOML file.
"""
# Load the TOML file
with open(file_path, 'r') as file:
data = toml.load(file)

# Remove the specified sections
for section in sections_to_remove:
_remove_nested_section(data, section)

with open(file_path, 'w') as file:
toml.dump(data, file)


def _remove_dir(path: Path) -> None:
"""Remove a directory if it exists.
Args:
path (Path): The path of the directory to remove.
"""
if path.exists():
shutil.rmtree(str(path))


def _remove_file(path: Path) -> None:
"""Remove a file if it exists.
Args:
path (Path): The path of the file to remove.
"""
if path.exists():
path.unlink()


def _handle_starter_setup(selected_add_ons_list: str, python_package_name: str) -> None:
"""Clean up the unnecessary files in the starters template.
Args:
selected_add_ons_list (str): A string contains the selected add-ons.
python_package_name (str): The name of the python package.
"""
# Remove all .csv and .xlsx files from data/01_raw/
raw_data_path = current_dir / "data/01_raw/"
for file_path in raw_data_path.glob("*.*"):
if file_path.suffix in [".csv", ".xlsx"]:
file_path.unlink()

# Empty the contents of conf/base/catalog.yml
catalog_yml_path = current_dir / "conf/base/catalog.yml"
if catalog_yml_path.exists():
catalog_yml_path.write_text('')
# Remove parameter files from conf/base
conf_base_path = current_dir / "conf/base/"
parameter_file_patterns = ["parameters_*.yml", "parameters/*.yml"]
for pattern in parameter_file_patterns:
for param_file in conf_base_path.glob(pattern):
_remove_file(param_file)

# Remove the pipelines subdirectories
if "Kedro Viz" in selected_add_ons_list: # Remove reporting if Kedro Viz is selected
pipelines_to_remove = ["data_science", "data_processing", "reporting"]
else:
with open(requirements_file_path, 'a') as file:
file.write(lint_requirements)
with open(pyproject_file_path, 'a') as file:
file.write(lint_pyproject_requirements)
pipelines_to_remove = ["data_science", "data_processing"]

pipelines_path = current_dir / f"src/{python_package_name}/pipelines/"
for pipeline_subdir in pipelines_to_remove:
_remove_dir(pipelines_path / pipeline_subdir)

# Remove all test files from tests/pipelines/
test_pipeline_path = current_dir / "tests/pipelines/test_data_science.py"
_remove_file(test_pipeline_path)


def setup_template_add_ons(selected_add_ons_list: str, requirements_file_path: str, pyproject_file_path: str, python_package_name: str) -> None:
"""Setup the templates according to the choice of add-ons.
Args:
selected_add_ons_list (str): A string contains the selected add-ons.
requirements_file_path (str): The path of the `requiremenets.txt` in the template.
pyproject_file_path (str): The path of the `pyproject.toml` in the template
python_package_name (str): The name of the python package.
"""
if "Linting" not in selected_add_ons_list:
_remove_from_file(requirements_file_path, lint_requirements)
_remove_from_toml(pyproject_file_path, lint_pyproject_requirements)

if "Testing" not in selected_add_ons_list:
tests_path = current_dir / "tests"
if tests_path.exists():
shutil.rmtree(str(tests_path))
else:
with open(requirements_file_path, 'a') as file:
file.write(test_requirements)
with open(pyproject_file_path, 'a') as file:
file.write(test_pyproject_requirements)
_remove_from_file(requirements_file_path, test_requirements)
_remove_from_toml(pyproject_file_path, test_pyproject_requirements)
_remove_dir(current_dir / "tests")

if "Logging" not in selected_add_ons_list:
logging_yml_path = current_dir / "conf/logging.yml"
if logging_yml_path.exists():
logging_yml_path.unlink()
_remove_file(current_dir / "conf/logging.yml")

if "Documentation" not in selected_add_ons_list:
docs_path = current_dir / "docs"
if docs_path.exists():
shutil.rmtree(str(docs_path))
else:
with open(pyproject_file_path, 'a') as file:
file.write(docs_pyproject_requirements)
_remove_from_toml(pyproject_file_path, docs_pyproject_requirements)
_remove_dir(current_dir / "docs")

if "Data Structure" not in selected_add_ons_list:
data_path = current_dir / "data"
if data_path.exists():
shutil.rmtree(str(data_path))

if "Pyspark" not in selected_add_ons_list: # If PySpark not selected
pass
else: # Use spaceflights-pyspark to create pyspark template
# Remove all .csv and .xlsx files from data/01_raw/
raw_data_path = current_dir / "data/01_raw/"
if raw_data_path.exists() and raw_data_path.is_dir():
for file_path in raw_data_path.glob("*.*"):
if file_path.suffix in [".csv", ".xlsx"]:
file_path.unlink()

# Remove parameter files from conf/base/
param_files = [
"parameters_data_processing.yml",
"parameters_data_science.yml",
]
conf_base_path = current_dir / "conf/base/"
if conf_base_path.exists() and conf_base_path.is_dir():
for param_file in param_files:
file_path = conf_base_path / param_file
if file_path.exists():
file_path.unlink()

# Remove specific pipeline subdirectories
pipelines_path = current_dir / f"src/{python_package_name}/pipelines/"
for pipeline_subdir in ["data_science", "data_processing"]:
shutil.rmtree(pipelines_path / pipeline_subdir, ignore_errors=True)

# Remove all test file from tests/pipelines/
test_pipeline_path = current_dir / "tests/pipelines/test_data_science.py"
if test_pipeline_path.exists():
test_pipeline_path.unlink()


def sort_requirements(requirements_file_path):
"""Sort the requirements.txt file in alphabetical order.
_remove_dir(current_dir / "data")

if "Pyspark" in selected_add_ons_list:
_handle_starter_setup(selected_add_ons_list, python_package_name)

if "Kedro Viz" in selected_add_ons_list:
_handle_starter_setup(selected_add_ons_list, python_package_name)


def sort_requirements(requirements_file_path: Path) -> None:
"""Sort the requirements.txt file alphabetically and write it back to the file.
Args:
requirements_file_path: the path to the requirements.txt file.
requirements_file_path (Path): The path to the `requirements.txt` file.
"""
with open(requirements_file_path, 'r') as requirements:
lines = requirements.readlines()
Expand Down
Loading

0 comments on commit 93dc1a9

Please sign in to comment.