Support ipynb files without requiring conversion (#57)

Support ipynb files without requiring conversion Co-authored-by: Danyaal Masood <Danyaal.Masood@man.com> Co-authored-by: Jon Bannister <jon.bannister@man.com>
man-group · Nov 9, 2021 · e3a514b · e3a514b
1 parent 21f897c
commit e3a514b
Show file tree

Hide file tree

Showing 12 changed files with 303 additions and 54 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+0.3.2 (2021-11-??)
+------------------
+
+* Feature: .ipynb files are now natively supported and can be used as Notebook Templates (#57)
+
+
 0.3.1 (2021-10-29)
 ------------------
 
@@ -6,6 +12,7 @@
 * Bugfix: Large notebooks were causing serialisation errors; now safely stored in gridfs.
 * **Incompatibility**: Reports run with this version onwards will not be readable by older versions of Notebooker.
 
+
 0.3.0 (2021-10-05)
 ------------------
 

diff --git a/notebooker/notebook_templates_example/sample/plot_random_raw.ipynb b/notebooker/notebook_templates_example/sample/plot_random_raw.ipynb
@@ -0,0 +1,74 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "source": [
+    "#Notebooker Test!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "lines_to_next_cell": 0,
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "plots = 5\n",
+    "days = 100\n",
+    "start_date = \"2020-01-01\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -\n",
+    "arr = np.random.rand(days, plots) - 0.5\n",
+    "dts = np.array(start_date, dtype=np.datetime64) + np.arange(days)\n",
+    "df = pd.DataFrame(arr, index=dts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -\n",
+    "df.cumsum().plot()"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_json": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooker/utils/conversion.py b/notebooker/utils/conversion.py
@@ -66,25 +66,30 @@ def _git_pull_latest(repo: git.repo.Repo):
     repo.git.pull("origin", "master")
 
 
-def _python_template(report_path: AnyStr, py_template_dir: AnyStr) -> AnyStr:
-    file_name = "{}.py".format(report_path)
-    return os.path.join(py_template_dir, file_name)
+def _template(report_path: str, py_template_dir: AnyStr) -> AnyStr:
+    py_path = os.path.join(py_template_dir, "{}.py".format(report_path))
+    ipynb_path = os.path.join(py_template_dir, "{}.ipynb".format(report_path))
+
+    if os.path.isfile(py_path):
+        return py_path
+
+    return ipynb_path
 
 
 def _ipynb_output_path(template_base_dir: AnyStr, report_path: AnyStr, git_hex: AnyStr) -> AnyStr:
     file_name = _output_ipynb_name(report_path)
     return os.path.join(template_base_dir, git_hex, file_name)
 
 
-def _get_python_template_path(report_path: str, warn_on_local: bool, py_template_dir) -> str:
+def _get_template_path(report_path: str, warn_on_local: bool, py_template_dir: AnyStr) -> str:
     if py_template_dir:
-        return _python_template(report_path, py_template_dir)
+        return _template(report_path, py_template_dir)
     else:
         if warn_on_local:
             logger.warning(
                 "Loading from notebooker default templates. This is only expected if you are running locally."
             )
-        return pkg_resources.resource_filename(__name__, "../notebook_templates_example/{}.py".format(report_path))
+        return _template(report_path, pkg_resources.resource_filename(__name__, "../notebook_templates_example"))
 
 
 def _get_output_path_hex(notebooker_disable_git, py_template_dir) -> str:
@@ -126,22 +131,25 @@ def generate_ipynb_from_py(
     Pulls the latest version of the notebook templates from git, and regenerates templates if there is a new HEAD
     OR: finds the local template from the template repository using a relative path
 
-    In both cases, this method converts the .py file into an .ipynb file which can be executed by papermill.
+    Both .ipynb and .py report templates are handled, where .py templates are converted to .ipynb, which can
+    be executed by papermill
 
     :param template_base_dir: The directory in which converted notebook templates reside.
     :param report_name: The name of the report which we are running.
     :param notebooker_disable_git: Whether or not to pull the latest version from git, if a change is available.
-    :param py_template_dir: The directory which contains raw python templates. This should be a subdir in a git repo.
+    :param py_template_dir: The directory which contains raw py/ipynb templates. This should be a subdir in a git repo.
     :param warn_on_local: Whether to warn when we are searching for notebooks in the notebooker repo itself.
 
     :return: The filepath of the .ipynb which we have just converted.
     """
     report_path = convert_report_name_into_path(report_name)
-    python_template_path = _get_python_template_path(report_path, warn_on_local, py_template_dir)
+    template_path = _get_template_path(report_path, warn_on_local, py_template_dir)
     output_template_path = _ipynb_output_path(
         template_base_dir, report_path, _get_output_path_hex(notebooker_disable_git, py_template_dir)
     )
 
+    mkdir_p(os.path.dirname(output_template_path))
+
     try:
         with open(output_template_path, "r") as f:
             if f.read():
@@ -151,14 +159,14 @@ def generate_ipynb_from_py(
         pass
 
     # "touch" the output file
-    print("Creating ipynb at: %s", output_template_path)
-    mkdir_p(os.path.dirname(output_template_path))
-    with open(output_template_path, "w") as f:
+    print("Writing ipynb to: %s", output_template_path)
+    with open(output_template_path, "w"):
         os.utime(output_template_path, None)
 
-    jupytext_nb = jupytext.read(python_template_path)
+    jupytext_nb = jupytext.read(template_path)
     jupytext_nb["metadata"]["kernelspec"] = kernel_spec()  # Override the kernel spec since we want to run it..
     jupytext.write(jupytext_nb, output_template_path)
+
     return output_template_path
 
 

diff --git a/notebooker/utils/templates.py b/notebooker/utils/templates.py
@@ -18,7 +18,7 @@ def _valid_dirname(d):
 
 
 def _valid_filename(f):
-    return f.endswith(".py") and "__init__" not in f and "__pycache__" not in f
+    return (f.endswith(".py") or f.endswith(".ipynb")) and "__init__" not in f and "__pycache__" not in f
 
 
 def _get_parameters_cell_idx(notebook: nbformat.NotebookNode) -> Optional[int]:

diff --git a/notebooker/web/utils.py b/notebooker/web/utils.py
@@ -54,7 +54,7 @@ def get_directory_structure(starting_point: Optional[str] = None) -> Dict[str, U
         if not _valid_dirname(path):
             continue
         folders = path[start:].split(os.sep)
-        subdir = {os.sep.join(folders[1:] + [f.replace(".py", "")]): None for f in files if _valid_filename(f)}
+        subdir = {os.sep.join(folders[1:] + [f.replace(".ipynb", "").replace(".py", "")]): None for f in files if _valid_filename(f)}
         parent = reduce(dict.get, folders[:-1], all_dirs)
         parent[folders[-1]] = subdir
     return all_dirs[rootdir[start:]]

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -1,7 +1,7 @@
 import git
 import pytest
 
-DUMMY_REPORT = """
+DUMMY_REPORT_PY = """
 # ---
 # jupyter:
 #   celltoolbar: Tags
@@ -48,13 +48,86 @@
 1/0
 """
 
+DUMMY_REPORT_IPYNB = """
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline",
+    "import pandas as pd",
+    "import numpy as np",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "n_points = random.choice(range(50, 1000))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx = pd.date_range('1/1/2000', periods=n_points)",
+    "df = pd.DataFrame(np.random.randn(n_points, 4), index=idx, columns=list('ABCD'))",
+    "df.plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "cumulative = df.cumsum()",
+    "cumulative.plot()"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Tags",
+  "jupytext": {
+   "cell_metadata_json": true,
+   "notebook_metadata_filter": "celltoolbar,jupytext_format_version"
+  },
+  "kernelspec": {
+   "display_name": "spark273",
+   "language": "python",
+   "name": "spark273"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+"""
 
 @pytest.fixture
 def setup_workspace(workspace):
     (workspace.workspace + "/templates").mkdir()
     git.Git(workspace.workspace).init()
     (workspace.workspace + "/templates/fake").mkdir()
-    report_to_run = workspace.workspace + "/templates/fake/report.py"
-    report_to_run.write_lines(DUMMY_REPORT.split("\n"))
+
+    py_report_to_run = workspace.workspace + "/templates/fake/py_report.py"
+    py_report_to_run.write_lines(DUMMY_REPORT_PY.split("\n"))
+
+    ipynb_report_to_run = workspace.workspace + "/templates/fake/ipynb_report.ipynb"
+    ipynb_report_to_run.write_lines(DUMMY_REPORT_IPYNB.split("\n"))
+
     report_to_run_failing = workspace.workspace + "/templates/fake/report_failing.py"
     report_to_run_failing.write_lines(DUMMY_FAILING_REPORT.split("\n"))
diff --git a/tests/integration/test_e2e.py b/tests/integration/test_e2e.py
@@ -2,6 +2,7 @@
 import datetime
 
 import freezegun
+import pytest
 
 from notebooker.constants import JobStatus
 from notebooker.web.routes.run_report import _rerun_report, run_report
@@ -29,12 +30,18 @@ def _check_report_output(job_id, serialiser, **kwargs):
         assert getattr(result, k) == v, "Report output for attribute {} was incorrect!".format(k)
 
 
+@pytest.mark.parametrize(
+    "report_name",
+    [
+        "fake/py_report",
+        "fake/ipynb_report"
+    ],
+)
 @freezegun.freeze_time(datetime.datetime(2018, 1, 12))
-def test_run_report(bson_library, flask_app, setup_and_cleanup_notebooker_filesystem, setup_workspace):
+def test_run_report(bson_library, flask_app, setup_and_cleanup_notebooker_filesystem, setup_workspace, report_name):
     with flask_app.app_context():
         serialiser = get_serializer()
         overrides = {"n_points": 5}
-        report_name = "fake/report"
         report_title = "my report title"
         mailto = ""
         job_id = run_report(
@@ -59,7 +66,6 @@ def test_run_report(bson_library, flask_app, setup_and_cleanup_notebooker_filesy
         assert job_id == serialiser.get_latest_successful_job_id_for_name_and_params(report_name, overrides)
         assert job_id == serialiser.get_latest_successful_job_id_for_name_and_params(report_name, None)
 
-
 @freezegun.freeze_time(datetime.datetime(2018, 1, 12))
 def test_run_failing_report(bson_library, flask_app, setup_and_cleanup_notebooker_filesystem, setup_workspace):
     with flask_app.app_context():
@@ -83,12 +89,18 @@ def test_run_failing_report(bson_library, flask_app, setup_and_cleanup_notebooke
         assert result.stdout
 
 
+@pytest.mark.parametrize(
+    "report_name",
+    [
+        "fake/py_report",
+        "fake/ipynb_report"
+    ],
+)
 @freezegun.freeze_time(datetime.datetime(2018, 1, 12))
-def test_run_report_and_rerun(bson_library, flask_app, setup_and_cleanup_notebooker_filesystem, setup_workspace):
+def test_run_report_and_rerun(bson_library, flask_app, setup_and_cleanup_notebooker_filesystem, setup_workspace, report_name):
     with flask_app.app_context():
         serialiser = get_serializer()
         overrides = {"n_points": 5}
-        report_name = "fake/report"
         report_title = "my report title"
         mailto = ""
         job_id = run_report(
@@ -126,12 +138,18 @@ def test_run_report_and_rerun(bson_library, flask_app, setup_and_cleanup_noteboo
         assert job_id != serialiser.get_latest_successful_job_id_for_name_and_params(report_name, overrides)
 
 
+@pytest.mark.parametrize(
+    "report_name",
+    [
+        "fake/py_report",
+        "fake/ipynb_report"
+    ],
+)
 @freezegun.freeze_time(datetime.datetime(2018, 1, 12))
-def test_run_report_hide_code(bson_library, flask_app, setup_and_cleanup_notebooker_filesystem, setup_workspace):
+def test_run_report_hide_code(bson_library, flask_app, setup_and_cleanup_notebooker_filesystem, setup_workspace, report_name):
     with flask_app.app_context():
         serialiser = get_serializer()
         overrides = {"n_points": 5}
-        report_name = "fake/report"
         report_title = "my report title"
         mailto = ""
         job_id = run_report(

diff --git a/tests/integration/test_templates.py b/tests/integration/test_templates.py
@@ -4,4 +4,6 @@
 def test_get_all_possible_templates(flask_app):
     flask_app.config["PY_TEMPLATE_BASE_DIR"] = None
     with flask_app.app_context():
-        assert get_all_possible_templates() == {"sample": {"sample/plot_random": None,  "sample/test_plotly": None}}
+        assert get_all_possible_templates() == {
+            "sample": {"sample/plot_random": None, "sample/test_plotly": None, "sample/plot_random_raw": None}
+        }
diff --git a/tests/integration/web/test_core_routes.py b/tests/integration/web/test_core_routes.py
@@ -9,7 +9,7 @@ def test_create_schedule(flask_app, setup_workspace):
         )
         assert rv.status_code == 200
         data = json.loads(rv.data)
-        assert data == {"result": ["fake/report", "fake/report_failing"]}
+        assert data == {"result": ["fake/py_report", "fake/ipynb_report", "fake/report_failing"]}
 
 
 def test_version_number(flask_app, setup_workspace):