Read-only mode (#138)

Add a new read-only mode
man-group · Feb 21, 2023 · 40fbb97 · 40fbb97
1 parent 112734d
commit 40fbb97
Show file tree

Hide file tree

Showing 30 changed files with 570 additions and 430 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+0.5.1 (2023-02-??)
+------------------
+
+* Feature: A new `--readonly-mode` is available for the webapp. This allows users to have an instance of Notebooker which only displays the results of externally-run or scheduler-run reports. See [the docs](https://notebooker.readthedocs.io/en/latest/webapp/webapp.html#read-only-mode) for more details.
+* Bugfix: Scheduler-executed reports will now correctly record stdout.
+
 0.5.0 (2023-01-19)
 ------------------
 

diff --git a/docs/images/read_only_result_page.png b/docs/images/read_only_result_page.png
diff --git a/docs/webapp/webapp.rst b/docs/webapp/webapp.rst
@@ -153,4 +153,23 @@ The webapp itself is configured via the command line notebooker-cli:
 
 .. click:: notebooker._entrypoints:base_notebooker
    :prog: notebooker-cli
-   :nested: full
+   :nested: full
+
+
+Read-only mode
+--------------
+There exists a read-only mode (add :code:`--readonly-mode` to command line arguments) in the
+Notebooker webapp which will disable the ability to run new,
+rerun, or delete existing reports. This mode is useful in situations where you would like Notebooker
+reports to be executed by a trusted process (e.g. the internal scheduler, or an external job scheduling engine)
+but you don't want users to be able to directly execute Notebooks. This is suited well to production
+environments or where the reports can reveal sensitive data if misconfigured.
+
+.. image:: /images/read_only_result_page.png
+   :width: 600
+   :alt: A Notebooker report in a read-only instance of the Notebooker webapp.
+
+.. note::
+    Please note that read-only mode does not change the functionality of the scheduler; users will still be able to
+    modify schedules and it will execute as intended. To disable the scheduler you can add :code:`--disable-scheduler`
+    to the command line arguments of the webapp; likewise git pulls can be prevented by using :code:`--disable-git`.
diff --git a/notebooker/_entrypoints.py b/notebooker/_entrypoints.py
@@ -59,6 +59,7 @@ def filesystem_default_value(dirname):
 )
 @click.option(
     "--notebooker-disable-git",
+    "--disable-git",
     default=False,
     is_flag=True,
     help="If selected, notebooker will not try to pull the latest version of python templates from git.",
@@ -138,6 +139,13 @@ def base_notebooker(
     help="The name of the mongo collection within the scheduler-mongo-database which is used for "
     "the scheduling back-end. Defaults to the same as the serializer's mongo collection + '_scheduler'.",
 )
+@click.option(
+    "--readonly-mode",
+    default=False,
+    is_flag=True,
+    help="This mode disables the ability to execute notebooks via REST or the webapp front-end. "
+    "Useful if you only want to display results which were e.g. executed by an external application.",
+)
 @pass_config
 def start_webapp(
     config: BaseConfig,
@@ -148,6 +156,7 @@ def start_webapp(
     disable_scheduler,
     scheduler_mongo_database,
     scheduler_mongo_collection,
+    readonly_mode,
 ):
     web_config = WebappConfig.copy_existing(config)
     web_config.PORT = port
@@ -157,6 +166,7 @@ def start_webapp(
     web_config.DISABLE_SCHEDULER = disable_scheduler
     web_config.SCHEDULER_MONGO_DATABASE = scheduler_mongo_database
     web_config.SCHEDULER_MONGO_COLLECTION = scheduler_mongo_collection
+    web_config.READONLY_MODE = readonly_mode
     return main(web_config)
 
 

diff --git a/notebooker/execute_notebook.py b/notebooker/execute_notebook.py
@@ -1,3 +1,8 @@
+from __future__ import unicode_literals
+
+import threading
+import time
+
 import copy
 import datetime
 import json
@@ -18,7 +23,7 @@
     NotebookResultError,
     python_template_dir,
 )
-from notebooker.serialization.serialization import get_serializer_from_cls
+from notebooker.serialization.serialization import get_serializer_from_cls, initialize_serializer_from_config
 from notebooker.settings import BaseConfig
 from notebooker.utils.conversion import _output_ipynb_name, generate_ipynb_from_py, ipython_to_html, ipython_to_pdf
 from notebooker.utils.filesystem import initialise_base_dirs
@@ -104,11 +109,7 @@ def _run_checks(
 
     logger.info("Executing notebook at {} using parameters {} --> {}".format(ipynb_raw_path, overrides, output_ipynb))
     pm.execute_notebook(
-        ipynb_raw_path,
-        ipynb_executed_path,
-        parameters=overrides,
-        log_output=True,
-        prepare_only=prepare_only,
+        ipynb_raw_path, ipynb_executed_path, parameters=overrides, log_output=True, prepare_only=prepare_only
     )
     with open(ipynb_executed_path, "r") as f:
         raw_executed_ipynb = f.read()
@@ -167,11 +168,7 @@ def run_report(
     job_id = job_id or str(uuid.uuid4())
     stop_execution = os.getenv("NOTEBOOKER_APP_STOPPING")
     if stop_execution:
-        logger.info(
-            "Aborting attempt to run %s, jobid=%s as app is shutting down.",
-            report_name,
-            job_id,
-        )
+        logger.info("Aborting attempt to run %s, jobid=%s as app is shutting down.", report_name, job_id)
         result_serializer.update_check_status(job_id, JobStatus.CANCELLED, error_info=CANCEL_MESSAGE)
         return
     try:
@@ -182,10 +179,7 @@ def run_report(
             attempts_remaining,
         )
         result_serializer.update_check_status(
-            job_id,
-            report_name=report_name,
-            job_start_time=job_submit_time,
-            status=JobStatus.PENDING,
+            job_id, report_name=report_name, job_start_time=job_submit_time, status=JobStatus.PENDING
         )
         result = _run_checks(
             job_id,
@@ -439,3 +433,126 @@ def docker_compose_entrypoint():
     logger.info("Received a request to run a report with the following parameters:")
     logger.info(args_to_execute)
     return subprocess.Popen(args_to_execute).wait()
+
+
+def _monitor_stderr(process, job_id, serializer_cls, serializer_args):
+    stderr = []
+    # Unsure whether flask app contexts are thread-safe; just reinitialise the serializer here.
+    result_serializer = get_serializer_from_cls(serializer_cls, **serializer_args)
+    while True:
+        line = process.stderr.readline().decode("utf-8")
+        if line != "":
+            stderr.append(line)
+            logger.info(line)  # So that we have it in the log, not just in memory.
+            result_serializer.update_stdout(job_id, new_lines=[line])
+        elif process.poll() is not None:
+            result_serializer.update_stdout(job_id, stderr, replace=True)
+            break
+    return "".join(stderr)
+
+
+def run_report_in_subprocess(
+    base_config,
+    report_name,
+    report_title,
+    mailto,
+    overrides,
+    *,
+    hide_code=False,
+    generate_pdf_output=False,
+    prepare_only=False,
+    scheduler_job_id=None,
+    run_synchronously=False,
+    mailfrom=None,
+    n_retries=3,
+    is_slideshow=False,
+) -> str:
+    """
+    Execute the Notebooker report in a subprocess.
+    Uses a subprocess to execute the report asynchronously, which is identical to the non-webapp entrypoint.
+    :param base_config: `BaseConfig` A set of configuration options which specify serialisation parameters.
+    :param report_name: `str` The report which we are executing
+    :param report_title: `str` The user-specified title of the report
+    :param mailto: `Optional[str]` Who the results will be emailed to
+    :param overrides: `Optional[Dict[str, Any]]` The parameters to be passed into the report
+    :param generate_pdf_output: `bool` Whether we're generating a PDF. Defaults to False.
+    :param prepare_only: `bool` Whether to do everything except execute the notebook. Useful for testing.
+    :param scheduler_job_id: `Optional[str]` if the job was triggered from the scheduler, this is the scheduler's job id
+    :param run_synchronously: `bool` If True, then we will join the stderr monitoring thread until the job has completed
+    :param mailfrom: `str` if passed, then this string will be used in the from field
+    :param n_retries: The number of retries to attempt.
+    :param is_slideshow: Whether the notebook is a reveal.js slideshow or not.
+    :return: The unique job_id.
+    """
+    job_id = str(uuid.uuid4())
+    job_start_time = datetime.datetime.now()
+    result_serializer = initialize_serializer_from_config(base_config)
+    result_serializer.save_check_stub(
+        job_id,
+        report_name,
+        report_title=report_title,
+        job_start_time=job_start_time,
+        status=JobStatus.SUBMITTED,
+        overrides=overrides,
+        mailto=mailto,
+        generate_pdf_output=generate_pdf_output,
+        hide_code=hide_code,
+        scheduler_job_id=scheduler_job_id,
+        is_slideshow=is_slideshow,
+    )
+
+    command = (
+        [
+            os.path.join(sys.exec_prefix, "bin", "notebooker-cli"),
+            "--output-base-dir",
+            base_config.OUTPUT_DIR,
+            "--template-base-dir",
+            base_config.TEMPLATE_DIR,
+            "--py-template-base-dir",
+            base_config.PY_TEMPLATE_BASE_DIR,
+            "--py-template-subdir",
+            base_config.PY_TEMPLATE_SUBDIR,
+            "--default-mailfrom",
+            base_config.DEFAULT_MAILFROM,
+        ]
+        + (["--notebooker-disable-git"] if base_config.NOTEBOOKER_DISABLE_GIT else [])
+        + ["--serializer-cls", result_serializer.__class__.__name__]
+        + result_serializer.serializer_args_to_cmdline_args()
+        + [
+            "execute-notebook",
+            "--job-id",
+            job_id,
+            "--report-name",
+            report_name,
+            "--report-title",
+            report_title,
+            "--mailto",
+            mailto,
+            "--overrides-as-json",
+            json.dumps(overrides),
+            "--pdf-output" if generate_pdf_output else "--no-pdf-output",
+            "--hide-code" if hide_code else "--show-code",
+            "--n-retries",
+            str(n_retries),
+        ]
+        + (["--prepare-notebook-only"] if prepare_only else [])
+        + (["--is-slideshow"] if is_slideshow else [])
+        + ([f"--scheduler-job-id={scheduler_job_id}"] if scheduler_job_id is not None else [])
+        + ([f"--mailfrom={mailfrom}"] if mailfrom is not None else [])
+    )
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    stderr_thread = threading.Thread(
+        target=_monitor_stderr, args=(p, job_id, base_config.SERIALIZER_CLS, base_config.SERIALIZER_CONFIG)
+    )
+    stderr_thread.daemon = True
+    stderr_thread.start()
+    if run_synchronously:
+        p.wait()
+    else:
+        time.sleep(1)
+        p.poll()
+    if p.returncode:
+        raise RuntimeError(f"The report execution failed with exit code {p.returncode}")
+
+    return job_id
diff --git a/notebooker/settings.py b/notebooker/settings.py
@@ -39,6 +39,11 @@ class BaseConfig:
     def copy_existing(cls, existing: "BaseConfig"):
         return cls(**asdict(existing))
 
+    @classmethod
+    def from_superset_kwargs(cls, kwargs: dict):
+        """ When we have too many kwargs but we want to use a subset containing the fields. """
+        return cls(**{k: v for (k, v) in kwargs.items() if k in cls.__dataclass_fields__})
+
 
 @dataclass
 class WebappConfig(BaseConfig):
@@ -53,3 +58,4 @@ class WebappConfig(BaseConfig):
     SCHEDULER_MONGO_DATABASE: str = ""
     SCHEDULER_MONGO_COLLECTION: str = ""
     DISABLE_SCHEDULER: bool = False
+    READONLY_MODE: bool = False
diff --git a/notebooker/utils/filesystem.py b/notebooker/utils/filesystem.py
@@ -41,15 +41,18 @@ def mkdir_p(path):
 
 
 def get_cache_dir():
-    return current_app.config["CACHE_DIR"]
+    with current_app.app_context():
+        return current_app.config["CACHE_DIR"]
 
 
 def get_output_dir():
-    return current_app.config["OUTPUT_DIR"]
+    with current_app.app_context():
+        return current_app.config["OUTPUT_DIR"]
 
 
 def get_template_dir():
-    return current_app.config["TEMPLATE_DIR"]
+    with current_app.app_context():
+        return current_app.config["TEMPLATE_DIR"]
 
 
 def _cleanup_dirs(webapp_config):

diff --git a/notebooker/utils/results.py b/notebooker/utils/results.py
@@ -110,24 +110,31 @@ def get_all_result_keys(
     return all_keys
 
 
-def get_all_available_results_json(serializer: MongoResultSerializer, limit: int, report_name: str = None) -> List[constants.NotebookResultBase]:
+def get_all_available_results_json(
+    serializer: MongoResultSerializer, limit: int, report_name: str = None, readonly_mode: bool = False
+) -> List[constants.NotebookResultBase]:
     json_output = []
     mongo_filter = {"report_name": report_name} if report_name is not None else {}
     for result in serializer.get_all_results(mongo_filter=mongo_filter, limit=limit, load_payload=False):
         output = result.saveable_output()
-        output["result_url"] = url_for(
-            "serve_results_bp.task_results", job_id=output["job_id"], report_name=output["report_name"]
-        )
-        output["ipynb_url"] = url_for(
-            "serve_results_bp.download_ipynb_result", job_id=output["job_id"], report_name=output["report_name"]
-        )
-        output["pdf_url"] = url_for(
-            "serve_results_bp.download_pdf_result", job_id=output["job_id"], report_name=output["report_name"]
-        )
-        output["rerun_url"] = url_for(
-            "run_report_bp.rerun_report", job_id=output["job_id"], report_name=output["report_name"]
-        )
-
+        job_id = output["job_id"]
+        report_name = output["report_name"]
+        urls = {"ipynb_url": "", "pdf_url": "", "result_url": "", "rerun_url": "", "clone_url": "", "delete_url": ""}
+        if job_id:
+            new_urls = {
+                "result_url": url_for("serve_results_bp.task_results", report_name=report_name, job_id=job_id),
+                "ipynb_url": url_for("serve_results_bp.download_ipynb_result", report_name=report_name, job_id=job_id),
+                "pdf_url": url_for("serve_results_bp.download_pdf_result", report_name=report_name, job_id=job_id),
+            }
+            urls.update(new_urls)
+            if not readonly_mode:
+                urls.update(
+                    {
+                        "rerun_url": url_for("run_report_bp.rerun_report", report_name=report_name, job_id=job_id),
+                        "delete_url": url_for("run_report_bp.delete_report", report_name=report_name, job_id=job_id),
+                    }
+                )
+        output.update(urls)
         json_output.append(output)
     return json_output
 

diff --git a/notebooker/web/app.py b/notebooker/web/app.py
@@ -21,9 +21,10 @@
 from notebooker.web.routes.core import core_bp
 from notebooker.web.routes.index import index_bp
 from notebooker.web.routes.pending_results import pending_results_bp
-from notebooker.web.routes.run_report import run_report_bp
+from notebooker.web.routes.report_execution import run_report_bp
 from notebooker.web.routes.scheduling import scheduling_bp
 from notebooker.web.routes.serve_results import serve_results_bp
+from notebooker.web.routes.templates import templates_bp
 
 logger = logging.getLogger(__name__)
 all_report_refresher: Optional[threading.Thread] = None
@@ -71,8 +72,10 @@ def create_app(webapp_config=None):
 
     flask_app.url_map.converters["date"] = DateConverter
     flask_app.register_blueprint(index_bp)
-    flask_app.register_blueprint(run_report_bp)
+    if webapp_config and not webapp_config.READONLY_MODE:
+        flask_app.register_blueprint(run_report_bp)
     flask_app.register_blueprint(core_bp)
+    flask_app.register_blueprint(templates_bp)
     flask_app.register_blueprint(serve_results_bp)
     flask_app.register_blueprint(pending_results_bp)
     if webapp_config and not webapp_config.DISABLE_SCHEDULER: