Skip to content

Commit

Permalink
Read-only mode (#138)
Browse files Browse the repository at this point in the history
Add a new read-only mode
  • Loading branch information
jonbannister authored Feb 21, 2023
1 parent 112734d commit 40fbb97
Show file tree
Hide file tree
Showing 30 changed files with 570 additions and 430 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
0.5.1 (2023-02-??)
------------------

* Feature: A new `--readonly-mode` is available for the webapp. This allows users to have an instance of Notebooker which only displays the results of externally-run or scheduler-run reports. See [the docs](https://notebooker.readthedocs.io/en/latest/webapp/webapp.html#read-only-mode) for more details.
* Bugfix: Scheduler-executed reports will now correctly record stdout.

0.5.0 (2023-01-19)
------------------

Expand Down
Binary file added docs/images/read_only_result_page.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
21 changes: 20 additions & 1 deletion docs/webapp/webapp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,23 @@ The webapp itself is configured via the command line notebooker-cli:

.. click:: notebooker._entrypoints:base_notebooker
:prog: notebooker-cli
:nested: full
:nested: full


Read-only mode
--------------
There exists a read-only mode (add :code:`--readonly-mode` to command line arguments) in the
Notebooker webapp which will disable the ability to run new,
rerun, or delete existing reports. This mode is useful in situations where you would like Notebooker
reports to be executed by a trusted process (e.g. the internal scheduler, or an external job scheduling engine)
but you don't want users to be able to directly execute Notebooks. This is suited well to production
environments or where the reports can reveal sensitive data if misconfigured.

.. image:: /images/read_only_result_page.png
:width: 600
:alt: A Notebooker report in a read-only instance of the Notebooker webapp.

.. note::
Please note that read-only mode does not change the functionality of the scheduler; users will still be able to
modify schedules and it will execute as intended. To disable the scheduler you can add :code:`--disable-scheduler`
to the command line arguments of the webapp; likewise git pulls can be prevented by using :code:`--disable-git`.
10 changes: 10 additions & 0 deletions notebooker/_entrypoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def filesystem_default_value(dirname):
)
@click.option(
"--notebooker-disable-git",
"--disable-git",
default=False,
is_flag=True,
help="If selected, notebooker will not try to pull the latest version of python templates from git.",
Expand Down Expand Up @@ -138,6 +139,13 @@ def base_notebooker(
help="The name of the mongo collection within the scheduler-mongo-database which is used for "
"the scheduling back-end. Defaults to the same as the serializer's mongo collection + '_scheduler'.",
)
@click.option(
"--readonly-mode",
default=False,
is_flag=True,
help="This mode disables the ability to execute notebooks via REST or the webapp front-end. "
"Useful if you only want to display results which were e.g. executed by an external application.",
)
@pass_config
def start_webapp(
config: BaseConfig,
Expand All @@ -148,6 +156,7 @@ def start_webapp(
disable_scheduler,
scheduler_mongo_database,
scheduler_mongo_collection,
readonly_mode,
):
web_config = WebappConfig.copy_existing(config)
web_config.PORT = port
Expand All @@ -157,6 +166,7 @@ def start_webapp(
web_config.DISABLE_SCHEDULER = disable_scheduler
web_config.SCHEDULER_MONGO_DATABASE = scheduler_mongo_database
web_config.SCHEDULER_MONGO_COLLECTION = scheduler_mongo_collection
web_config.READONLY_MODE = readonly_mode
return main(web_config)


Expand Down
147 changes: 132 additions & 15 deletions notebooker/execute_notebook.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from __future__ import unicode_literals

import threading
import time

import copy
import datetime
import json
Expand All @@ -18,7 +23,7 @@
NotebookResultError,
python_template_dir,
)
from notebooker.serialization.serialization import get_serializer_from_cls
from notebooker.serialization.serialization import get_serializer_from_cls, initialize_serializer_from_config
from notebooker.settings import BaseConfig
from notebooker.utils.conversion import _output_ipynb_name, generate_ipynb_from_py, ipython_to_html, ipython_to_pdf
from notebooker.utils.filesystem import initialise_base_dirs
Expand Down Expand Up @@ -104,11 +109,7 @@ def _run_checks(

logger.info("Executing notebook at {} using parameters {} --> {}".format(ipynb_raw_path, overrides, output_ipynb))
pm.execute_notebook(
ipynb_raw_path,
ipynb_executed_path,
parameters=overrides,
log_output=True,
prepare_only=prepare_only,
ipynb_raw_path, ipynb_executed_path, parameters=overrides, log_output=True, prepare_only=prepare_only
)
with open(ipynb_executed_path, "r") as f:
raw_executed_ipynb = f.read()
Expand Down Expand Up @@ -167,11 +168,7 @@ def run_report(
job_id = job_id or str(uuid.uuid4())
stop_execution = os.getenv("NOTEBOOKER_APP_STOPPING")
if stop_execution:
logger.info(
"Aborting attempt to run %s, jobid=%s as app is shutting down.",
report_name,
job_id,
)
logger.info("Aborting attempt to run %s, jobid=%s as app is shutting down.", report_name, job_id)
result_serializer.update_check_status(job_id, JobStatus.CANCELLED, error_info=CANCEL_MESSAGE)
return
try:
Expand All @@ -182,10 +179,7 @@ def run_report(
attempts_remaining,
)
result_serializer.update_check_status(
job_id,
report_name=report_name,
job_start_time=job_submit_time,
status=JobStatus.PENDING,
job_id, report_name=report_name, job_start_time=job_submit_time, status=JobStatus.PENDING
)
result = _run_checks(
job_id,
Expand Down Expand Up @@ -439,3 +433,126 @@ def docker_compose_entrypoint():
logger.info("Received a request to run a report with the following parameters:")
logger.info(args_to_execute)
return subprocess.Popen(args_to_execute).wait()


def _monitor_stderr(process, job_id, serializer_cls, serializer_args):
stderr = []
# Unsure whether flask app contexts are thread-safe; just reinitialise the serializer here.
result_serializer = get_serializer_from_cls(serializer_cls, **serializer_args)
while True:
line = process.stderr.readline().decode("utf-8")
if line != "":
stderr.append(line)
logger.info(line) # So that we have it in the log, not just in memory.
result_serializer.update_stdout(job_id, new_lines=[line])
elif process.poll() is not None:
result_serializer.update_stdout(job_id, stderr, replace=True)
break
return "".join(stderr)


def run_report_in_subprocess(
base_config,
report_name,
report_title,
mailto,
overrides,
*,
hide_code=False,
generate_pdf_output=False,
prepare_only=False,
scheduler_job_id=None,
run_synchronously=False,
mailfrom=None,
n_retries=3,
is_slideshow=False,
) -> str:
"""
Execute the Notebooker report in a subprocess.
Uses a subprocess to execute the report asynchronously, which is identical to the non-webapp entrypoint.
:param base_config: `BaseConfig` A set of configuration options which specify serialisation parameters.
:param report_name: `str` The report which we are executing
:param report_title: `str` The user-specified title of the report
:param mailto: `Optional[str]` Who the results will be emailed to
:param overrides: `Optional[Dict[str, Any]]` The parameters to be passed into the report
:param generate_pdf_output: `bool` Whether we're generating a PDF. Defaults to False.
:param prepare_only: `bool` Whether to do everything except execute the notebook. Useful for testing.
:param scheduler_job_id: `Optional[str]` if the job was triggered from the scheduler, this is the scheduler's job id
:param run_synchronously: `bool` If True, then we will join the stderr monitoring thread until the job has completed
:param mailfrom: `str` if passed, then this string will be used in the from field
:param n_retries: The number of retries to attempt.
:param is_slideshow: Whether the notebook is a reveal.js slideshow or not.
:return: The unique job_id.
"""
job_id = str(uuid.uuid4())
job_start_time = datetime.datetime.now()
result_serializer = initialize_serializer_from_config(base_config)
result_serializer.save_check_stub(
job_id,
report_name,
report_title=report_title,
job_start_time=job_start_time,
status=JobStatus.SUBMITTED,
overrides=overrides,
mailto=mailto,
generate_pdf_output=generate_pdf_output,
hide_code=hide_code,
scheduler_job_id=scheduler_job_id,
is_slideshow=is_slideshow,
)

command = (
[
os.path.join(sys.exec_prefix, "bin", "notebooker-cli"),
"--output-base-dir",
base_config.OUTPUT_DIR,
"--template-base-dir",
base_config.TEMPLATE_DIR,
"--py-template-base-dir",
base_config.PY_TEMPLATE_BASE_DIR,
"--py-template-subdir",
base_config.PY_TEMPLATE_SUBDIR,
"--default-mailfrom",
base_config.DEFAULT_MAILFROM,
]
+ (["--notebooker-disable-git"] if base_config.NOTEBOOKER_DISABLE_GIT else [])
+ ["--serializer-cls", result_serializer.__class__.__name__]
+ result_serializer.serializer_args_to_cmdline_args()
+ [
"execute-notebook",
"--job-id",
job_id,
"--report-name",
report_name,
"--report-title",
report_title,
"--mailto",
mailto,
"--overrides-as-json",
json.dumps(overrides),
"--pdf-output" if generate_pdf_output else "--no-pdf-output",
"--hide-code" if hide_code else "--show-code",
"--n-retries",
str(n_retries),
]
+ (["--prepare-notebook-only"] if prepare_only else [])
+ (["--is-slideshow"] if is_slideshow else [])
+ ([f"--scheduler-job-id={scheduler_job_id}"] if scheduler_job_id is not None else [])
+ ([f"--mailfrom={mailfrom}"] if mailfrom is not None else [])
)
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

stderr_thread = threading.Thread(
target=_monitor_stderr, args=(p, job_id, base_config.SERIALIZER_CLS, base_config.SERIALIZER_CONFIG)
)
stderr_thread.daemon = True
stderr_thread.start()
if run_synchronously:
p.wait()
else:
time.sleep(1)
p.poll()
if p.returncode:
raise RuntimeError(f"The report execution failed with exit code {p.returncode}")

return job_id
6 changes: 6 additions & 0 deletions notebooker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ class BaseConfig:
def copy_existing(cls, existing: "BaseConfig"):
return cls(**asdict(existing))

@classmethod
def from_superset_kwargs(cls, kwargs: dict):
""" When we have too many kwargs but we want to use a subset containing the fields. """
return cls(**{k: v for (k, v) in kwargs.items() if k in cls.__dataclass_fields__})


@dataclass
class WebappConfig(BaseConfig):
Expand All @@ -53,3 +58,4 @@ class WebappConfig(BaseConfig):
SCHEDULER_MONGO_DATABASE: str = ""
SCHEDULER_MONGO_COLLECTION: str = ""
DISABLE_SCHEDULER: bool = False
READONLY_MODE: bool = False
9 changes: 6 additions & 3 deletions notebooker/utils/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,18 @@ def mkdir_p(path):


def get_cache_dir():
return current_app.config["CACHE_DIR"]
with current_app.app_context():
return current_app.config["CACHE_DIR"]


def get_output_dir():
return current_app.config["OUTPUT_DIR"]
with current_app.app_context():
return current_app.config["OUTPUT_DIR"]


def get_template_dir():
return current_app.config["TEMPLATE_DIR"]
with current_app.app_context():
return current_app.config["TEMPLATE_DIR"]


def _cleanup_dirs(webapp_config):
Expand Down
35 changes: 21 additions & 14 deletions notebooker/utils/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,24 +110,31 @@ def get_all_result_keys(
return all_keys


def get_all_available_results_json(serializer: MongoResultSerializer, limit: int, report_name: str = None) -> List[constants.NotebookResultBase]:
def get_all_available_results_json(
serializer: MongoResultSerializer, limit: int, report_name: str = None, readonly_mode: bool = False
) -> List[constants.NotebookResultBase]:
json_output = []
mongo_filter = {"report_name": report_name} if report_name is not None else {}
for result in serializer.get_all_results(mongo_filter=mongo_filter, limit=limit, load_payload=False):
output = result.saveable_output()
output["result_url"] = url_for(
"serve_results_bp.task_results", job_id=output["job_id"], report_name=output["report_name"]
)
output["ipynb_url"] = url_for(
"serve_results_bp.download_ipynb_result", job_id=output["job_id"], report_name=output["report_name"]
)
output["pdf_url"] = url_for(
"serve_results_bp.download_pdf_result", job_id=output["job_id"], report_name=output["report_name"]
)
output["rerun_url"] = url_for(
"run_report_bp.rerun_report", job_id=output["job_id"], report_name=output["report_name"]
)

job_id = output["job_id"]
report_name = output["report_name"]
urls = {"ipynb_url": "", "pdf_url": "", "result_url": "", "rerun_url": "", "clone_url": "", "delete_url": ""}
if job_id:
new_urls = {
"result_url": url_for("serve_results_bp.task_results", report_name=report_name, job_id=job_id),
"ipynb_url": url_for("serve_results_bp.download_ipynb_result", report_name=report_name, job_id=job_id),
"pdf_url": url_for("serve_results_bp.download_pdf_result", report_name=report_name, job_id=job_id),
}
urls.update(new_urls)
if not readonly_mode:
urls.update(
{
"rerun_url": url_for("run_report_bp.rerun_report", report_name=report_name, job_id=job_id),
"delete_url": url_for("run_report_bp.delete_report", report_name=report_name, job_id=job_id),
}
)
output.update(urls)
json_output.append(output)
return json_output

Expand Down
7 changes: 5 additions & 2 deletions notebooker/web/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@
from notebooker.web.routes.core import core_bp
from notebooker.web.routes.index import index_bp
from notebooker.web.routes.pending_results import pending_results_bp
from notebooker.web.routes.run_report import run_report_bp
from notebooker.web.routes.report_execution import run_report_bp
from notebooker.web.routes.scheduling import scheduling_bp
from notebooker.web.routes.serve_results import serve_results_bp
from notebooker.web.routes.templates import templates_bp

logger = logging.getLogger(__name__)
all_report_refresher: Optional[threading.Thread] = None
Expand Down Expand Up @@ -71,8 +72,10 @@ def create_app(webapp_config=None):

flask_app.url_map.converters["date"] = DateConverter
flask_app.register_blueprint(index_bp)
flask_app.register_blueprint(run_report_bp)
if webapp_config and not webapp_config.READONLY_MODE:
flask_app.register_blueprint(run_report_bp)
flask_app.register_blueprint(core_bp)
flask_app.register_blueprint(templates_bp)
flask_app.register_blueprint(serve_results_bp)
flask_app.register_blueprint(pending_results_bp)
if webapp_config and not webapp_config.DISABLE_SCHEDULER:
Expand Down
Loading

0 comments on commit 40fbb97

Please sign in to comment.