Skip to content

Commit

Permalink
Merge pull request #199 from man-group/add-cleanup-script
Browse files Browse the repository at this point in the history
Adding a cleanup entrypoint
  • Loading branch information
jonbannister authored Jan 17, 2025
2 parents e82f538 + b0651d4 commit d829d77
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 12 deletions.
8 changes: 4 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ jobs:
PYTHON_VERSION: "3_6"
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_6
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_6
VERSION: 0.7.1
VERSION: 0.7.2
PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
YARN_STATIC_DIR: notebooker/web/static/
IMAGE_NAME: mangroup/notebooker
Expand All @@ -229,7 +229,7 @@ jobs:
environment:
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_7
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_7
VERSION: 0.7.1
VERSION: 0.7.2
PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
YARN_STATIC_DIR: notebooker/web/static/
IMAGE_NAME: mangroup/notebooker
Expand All @@ -243,7 +243,7 @@ jobs:
environment:
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_8
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_8
VERSION: 0.7.1
VERSION: 0.7.2
PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
YARN_STATIC_DIR: notebooker/web/static/
IMAGE_NAME: mangroup/notebooker
Expand All @@ -257,7 +257,7 @@ jobs:
environment:
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_11
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_11
VERSION: 0.7.1
VERSION: 0.7.2
PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
YARN_STATIC_DIR: notebooker/web/static/
IMAGE_NAME: mangroup/notebooker
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
0.7.2 (2025-01-17)
------------------

* feature: added a cleanup script to delete reports older than a given number of days, optionally filterable by report name.

0.7.1 (2025-01-02)
------------------

Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
author = "Man Group Quant Tech"

# The full version, including alpha/beta/rc tags
release = "0.7.1"
release = "0.7.2"


# -- General configuration ---------------------------------------------------
Expand Down
13 changes: 13 additions & 0 deletions notebooker/_entrypoints.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import uuid
from typing import Optional

import click

Expand All @@ -10,6 +11,7 @@
from notebooker.serialization import SERIALIZER_TO_CLI_OPTIONS
from notebooker.settings import BaseConfig, WebappConfig
from notebooker.snapshot import snap_latest_successful_notebooks
from notebooker.utils.cleanup import delete_old_reports
from notebooker.web.app import main


Expand Down Expand Up @@ -267,6 +269,17 @@ def execute_notebook(
)


@base_notebooker.command()
@click.option("--days", "--days-cutoff", "-d", type=int, required=True, help="Delete reports older than this many days")
@click.option(
"--report-name", required=False, help="The name of the template to retrieve, relative to the template directory."
)
@click.option("--dry-run", is_flag=True, default=False, help="Show what would be deleted without actually deleting")
@pass_config
def cleanup_old_reports(config: BaseConfig, days: int, report_name: Optional[str], dry_run: bool):
delete_old_reports(config, days_cutoff=days, report_name=report_name, dry_run=dry_run)


@base_notebooker.command()
@click.option(
"--report-name", required=True, help="The name of the template to retrieve, relative to the template directory."
Expand Down
25 changes: 20 additions & 5 deletions notebooker/serialization/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,18 +521,33 @@ def get_latest_successful_job_ids_for_name_all_params(self, report_name: str) ->
def n_all_results_for_report_name(self, report_name: str) -> int:
return self._get_result_count({"report_name": report_name})

def delete_result(self, job_id: AnyStr) -> Dict[str, Any]:
def delete_result(self, job_id: AnyStr, dry_run: bool = False) -> Dict[str, Any]:
result = self._get_raw_check_result(job_id)
status = JobStatus.from_string(result["status"])
gridfs_filenames = load_files_from_gridfs(self.result_data_store, result, do_read=False)
if status in (JobStatus.ERROR, JobStatus.TIMEOUT, JobStatus.CANCELLED):
gridfs_filenames.append(_error_info_filename(job_id))
self.update_check_status(job_id, JobStatus.DELETED)
if not dry_run:
self.update_check_status(job_id, JobStatus.DELETED)
deleted_gridfs_files = []
for filename in gridfs_filenames:
logger.info(f"Deleting {filename}")
logger.debug(f"Deleting {filename}")
existed = False
for grid_out in self.result_data_store.find({"filename": filename}):
self.result_data_store.delete(grid_out._id)
return {"deleted_result_document": result, "gridfs_filenames": gridfs_filenames}
existed = True
if not dry_run:
self.result_data_store.delete(grid_out._id)
if existed:
deleted_gridfs_files.append(filename)
return {"deleted_result_document": result, "gridfs_filenames": deleted_gridfs_files}

def get_job_ids_older_than(self, cutoff: datetime.datetime, report_name: Optional[str] = None) -> List[str]:
query = {"job_start_time": {"$lte": cutoff}}
query = _add_deleted_status_to_filter(query)
if report_name:
query["report_name"] = report_name
to_delete = [d["job_id"] for d in self.library.find(query, {"_id": 0, "job_id": 1})]
return to_delete


def _pdf_filename(job_id: str) -> str:
Expand Down
50 changes: 50 additions & 0 deletions notebooker/utils/cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import datetime
from typing import Optional

from tqdm import tqdm
import logging

from notebooker.serialization.serialization import get_serializer_from_cls
from notebooker.settings import BaseConfig

logger = logging.getLogger(__name__)


def delete_old_reports(config: BaseConfig, days_cutoff: int, report_name: Optional[str], dry_run: bool = True) -> None:
"""
Delete notebooker reports older than specified days.
Args:
config: The configuration which will point to the serializer class and config.
days_cutoff: Delete reports older than this many days
report_name: Optionally specify which report_name we should be removing old reports for.
dry_run: If True, only show what would be deleted without actually deleting
"""
serializer = get_serializer_from_cls(config.SERIALIZER_CLS, **config.SERIALIZER_CONFIG)
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days_cutoff)

# Find reports to delete
to_delete = serializer.get_job_ids_older_than(cutoff_date, report_name=report_name)

num_reports = len(to_delete)

if num_reports == 0:
logger.info(f"No reports found older than {days_cutoff} days")
return

logger.info(f"Found {num_reports} reports older than {days_cutoff} days")

# Delete reports
logger.info("Starting deletion process...")
for report in tqdm(to_delete, desc="Deleting reports"):
try:
removed = serializer.delete_result(report, dry_run=dry_run)
logger.info(
f"{'Would have deleted' if dry_run else 'Deleted'}: "
f"Title={removed['deleted_result_document']['report_title']}, "
f"GridFS files={removed['gridfs_filenames']}"
)
except Exception as e:
logger.error(f"Failed to delete report {report}: {str(e)}")

logger.info(f"{'Would have' if dry_run else 'Successfully'} removed {num_reports} reports")
2 changes: 1 addition & 1 deletion notebooker/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.1"
__version__ = "0.7.2"
2 changes: 1 addition & 1 deletion notebooker/web/static/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "notebooker",
"version": "0.7.1",
"version": "0.7.2",
"description": "Notebooker - Turn notebooks into reports",
"dependencies": {
"bootstrap-table": "1.20.2",
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def test_delete(bson_library, webapp_config):
raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}, "other_stuff": "Yep"},
)
)
assert serializer.get_job_ids_older_than(datetime.datetime(2020, 1, 1), report_name=report_name) == [job_id]
assert bson_library.find_one({"job_id": job_id}) is not None
result = serializer.get_check_result(job_id)
assert result is not None
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from unittest.mock import Mock, MagicMock

from mock import patch

from notebooker.serialization.mongo import JobStatus, MongoResultSerializer
Expand Down Expand Up @@ -42,3 +44,33 @@ def test__get_all_job_ids(conn, db, gridfs):
{"$project": {"report_name": 1, "job_id": 1}},
]
)


@patch("notebooker.serialization.mongo.gridfs")
@patch("notebooker.serialization.mongo.MongoResultSerializer.get_mongo_database")
@patch("notebooker.serialization.mongo.MongoResultSerializer.get_mongo_connection")
def test_delete_result_dry_run(mock_conn, mock_db, mock_gridfs):
# Setup
serializer = MongoResultSerializer()
mock_result = {
"job_id": "test_job",
"status": JobStatus.DONE.value,
"raw_html_resources": {"outputs": ["file1.html"]},
"generate_pdf_output": True,
}

serializer._get_raw_check_result = Mock(return_value=mock_result)
mock_gridfs_instance = MagicMock()
serializer.result_data_store = mock_gridfs_instance
mock_gridfs_instance.find.return_value = [Mock(_id="id1")]

# Execute with dry_run=True
result = serializer.delete_result("test_job", dry_run=True)

# Verify no actual deletions occurred
assert not serializer.library.find_one_and_update.called
assert not mock_gridfs_instance.delete.called

# But verify the result contains what would be deleted
assert result["deleted_result_document"] == mock_result
assert len(result["gridfs_filenames"]) > 0

0 comments on commit d829d77

Please sign in to comment.