From bada2d6d68f3e61eb4ab9a67ed0998e05c55a0b5 Mon Sep 17 00:00:00 2001 From: "Richard Kuo (Danswer)" Date: Wed, 9 Oct 2024 13:11:07 -0700 Subject: [PATCH 1/5] check last_pruned instead of is_pruning --- backend/danswer/server/documents/cc_pair.py | 12 ++++----- .../common_utils/managers/cc_pair.py | 26 ++++++++++++++----- .../connector_job_tests/slack/test_prune.py | 3 ++- .../integration/tests/pruning/test_pruning.py | 3 ++- 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/backend/danswer/server/documents/cc_pair.py b/backend/danswer/server/documents/cc_pair.py index ce3131d050f..fadd97d02c0 100644 --- a/backend/danswer/server/documents/cc_pair.py +++ b/backend/danswer/server/documents/cc_pair.py @@ -1,4 +1,5 @@ import math +from datetime import datetime from http import HTTPStatus from fastapi import APIRouter @@ -201,12 +202,12 @@ def update_cc_pair_name( raise HTTPException(status_code=400, detail="Name must be unique") -@router.get("/admin/cc-pair/{cc_pair_id}/prune") -def get_cc_pair_latest_prune( +@router.get("/admin/cc-pair/{cc_pair_id}/last_pruned") +def get_cc_pair_last_pruned( cc_pair_id: int, user: User = Depends(current_curator_or_admin_user), db_session: Session = Depends(get_session), -) -> bool: +) -> datetime | None: cc_pair = get_connector_credential_pair_from_id( cc_pair_id=cc_pair_id, db_session=db_session, @@ -216,11 +217,10 @@ def get_cc_pair_latest_prune( if not cc_pair: raise HTTPException( status_code=400, - detail="Connection not found for current user's permissions", + detail="cc_pair not found for current user's permissions", ) - rcp = RedisConnectorPruning(cc_pair.id) - return rcp.is_pruning(db_session, get_redis_client()) + return cc_pair.last_pruned @router.post("/admin/cc-pair/{cc_pair_id}/prune") diff --git a/backend/tests/integration/common_utils/managers/cc_pair.py b/backend/tests/integration/common_utils/managers/cc_pair.py index 48ae805cbb9..46780acd52d 100644 --- a/backend/tests/integration/common_utils/managers/cc_pair.py +++ b/backend/tests/integration/common_utils/managers/cc_pair.py @@ -274,31 +274,43 @@ def prune( result.raise_for_status() @staticmethod - def is_pruning( + def last_pruned( cc_pair: DATestCCPair, user_performing_action: DATestUser | None = None, - ) -> bool: + ) -> datetime | None: response = requests.get( - url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/prune", + url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/last_pruned", headers=user_performing_action.headers if user_performing_action else GENERAL_HEADERS, ) response.raise_for_status() - response_bool = response.json() - return response_bool + response_str = response.json() + + # If the response itself is a datetime string, parse it + if not isinstance(response_str, str): + return None + + try: + return datetime.fromisoformat(response_str) + except ValueError: + # Handle invalid datetime format + pass + + return None @staticmethod def wait_for_prune( cc_pair: DATestCCPair, + after: datetime, timeout: float = MAX_DELAY, user_performing_action: DATestUser | None = None, ) -> None: """after: The task register time must be after this time.""" start = time.monotonic() while True: - result = CCPairManager.is_pruning(cc_pair, user_performing_action) - if not result: + last_pruned = CCPairManager.last_pruned(cc_pair, user_performing_action) + if last_pruned and last_pruned > after: break elapsed = time.monotonic() - start diff --git a/backend/tests/integration/connector_job_tests/slack/test_prune.py b/backend/tests/integration/connector_job_tests/slack/test_prune.py index 96638417c4d..12cf7da7a97 100644 --- a/backend/tests/integration/connector_job_tests/slack/test_prune.py +++ b/backend/tests/integration/connector_job_tests/slack/test_prune.py @@ -195,8 +195,9 @@ def test_slack_prune( ) # Prune the cc_pair + now = datetime.now(timezone.utc) CCPairManager.prune(cc_pair, user_performing_action=admin_user) - CCPairManager.wait_for_prune(cc_pair, user_performing_action=admin_user) + CCPairManager.wait_for_prune(cc_pair, now, user_performing_action=admin_user) # ----------------------------VERIFY THE CHANGES--------------------------- # Ensure admin user can't see deleted messages diff --git a/backend/tests/integration/tests/pruning/test_pruning.py b/backend/tests/integration/tests/pruning/test_pruning.py index 4c4e82cdfb0..f74b1764a4b 100644 --- a/backend/tests/integration/tests/pruning/test_pruning.py +++ b/backend/tests/integration/tests/pruning/test_pruning.py @@ -105,9 +105,10 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None: logger.info("Removing courses.html.") os.remove(os.path.join(website_tgt, "courses.html")) + now = datetime.now(timezone.utc) CCPairManager.prune(cc_pair_1, user_performing_action=admin_user) CCPairManager.wait_for_prune( - cc_pair_1, timeout=60, user_performing_action=admin_user + cc_pair_1, now, timeout=60, user_performing_action=admin_user ) selected_cc_pair = CCPairManager.get_one( From d358dd1718522c90f5646072946713181a9bc55d Mon Sep 17 00:00:00 2001 From: "Richard Kuo (Danswer)" Date: Wed, 9 Oct 2024 13:45:46 -0700 Subject: [PATCH 2/5] try using the ThreadingHTTPServer class for stability and avoiding blocking single-threaded behavior --- backend/tests/integration/tests/pruning/test_pruning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/tests/integration/tests/pruning/test_pruning.py b/backend/tests/integration/tests/pruning/test_pruning.py index f74b1764a4b..5f985199bf1 100644 --- a/backend/tests/integration/tests/pruning/test_pruning.py +++ b/backend/tests/integration/tests/pruning/test_pruning.py @@ -24,7 +24,7 @@ @contextmanager def http_server_context( directory: str, port: int = 8000 -) -> Generator[http.server.HTTPServer, None, None]: +) -> Generator[http.server.ThreadingHTTPServer, None, None]: # Create a handler that serves files from the specified directory def handler_class( *args: Any, **kwargs: Any @@ -34,7 +34,7 @@ def handler_class( ) # Create an HTTPServer instance - httpd = http.server.HTTPServer(("0.0.0.0", port), handler_class) + httpd = http.server.ThreadingHTTPServer(("0.0.0.0", port), handler_class) # Define a thread that runs the server in the background server_thread = threading.Thread(target=httpd.serve_forever) From 9fae8f5774f673406bd703257066636bad7f5903 Mon Sep 17 00:00:00 2001 From: "Richard Kuo (Danswer)" Date: Thu, 10 Oct 2024 12:43:29 -0700 Subject: [PATCH 3/5] add startup delay to web server in test --- backend/tests/integration/tests/pruning/test_pruning.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/tests/integration/tests/pruning/test_pruning.py b/backend/tests/integration/tests/pruning/test_pruning.py index 5f985199bf1..d7d6b8ae692 100644 --- a/backend/tests/integration/tests/pruning/test_pruning.py +++ b/backend/tests/integration/tests/pruning/test_pruning.py @@ -45,6 +45,7 @@ def handler_class( try: # Start the server in the background server_thread.start() + sleep(5) # give it a few seconds to start yield httpd finally: # Shutdown the server and wait for the thread to finish From cfcc1b1d9b039a85c1bc50772571d54de988ad00 Mon Sep 17 00:00:00 2001 From: "Richard Kuo (Danswer)" Date: Wed, 16 Oct 2024 09:58:40 -0700 Subject: [PATCH 4/5] just explicitly return None if we can't parse the datetime --- backend/tests/integration/common_utils/managers/cc_pair.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/backend/tests/integration/common_utils/managers/cc_pair.py b/backend/tests/integration/common_utils/managers/cc_pair.py index 46780acd52d..64961b22e3c 100644 --- a/backend/tests/integration/common_utils/managers/cc_pair.py +++ b/backend/tests/integration/common_utils/managers/cc_pair.py @@ -294,10 +294,7 @@ def last_pruned( try: return datetime.fromisoformat(response_str) except ValueError: - # Handle invalid datetime format - pass - - return None + return None @staticmethod def wait_for_prune( From 09f70a7d39586590f8f9503e58789a444ee0905a Mon Sep 17 00:00:00 2001 From: "Richard Kuo (Danswer)" Date: Wed, 16 Oct 2024 11:33:42 -0700 Subject: [PATCH 5/5] switch to uvicorn for test stability --- .../integration/tests/pruning/test_pruning.py | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/backend/tests/integration/tests/pruning/test_pruning.py b/backend/tests/integration/tests/pruning/test_pruning.py index d7d6b8ae692..7498f760cef 100644 --- a/backend/tests/integration/tests/pruning/test_pruning.py +++ b/backend/tests/integration/tests/pruning/test_pruning.py @@ -10,6 +10,10 @@ from time import sleep from typing import Any +import uvicorn +from fastapi import FastAPI +from fastapi.staticfiles import StaticFiles + from danswer.server.documents.models import DocumentSource from danswer.utils.logger import setup_logger from tests.integration.common_utils.managers.api_key import APIKeyManager @@ -21,6 +25,46 @@ logger = setup_logger() +# FastAPI server for serving files +def create_fastapi_app(directory: str) -> FastAPI: + app = FastAPI() + + # Mount the directory to serve static files + app.mount("/", StaticFiles(directory=directory, html=True), name="static") + + return app + + +# as far as we know, this doesn't hang when crawled. This is good. +@contextmanager +def fastapi_server_context( + directory: str, port: int = 8000 +) -> Generator[None, None, None]: + app = create_fastapi_app(directory) + + config = uvicorn.Config(app=app, host="0.0.0.0", port=port, log_level="info") + server = uvicorn.Server(config) + + # Create a thread to run the FastAPI server + server_thread = threading.Thread(target=server.run) + server_thread.daemon = ( + True # Ensures the thread will exit when the main program exits + ) + + try: + # Start the server in the background + server_thread.start() + sleep(5) # Give it a few seconds to start + yield # Yield control back to the calling function (context manager in use) + finally: + # Shutdown the server + server.should_exit = True + server_thread.join() + + +# Leaving this here for posterity and experimentation, but the reason we're +# not using this is python's web servers hang frequently when crawled +# this is obviously not good for a unit test @contextmanager def http_server_context( directory: str, port: int = 8000 @@ -71,7 +115,7 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None: website_src = os.path.join(test_directory, "website") website_tgt = os.path.join(temp_dir, "website") shutil.copytree(website_src, website_tgt) - with http_server_context(os.path.join(temp_dir, "website"), port): + with fastapi_server_context(os.path.join(temp_dir, "website"), port): sleep(1) # sleep a tiny bit before starting everything hostname = os.getenv("TEST_WEB_HOSTNAME", "localhost")