From 585729e667fce63a8bc46eb662b784588680cc16 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Fri, 6 Oct 2023 23:17:41 +0200 Subject: [PATCH] perf: improve performance for larger repositories * chore: decrease queue get timeout * perf: avoid checking for new files too often * perf: cache regex-based sorting --- seagoat/queue/base_queue.py | 2 +- seagoat/queue/task_queue.py | 13 +++++++++++++ seagoat/result.py | 26 +++++++++++++++++--------- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/seagoat/queue/base_queue.py b/seagoat/queue/base_queue.py index 55856c80..3d631921 100644 --- a/seagoat/queue/base_queue.py +++ b/seagoat/queue/base_queue.py @@ -73,7 +73,7 @@ def _worker_function(self): while True: try: - task = self._task_queue.get(timeout=1) + task = self._task_queue.get(timeout=0.1) if task.name == "shutdown": break self._handle_task(context, task) diff --git a/seagoat/queue/task_queue.py b/seagoat/queue/task_queue.py index cb7f6654..d5976ddc 100644 --- a/seagoat/queue/task_queue.py +++ b/seagoat/queue/task_queue.py @@ -1,6 +1,7 @@ # pylint: disable=import-outside-toplevel import logging import math +import time import orjson @@ -9,6 +10,9 @@ from seagoat.queue.base_queue import LOW_PRIORITY +SECONDS_BETWEEN_MAINTENANCE = 10 + + def calculate_accuracy(chunks_analyzed: int, total_chunks: int) -> int: if total_chunks == 0 or total_chunks - chunks_analyzed == 0: return 100 @@ -40,9 +44,18 @@ def _get_context(self): seagoat_engine = Engine(self.kwargs["repo_path"]) context["seagoat_engine"] = seagoat_engine + context["last_maintenance"] = None return context def handle_maintenance(self, context): + if ( + context["last_maintenance"] is not None + and time.time() - context["last_maintenance"] < SECONDS_BETWEEN_MAINTENANCE + ): + return + + context["last_maintenance"] = time.time() + if self._task_queue.qsize() > 0: return diff --git a/seagoat/result.py b/seagoat/result.py index b74efa8b..4932b88a 100644 --- a/seagoat/result.py +++ b/seagoat/result.py @@ -1,4 +1,5 @@ # pylint: disable=too-few-public-methods +import functools import re from collections import Counter from dataclasses import dataclass @@ -12,6 +13,19 @@ from seagoat.utils.file_types import get_file_penalty_factor +SPLITTER_PATTERN = re.compile(r"\s+") + + +@functools.cache +def get_number_of_exact_matches(line: str, query: str): + terms = re.split(SPLITTER_PATTERN, query) + pattern = ".*".join(map(re.escape, terms)) + + if re.search(pattern, line, re.IGNORECASE): + return 1 + return 0 + + class ResultLineType(Enum): RESULT = "result" CONTEXT = "context" @@ -27,16 +41,10 @@ class ResultLine: line_text: str types: Set[ResultLineType] - def _get_number_of_exact_matches(self, query: str) -> int: - terms = re.split(r"\s+", query) - pattern = ".*".join(map(re.escape, terms)) - - if re.search(pattern, self.line_text, re.IGNORECASE): - return 1 - return 0 - def get_score(self, query: str) -> float: - return self.vector_distance / (1 + self._get_number_of_exact_matches(query)) + return self.vector_distance / ( + 1 + get_number_of_exact_matches(self.line_text, query) + ) def add_type(self, type_: ResultLineType) -> None: self.types.add(type_)