Skip to content

Commit

Permalink
perf: improve performance for larger repositories
Browse files Browse the repository at this point in the history
* chore: decrease queue get timeout

* perf: avoid checking for new files too often

* perf: cache regex-based sorting
  • Loading branch information
kantord committed Oct 6, 2023
1 parent 511f01f commit 585729e
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 10 deletions.
2 changes: 1 addition & 1 deletion seagoat/queue/base_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def _worker_function(self):

while True:
try:
task = self._task_queue.get(timeout=1)
task = self._task_queue.get(timeout=0.1)
if task.name == "shutdown":
break
self._handle_task(context, task)
Expand Down
13 changes: 13 additions & 0 deletions seagoat/queue/task_queue.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# pylint: disable=import-outside-toplevel
import logging
import math
import time

import orjson

Expand All @@ -9,6 +10,9 @@
from seagoat.queue.base_queue import LOW_PRIORITY


SECONDS_BETWEEN_MAINTENANCE = 10


def calculate_accuracy(chunks_analyzed: int, total_chunks: int) -> int:
if total_chunks == 0 or total_chunks - chunks_analyzed == 0:
return 100
Expand Down Expand Up @@ -40,9 +44,18 @@ def _get_context(self):

seagoat_engine = Engine(self.kwargs["repo_path"])
context["seagoat_engine"] = seagoat_engine
context["last_maintenance"] = None
return context

def handle_maintenance(self, context):
if (
context["last_maintenance"] is not None
and time.time() - context["last_maintenance"] < SECONDS_BETWEEN_MAINTENANCE
):
return

context["last_maintenance"] = time.time()

if self._task_queue.qsize() > 0:
return

Expand Down
26 changes: 17 additions & 9 deletions seagoat/result.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# pylint: disable=too-few-public-methods
import functools
import re
from collections import Counter
from dataclasses import dataclass
Expand All @@ -12,6 +13,19 @@
from seagoat.utils.file_types import get_file_penalty_factor


SPLITTER_PATTERN = re.compile(r"\s+")


@functools.cache
def get_number_of_exact_matches(line: str, query: str):
terms = re.split(SPLITTER_PATTERN, query)
pattern = ".*".join(map(re.escape, terms))

if re.search(pattern, line, re.IGNORECASE):
return 1
return 0


class ResultLineType(Enum):
RESULT = "result"
CONTEXT = "context"
Expand All @@ -27,16 +41,10 @@ class ResultLine:
line_text: str
types: Set[ResultLineType]

def _get_number_of_exact_matches(self, query: str) -> int:
terms = re.split(r"\s+", query)
pattern = ".*".join(map(re.escape, terms))

if re.search(pattern, self.line_text, re.IGNORECASE):
return 1
return 0

def get_score(self, query: str) -> float:
return self.vector_distance / (1 + self._get_number_of_exact_matches(query))
return self.vector_distance / (
1 + get_number_of_exact_matches(self.line_text, query)
)

def add_type(self, type_: ResultLineType) -> None:
self.types.add(type_)
Expand Down

0 comments on commit 585729e

Please sign in to comment.