Skip to content

Commit

Permalink
Performance optimizations (#150)
Browse files Browse the repository at this point in the history
* Refactor OperationIndex to reducee nubmer of database transactions

* Some docs

* Do the same for BigMapIndex

* Docs

* Changelog

* Fix index state not being updated

* Refactor index module

* Fix fire_handler calls

* Fix changelog

* Lint

* `LazyOperationFetcher` helper 🤔

* Test utils index matcher benchmark
  • Loading branch information
droserasprout authored Oct 12, 2021
1 parent 5b2ba98 commit 17f8699
Show file tree
Hide file tree
Showing 7 changed files with 199 additions and 42 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
### Improved

* Raise `DatabaseConfigurationError` exception when project models are not compatible with GraphQL.
* Another bunch of performance optimizations. Reduced DB pressure, speeded up parallel processing lots of indexes.

## 3.0.4 - 2021-10-04

Expand Down
6 changes: 3 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion scripts/commit_benchmarks.sh
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
cp tests/benchmarks/config.latest.json tests/benchmarks/config.json
cp tests/benchmarks/config.latest.json tests/benchmarks/config.json
cp tests/benchmarks/index.latest.json tests/benchmarks/index.json
8 changes: 5 additions & 3 deletions scripts/run_benchmarks.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
rm tests/benchmarks/config.latest.json
python tests/benchmarks/config.py -o tests/benchmarks/config.latest.json
python -m pyperf compare_to --table tests/benchmarks/config.json tests/benchmarks/config.latest.json
for i in config index; do
rm tests/benchmarks/$i.latest.json;
python tests/benchmarks/$i.py -o tests/benchmarks/$i.latest.json;
python -m pyperf compare_to --table tests/benchmarks/$i.json tests/benchmarks/$i.latest.json;
done;
127 changes: 92 additions & 35 deletions src/dipdup/index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import abstractmethod
from collections import defaultdict, deque, namedtuple
from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union, cast
from typing import Deque, Dict, Iterable, Optional, Sequence, Set, Tuple, Union, cast

from pydantic.error_wrappers import ValidationError

Expand All @@ -9,6 +9,7 @@
BigMapHandlerConfig,
BigMapIndexConfig,
ContractConfig,
HeadHandlerConfig,
HeadIndexConfig,
OperationHandlerConfig,
OperationHandlerOriginationPatternConfig,
Expand All @@ -32,12 +33,20 @@
SingleLevelRollback = namedtuple('SingleLevelRollback', ('level'))
Operations = Tuple[OperationData, ...]
OperationQueueItemT = Union[Operations, SingleLevelRollback]
OperationHandlerArgumentT = Optional[Union[Transaction, Origination, OperationData]]
MatchedOperationsT = Tuple[OperationSubgroup, OperationHandlerConfig, Deque[OperationHandlerArgumentT]]
MatchedBigMapsT = Tuple[BigMapHandlerConfig, BigMapDiff]

# NOTE: For initializing the index state on startup
block_cache: Dict[int, BlockData] = {}


class Index:
"""Base class for index implementations
Provides common interface for managing index state and switching between sync and realtime modes.
"""

_queue: Deque

def __init__(self, ctx: DipDupContext, config: ResolvedIndexConfigT, datasource: TzktDatasource) -> None:
Expand Down Expand Up @@ -256,10 +265,18 @@ async def _process_level_operations(self, operations: Tuple[OperationData, ...])
elif level < self.state.level:
raise RuntimeError(f'Level of operation batch must be higher than index state level: {level} < {self.state.level}')

self._logger.info('Processing %s operations of level %s', len(operations), level)
matched_subgroups = await self._match_operations(operations)

# NOTE: We still need to bump index level but don't care if it will be done in existing transaction
if not matched_subgroups:
await self.state.update_status(level=level)
return

async with in_global_transaction():
self._logger.info('Processing %s operations of level %s', len(operations), level)
await self._process_operations(operations)
await self.state.update_status(self.state.status, level)
for operation_subgroup, handler_config, args in matched_subgroups:
await self._call_matched_handler(handler_config, operation_subgroup, args)
await self.state.update_status(level=level)

async def _match_operation(self, pattern_config: OperationHandlerPatternConfigT, operation: OperationData) -> bool:
"""Match single operation with pattern"""
Expand Down Expand Up @@ -294,9 +311,10 @@ async def _match_operation(self, pattern_config: OperationHandlerPatternConfigT,
else:
raise NotImplementedError

async def _process_operations(self, operations: Iterable[OperationData]) -> None:
async def _match_operations(self, operations: Iterable[OperationData]) -> Deque[MatchedOperationsT]:
"""Try to match operations in cache with all patterns from indexes. Must be wrapped in transaction."""
self._head_hashes = set()
self._head_hashes.clear()
matched_subgroups: Deque[MatchedOperationsT] = deque()
operation_subgroups: Dict[OperationSubgroup, Deque[OperationData]] = defaultdict(deque)
for operation in operations:
key = OperationSubgroup(operation.hash, operation.counter)
Expand All @@ -311,7 +329,7 @@ async def _process_operations(self, operations: Iterable[OperationData]) -> None
pattern_idx = 0
matched_operations: Deque[Optional[OperationData]] = deque()

# TODO: Ensure complex cases work, for ex. required argument after optional one
# TODO: Ensure complex cases work, e.g. when optional argument is followed by required one
# TODO: Add None to matched_operations where applicable (pattern is optional and operation not found)
while operation_idx < len(operations):
operation, pattern_config = operations[operation_idx], handler_config.pattern[pattern_idx]
Expand All @@ -335,26 +353,29 @@ async def _process_operations(self, operations: Iterable[OperationData]) -> None
operation_idx += 1

if pattern_idx == len(handler_config.pattern):
await self._on_match(operation_subgroup, handler_config, matched_operations)
self._logger.info('%s: `%s` handler matched!', operation_subgroup.hash, handler_config.callback)

args = await self._prepare_handler_args(handler_config, matched_operations)
matched_subgroups.append((operation_subgroup, handler_config, args))

matched_operations.clear()
pattern_idx = 0

if len(matched_operations) >= sum(map(lambda x: 0 if x.optional else 1, handler_config.pattern)):
await self._on_match(operation_subgroup, handler_config, matched_operations)
self._logger.info('%s: `%s` handler matched!', operation_subgroup.hash, handler_config.callback)

args = await self._prepare_handler_args(handler_config, matched_operations)
matched_subgroups.append((operation_subgroup, handler_config, args))

return matched_subgroups

async def _on_match(
async def _prepare_handler_args(
self,
operation_subgroup: OperationSubgroup,
handler_config: OperationHandlerConfig,
matched_operations: Deque[Optional[OperationData]],
):
"""Prepare handler arguments, parse parameter and storage. Schedule callback in executor."""
self._logger.info('%s: `%s` handler matched!', operation_subgroup.hash, handler_config.callback)
if not handler_config.parent:
raise ConfigInitializationException

args: List[Optional[Union[Transaction, Origination, OperationData]]] = []
) -> Deque[OperationHandlerArgumentT]:
"""Prepare handler arguments, parse parameter and storage."""
args: Deque[OperationHandlerArgumentT] = deque()
for pattern_config, operation in zip(handler_config.pattern, matched_operations):
if operation is None:
args.append(None)
Expand Down Expand Up @@ -393,6 +414,14 @@ async def _on_match(
else:
raise NotImplementedError

return args

async def _call_matched_handler(
self, handler_config: OperationHandlerConfig, operation_subgroup: OperationSubgroup, args: Sequence[OperationHandlerArgumentT]
) -> None:
if not handler_config.parent:
raise ConfigInitializationException

await self._ctx.fire_handler(
handler_config.callback,
handler_config.parent.name,
Expand Down Expand Up @@ -477,15 +506,25 @@ async def _synchronize(self, last_level: int, cache: bool = False) -> None:
await self._exit_sync_state(last_level)

async def _process_level_big_maps(self, big_maps: Tuple[BigMapData, ...]):
if not big_maps:
return
level = self._extract_level(big_maps)

# NOTE: le operator because single level rollbacks are not supported
if level <= self.state.level:
raise RuntimeError(f'Level of big map batch must be higher than index state level: {level} <= {self.state.level}')

self._logger.info('Processing %s big map diffs of level %s', len(big_maps), level)
matched_big_maps = await self._match_big_maps(big_maps)

# NOTE: We still need to bump index level but don't care if it will be done in existing transaction
if not matched_big_maps:
await self.state.update_status(level=level)
return

async with in_global_transaction():
self._logger.info('Processing %s big map diffs of level %s', len(big_maps), level)
await self._process_big_maps(big_maps)
for handler_config, big_map_diff in matched_big_maps:
await self._call_matched_handler(handler_config, big_map_diff)
await self.state.update_status(level=level)

async def _match_big_map(self, handler_config: BigMapHandlerConfig, big_map: BigMapData) -> bool:
Expand All @@ -496,11 +535,11 @@ async def _match_big_map(self, handler_config: BigMapHandlerConfig, big_map: Big
return False
return True

async def _on_match(
async def _prepare_handler_args(
self,
handler_config: BigMapHandlerConfig,
matched_big_map: BigMapData,
) -> None:
) -> BigMapDiff:
"""Prepare handler arguments, parse key and value. Schedule callback in executor."""
self._logger.info('%s: `%s` handler matched!', matched_big_map.operation_id, handler_config.callback)
if not handler_config.parent:
Expand All @@ -524,13 +563,30 @@ async def _on_match(
else:
value = None

big_map_diff = BigMapDiff( # type: ignore
return BigMapDiff(
data=matched_big_map,
action=matched_big_map.action,
key=key,
value=value,
)

async def _match_big_maps(self, big_maps: Iterable[BigMapData]) -> Deque[MatchedBigMapsT]:
"""Try to match big map diffs in cache with all patterns from indexes."""
matched_big_maps: Deque[MatchedBigMapsT] = deque()

for big_map in big_maps:
for handler_config in self._config.handlers:
big_map_matched = await self._match_big_map(handler_config, big_map)
if big_map_matched:
arg = await self._prepare_handler_args(handler_config, big_map)
matched_big_maps.append((handler_config, arg))

return matched_big_maps

async def _call_matched_handler(self, handler_config: BigMapHandlerConfig, big_map_diff: BigMapDiff) -> None:
if not handler_config.parent:
raise ConfigInitializationException

await self._ctx.fire_handler(
handler_config.callback,
handler_config.parent.name,
Expand All @@ -540,15 +596,6 @@ async def _on_match(
big_map_diff,
)

async def _process_big_maps(self, big_maps: Iterable[BigMapData]) -> None:
"""Try to match big map diffs in cache with all patterns from indexes."""

for big_map in big_maps:
for handler_config in self._config.handlers:
big_map_matched = await self._match_big_map(handler_config, big_map)
if big_map_matched:
await self._on_match(handler_config, big_map)

async def _get_big_map_addresses(self) -> Set[str]:
"""Get addresses to fetch big map diffs from during initial synchronization"""
addresses = set()
Expand Down Expand Up @@ -587,10 +634,20 @@ async def _process_queue(self) -> None:
async with in_global_transaction():
self._logger.info('Processing head info of level %s', level)
for handler_config in self._config.handlers:
if not handler_config.parent:
raise ConfigInitializationException
await self._ctx.fire_handler(handler_config.callback, handler_config.parent.name, self.datasource, head.hash, head)
await self._call_matched_handler(handler_config, head)
await self.state.update_status(level=level)

async def _call_matched_handler(self, handler_config: HeadHandlerConfig, head: HeadBlockData) -> None:
if not handler_config.parent:
raise ConfigInitializationException

await self._ctx.fire_handler(
handler_config.callback,
handler_config.parent.name,
self.datasource,
head.hash,
(head,),
)

def push_head(self, head: HeadBlockData) -> None:
self._queue.append(head)
56 changes: 56 additions & 0 deletions src/dipdup/test/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import logging
from collections import deque
from contextlib import contextmanager
from typing import AsyncGenerator, Deque, Iterator, Tuple
from unittest.mock import patch

from dipdup.datasources.tzkt.datasource import OperationFetcher
from dipdup.index import OperationIndex
from dipdup.models import OperationData

logging.basicConfig(level=logging.ERROR)


# NOTE: Not an actual fuzzer :)
class OperationFetcherFuzzer(OperationFetcher):
"""This thing is lazy, so instead of fetching all operations it returns the same data over and over again."""

levels: int
repeats: int

def __new__(cls, *a, **kw):
super().__new__(cls, *a, **kw)
cls.levels = 100
cls.repeats = 100

async def fetch_operations_by_level(self) -> AsyncGenerator[Tuple[int, Tuple[OperationData, ...]], None]:
self._datasource._http._config.batch_size = 1000
level_operations: Deque[Tuple[int, Tuple[OperationData, ...]]] = deque()
async for level, operations in super().fetch_operations_by_level():
level_operations.append((level, operations))
if len(level_operations) >= self.levels:
break

for _ in range(self.repeats):
for level, operations in level_operations:
yield level, operations


class OperationIndexFuzzer(OperationIndex):
async def _process_level_operations(self, operations: Tuple[OperationData, ...]) -> None:
await self._match_operations(operations)


@contextmanager
def with_operation_fetcher_fuzzer(levels=100, repeats=100) -> Iterator[None]:
OperationFetcherFuzzer.levels = levels
OperationFetcherFuzzer.repeats = repeats
with patch('dipdup.datasources.tzkt.datasource.OperationFetcher', OperationFetcherFuzzer):
yield


@contextmanager
def with_operation_index_fuzzer(levels=100, repeats=100) -> Iterator[None]:
with with_operation_fetcher_fuzzer(levels=levels, repeats=repeats):
with patch('dipdup.index.OperationIndex', OperationIndexFuzzer):
yield
40 changes: 40 additions & 0 deletions tests/benchmarks/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import asyncio
from contextlib import suppress
from os.path import dirname, join

import pyperf # type: ignore

from dipdup.config import DipDupConfig
from dipdup.dipdup import DipDup
from dipdup.exceptions import ReindexingRequiredError
from dipdup.test import with_operation_index_fuzzer


def add_cmdline_args(cmd, args):
cmd += ['--quiet']


runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)


paths = [
join(dirname(__file__), '..', 'integration_tests', name)
for name in [
'hic_et_nunc.yml',
]
]


async def _match():
for path in paths:
config = DipDupConfig.load([path])
config.database.path = ':memory:'
config.initialize()

with with_operation_index_fuzzer(10, 3):
dipdup = DipDup(config)
with suppress(ReindexingRequiredError):
await dipdup.run(True, True)


runner.bench_func('index_match_operations', lambda: asyncio.run(_match()))

0 comments on commit 17f8699

Please sign in to comment.