diff --git a/scratch/binary_search.py b/scratch/binary_search.py deleted file mode 100644 index 0108d66..0000000 --- a/scratch/binary_search.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import random -import sys -import time - -from orso.tools import random_string - -from draken.compiled import StringBinaryIndex - -sys.path.insert(1, os.path.join(sys.path[0], "..")) - - -# Performance test harness -def performance_test(): - index = StringBinaryIndex() - - # Generate 10 million items for about 50,000 keys - num_keys = 50000 - num_items = 10000000 - - keys = [random_string(10) for _ in range(num_keys)] - filenames = [random_string(10) for _ in range(num_items)] - - start_time = time.time() - for i in range(num_items): - key = random.choice(keys) - filename = filenames[i] - offset = random.randint(0, 1000000) - index.add_entry(key, filename, offset) - - print(f"Time to add entries: {time.time() - start_time:.4f} seconds") - - start_time = time.time() - index.finalize_index() - print(f"Time to finalize index: {time.time() - start_time:.4f} seconds") - - # Perform lookup_eq - lookup_key = random.choice(keys) - start_time = time.monotonic_ns() - result_eq = index.lookup_eq(lookup_key) - print(f"Time for lookup_eq: {(time.monotonic_ns() - start_time)/1e6:.4f} milliseconds") - print(f"Results for lookup_eq: {len(result_eq)} items") - - # Perform lookup_in_list - lookup_keys = random.sample(keys, 100) - start_time = time.monotonic_ns() - result_in_list = index.lookup_in_list(lookup_keys) - print(f"Time for lookup_in_list: {(time.monotonic_ns() - start_time)/1e6:.4f} milliseconds") - print(f"Results for lookup_in_list: {sum(len(v) for v in result_in_list.values())} items") - - # Perform lookup_range - start_key = random.choice(keys) - end_key = random.choice(keys) - start_key, end_key = min(start_key, end_key), max(start_key, end_key) - start_time = time.time() - result_range = index.lookup_range(start_key, end_key) - print(f"Time for lookup_range: {time.time() - start_time:.4f} seconds") - print(f"Results for lookup_range: {sum(len(v) for v in result_range.values())} items") - - -if __name__ == "__main__": - performance_test() diff --git a/scratch/bloom.py b/scratch/bloom.py deleted file mode 100644 index 453de54..0000000 --- a/scratch/bloom.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import sys - -from draken.compiled.bloom_filter import BloomFilter -from draken.compiled.bloom_filter import deserialize - -sys.path.insert(1, os.path.join(sys.path[0], "..")) - - -data = { - "term1": {"doc_id": 1, "frequency": 3, "positions": [1, 5, 9]}, - "term2": {"doc_id": 2, "frequency": 2, "positions": [3, 7]}, - "term3": {"doc_id": 1, "frequency": 1, "positions": [4]}, - "term4": {"doc_id": 3, "frequency": 4, "positions": [2, 6, 8, 10]}, - "term5": {"doc_id": 2, "frequency": 1, "positions": [11]}, -} - -bf = BloomFilter() - -for rec in data.keys(): - bf.add(hash(rec)) - -ser = bf.serialize() -de = deserialize(bytes(ser)) - -print(de.possibly_contains(hash("term2"))) -print(de.possibly_contains(hash("term9"))) diff --git a/scratch/had.py b/scratch/had.py deleted file mode 100644 index c26fda2..0000000 --- a/scratch/had.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -import sys - -import opteryx -import ormsgpack -import zstd - -from draken.compiled import create_sstable -from draken.compiled import match_equals -from draken.compiled import murmurhash3 - -sys.path.insert(1, os.path.join(sys.path[0], "..")) - - -def generate_sample_data(): - data = {} - df = opteryx.query("SELECT name FROM $astronauts") - for i, r in enumerate(df): - data[r[0]] = [("$astronauts", i)] - - return data - - -sample_data = generate_sample_data() - - -sstable_bytes = create_sstable(sample_data, {}, 0) -print(f"SSTable created with size: {len(sstable_bytes)} bytes") - -term_to_lookup = b"Anthony W. England" -result = match_equals(sstable_bytes, term_to_lookup) -if result: - print(f"Lookup result for '{term_to_lookup}': {result}") -else: - print(f"'{term_to_lookup}' not found in SSTable") diff --git a/scratch/hasher.py b/scratch/hasher.py deleted file mode 100644 index d8304c8..0000000 --- a/scratch/hasher.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import sys - -from orso.cityhash import CityHash32 -from orso.tools import monitor -from orso.tools import random_string - -from draken.compiled import murmurhash3 - -sys.path.insert(1, os.path.join(sys.path[0], "..")) - - -strings = [random_string(2048) for i in range(1_000_000)] - - -@monitor() -def do_city(): - [CityHash32(s) for s in strings] - - -@monitor() -def do_mur(): - [murmurhash3(s) for s in strings] - - -print("city") -do_city() -print("mur") -do_mur() diff --git a/scratch/skip_list.py b/scratch/skip_list.py deleted file mode 100644 index f4dad8f..0000000 --- a/scratch/skip_list.py +++ /dev/null @@ -1,89 +0,0 @@ -import random -from typing import Any -from typing import Optional -from typing import Tuple - - -class SkipNode: - def __init__(self, key: Any, value: Any, level: int): - self.key = key - self.value = value - self.forward = [None] * (level + 1) - - -class SkipList: - def __init__(self, max_level: int): - self.max_level = max_level - self.head = SkipNode(None, None, max_level) - self.level = 0 - - def random_level(self) -> int: - level = 0 - while random.random() < 0.5 and level < self.max_level: - level += 1 - return level - - def search(self, key: Any) -> Optional[Tuple[Any, Any]]: - current = self.head - for i in range(self.level, -1, -1): - while current.forward[i] and current.forward[i].key < key: - current = current.forward[i] - current = current.forward[0] - if current and current.key == key: - return current.key, current.value - return None - - def insert(self, key: Any, value: Any) -> None: - update = [None] * (self.max_level + 1) - current = self.head - - for i in range(self.level, -1, -1): - while current.forward[i] and current.forward[i].key < key: - current = current.forward[i] - update[i] = current - - level = self.random_level() - if level > self.level: - for i in range(self.level + 1, level + 1): - update[i] = self.head - self.level = level - - new_node = SkipNode(key, value, level) - for i in range(level + 1): - new_node.forward[i] = update[i].forward[i] - update[i].forward[i] = new_node - - def delete(self, key: Any) -> None: - update = [None] * (self.max_level + 1) - current = self.head - - for i in range(self.level, -1, -1): - while current.forward[i] and current.forward[i].key < key: - current = current.forward[i] - update[i] = current - - current = current.forward[0] - if current and current.key == key: - for i in range(self.level + 1): - if update[i].forward[i] != current: - break - update[i].forward[i] = current.forward[i] - - while self.level > 0 and not self.head.forward[self.level]: - self.level -= 1 - - -# Example usage -skip_list = SkipList(3) -skip_list.insert(3, "three") -skip_list.insert(6, "six") -skip_list.insert(7, "seven") -skip_list.insert(9, "nine") -skip_list.insert(12, "twelve") -skip_list.insert(19, "nineteen") - -print(skip_list.search(6)) # Output: (6, 'six') -print(skip_list.search(15)) # Output: None - -skip_list.delete(6) -print(skip_list.search(6)) # Output: None diff --git a/scratch/uwheel.py b/scratch/uwheel.py deleted file mode 100644 index b3cfc3d..0000000 --- a/scratch/uwheel.py +++ /dev/null @@ -1,228 +0,0 @@ -from collections import defaultdict -from datetime import datetime -from datetime import timedelta -from typing import Dict -from typing import List -from typing import Tuple - -import opteryx - - -class MutablePartialAggregate: - def __init__( - self, - count: int = 0, - sum: float = 0.0, - min_: float = float("inf"), - max_: float = float("-inf"), - missing: int = 0, - ): - self.count = count - self.sum = sum - self.min = min_ - self.max = max_ - self.missing = missing - - -class PartialAggregate: - def __init__(self, count: int, sum: float, min_: float, max_: float, missing: int): - self.count = count - self.sum = sum - self.min = min_ - self.max = max_ - self.missing = missing - - -class Aggregate: - def __init__(self, count: int, sum: float, avg: float, min_: float, max_: float, missing: int): - self.count = count - self.sum = sum - self.avg = avg - self.min = min_ - self.max = max_ - self.missing = missing - - -class AggregationFramework: - def lift(self, input_data: float) -> MutablePartialAggregate: - return MutablePartialAggregate( - count=1, sum=input_data, min_=input_data, max_=input_data, missing=input_data - ) - - def combine_mutable(self, mutable: MutablePartialAggregate, input_data: float) -> None: - mutable.count += 1 - if input_data is None: - mutable.missing += 1 - else: - mutable.sum += input_data - mutable.min = min(mutable.min, input_data) - mutable.max = max(mutable.max, input_data) - - def freeze(self, mutable: MutablePartialAggregate) -> PartialAggregate: - return PartialAggregate( - count=mutable.count, - sum=mutable.sum, - min_=mutable.min, - max_=mutable.max, - missing=mutable.missing, - ) - - def combine(self, a: PartialAggregate, b: PartialAggregate) -> PartialAggregate: - return PartialAggregate( - count=a.count + b.count, - sum=a.sum + b.sum, - min_=min(a.min, b.min), - max_=max(a.max, b.max), - missing=a.missing + b.missing, - ) - - def lower(self, partial: PartialAggregate) -> Aggregate: - avg = partial.sum / partial.count if partial.count != 0 else 0 - return Aggregate( - count=partial.count, - sum=partial.sum, - avg=avg, - min_=partial.min, - max_=partial.max, - missing=partial.missing, - ) - - -class HierarchicalAggregateWheel: - def __init__(self): - self.seconds = defaultdict(MutablePartialAggregate) - self.minutes = defaultdict(MutablePartialAggregate) - self.hours = defaultdict(MutablePartialAggregate) - self.days = defaultdict(MutablePartialAggregate) - self.weeks = defaultdict(MutablePartialAggregate) - self.years = defaultdict(MutablePartialAggregate) - - def aggregate(self, timestamp: datetime, value: float, framework: AggregationFramework): - # Aggregate by second - second_key = timestamp.replace(microsecond=0) - if second_key not in self.seconds: - self.seconds[second_key] = MutablePartialAggregate() - framework.combine_mutable(self.seconds[second_key], value) - - # Aggregate by minute - minute_key = second_key.replace(second=0) - if minute_key not in self.minutes: - self.minutes[minute_key] = MutablePartialAggregate() - framework.combine_mutable(self.minutes[minute_key], value) - - # Aggregate by hour - hour_key = minute_key.replace(minute=0) - if hour_key not in self.hours: - self.hours[hour_key] = MutablePartialAggregate() - framework.combine_mutable(self.hours[hour_key], value) - - # Aggregate by day - day_key = hour_key.replace(hour=0) - if day_key not in self.days: - self.days[day_key] = MutablePartialAggregate() - framework.combine_mutable(self.days[day_key], value) - - # Aggregate by week - week_key = day_key - timedelta(days=day_key.weekday()) - if week_key not in self.weeks: - self.weeks[week_key] = MutablePartialAggregate() - framework.combine_mutable(self.weeks[week_key], value) - - # Aggregate by year - year_key = day_key.replace(month=1, day=1) - if year_key not in self.years: - self.years[year_key] = MutablePartialAggregate() - framework.combine_mutable(self.years[year_key], value) - - def freeze(self, framework: AggregationFramework): - self.seconds = {k: framework.freeze(v) for k, v in self.seconds.items()} - self.minutes = {k: framework.freeze(v) for k, v in self.minutes.items()} - self.hours = {k: framework.freeze(v) for k, v in self.hours.items()} - self.days = {k: framework.freeze(v) for k, v in self.days.items()} - self.weeks = {k: framework.freeze(v) for k, v in self.weeks.items()} - self.years = {k: framework.freeze(v) for k, v in self.years.items()} - - def combine_wheels(self, other, framework: AggregationFramework): - self.seconds = self._combine_dicts(self.seconds, other.seconds, framework) - self.minutes = self._combine_dicts(self.minutes, other.minutes, framework) - self.hours = self._combine_dicts(self.hours, other.hours, framework) - self.days = self._combine_dicts(self.days, other.days, framework) - self.weeks = self._combine_dicts(self.weeks, other.weeks, framework) - self.years = self._combine_dicts(self.years, other.years, framework) - - def _combine_dicts(self, dict_a, dict_b, framework: AggregationFramework): - combined = defaultdict(lambda: PartialAggregate(0, 0.0)) - for k, v in dict_a.items(): - combined[k] = v - for k, v in dict_b.items(): - if k in combined: - combined[k] = framework.combine(combined[k], v) - else: - combined[k] = v - return combined - - -class µWheelIndex: - def __init__(self, framework: AggregationFramework): - self.framework = framework - self.haw = HierarchicalAggregateWheel() - - def build_index_for_file(self, data: List[Tuple[datetime, float]]): - for timestamp, value in data: - self.haw.aggregate(timestamp, value, self.framework) - self.haw.freeze(self.framework) - return self.haw - - def combine_indices(self, haw_a: HierarchicalAggregateWheel, haw_b: HierarchicalAggregateWheel): - haw_a.combine_wheels(haw_b, self.framework) - return haw_a - - def query(self, granularity: str, start: datetime, end: datetime): - aggregates = [] - if granularity == "second": - aggregates = self._query_range(self.haw.seconds, start, end) - elif granularity == "minute": - aggregates = self._query_range(self.haw.minutes, start, end) - elif granularity == "hour": - aggregates = self._query_range(self.haw.hours, start, end) - elif granularity == "day": - aggregates = self._query_range(self.haw.days, start, end) - elif granularity == "week": - aggregates = self._query_range(self.haw.weeks, start, end) - elif granularity == "year": - aggregates = self._query_range(self.haw.years, start, end) - return aggregates - - def _query_range(self, wheel: Dict[datetime, PartialAggregate], start: datetime, end: datetime): - results = [] - for timestamp, aggregate in wheel.items(): - if start <= timestamp <= end: - results.append((timestamp, self.framework.lower(aggregate))) - return results - - -data_file1 = zip( - *opteryx.query( - "select Launched_at, Price from $missions WHERE Launched_at is not null" - ).collect([0, 1]) -) - - -# Example usage -framework = AggregationFramework() -index_service = µWheelIndex(framework) - - -haw1 = index_service.build_index_for_file(data_file1) - -# combined_haw = index_service.combine_indices(haw1, haw2) - -# Querying the combined HAW -start_time = datetime(1960, 5, 15, 12, 0, 0) -end_time = datetime(2024, 5, 15, 12, 59, 59) -results = index_service.query("hour", start_time, end_time) - -for timestamp, aggregate in results: - print( - f"Timestamp: {timestamp}, Count: {aggregate.count}, Sum: {aggregate.sum}, Avg: {aggregate.avg}, Min: {aggregate.min}, Max: {aggregate.max}, missing: {aggregate.missing}" - )