-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from mabel-dev/initial
Initial
- Loading branch information
Showing
28 changed files
with
1,329 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .binary_search import StringBinaryIndex | ||
from .hadro import create_sstable |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# cython: language_level=3 | ||
|
||
from typing import List, Tuple, Dict | ||
from libc.stdint cimport int32_t, int64_t | ||
|
||
cdef class StringBinaryIndex: | ||
cdef list key_store | ||
cdef list value_store | ||
cdef dict key_to_pointer | ||
|
||
def __init__(self): | ||
self.key_store = [] | ||
self.value_store = [] | ||
self.key_to_pointer = {} | ||
|
||
def add_entry(self, key: str, filename: str, offset: int): | ||
# Add individual (filename, offset) tuples for the given key | ||
if key not in self.key_to_pointer: | ||
self.key_to_pointer[key] = len(self.value_store) | ||
self.key_store.append((len(key.encode('utf-8')), key.encode('utf-8'), self.key_to_pointer[key])) | ||
self.value_store.append([]) # Initialize empty list for new key | ||
|
||
# Add value to the corresponding value list | ||
pointer = self.key_to_pointer[key] | ||
self.value_store[pointer].append((len(filename.encode('utf-8')), filename.encode('utf-8'), offset)) | ||
|
||
def finalize_index(self): | ||
# Sort key_store by keys for binary search | ||
self.key_store.sort() | ||
|
||
def lookup_eq(self, key: str) -> List[Tuple[str, int]]: | ||
# Perform binary search on key_store | ||
pointer = self.key_to_pointer.get(key) | ||
if pointer is None: | ||
return [] | ||
|
||
value_data = self.value_store[pointer] | ||
return [(filename.decode('utf-8'), offset) for _, filename, offset in value_data] | ||
|
||
def lookup_in_list(self, keys: List[str]) -> Dict[str, List[Tuple[str, int]]]: | ||
result = {} | ||
for key in keys: | ||
result[key] = self.lookup_eq(key) | ||
return result | ||
|
||
def lookup_range(self, start_key: str, end_key: str) -> Dict[str, List[Tuple[str, int]]]: | ||
result = {} | ||
start_index = self._binary_search(start_key, find_start=True) | ||
end_index = self._binary_search(end_key, find_start=False) | ||
|
||
for index in range(start_index, end_index + 1): | ||
key_len, key_bytes, pointer = self.key_store[index] | ||
key = key_bytes.decode('utf-8') | ||
result[key] = self.lookup_eq(key) | ||
|
||
return result | ||
|
||
def _binary_search(self, key: str, find_start: bool) -> int: | ||
# Implement binary search on the sorted key_store | ||
key_bytes = key.encode('utf-8') | ||
low, high = 0, len(self.key_store) - 1 | ||
while low <= high: | ||
mid = (low + high) // 2 | ||
mid_key_bytes = self.key_store[mid][1] | ||
if mid_key_bytes < key_bytes: | ||
low = mid + 1 | ||
elif mid_key_bytes > key_bytes: | ||
high = mid - 1 | ||
else: | ||
if find_start: | ||
if mid == 0 or self.key_store[mid - 1][1] != key_bytes: | ||
return mid | ||
high = mid - 1 | ||
else: | ||
if mid == len(self.key_store) - 1 or self.key_store[mid + 1][1] != key_bytes: | ||
return mid | ||
low = mid + 1 | ||
return low if find_start else high |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# cython: language_level=3 | ||
# cython: boundscheck=False | ||
# cython: wraparound=False | ||
""" | ||
This is not a general perpose Bloom Filter, if used outside of Draken, it may not | ||
perform entirely as expected. | ||
""" | ||
|
||
from libc.stdlib cimport malloc, free | ||
from libc.string cimport memset, memcpy | ||
|
||
# Define constants for the fixed size | ||
BIT_ARRAY_SIZE = 64 * 1024 # 64 KB = 512 Kbits | ||
BYTE_ARRAY_SIZE = BIT_ARRAY_SIZE // 8 | ||
|
||
cdef class BloomFilter: | ||
cdef unsigned char* bit_array | ||
|
||
def __cinit__(self): | ||
# Allocate memory for the bit array and initialize to 0 | ||
self.bit_array = <unsigned char*>malloc(BYTE_ARRAY_SIZE) | ||
if not self.bit_array: | ||
raise MemoryError("Failed to allocate memory for the bit array.") | ||
memset(self.bit_array, 0, BYTE_ARRAY_SIZE) | ||
|
||
def __dealloc__(self): | ||
if self.bit_array: | ||
free(self.bit_array) | ||
|
||
cpdef void add(self, long item): | ||
"""Add an item to the Bloom filter""" | ||
h1 = item % BIT_ARRAY_SIZE | ||
# Apply the golden ratio to the item and use modulo to wrap within the size of the bit array | ||
h2 = <long>(item * 1.618033988749895) % BIT_ARRAY_SIZE | ||
# Set bits using bitwise OR | ||
self.bit_array[h1 // 8] |= 1 << (h1 % 8) | ||
self.bit_array[h2 // 8] |= 1 << (h2 % 8) | ||
|
||
cpdef int possibly_contains(self, long item): | ||
"""Check if the item might be in the set""" | ||
h1 = item % BIT_ARRAY_SIZE | ||
# Apply the golden ratio to the item and use modulo to wrap within the size of the bit array | ||
h2 = <long>(item * 1.618033988749895) % BIT_ARRAY_SIZE | ||
# Check bits using bitwise AND | ||
return (self.bit_array[h1 // 8] & (1 << (h1 % 8))) and \ | ||
(self.bit_array[h2 // 8] & (1 << (h2 % 8))) | ||
|
||
cpdef memoryview serialize(self): | ||
"""Serialize the Bloom filter to a memory view""" | ||
return memoryview(self.bit_array[:BYTE_ARRAY_SIZE]) | ||
|
||
cpdef BloomFilter deserialize(const unsigned char* data): | ||
"""Deserialize a memory view to a Bloom filter""" | ||
bf = BloomFilter() | ||
memcpy(bf.bit_array, data, BYTE_ARRAY_SIZE) | ||
return bf |
Oops, something went wrong.