Skip to content

Commit

Permalink
Merge pull request #4 from mabel-dev/initial
Browse files Browse the repository at this point in the history
Initial
  • Loading branch information
joocer authored May 20, 2024
2 parents ab55948 + 4ea3ee0 commit cfd78f5
Show file tree
Hide file tree
Showing 28 changed files with 1,329 additions and 34 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
*.c
*.cpp
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]
Copyright 2024 Justin Joyce

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
2 changes: 1 addition & 1 deletion draken/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 3
__build__ = 4

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
2 changes: 2 additions & 0 deletions draken/compiled/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .binary_search import StringBinaryIndex
from .hadro import create_sstable
78 changes: 78 additions & 0 deletions draken/compiled/binary_search.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# cython: language_level=3

from typing import List, Tuple, Dict
from libc.stdint cimport int32_t, int64_t

cdef class StringBinaryIndex:
cdef list key_store
cdef list value_store
cdef dict key_to_pointer

def __init__(self):
self.key_store = []
self.value_store = []
self.key_to_pointer = {}

def add_entry(self, key: str, filename: str, offset: int):
# Add individual (filename, offset) tuples for the given key
if key not in self.key_to_pointer:
self.key_to_pointer[key] = len(self.value_store)
self.key_store.append((len(key.encode('utf-8')), key.encode('utf-8'), self.key_to_pointer[key]))
self.value_store.append([]) # Initialize empty list for new key

# Add value to the corresponding value list
pointer = self.key_to_pointer[key]
self.value_store[pointer].append((len(filename.encode('utf-8')), filename.encode('utf-8'), offset))

def finalize_index(self):
# Sort key_store by keys for binary search
self.key_store.sort()

def lookup_eq(self, key: str) -> List[Tuple[str, int]]:
# Perform binary search on key_store
pointer = self.key_to_pointer.get(key)
if pointer is None:
return []

value_data = self.value_store[pointer]
return [(filename.decode('utf-8'), offset) for _, filename, offset in value_data]

def lookup_in_list(self, keys: List[str]) -> Dict[str, List[Tuple[str, int]]]:
result = {}
for key in keys:
result[key] = self.lookup_eq(key)
return result

def lookup_range(self, start_key: str, end_key: str) -> Dict[str, List[Tuple[str, int]]]:
result = {}
start_index = self._binary_search(start_key, find_start=True)
end_index = self._binary_search(end_key, find_start=False)

for index in range(start_index, end_index + 1):
key_len, key_bytes, pointer = self.key_store[index]
key = key_bytes.decode('utf-8')
result[key] = self.lookup_eq(key)

return result

def _binary_search(self, key: str, find_start: bool) -> int:
# Implement binary search on the sorted key_store
key_bytes = key.encode('utf-8')
low, high = 0, len(self.key_store) - 1
while low <= high:
mid = (low + high) // 2
mid_key_bytes = self.key_store[mid][1]
if mid_key_bytes < key_bytes:
low = mid + 1
elif mid_key_bytes > key_bytes:
high = mid - 1
else:
if find_start:
if mid == 0 or self.key_store[mid - 1][1] != key_bytes:
return mid
high = mid - 1
else:
if mid == len(self.key_store) - 1 or self.key_store[mid + 1][1] != key_bytes:
return mid
low = mid + 1
return low if find_start else high
56 changes: 56 additions & 0 deletions draken/compiled/bloom_filter.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# cython: language_level=3
# cython: boundscheck=False
# cython: wraparound=False
"""
This is not a general perpose Bloom Filter, if used outside of Draken, it may not
perform entirely as expected.
"""

from libc.stdlib cimport malloc, free
from libc.string cimport memset, memcpy

# Define constants for the fixed size
BIT_ARRAY_SIZE = 64 * 1024 # 64 KB = 512 Kbits
BYTE_ARRAY_SIZE = BIT_ARRAY_SIZE // 8

cdef class BloomFilter:
cdef unsigned char* bit_array

def __cinit__(self):
# Allocate memory for the bit array and initialize to 0
self.bit_array = <unsigned char*>malloc(BYTE_ARRAY_SIZE)
if not self.bit_array:
raise MemoryError("Failed to allocate memory for the bit array.")
memset(self.bit_array, 0, BYTE_ARRAY_SIZE)

def __dealloc__(self):
if self.bit_array:
free(self.bit_array)

cpdef void add(self, long item):
"""Add an item to the Bloom filter"""
h1 = item % BIT_ARRAY_SIZE
# Apply the golden ratio to the item and use modulo to wrap within the size of the bit array
h2 = <long>(item * 1.618033988749895) % BIT_ARRAY_SIZE
# Set bits using bitwise OR
self.bit_array[h1 // 8] |= 1 << (h1 % 8)
self.bit_array[h2 // 8] |= 1 << (h2 % 8)

cpdef int possibly_contains(self, long item):
"""Check if the item might be in the set"""
h1 = item % BIT_ARRAY_SIZE
# Apply the golden ratio to the item and use modulo to wrap within the size of the bit array
h2 = <long>(item * 1.618033988749895) % BIT_ARRAY_SIZE
# Check bits using bitwise AND
return (self.bit_array[h1 // 8] & (1 << (h1 % 8))) and \
(self.bit_array[h2 // 8] & (1 << (h2 % 8)))

cpdef memoryview serialize(self):
"""Serialize the Bloom filter to a memory view"""
return memoryview(self.bit_array[:BYTE_ARRAY_SIZE])

cpdef BloomFilter deserialize(const unsigned char* data):
"""Deserialize a memory view to a Bloom filter"""
bf = BloomFilter()
memcpy(bf.bit_array, data, BYTE_ARRAY_SIZE)
return bf
Loading

0 comments on commit cfd78f5

Please sign in to comment.