Merge pull request #4 from mabel-dev/initial

Initial
mabel-dev · May 20, 2024 · cfd78f5 · cfd78f5
2 parents ab55948 + 4ea3ee0
commit cfd78f5
Show file tree

Hide file tree

Showing 28 changed files with 1,329 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+*.c
+*.cpp
diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2024 Justin Joyce
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/draken/__version__.py b/draken/__version__.py
@@ -1,4 +1,4 @@
-__build__ = 3
+__build__ = 4
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/draken/compiled/__init__.py b/draken/compiled/__init__.py
@@ -0,0 +1,2 @@
+from .binary_search import StringBinaryIndex
+from .hadro import create_sstable
diff --git a/draken/compiled/binary_search.pyx b/draken/compiled/binary_search.pyx
@@ -0,0 +1,78 @@
+# cython: language_level=3
+
+from typing import List, Tuple, Dict
+from libc.stdint cimport int32_t, int64_t
+
+cdef class StringBinaryIndex:
+    cdef list key_store
+    cdef list value_store
+    cdef dict key_to_pointer
+
+    def __init__(self):
+        self.key_store = []
+        self.value_store = []
+        self.key_to_pointer = {}
+
+    def add_entry(self, key: str, filename: str, offset: int):
+        # Add individual (filename, offset) tuples for the given key
+        if key not in self.key_to_pointer:
+            self.key_to_pointer[key] = len(self.value_store)
+            self.key_store.append((len(key.encode('utf-8')), key.encode('utf-8'), self.key_to_pointer[key]))
+            self.value_store.append([])  # Initialize empty list for new key
+
+        # Add value to the corresponding value list
+        pointer = self.key_to_pointer[key]
+        self.value_store[pointer].append((len(filename.encode('utf-8')), filename.encode('utf-8'), offset))
+
+    def finalize_index(self):
+        # Sort key_store by keys for binary search
+        self.key_store.sort()
+
+    def lookup_eq(self, key: str) -> List[Tuple[str, int]]:
+        # Perform binary search on key_store
+        pointer = self.key_to_pointer.get(key)
+        if pointer is None:
+            return []
+
+        value_data = self.value_store[pointer]
+        return [(filename.decode('utf-8'), offset) for _, filename, offset in value_data]
+
+    def lookup_in_list(self, keys: List[str]) -> Dict[str, List[Tuple[str, int]]]:
+        result = {}
+        for key in keys:
+            result[key] = self.lookup_eq(key)
+        return result
+
+    def lookup_range(self, start_key: str, end_key: str) -> Dict[str, List[Tuple[str, int]]]:
+        result = {}
+        start_index = self._binary_search(start_key, find_start=True)
+        end_index = self._binary_search(end_key, find_start=False)
+
+        for index in range(start_index, end_index + 1):
+            key_len, key_bytes, pointer = self.key_store[index]
+            key = key_bytes.decode('utf-8')
+            result[key] = self.lookup_eq(key)
+
+        return result
+
+    def _binary_search(self, key: str, find_start: bool) -> int:
+        # Implement binary search on the sorted key_store
+        key_bytes = key.encode('utf-8')
+        low, high = 0, len(self.key_store) - 1
+        while low <= high:
+            mid = (low + high) // 2
+            mid_key_bytes = self.key_store[mid][1]
+            if mid_key_bytes < key_bytes:
+                low = mid + 1
+            elif mid_key_bytes > key_bytes:
+                high = mid - 1
+            else:
+                if find_start:
+                    if mid == 0 or self.key_store[mid - 1][1] != key_bytes:
+                        return mid
+                    high = mid - 1
+                else:
+                    if mid == len(self.key_store) - 1 or self.key_store[mid + 1][1] != key_bytes:
+                        return mid
+                    low = mid + 1
+        return low if find_start else high
diff --git a/draken/compiled/bloom_filter.pyx b/draken/compiled/bloom_filter.pyx
@@ -0,0 +1,56 @@
+# cython: language_level=3
+# cython: boundscheck=False
+# cython: wraparound=False
+"""
+This is not a general perpose Bloom Filter, if used outside of Draken, it may not
+perform entirely as expected.
+"""
+
+from libc.stdlib cimport malloc, free
+from libc.string cimport memset, memcpy
+
+# Define constants for the fixed size
+BIT_ARRAY_SIZE = 64 * 1024  # 64 KB = 512 Kbits
+BYTE_ARRAY_SIZE = BIT_ARRAY_SIZE // 8
+
+cdef class BloomFilter:
+    cdef unsigned char* bit_array
+
+    def __cinit__(self):
+        # Allocate memory for the bit array and initialize to 0
+        self.bit_array = <unsigned char*>malloc(BYTE_ARRAY_SIZE)
+        if not self.bit_array:
+            raise MemoryError("Failed to allocate memory for the bit array.")
+        memset(self.bit_array, 0, BYTE_ARRAY_SIZE)
+
+    def __dealloc__(self):
+        if self.bit_array:
+            free(self.bit_array)
+
+    cpdef void add(self, long item):
+        """Add an item to the Bloom filter"""
+        h1 = item % BIT_ARRAY_SIZE
+        # Apply the golden ratio to the item and use modulo to wrap within the size of the bit array
+        h2 = <long>(item * 1.618033988749895) % BIT_ARRAY_SIZE
+        # Set bits using bitwise OR
+        self.bit_array[h1 // 8] |= 1 << (h1 % 8)
+        self.bit_array[h2 // 8] |= 1 << (h2 % 8)
+
+    cpdef int possibly_contains(self, long item):
+        """Check if the item might be in the set"""
+        h1 = item % BIT_ARRAY_SIZE
+        # Apply the golden ratio to the item and use modulo to wrap within the size of the bit array
+        h2 = <long>(item * 1.618033988749895) % BIT_ARRAY_SIZE
+        # Check bits using bitwise AND
+        return (self.bit_array[h1 // 8] & (1 << (h1 % 8))) and \
+               (self.bit_array[h2 // 8] & (1 << (h2 % 8)))
+
+    cpdef memoryview serialize(self):
+        """Serialize the Bloom filter to a memory view"""
+        return memoryview(self.bit_array[:BYTE_ARRAY_SIZE])
+
+cpdef BloomFilter deserialize(const unsigned char* data):
+    """Deserialize a memory view to a Bloom filter"""
+    bf = BloomFilter()
+    memcpy(bf.bit_array, data, BYTE_ARRAY_SIZE)
+    return bf
-Original file line number
+Diff line change
@@ Expand Up / @@ -158,3 +158,5 @@ cython_debug/ @@
     #  and can be added to the global gitignore or merged into this file.  For a more nuclear
     #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
     #.idea/
+    *.c
+    *.cpp
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .binary_search import StringBinaryIndex
		from .hadro import create_sstable