From 414e0459a5941afc3050bba2e2c41273f99b2ac8 Mon Sep 17 00:00:00 2001 From: Praneeth Bedapudi Date: Mon, 11 Dec 2023 10:49:27 +0530 Subject: [PATCH] cleanup defined index Signed-off-by: Praneeth Bedapudi --- README.md | 9 +++- liteindex/kv_index.py | 118 ++++++++++++++++++++++++++++++------------ setup.py | 2 +- 3 files changed, 92 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 8d98191..9c30342 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,13 @@ pip install --upgrade liteindex - works across threads, processes seamlessly - compression is supported natively and optional custom compression dictionaries can be built +### KVStore +- [Documentation]() | [Detailed example]() | [Benchmarks]() +- screaming fast reads, full multi thread process support, based on lmdb +- can store any python objects, no querying supported +- Eviction policies supported: `LRU`, `LFU`, `any` and age based invalidation ### function_cache - [Documentation](https://github.com/notAI-tech/LiteIndex/blob/main/function_cache.md) | [Detailed example](https://github.com/notAI-tech/LiteIndex/blob/main/examples/function_cache_example.py) | [Benchmarks](https://github.com/notAI-tech/LiteIndex/tree/main/benchmarks/function_cache) -- based on DefinedIndex, easy to use decorator for caching heavy function calls with arguments -- works across threads, processes seamlessly +- ultra fast, works across threads, processes seamlessly +- Eviction policies supported: `LRU`, `LFU`, `any` and age based invalidation diff --git a/liteindex/kv_index.py b/liteindex/kv_index.py index 696a5d7..1339fb0 100644 --- a/liteindex/kv_index.py +++ b/liteindex/kv_index.py @@ -5,28 +5,39 @@ import hashlib import tempfile + class KVIndex: EvictNone = "none" EvictAny = "any" - def __init__(self, dir=tempfile.mkdtemp(), fast_mode=False, store_key=True, eviction_policy=EvictNone, max_size_mb=0, max_no_of_elements=0): + def __init__( + self, + dir=tempfile.mkdtemp(), + fast_mode=False, + store_key=True, + eviction_policy=EvictNone, + max_size_mb=0, + max_no_of_elements=0, + ): self.fast_mode = fast_mode self.store_key = store_key self.eviction_policy = eviction_policy - + if self.eviction_policy == KVIndex.EvictNone: self.max_no_of_elements = 0 self.max_size_mb = 0 elif self.eviction_policy == KVIndex.EvictAny: if max_size_mb == 0 and max_no_of_elements == 0: - raise ValueError("At least one of max_size_mb or max_no_of_elements must be set when eviction_policy is EvictAny") + raise ValueError( + "At least one of max_size_mb or max_no_of_elements must be set when eviction_policy is EvictAny" + ) else: raise ValueError(f"Unknown eviction_policy: {eviction_policy}") self.max_size_mb = max_size_mb self.max_no_of_elements = max_no_of_elements - + self.__env = lmdb.open( path=dir, subdir=True, @@ -37,20 +48,24 @@ def __init__(self, dir=tempfile.mkdtemp(), fast_mode=False, store_key=True, evic writemap=fast_mode, max_readers=2048, meminit=False, - max_dbs=4 + max_dbs=4, ) - self.__key_hash_to_value_db = self.__env.open_db(b"key_hash_to_value", create=True) + self.__key_hash_to_value_db = self.__env.open_db( + b"key_hash_to_value", create=True + ) if self.store_key: - self.__key_hash_to_key_db = self.__env.open_db(b"key_hash_to_key", create=True) - - def __evict(self, txn, min_to_delete=1): + self.__key_hash_to_key_db = self.__env.open_db( + b"key_hash_to_key", create=True + ) + + def __evict(self, txn, min_to_delete=1): if self.max_size_mb: stat = txn.stat(db=self.__key_hash_to_value_db) - - current_size_mb = stat['psize'] * stat['leaf_pages'] / 1024**2 - current_record_count = stat['entries'] + + current_size_mb = stat["psize"] * stat["leaf_pages"] / 1024**2 + current_record_count = stat["entries"] if current_size_mb <= self.max_size_mb * 0.8: return @@ -65,16 +80,16 @@ def __evict(self, txn, min_to_delete=1): cursor.delete() else: break - + stat = txn.stat(db=self.__key_hash_to_value_db) - current_size_mb = stat['psize'] * stat['leaf_pages'] / 1024**2 - current_record_count = stat['entries'] + current_size_mb = stat["psize"] * stat["leaf_pages"] / 1024**2 + current_record_count = stat["entries"] if current_size_mb <= self.max_size_mb * 0.8: cursor.close() return elif self.max_no_of_elements: stat = txn.stat(db=self.__key_hash_to_value_db) - current_record_count = stat['entries'] + current_record_count = stat["entries"] if current_record_count <= self.max_no_of_elements: return @@ -89,25 +104,37 @@ def __evict(self, txn, min_to_delete=1): cursor.delete() else: break - def __setitem__(self, key, value): with self.__env.begin(write=True) as txn: self.__evict(txn) - key = pickle.dumps(key, protocol=pickle.HIGHEST_PROTOCOL) if isinstance(key, str) else key.encode() - pickled_value = pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL) if isinstance(value, str) else value.encode() + key = ( + pickle.dumps(key, protocol=pickle.HIGHEST_PROTOCOL) + if isinstance(key, str) + else key.encode() + ) + pickled_value = ( + pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL) + if isinstance(value, str) + else value.encode() + ) key_hash = hashlib.sha256(key).digest() - txn.put(key_hash, pickled_value, db=self.__key_hash_to_value_db, overwrite=True) + txn.put( + key_hash, pickled_value, db=self.__key_hash_to_value_db, overwrite=True + ) if self.store_key: txn.put(key_hash, key, db=self.__key_hash_to_key_db, overwrite=False) - - + def __getitem__(self, key): with self.__env.begin(write=False, buffers=True) as txn: - key = pickle.dumps(key, protocol=pickle.HIGHEST_PROTOCOL) if isinstance(key, str) else key.encode() + key = ( + pickle.dumps(key, protocol=pickle.HIGHEST_PROTOCOL) + if isinstance(key, str) + else key.encode() + ) key_hash = hashlib.sha256(key).digest() result = txn.get(key_hash, db=self.__key_hash_to_value_db) @@ -116,32 +143,55 @@ def __getitem__(self, key): raise KeyError(key) return pickle.loads(result) if result[0] == 128 else result.decode() - + def update(self, items): with self.__env.begin(write=True) as txn: self.__evict(txn, min_to_delete=len(items)) for key, value in items: - key = pickle.dumps(key, protocol=pickle.HIGHEST_PROTOCOL) if isinstance(key, str) else key.encode() - pickled_value = pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL) if isinstance(value, str) else value.encode() + key = ( + pickle.dumps(key, protocol=pickle.HIGHEST_PROTOCOL) + if isinstance(key, str) + else key.encode() + ) + pickled_value = ( + pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL) + if isinstance(value, str) + else value.encode() + ) key_hash = hashlib.sha256(key).digest() - txn.put(key_hash, pickled_value, db=self.__key_hash_to_value_db, overwrite=True) + txn.put( + key_hash, + pickled_value, + db=self.__key_hash_to_value_db, + overwrite=True, + ) if self.store_key: - txn.put(key_hash, key, db=self.__key_hash_to_key_db, overwrite=False) - + txn.put( + key_hash, key, db=self.__key_hash_to_key_db, overwrite=False + ) + def getmulti(self, keys): with self.__env.begin(write=False, buffers=True) as txn: results = [] for key in keys: - key = pickle.dumps(key, protocol=pickle.HIGHEST_PROTOCOL) if isinstance(key, str) else key.encode() + key = ( + pickle.dumps(key, protocol=pickle.HIGHEST_PROTOCOL) + if isinstance(key, str) + else key.encode() + ) key_hash = hashlib.sha256(key).digest() result = txn.get(key_hash, db=self.__key_hash_to_value_db) - results.append(pickle.loads(result) if result[0] == 128 else result.decode() if result is not None else None) - - return results - + results.append( + pickle.loads(result) + if result[0] == 128 + else result.decode() + if result is not None + else None + ) + return results diff --git a/setup.py b/setup.py index 69f03b7..856c84f 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ EMAIL = "praneeth@bpraneeth.com" AUTHOR = "BEDAPUDI PRANEETH" REQUIRES_PYTHON = ">=3.6.0" -VERSION = "0.0.2.dev22" +VERSION = "0.0.2.dev23" # What packages are required for this module to be executed? REQUIRED = [