Skip to content

Commit

Permalink
feat(cache): replace HDF cache with Pickle (#904)
Browse files Browse the repository at this point in the history
* chore(deps): bump deps to versions compatible with Python 3.12

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* feat(cache): replace HDF cache with Pickle

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

---------

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
  • Loading branch information
lukapeschke authored Nov 21, 2024
1 parent c924de1 commit 9f92582
Show file tree
Hide file tree
Showing 7 changed files with 396 additions and 616 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

### Changed

- Drop support for HDF cache, and use a Pickle cache instead. CacheEnum.HDF is now an alias to CacheEnum.pickle
and will be dropped in v0.15.0

## [0.13.0] - 2024-07-17

### Changed
Expand Down
22 changes: 0 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,25 +88,3 @@ If you just want to download a file, without converting it to a pandas dataframe
...
Image size: 60284 bytes
```

## Installation on macOS M1 chipset

## install everything
```console
brew install hdf5 snappy
HDF5_DIR="/opt/homebrew/Cellar/hdf5/1.12.1/" CPPFLAGS="-I/opt/homebrew/Cellar/snappy/1.1.9/include -L/opt/homebrew/Cellar/snappy/1.1.9/lib" poetry install
```

For more details, here is what is needed:

### install pytables
```console
brew install hdf5
HDF5_DIR="/opt/homebrew/Cellar/hdf5/1.12.1/" poetry run pip install tables
```

### install python-snappy
```console
brew install snappy
CPPFLAGS="-I/opt/homebrew/Cellar/snappy/1.1.9/include -L/opt/homebrew/Cellar/snappy/1.1.9/lib" poetry run pip install python-snappy
```
22 changes: 0 additions & 22 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,25 +86,3 @@ If you just want to download a file, without converting it to a pandas dataframe
...
Image size: 60284 bytes
```

## Installation on macOS M1 chipset

## install everything
```console
brew install hdf5 snappy
HDF5_DIR="/opt/homebrew/Cellar/hdf5/1.12.1/" CPPFLAGS="-I/opt/homebrew/Cellar/snappy/1.1.9/include -L/opt/homebrew/Cellar/snappy/1.1.9/lib" poetry install
```

For more details, here is what is needed:

### install pytables
```console
brew install hdf5
HDF5_DIR="/opt/homebrew/Cellar/hdf5/1.12.1/" poetry run pip install tables
```

### install python-snappy
```console
brew install snappy
CPPFLAGS="-I/opt/homebrew/Cellar/snappy/1.1.9/include -L/opt/homebrew/Cellar/snappy/1.1.9/lib" poetry run pip install python-snappy
```
42 changes: 24 additions & 18 deletions peakina/cache.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import warnings
from abc import ABCMeta, abstractmethod
from collections.abc import Callable
from contextlib import suppress
from datetime import timedelta
from enum import Enum
from functools import lru_cache, wraps
from pathlib import Path
from time import monotonic_ns, time
from typing import Any, Callable, TypedDict
from typing import Any, TypedDict

import pandas as pd

Expand All @@ -18,17 +20,25 @@ class InMemoryCached(TypedDict):

class CacheEnum(str, Enum):
MEMORY = "memory"
# FIXME: to be removed in v0.15.0
HDF = "hdf"
PICKLE = "pickle"


class Cache(metaclass=ABCMeta):
@staticmethod
def get_cache(kind: CacheEnum, *args: Any, **kwargs: Any) -> "Cache":
ALL_CACHES = {
CacheEnum.MEMORY: InMemoryCache,
CacheEnum.HDF: HDFCache,
}
return ALL_CACHES[kind](*args, **kwargs) # type: ignore[no-any-return]
if kind == CacheEnum.HDF:
warnings.warn(
"HDF Cache has been removed in v0.14.0, PickleCache will be used instead. "
"This will be an error in v0.15.0, please use CacheEnum.PICKLE instead",
DeprecationWarning,
)
kind = CacheEnum.PICKLE
if kind == CacheEnum.PICKLE:
return PickleCache(*args, **kwargs)
else:
return InMemoryCache(*args, **kwargs)

@staticmethod
def should_invalidate(
Expand Down Expand Up @@ -88,11 +98,13 @@ def delete(self, key: str) -> None:
del self._cache[key]


class HDFCache(Cache):
META_DF_KEY = "__meta__"
META_DF_KEY = "__meta__"


class PickleCache(Cache):
def __init__(self, cache_dir: str | Path) -> None:
self.cache_dir = Path(cache_dir).resolve()
self._meta_df_key = self.cache_dir / META_DF_KEY

def get_metadata(self) -> pd.DataFrame:
"""
Expand All @@ -101,20 +113,14 @@ def get_metadata(self) -> pd.DataFrame:
If metadata file is not found or is corrupted, an empty one is recreated.
"""
try:
# We manually instantiate the HDFStore to be able to close it no matter what
# See https://github.com/pandas-dev/pandas/pull/28429 for more infos
store = pd.HDFStore(self.cache_dir / self.META_DF_KEY, mode="r")
try:
metadata = pd.read_hdf(store)
finally:
store.close()
metadata = pd.read_pickle(self._meta_df_key)
except Exception: # catch all, on purpose
metadata = pd.DataFrame(columns=["key", "mtime", "created_at"])
self.set_metadata(metadata)
return metadata

def set_metadata(self, df: pd.DataFrame) -> None:
df.to_hdf(self.cache_dir / self.META_DF_KEY, self.META_DF_KEY, mode="w")
df.to_pickle(self._meta_df_key)

def get(
self, key: str, mtime: float | None = None, expire: timedelta | None = None
Expand All @@ -135,7 +141,7 @@ def get(
self.delete(key)

try:
return pd.read_hdf(self.cache_dir / key)
return pd.read_pickle(self.cache_dir / key)
except FileNotFoundError:
raise KeyError(key)

Expand All @@ -148,7 +154,7 @@ def set(self, key: str, value: pd.DataFrame, mtime: float | None = None) -> None
metadata = metadata[metadata.key != key] # drop duplicates
metadata = pd.concat([metadata, pd.Series(infos).to_frame().T], ignore_index=True)
self.set_metadata(metadata)
value.to_hdf(self.cache_dir / key, key, mode="w")
value.to_pickle(self.cache_dir / key)
except OSError:
self.delete(key)
raise
Expand Down
Loading

0 comments on commit 9f92582

Please sign in to comment.