From 9bdaf4056930780bfa3b95288d9847e6ef204880 Mon Sep 17 00:00:00 2001 From: diehlbw Date: Thu, 27 Jun 2024 13:45:55 +0000 Subject: [PATCH] expose inmemory to startup --- changelog/34.feature.rst | 1 + src/seismometer/__init__.py | 23 +++++++++-- src/seismometer/configuration/config.py | 7 ++++ src/seismometer/seismogram.py | 51 ++++++++++++++++++++++--- 4 files changed, 74 insertions(+), 8 deletions(-) create mode 100644 changelog/34.feature.rst diff --git a/changelog/34.feature.rst b/changelog/34.feature.rst new file mode 100644 index 0000000..902b9e0 --- /dev/null +++ b/changelog/34.feature.rst @@ -0,0 +1 @@ +* seismometer.run_startup() can now accept preloaded prediction and event dataframes that take precendence over loading from configuration diff --git a/src/seismometer/__init__.py b/src/seismometer/__init__.py index 15e6aa8..d143f2f 100644 --- a/src/seismometer/__init__.py +++ b/src/seismometer/__init__.py @@ -1,11 +1,22 @@ import logging from pathlib import Path +from typing import Optional + +import pandas as pd from seismometer._version import __version__ from seismometer.core.logger import add_log_formatter, set_default_logger_config -def run_startup(*, config_path: str | Path = None, output_path: str | Path = None, log_level: int = logging.WARN): +def run_startup( + *, + config_path: str | Path = None, + output_path: str | Path = None, + predictions_frame: Optional[pd.DataFrame] = None, + events_frame: Optional[pd.DataFrame] = None, + definitions: Optional[dict] = None, + log_level: int = logging.WARN, +): """ Runs the required startup for instantiating seismometer. @@ -16,6 +27,12 @@ def run_startup(*, config_path: str | Path = None, output_path: str | Path = Non output_path : Optional[str | Path], optional An output path to write data to, overwriting the default path specified by info_dir in config.yml, by default None. + predictions_frame : Optional[pd.DataFrame], optional + An optional DataFrame containing the fully loaded predictions data, by default None. + By default, when not specified here, these data will be loaded based on conifguration. + events_frame : Optional[pd.DataFrame], optional + An optional DataFrame containing the fully loaded events data, by default None. + By default, when not specified here, these data will be loaded based on conifguration. log_level : logging._Level, optional The log level to set. by default, logging.WARN. """ @@ -31,8 +48,8 @@ def run_startup(*, config_path: str | Path = None, output_path: str | Path = Non logger.setLevel(log_level) logger.info(f"seismometer version {__version__} starting") - sg = Seismogram(config_path, output_path) - sg.load_data() + sg = Seismogram(config_path, output_path, definitions=definitions) + sg.load_data(predictions=predictions_frame, events=events_frame) # Surface api into namespace s_module = importlib.import_module("seismometer._api") diff --git a/src/seismometer/configuration/config.py b/src/seismometer/configuration/config.py index e115811..0427660 100644 --- a/src/seismometer/configuration/config.py +++ b/src/seismometer/configuration/config.py @@ -37,6 +37,8 @@ class ConfigProvider: Specifies the template notebook name to use during building, by default None; it uses "template" from the primary config file. This is the template that will be used as a base for building the final notebook. + definitions : Optional[dict], optional + A dictionary of definitions to use instead of loading those specified by configuration, by default None. """ @@ -47,6 +49,7 @@ def __init__( info_dir: str | Path = None, data_dir: str | Path = None, template_notebook: Option = None, + definitions: dict = None, ): self._config: OtherInfo = None self._usage: DataUsage = None @@ -55,6 +58,10 @@ def __init__( self._output_dir: Path = None self._output_notebook: str = "" + if definitions is not None: + self._prediction_defs = PredictionDictionary(predictions=definitions.pop("predictions", [])) + self._event_defs = EventDictionary(events=definitions.pop("events", None)) + self._load_config_config(config_config) self._resolve_other_paths(usage_config, info_dir, data_dir) self._override_template(template_notebook) diff --git a/src/seismometer/seismogram.py b/src/seismometer/seismogram.py index 7ee08f4..69efd5f 100644 --- a/src/seismometer/seismogram.py +++ b/src/seismometer/seismogram.py @@ -50,7 +50,12 @@ class Seismogram(object, metaclass=Singleton): output_list: list[str] """ The list of columns representing model outputs.""" - def __init__(self, config_path: Optional[str | Path] = None, output_path: Optional[str | Path] = None): + def __init__( + self, + config_path: Optional[str | Path] = None, + output_path: Optional[str | Path] = None, + definitions: Optional[dict] = None, + ): """ Constructor for Seismogram, which can only be instantiated once. @@ -63,6 +68,8 @@ def __init__(self, config_path: Optional[str | Path] = None, output_path: Option output_path : str or Path, optional Override location to place resulting data and report files. Defaults to the config.yml info_dir, and then the notebook's output directory. + definitions : dict, optional + Additional definitions to be used instead of loading based on configuration, by default None. """ if config_path is None: @@ -73,13 +80,36 @@ def __init__(self, config_path: Optional[str | Path] = None, output_path: Option self.cohort_cols: list[str] = [] self.config_path = config_path - self.load_config(config_path) + self.load_config(config_path, definitions=definitions) self.config.set_output(output_path) self.config.output_dir.mkdir(parents=True, exist_ok=True) self.dataloader = loader_factory(self.config) - def load_data(self, predictions=None, events=None): + def load_data( + self, *, predictions: Optional[pd.DataFrame] = None, events: Optional[pd.DataFrame] = None, reset: bool = False + ): + """ + Loads the seismogram data. + + Uses the passed in frames if they are specified, otherwise uses configuration to load data. + If data is already loaded, does not change state unless reset is true. + + Parameters + ---------- + predictions : pd.DataFrame, optional + The fully prepared predictions dataframe, by default None. + Uses this when specified, otherwise loads based on configuration. + events : pd.DataFrame, optional + The pre-loaded events dataframe, by default None. + Uses this when specified, otherwise loads based on configuration. + reset : bool, optional + Flag when set to true will overwrite existing dataframe, by default False + """ + if self.dataframe and not reset: + logger.debug("Data already loaded; pass reset=True to clear data and re-evaluate.") + return + self._load_metadata() self.dataframe = self.dataloader.load_data(predictions, events) @@ -273,8 +303,19 @@ def score_bins(self): # endregion # region initialization and preprocessing (this region knows about config) - def load_config(self, config_path: Path): - self.config = ConfigProvider(config_path) + def load_config(self, config_path: Path, definitions: Optional[dict] = None): + """ + Loads the base configuration and alerting congfiguration + + Parameters + ---------- + config_path : Path + The location of the main configuration file. + definitions : Optional[dict], optional + An optional dictionary containing both events and predictions lists, by default None. + If not passed, these will be loaded based on configuration. + """ + self.config = ConfigProvider(config_path, definitions=definitions) self.alert_config = AlertConfigProvider(config_path) if len(self.config.cohorts) == 0: