expose inmemory to startup

diehlbw · Jun 27, 2024 · 9bdaf40 · 9bdaf40
1 parent f6d490f
commit 9bdaf40
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 8 deletions.
diff --git a/changelog/34.feature.rst b/changelog/34.feature.rst
@@ -0,0 +1 @@
+* seismometer.run_startup() can now accept preloaded prediction and event dataframes that take precendence over loading from configuration
diff --git a/src/seismometer/__init__.py b/src/seismometer/__init__.py
@@ -1,11 +1,22 @@
 import logging
 from pathlib import Path
+from typing import Optional
+
+import pandas as pd
 
 from seismometer._version import __version__
 from seismometer.core.logger import add_log_formatter, set_default_logger_config
 
 
-def run_startup(*, config_path: str | Path = None, output_path: str | Path = None, log_level: int = logging.WARN):
+def run_startup(
+    *,
+    config_path: str | Path = None,
+    output_path: str | Path = None,
+    predictions_frame: Optional[pd.DataFrame] = None,
+    events_frame: Optional[pd.DataFrame] = None,
+    definitions: Optional[dict] = None,
+    log_level: int = logging.WARN,
+):
     """
     Runs the required startup for instantiating seismometer.
 
@@ -16,6 +27,12 @@ def run_startup(*, config_path: str | Path = None, output_path: str | Path = Non
     output_path : Optional[str | Path], optional
         An output path to write data to, overwriting the default path specified by info_dir in config.yml,
         by default None.
+    predictions_frame : Optional[pd.DataFrame], optional
+        An optional DataFrame containing the fully loaded predictions data, by default None.
+        By default, when not specified here, these data will be loaded based on conifguration.
+    events_frame : Optional[pd.DataFrame], optional
+        An optional DataFrame containing the fully loaded events data, by default None.
+        By default, when not specified here, these data will be loaded based on conifguration.
     log_level : logging._Level, optional
         The log level to set. by default, logging.WARN.
     """
@@ -31,8 +48,8 @@ def run_startup(*, config_path: str | Path = None, output_path: str | Path = Non
     logger.setLevel(log_level)
     logger.info(f"seismometer version {__version__} starting")
 
-    sg = Seismogram(config_path, output_path)
-    sg.load_data()
+    sg = Seismogram(config_path, output_path, definitions=definitions)
+    sg.load_data(predictions=predictions_frame, events=events_frame)
 
     # Surface api into namespace
     s_module = importlib.import_module("seismometer._api")

diff --git a/src/seismometer/configuration/config.py b/src/seismometer/configuration/config.py
@@ -37,6 +37,8 @@ class ConfigProvider:
         Specifies the template notebook name to use during building, by default None; it uses "template" from the
         primary config file.
         This is the template that will be used as a base for building the final notebook.
+    definitions : Optional[dict], optional
+        A dictionary of definitions to use instead of loading those specified by configuration, by default None.
 
     """
 
@@ -47,6 +49,7 @@ def __init__(
         info_dir: str | Path = None,
         data_dir: str | Path = None,
         template_notebook: Option = None,
+        definitions: dict = None,
     ):
         self._config: OtherInfo = None
         self._usage: DataUsage = None
@@ -55,6 +58,10 @@ def __init__(
         self._output_dir: Path = None
         self._output_notebook: str = ""
 
+        if definitions is not None:
+            self._prediction_defs = PredictionDictionary(predictions=definitions.pop("predictions", []))
+            self._event_defs = EventDictionary(events=definitions.pop("events", None))
+
         self._load_config_config(config_config)
         self._resolve_other_paths(usage_config, info_dir, data_dir)
         self._override_template(template_notebook)

diff --git a/src/seismometer/seismogram.py b/src/seismometer/seismogram.py
@@ -50,7 +50,12 @@ class Seismogram(object, metaclass=Singleton):
     output_list: list[str]
     """ The list of columns representing model outputs."""
 
-    def __init__(self, config_path: Optional[str | Path] = None, output_path: Optional[str | Path] = None):
+    def __init__(
+        self,
+        config_path: Optional[str | Path] = None,
+        output_path: Optional[str | Path] = None,
+        definitions: Optional[dict] = None,
+    ):
         """
         Constructor for Seismogram, which can only be instantiated once.
 
@@ -63,6 +68,8 @@ def __init__(self, config_path: Optional[str | Path] = None, output_path: Option
         output_path : str or Path, optional
             Override location to place resulting data and report files.
             Defaults to the config.yml info_dir, and then the notebook's output directory.
+        definitions : dict, optional
+            Additional definitions to be used instead of loading based on configuration, by default None.
 
         """
         if config_path is None:
@@ -73,13 +80,36 @@ def __init__(self, config_path: Optional[str | Path] = None, output_path: Option
         self.cohort_cols: list[str] = []
         self.config_path = config_path
 
-        self.load_config(config_path)
+        self.load_config(config_path, definitions=definitions)
 
         self.config.set_output(output_path)
         self.config.output_dir.mkdir(parents=True, exist_ok=True)
         self.dataloader = loader_factory(self.config)
 
-    def load_data(self, predictions=None, events=None):
+    def load_data(
+        self, *, predictions: Optional[pd.DataFrame] = None, events: Optional[pd.DataFrame] = None, reset: bool = False
+    ):
+        """
+        Loads the seismogram data.
+
+        Uses the passed in frames if they are specified, otherwise uses configuration to load data.
+        If data is already loaded, does not change state unless reset is true.
+
+        Parameters
+        ----------
+        predictions : pd.DataFrame, optional
+            The fully prepared predictions dataframe, by default None.
+            Uses this when specified, otherwise loads based on configuration.
+        events : pd.DataFrame, optional
+            The pre-loaded events dataframe, by default None.
+            Uses this when specified, otherwise loads based on configuration.
+        reset : bool, optional
+            Flag when set to true will overwrite existing dataframe, by default False
+        """
+        if self.dataframe and not reset:
+            logger.debug("Data already loaded; pass reset=True to clear data and re-evaluate.")
+            return
+
         self._load_metadata()
 
         self.dataframe = self.dataloader.load_data(predictions, events)
@@ -273,8 +303,19 @@ def score_bins(self):
     # endregion
 
     # region initialization and preprocessing (this region knows about config)
-    def load_config(self, config_path: Path):
-        self.config = ConfigProvider(config_path)
+    def load_config(self, config_path: Path, definitions: Optional[dict] = None):
+        """
+        Loads the base configuration and alerting congfiguration
+
+        Parameters
+        ----------
+        config_path : Path
+            The location of the main configuration file.
+        definitions : Optional[dict], optional
+            An optional dictionary containing both events and predictions lists, by default None.
+            If not passed, these will be loaded based on configuration.
+        """
+        self.config = ConfigProvider(config_path, definitions=definitions)
         self.alert_config = AlertConfigProvider(config_path)
 
         if len(self.config.cohorts) == 0: