oracle · ahosler · Nov 26, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 6, 2024
@@ -56,6 +56,6 @@ jobs:
           $CONDA/bin/conda init
           source /home/runner/.bashrc
           pip install -r test-requirements-operators.txt
-          pip install "oracle-automlx[forecasting]>=24.4.0"
+          pip install "oracle-automlx[forecasting]>=24.4.1"
           pip install pandas>=2.2.0
           python -m pytest -v -p no:warnings --durations=5 tests/operators/forecast
@@ -1,35 +1,33 @@
 #!/usr/bin/env python
-# -*- coding: utf-8; -*-
 
-# Copyright (c) 2022, 2023 Oracle and/or its affiliates.
+# Copyright (c) 2022, 2024 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 import os
 from string import Template
 from typing import Dict
-import json
 
 import yaml
 
 from ads.common.auth import AuthType, ResourcePrincipal
 from ads.opctl import logger
 from ads.opctl.config.base import ConfigProcessor
-from ads.opctl.config.utils import read_from_ini, _DefaultNoneDict
-from ads.opctl.utils import is_in_notebook_session, get_service_pack_prefix
+from ads.opctl.config.utils import _DefaultNoneDict, read_from_ini
 from ads.opctl.constants import (
-    DEFAULT_PROFILE,
-    DEFAULT_OCI_CONFIG_FILE,
-    DEFAULT_CONDA_PACK_FOLDER,
-    DEFAULT_ADS_CONFIG_FOLDER,
-    ADS_JOBS_CONFIG_FILE_NAME,
     ADS_CONFIG_FILE_NAME,
-    ADS_ML_PIPELINE_CONFIG_FILE_NAME,
     ADS_DATAFLOW_CONFIG_FILE_NAME,
+    ADS_JOBS_CONFIG_FILE_NAME,
     ADS_LOCAL_BACKEND_CONFIG_FILE_NAME,
+    ADS_ML_PIPELINE_CONFIG_FILE_NAME,
     ADS_MODEL_DEPLOYMENT_CONFIG_FILE_NAME,
-    DEFAULT_NOTEBOOK_SESSION_CONDA_DIR,
     BACKEND_NAME,
+    DEFAULT_ADS_CONFIG_FOLDER,
+    DEFAULT_CONDA_PACK_FOLDER,
+    DEFAULT_NOTEBOOK_SESSION_CONDA_DIR,
+    DEFAULT_OCI_CONFIG_FILE,
+    DEFAULT_PROFILE,
 )
+from ads.opctl.utils import get_service_pack_prefix, is_in_notebook_session
 
 
 class ConfigMerger(ConfigProcessor):
@@ -41,8 +39,9 @@ class ConfigMerger(ConfigProcessor):
     """
 
     def process(self, **kwargs) -> None:
-        config_string = Template(json.dumps(self.config)).safe_substitute(os.environ)
-        self.config = json.loads(config_string)
+        for key, value in self.config.items():
+            if isinstance(value, str):  # Substitute only if the value is a string
+                self.config[key] = Template(value).safe_substitute(os.environ)
 
         if "runtime" not in self.config:
             self.config["runtime"] = {}

@@ -1,7 +1,6 @@
 #!/usr/bin/env python
-# -*- coding: utf-8; -*-
 
-# Copyright (c) 2023 Oracle and/or its affiliates.
+# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 
@@ -11,15 +10,16 @@
 from typing import Any, Dict, List
 
 from ads.common.serializer import DataClassSerializable
-
-from ads.opctl.operator.common.utils import OperatorValidator
 from ads.opctl.operator.common.errors import InvalidParameterError
+from ads.opctl.operator.common.utils import OperatorValidator
+
 
 @dataclass(repr=True)
 class InputData(DataClassSerializable):
     """Class representing operator specification input data details."""
 
     connect_args: Dict = None
+    data: Dict = None
     format: str = None
     columns: List[str] = None
     url: str = None

@@ -40,6 +40,7 @@ def load_data(data_spec, storage_options=None, **kwargs):
     if data_spec is None:
         raise InvalidParameterError("No details provided for this data source.")
     filename = data_spec.url
+    data = data_spec.data
     format = data_spec.format
     columns = data_spec.columns
     connect_args = data_spec.connect_args
@@ -53,7 +54,10 @@ def load_data(data_spec, storage_options=None, **kwargs):
     if vault_secret_id is not None and connect_args is None:
         connect_args = dict()
 
-    if filename is not None:
+    if data is not None:
+        if format == "spark":
+            data = data.toPandas()
+    elif filename is not None:
         if not format:
             _, format = os.path.splitext(filename)
             format = format[1:]

@@ -81,22 +81,6 @@ def _build_model(self) -> pd.DataFrame:
 
         from automlx import Pipeline, init
 
-        cpu_count = os.cpu_count()
-        try:
-            if cpu_count < 4:
-                engine = "local"
-                engine_opts = None
-            else:
-                engine = "ray"
-                engine_opts = ({"ray_setup": {"_temp_dir": "/tmp/ray-temp"}},)
-            init(
-                engine=engine,
-                engine_opts=engine_opts,
-                loglevel=logging.CRITICAL,
-            )
-        except Exception as e:
-            logger.info(f"Error. Has Ray already been initialized? Skipping. {e}")
-
         full_data_dict = self.datasets.get_data_by_series()
 
         self.models = {}
@@ -112,6 +96,26 @@ def _build_model(self) -> pd.DataFrame:
         # Clean up kwargs for pass through
         model_kwargs_cleaned, time_budget = self.set_kwargs()
 
+        cpu_count = os.cpu_count()
+        try:
+            engine_type = model_kwargs_cleaned.pop(
+                "engine", "local" if cpu_count <= 4 else "ray"
+            )
+            engine_opts = (
+                None
+                if engine_type == "local"
+                else ({"ray_setup": {"_temp_dir": "/tmp/ray-temp"}},)
+            )
+            init(
+                engine=engine_type,
+                engine_opts=engine_opts,
+                loglevel=logging.CRITICAL,
+            )
+        except Exception as e:
+            logger.info(
+                f"Error initializing automlx. Has Ray already been initialized? Skipping. {e}"
+            )
+
         for s_id, df in full_data_dict.items():
             try:
                 logger.debug(f"Running automlx on series {s_id}")

@@ -43,11 +43,11 @@
 
 from ..const import (
     AUTO_SELECT,
+    BACKTEST_REPORT_NAME,
     SUMMARY_METRICS_HORIZON_LIMIT,
     SpeedAccuracyMode,
     SupportedMetrics,
     SupportedModels,
-    BACKTEST_REPORT_NAME
 )
 from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec
 from .forecast_datasets import ForecastDatasets
@@ -259,7 +259,11 @@ def generate_report(self):
                 output_dir = self.spec.output_directory.url
                 file_path = f"{output_dir}/{BACKTEST_REPORT_NAME}"
                 if self.spec.model == AUTO_SELECT:
-                    backtest_sections.append(rc.Heading("Auto-Select Backtesting and Performance Metrics", level=2))
+                    backtest_sections.append(
+                        rc.Heading(
+                            "Auto-Select Backtesting and Performance Metrics", level=2
+                        )
+                    )
                     if not os.path.exists(file_path):
                         failure_msg = rc.Text(
                             "auto-select could not be executed. Please check the "
@@ -268,15 +272,23 @@ def generate_report(self):
                         backtest_sections.append(failure_msg)
                     else:
                         backtest_stats = pd.read_csv(file_path)
-                        model_metric_map = backtest_stats.drop(columns=['metric', 'backtest'])
-                        average_dict = {k: round(v, 4) for k, v in model_metric_map.mean().to_dict().items()}
+                        model_metric_map = backtest_stats.drop(
+                            columns=["metric", "backtest"]
+                        )
+                        average_dict = {
+                            k: round(v, 4)
+                            for k, v in model_metric_map.mean().to_dict().items()
+                        }
                         best_model = min(average_dict, key=average_dict.get)
                         summary_text = rc.Text(
                             f"Overall, the average {self.spec.metric} scores for the models are {average_dict}, with"
-                            f" {best_model} being identified as the top-performing model during backtesting.")
+                            f" {best_model} being identified as the top-performing model during backtesting."
+                        )
                         backtest_table = rc.DataTable(backtest_stats, index=True)
                         liner_plot = get_auto_select_plot(backtest_stats)
-                        backtest_sections.extend([backtest_table, summary_text, liner_plot])
+                        backtest_sections.extend(
+                            [backtest_table, summary_text, liner_plot]
+                        )
 
                 forecast_plots = []
                 if len(self.forecast_output.list_series_ids()) > 0:
@@ -301,7 +313,14 @@ def generate_report(self):
                         forecast_plots = [forecast_text, forecast_sec]
 
                 yaml_appendix_title = rc.Heading("Reference: YAML File", level=2)
-                yaml_appendix = rc.Yaml(self.config.to_dict())
+                config_dict = self.config.to_dict()
+                # pop the data incase it isn't json serializable
+                config_dict["spec"]["historical_data"].pop("data")
+                if config_dict["spec"].get("additional_data"):
+                    config_dict["spec"]["additional_data"].pop("data")
+                if config_dict["spec"].get("test_data"):
+                    config_dict["spec"]["test_data"].pop("data")
+                yaml_appendix = rc.Yaml(config_dict)
                 report_sections = (
                     [summary]
                     + backtest_sections

@@ -358,11 +358,7 @@ def _generate_report(self):
                 logger.debug(f"Full Traceback: {traceback.format_exc()}")
 
         model_description = rc.Text(
-            "Prophet is a procedure for forecasting time series data based on an additive "
-            "model where non-linear trends are fit with yearly, weekly, and daily seasonality, "
-            "plus holiday effects. It works best with time series that have strong seasonal "
-            "effects and several seasons of historical data. Prophet is robust to missing "
-            "data and shifts in the trend, and typically handles outliers well."
+            """Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well."""
         )
         other_sections = all_sections
 

@@ -37,6 +37,9 @@ spec:
           nullable: true
           required: false
           type: dict
+        data:
+          nullable: true
+          required: false
         format:
           allowed:
             - csv
@@ -48,6 +51,7 @@ spec:
             - sql_query
             - hdf
             - tsv
+            - pandas
           required: false
           type: string
         columns:
@@ -92,6 +96,9 @@ spec:
           nullable: true
           required: false
           type: dict
+        data:
+          nullable: true
+          required: false
         format:
           allowed:
             - csv
@@ -103,6 +110,7 @@ spec:
             - sql_query
             - hdf
             - tsv
+            - pandas
           required: false
           type: string
         columns:
@@ -146,6 +154,9 @@ spec:
           nullable: true
           required: false
           type: dict
+        data:
+          nullable: true
+          required: false
         format:
           allowed:
             - csv
@@ -157,6 +168,7 @@ spec:
             - sql_query
             - hdf
             - tsv
+            - pandas
           required: false
           type: string
         columns:

@@ -157,27 +157,26 @@ forecast = [
   "oci-cli",
   "py-cpuinfo",
   "rich",
-  "autots[additional]",
+  "autots",
   "mlforecast",
   "neuralprophet>=0.7.0",
   "numpy<2.0.0",
   "oci-cli",
   "optuna",
-  "oracle-ads",
   "pmdarima",
   "prophet",
   "shap",
   "sktime",
   "statsmodels",
   "plotly",
   "oracledb",
-  "report-creator==1.0.28",
+  "report-creator==1.0.32",
 ]
 anomaly  = [
   "oracle_ads[opctl]",
   "autots",
   "oracledb",
-  "report-creator==1.0.28",
+  "report-creator==1.0.32",
   "rrcf==0.4.4",
   "scikit-learn<1.6.0",
   "salesforce-merlion[all]==2.0.4"
@@ -186,7 +185,7 @@ recommender = [
   "oracle_ads[opctl]",
   "scikit-surprise",
   "plotly",
-  "report-creator==1.0.28",
+  "report-creator==1.0.32",
 ]
 feature-store-marketplace = [
     "oracle-ads[opctl]",
@@ -202,7 +201,7 @@ pii = [
   "scrubadub_spacy",
   "spacy-transformers==1.2.5",
   "spacy==3.6.1",
-  "report-creator==1.0.28",
+  "report-creator==1.0.32",
 ]
 llm = ["langchain>=0.2", "langchain-community", "langchain_openai", "pydantic>=2,<3", "evaluate>=0.4.0"]
 aqua = ["jupyter_server"]