Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataflow changes #1018

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/run-forecast-unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,6 @@ jobs:
$CONDA/bin/conda init
source /home/runner/.bashrc
pip install -r test-requirements-operators.txt
pip install "oracle-automlx[forecasting]>=24.4.0"
pip install "oracle-automlx[forecasting]>=24.4.1"
pip install pandas>=2.2.0
python -m pytest -v -p no:warnings --durations=5 tests/operators/forecast
27 changes: 13 additions & 14 deletions ads/opctl/config/merger.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,33 @@
#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2022, 2023 Oracle and/or its affiliates.
# Copyright (c) 2022, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import os
from string import Template
from typing import Dict
import json

import yaml

from ads.common.auth import AuthType, ResourcePrincipal
from ads.opctl import logger
from ads.opctl.config.base import ConfigProcessor
from ads.opctl.config.utils import read_from_ini, _DefaultNoneDict
from ads.opctl.utils import is_in_notebook_session, get_service_pack_prefix
from ads.opctl.config.utils import _DefaultNoneDict, read_from_ini
from ads.opctl.constants import (
DEFAULT_PROFILE,
DEFAULT_OCI_CONFIG_FILE,
DEFAULT_CONDA_PACK_FOLDER,
DEFAULT_ADS_CONFIG_FOLDER,
ADS_JOBS_CONFIG_FILE_NAME,
ADS_CONFIG_FILE_NAME,
ADS_ML_PIPELINE_CONFIG_FILE_NAME,
ADS_DATAFLOW_CONFIG_FILE_NAME,
ADS_JOBS_CONFIG_FILE_NAME,
ADS_LOCAL_BACKEND_CONFIG_FILE_NAME,
ADS_ML_PIPELINE_CONFIG_FILE_NAME,
ADS_MODEL_DEPLOYMENT_CONFIG_FILE_NAME,
DEFAULT_NOTEBOOK_SESSION_CONDA_DIR,
BACKEND_NAME,
DEFAULT_ADS_CONFIG_FOLDER,
DEFAULT_CONDA_PACK_FOLDER,
DEFAULT_NOTEBOOK_SESSION_CONDA_DIR,
DEFAULT_OCI_CONFIG_FILE,
DEFAULT_PROFILE,
)
from ads.opctl.utils import get_service_pack_prefix, is_in_notebook_session


class ConfigMerger(ConfigProcessor):
Expand All @@ -41,8 +39,9 @@ class ConfigMerger(ConfigProcessor):
"""

def process(self, **kwargs) -> None:
config_string = Template(json.dumps(self.config)).safe_substitute(os.environ)
self.config = json.loads(config_string)
for key, value in self.config.items():
if isinstance(value, str): # Substitute only if the value is a string
self.config[key] = Template(value).safe_substitute(os.environ)

if "runtime" not in self.config:
self.config["runtime"] = {}
Expand Down
8 changes: 4 additions & 4 deletions ads/opctl/operator/common/operator_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2023 Oracle and/or its affiliates.
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/


Expand All @@ -11,15 +10,16 @@
from typing import Any, Dict, List

from ads.common.serializer import DataClassSerializable

from ads.opctl.operator.common.utils import OperatorValidator
from ads.opctl.operator.common.errors import InvalidParameterError
from ads.opctl.operator.common.utils import OperatorValidator


@dataclass(repr=True)
class InputData(DataClassSerializable):
"""Class representing operator specification input data details."""

connect_args: Dict = None
data: Dict = None
format: str = None
columns: List[str] = None
url: str = None
Expand Down
6 changes: 5 additions & 1 deletion ads/opctl/operator/lowcode/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def load_data(data_spec, storage_options=None, **kwargs):
if data_spec is None:
raise InvalidParameterError("No details provided for this data source.")
filename = data_spec.url
data = data_spec.data
format = data_spec.format
columns = data_spec.columns
connect_args = data_spec.connect_args
Expand All @@ -53,7 +54,10 @@ def load_data(data_spec, storage_options=None, **kwargs):
if vault_secret_id is not None and connect_args is None:
connect_args = dict()

if filename is not None:
if data is not None:
if format == "spark":
data = data.toPandas()
elif filename is not None:
if not format:
_, format = os.path.splitext(filename)
format = format[1:]
Expand Down
36 changes: 20 additions & 16 deletions ads/opctl/operator/lowcode/forecast/model/automlx.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,22 +81,6 @@ def _build_model(self) -> pd.DataFrame:

from automlx import Pipeline, init

cpu_count = os.cpu_count()
try:
if cpu_count < 4:
engine = "local"
engine_opts = None
else:
engine = "ray"
engine_opts = ({"ray_setup": {"_temp_dir": "/tmp/ray-temp"}},)
init(
engine=engine,
engine_opts=engine_opts,
loglevel=logging.CRITICAL,
)
except Exception as e:
logger.info(f"Error. Has Ray already been initialized? Skipping. {e}")

full_data_dict = self.datasets.get_data_by_series()

self.models = {}
Expand All @@ -112,6 +96,26 @@ def _build_model(self) -> pd.DataFrame:
# Clean up kwargs for pass through
model_kwargs_cleaned, time_budget = self.set_kwargs()

cpu_count = os.cpu_count()
try:
engine_type = model_kwargs_cleaned.pop(
"engine", "local" if cpu_count <= 4 else "ray"
)
engine_opts = (
None
if engine_type == "local"
else ({"ray_setup": {"_temp_dir": "/tmp/ray-temp"}},)
)
init(
engine=engine_type,
engine_opts=engine_opts,
loglevel=logging.CRITICAL,
)
except Exception as e:
logger.info(
f"Error initializing automlx. Has Ray already been initialized? Skipping. {e}"
)

for s_id, df in full_data_dict.items():
try:
logger.debug(f"Running automlx on series {s_id}")
Expand Down
33 changes: 26 additions & 7 deletions ads/opctl/operator/lowcode/forecast/model/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@

from ..const import (
AUTO_SELECT,
BACKTEST_REPORT_NAME,
SUMMARY_METRICS_HORIZON_LIMIT,
SpeedAccuracyMode,
SupportedMetrics,
SupportedModels,
BACKTEST_REPORT_NAME
)
from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec
from .forecast_datasets import ForecastDatasets
Expand Down Expand Up @@ -259,7 +259,11 @@ def generate_report(self):
output_dir = self.spec.output_directory.url
file_path = f"{output_dir}/{BACKTEST_REPORT_NAME}"
if self.spec.model == AUTO_SELECT:
backtest_sections.append(rc.Heading("Auto-Select Backtesting and Performance Metrics", level=2))
backtest_sections.append(
rc.Heading(
"Auto-Select Backtesting and Performance Metrics", level=2
)
)
if not os.path.exists(file_path):
failure_msg = rc.Text(
"auto-select could not be executed. Please check the "
Expand All @@ -268,15 +272,23 @@ def generate_report(self):
backtest_sections.append(failure_msg)
else:
backtest_stats = pd.read_csv(file_path)
model_metric_map = backtest_stats.drop(columns=['metric', 'backtest'])
average_dict = {k: round(v, 4) for k, v in model_metric_map.mean().to_dict().items()}
model_metric_map = backtest_stats.drop(
columns=["metric", "backtest"]
)
average_dict = {
k: round(v, 4)
for k, v in model_metric_map.mean().to_dict().items()
}
best_model = min(average_dict, key=average_dict.get)
summary_text = rc.Text(
f"Overall, the average {self.spec.metric} scores for the models are {average_dict}, with"
f" {best_model} being identified as the top-performing model during backtesting.")
f" {best_model} being identified as the top-performing model during backtesting."
)
backtest_table = rc.DataTable(backtest_stats, index=True)
liner_plot = get_auto_select_plot(backtest_stats)
backtest_sections.extend([backtest_table, summary_text, liner_plot])
backtest_sections.extend(
[backtest_table, summary_text, liner_plot]
)

forecast_plots = []
if len(self.forecast_output.list_series_ids()) > 0:
Expand All @@ -301,7 +313,14 @@ def generate_report(self):
forecast_plots = [forecast_text, forecast_sec]

yaml_appendix_title = rc.Heading("Reference: YAML File", level=2)
yaml_appendix = rc.Yaml(self.config.to_dict())
config_dict = self.config.to_dict()
# pop the data incase it isn't json serializable
config_dict["spec"]["historical_data"].pop("data")
if config_dict["spec"].get("additional_data"):
config_dict["spec"]["additional_data"].pop("data")
if config_dict["spec"].get("test_data"):
config_dict["spec"]["test_data"].pop("data")
yaml_appendix = rc.Yaml(config_dict)
report_sections = (
[summary]
+ backtest_sections
Expand Down
6 changes: 1 addition & 5 deletions ads/opctl/operator/lowcode/forecast/model/prophet.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,11 +358,7 @@ def _generate_report(self):
logger.debug(f"Full Traceback: {traceback.format_exc()}")

model_description = rc.Text(
"Prophet is a procedure for forecasting time series data based on an additive "
"model where non-linear trends are fit with yearly, weekly, and daily seasonality, "
"plus holiday effects. It works best with time series that have strong seasonal "
"effects and several seasons of historical data. Prophet is robust to missing "
"data and shifts in the trend, and typically handles outliers well."
"""Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well."""
)
other_sections = all_sections

Expand Down
12 changes: 12 additions & 0 deletions ads/opctl/operator/lowcode/forecast/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ spec:
nullable: true
required: false
type: dict
data:
nullable: true
required: false
format:
allowed:
- csv
Expand All @@ -48,6 +51,7 @@ spec:
- sql_query
- hdf
- tsv
- pandas
required: false
type: string
columns:
Expand Down Expand Up @@ -92,6 +96,9 @@ spec:
nullable: true
required: false
type: dict
data:
nullable: true
required: false
format:
allowed:
- csv
Expand All @@ -103,6 +110,7 @@ spec:
- sql_query
- hdf
- tsv
- pandas
required: false
type: string
columns:
Expand Down Expand Up @@ -146,6 +154,9 @@ spec:
nullable: true
required: false
type: dict
data:
nullable: true
required: false
format:
allowed:
- csv
Expand All @@ -157,6 +168,7 @@ spec:
- sql_query
- hdf
- tsv
- pandas
required: false
type: string
columns:
Expand Down
11 changes: 5 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -157,27 +157,26 @@ forecast = [
"oci-cli",
"py-cpuinfo",
"rich",
"autots[additional]",
"autots",
"mlforecast",
"neuralprophet>=0.7.0",
"numpy<2.0.0",
"oci-cli",
"optuna",
"oracle-ads",
"pmdarima",
"prophet",
"shap",
"sktime",
"statsmodels",
"plotly",
"oracledb",
"report-creator==1.0.28",
"report-creator==1.0.32",
]
anomaly = [
"oracle_ads[opctl]",
"autots",
"oracledb",
"report-creator==1.0.28",
"report-creator==1.0.32",
"rrcf==0.4.4",
"scikit-learn<1.6.0",
"salesforce-merlion[all]==2.0.4"
Expand All @@ -186,7 +185,7 @@ recommender = [
"oracle_ads[opctl]",
"scikit-surprise",
"plotly",
"report-creator==1.0.28",
"report-creator==1.0.32",
]
feature-store-marketplace = [
"oracle-ads[opctl]",
Expand All @@ -202,7 +201,7 @@ pii = [
"scrubadub_spacy",
"spacy-transformers==1.2.5",
"spacy==3.6.1",
"report-creator==1.0.28",
"report-creator==1.0.32",
]
llm = ["langchain>=0.2", "langchain-community", "langchain_openai", "pydantic>=2,<3", "evaluate>=0.4.0"]
aqua = ["jupyter_server"]
Expand Down
Loading
Loading