From 2366e85d0b3a507c0ff6f53df7c266d417d7af4b Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 24 Feb 2022 16:43:29 -0800 Subject: [PATCH 01/58] State of feast Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/bigquery.py | 2 +- sdk/python/feast/infra/passthrough_provider.py | 2 ++ sdk/python/feast/repo_config.py | 5 ++++- .../tests/integration/feature_repos/repo_configuration.py | 2 +- .../tests/integration/online_store/test_universal_online.py | 4 ++-- .../tests/integration/registration/test_universal_types.py | 6 ++++-- 6 files changed, 14 insertions(+), 7 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index 44e62d6ad1..df82ac65f6 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -609,7 +609,7 @@ def _get_bigquery_client(project: Optional[str] = None, location: Optional[str] {{ featureview.name }}__base AS ( SELECT - subquery.*, + subquery.*, entity_dataframe.entity_timestamp, entity_dataframe.{{featureview.name}}__entity_row_unique_id FROM {{ featureview.name }}__subquery AS subquery diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index 3468b9dc92..bd0a0ec494 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -131,6 +131,8 @@ def materialize_single_feature_view( entities = [] for entity_name in feature_view.entities: + print("Entity Names!!") + print(entity_name) entities.append(registry.get_entity(entity_name, project)) ( diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 3f32d18b80..18f4e9c290 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -1,3 +1,4 @@ +from curses import raw import os from pathlib import Path from typing import Any @@ -345,9 +346,11 @@ def get_feature_server_config_from_type(feature_server_type: str): def load_repo_config(repo_path: Path) -> RepoConfig: config_path = repo_path / "feature_store.yaml" - + print("INNNNNN") + print(repo_path) with open(config_path) as f: raw_config = yaml.safe_load(os.path.expandvars(f.read())) + print(raw_config) try: c = RepoConfig(**raw_config) c.repo_path = repo_path diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 89aea727a6..d7e4231d7f 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -52,7 +52,7 @@ "type": "redis", "redis_type": "redis_cluster", # Redis Cluster Port Forwarding is setup in "pr_integration_tests.yaml" under "Setup Redis Cluster". - "connection_string": "127.0.0.1:6001,127.0.0.1:6002,127.0.0.1:6003", + "connection_string": "127.0.0.1:7001,127.0.0.1:7002,127.0.0.1:7003", } # FULL_REPO_CONFIGS contains the repo configurations (e.g. provider, offline store, diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 90a06bb347..95c7f8d0a1 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -564,8 +564,8 @@ def test_online_retrieval(environment, universal_data_sources, full_feature_name ) -@pytest.mark.integration -@pytest.mark.universal +# @pytest.mark.integration +# @pytest.mark.universal def test_online_store_cleanup(environment, universal_data_sources): """ Some online store implementations (like Redis) keep features from different features views diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 59ca119f98..74f66cf3b6 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -155,7 +155,7 @@ def test_entity_inference_types_match(offline_types_test_fixtures): ) -@pytest.mark.integration +#@pytest.mark.integration @pytest.mark.universal def test_feature_get_historical_features_types_match(offline_types_test_fixtures): environment, config, data_source, fv = offline_types_test_fixtures @@ -169,7 +169,7 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures ) entity = driver() fs.apply([fv, entity]) - + print("Applying") features = [f"{fv.name}:value"] entity_df = pd.DataFrame() entity_df["driver_id"] = ( @@ -181,9 +181,11 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures ts - timedelta(hours=4), ts - timedelta(hours=2), ] + print("Retrieving historical features") historical_features = fs.get_historical_features( entity_df=entity_df, features=features, ) + print("retrieved historical features") # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() print(historical_features_df) From 20e82ea4eda93a2e689294b3d5f88900b2a3dd94 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 25 Feb 2022 16:19:45 -0800 Subject: [PATCH 02/58] Clean up changes Signed-off-by: Kevin Zhang --- .../feast/infra/passthrough_provider.py | 6 +----- sdk/python/feast/infra/provider.py | 2 -- sdk/python/feast/type_map.py | 21 +++++++++++-------- .../test_universal_historical_retrieval.py | 3 --- .../integration/registration/test_cli.py | 1 + .../registration/test_universal_types.py | 5 ----- 6 files changed, 14 insertions(+), 24 deletions(-) diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index bd0a0ec494..1fdb37ded3 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta +from multiprocessing.dummy import JoinableQueue from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import pandas @@ -131,8 +132,6 @@ def materialize_single_feature_view( entities = [] for entity_name in feature_view.entities: - print("Entity Names!!") - print(entity_name) entities.append(registry.get_entity(entity_name, project)) ( @@ -154,12 +153,9 @@ def materialize_single_feature_view( ) table = offline_job.to_arrow() - if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping(table, feature_view.batch_source.field_mapping) - join_keys = {entity.join_key: entity.value_type for entity in entities} - with tqdm_builder(table.num_rows) as pbar: for batch in table.to_batches(DEFAULT_BATCH_SIZE): rows_to_write = _convert_arrow_to_proto(batch, feature_view, join_keys) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index b3f1029242..c4265bbe86 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -351,14 +351,12 @@ def _convert_arrow_to_proto( columns = [(f.name, f.dtype) for f in feature_view.features] + list( join_keys.items() ) - proto_values_by_column = { column: python_values_to_proto_values( table.column(column).to_numpy(zero_copy_only=False), value_type ) for column, value_type in columns } - entity_keys = [ EntityKeyProto( join_keys=join_keys, diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 82827bce2a..0cb2f413b6 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -326,13 +326,17 @@ def _python_value_to_proto_value( ProtoValue(unix_timestamp_list_val=Int64List(val=ts)) # type: ignore for ts in int_timestamps_lists ] - - return [ - ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore - if value is not None - else ProtoValue() - for value in values - ] + # TODO: Make this better. + val_list = [] + for value in values: + if(value.dtype == "bool"): + value = [bool(e) for e in value] + val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) + elif value is not None: + val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) # type: ignore + else: + val_list.append(ProtoValue()) + return val_list # Handle scalar types below else: @@ -353,9 +357,8 @@ def _python_value_to_proto_value( ) = PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE[feast_value_type] if valid_scalar_types: assert type(sample) in valid_scalar_types - return [ - ProtoValue(**{field_name: func(value)}) + ProtoValue(**{field_name: func(bool(value) if type(value) is np.bool_ else value)}) if not pd.isnull(value) else ProtoValue() for value in values diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index ab9b9515f3..c51b52dd1d 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -346,12 +346,10 @@ def test_historical_features(environment, universal_data_sources, full_feature_n event_timestamp, full_feature_names, ) - # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) - job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ @@ -371,7 +369,6 @@ def test_historical_features(environment, universal_data_sources, full_feature_n ], full_feature_names=full_feature_names, ) - start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() diff --git a/sdk/python/tests/integration/registration/test_cli.py b/sdk/python/tests/integration/registration/test_cli.py index 25f0ae4841..30d924dbb4 100644 --- a/sdk/python/tests/integration/registration/test_cli.py +++ b/sdk/python/tests/integration/registration/test_cli.py @@ -36,6 +36,7 @@ def test_universal_cli(environment: Environment): project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}" runner = CliRunner() + os.system(f"EXPORT PYTHONPATH=$PYTHONPATH:/$(pwd)") with tempfile.TemporaryDirectory() as repo_dir_name: try: diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 74f66cf3b6..4199b4a928 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -169,7 +169,6 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures ) entity = driver() fs.apply([fv, entity]) - print("Applying") features = [f"{fv.name}:value"] entity_df = pd.DataFrame() entity_df["driver_id"] = ( @@ -181,15 +180,11 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures ts - timedelta(hours=4), ts - timedelta(hours=2), ] - print("Retrieving historical features") historical_features = fs.get_historical_features( entity_df=entity_df, features=features, ) - print("retrieved historical features") # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() - print(historical_features_df) - if config.feature_is_list: assert_feature_list_types( environment.test_repo_config.provider, From 55e791509e76657e1944232ff2bda27631331b78 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 25 Feb 2022 16:29:45 -0800 Subject: [PATCH 03/58] Fix random incorrect changes Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/bigquery.py | 2 +- sdk/python/feast/infra/passthrough_provider.py | 4 +++- sdk/python/feast/infra/provider.py | 2 ++ sdk/python/feast/repo_config.py | 5 +---- .../offline_store/test_universal_historical_retrieval.py | 3 +++ .../tests/integration/online_store/test_universal_online.py | 4 ++-- sdk/python/tests/integration/registration/test_cli.py | 1 - .../tests/integration/registration/test_universal_types.py | 5 ++++- 8 files changed, 16 insertions(+), 10 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index df82ac65f6..44e62d6ad1 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -609,7 +609,7 @@ def _get_bigquery_client(project: Optional[str] = None, location: Optional[str] {{ featureview.name }}__base AS ( SELECT - subquery.*, + subquery.*, entity_dataframe.entity_timestamp, entity_dataframe.{{featureview.name}}__entity_row_unique_id FROM {{ featureview.name }}__subquery AS subquery diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index 1fdb37ded3..3468b9dc92 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -1,5 +1,4 @@ from datetime import datetime, timedelta -from multiprocessing.dummy import JoinableQueue from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import pandas @@ -153,9 +152,12 @@ def materialize_single_feature_view( ) table = offline_job.to_arrow() + if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping(table, feature_view.batch_source.field_mapping) + join_keys = {entity.join_key: entity.value_type for entity in entities} + with tqdm_builder(table.num_rows) as pbar: for batch in table.to_batches(DEFAULT_BATCH_SIZE): rows_to_write = _convert_arrow_to_proto(batch, feature_view, join_keys) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index c4265bbe86..b3f1029242 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -351,12 +351,14 @@ def _convert_arrow_to_proto( columns = [(f.name, f.dtype) for f in feature_view.features] + list( join_keys.items() ) + proto_values_by_column = { column: python_values_to_proto_values( table.column(column).to_numpy(zero_copy_only=False), value_type ) for column, value_type in columns } + entity_keys = [ EntityKeyProto( join_keys=join_keys, diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 18f4e9c290..3f32d18b80 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -1,4 +1,3 @@ -from curses import raw import os from pathlib import Path from typing import Any @@ -346,11 +345,9 @@ def get_feature_server_config_from_type(feature_server_type: str): def load_repo_config(repo_path: Path) -> RepoConfig: config_path = repo_path / "feature_store.yaml" - print("INNNNNN") - print(repo_path) + with open(config_path) as f: raw_config = yaml.safe_load(os.path.expandvars(f.read())) - print(raw_config) try: c = RepoConfig(**raw_config) c.repo_path = repo_path diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index c51b52dd1d..ab9b9515f3 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -346,10 +346,12 @@ def test_historical_features(environment, universal_data_sources, full_feature_n event_timestamp, full_feature_names, ) + # Only need the shadow entities features in the FeatureService test expected_df = full_expected_df.drop( columns=["origin__temperature", "destination__temperature"], ) + job_from_df = store.get_historical_features( entity_df=entity_df_with_request_data, features=[ @@ -369,6 +371,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n ], full_feature_names=full_feature_names, ) + start_time = datetime.utcnow() actual_df_from_df_entities = job_from_df.to_df() diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 95c7f8d0a1..90a06bb347 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -564,8 +564,8 @@ def test_online_retrieval(environment, universal_data_sources, full_feature_name ) -# @pytest.mark.integration -# @pytest.mark.universal +@pytest.mark.integration +@pytest.mark.universal def test_online_store_cleanup(environment, universal_data_sources): """ Some online store implementations (like Redis) keep features from different features views diff --git a/sdk/python/tests/integration/registration/test_cli.py b/sdk/python/tests/integration/registration/test_cli.py index 30d924dbb4..25f0ae4841 100644 --- a/sdk/python/tests/integration/registration/test_cli.py +++ b/sdk/python/tests/integration/registration/test_cli.py @@ -36,7 +36,6 @@ def test_universal_cli(environment: Environment): project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}" runner = CliRunner() - os.system(f"EXPORT PYTHONPATH=$PYTHONPATH:/$(pwd)") with tempfile.TemporaryDirectory() as repo_dir_name: try: diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 4199b4a928..59ca119f98 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -155,7 +155,7 @@ def test_entity_inference_types_match(offline_types_test_fixtures): ) -#@pytest.mark.integration +@pytest.mark.integration @pytest.mark.universal def test_feature_get_historical_features_types_match(offline_types_test_fixtures): environment, config, data_source, fv = offline_types_test_fixtures @@ -169,6 +169,7 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures ) entity = driver() fs.apply([fv, entity]) + features = [f"{fv.name}:value"] entity_df = pd.DataFrame() entity_df["driver_id"] = ( @@ -185,6 +186,8 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures ) # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() + print(historical_features_df) + if config.feature_is_list: assert_feature_list_types( environment.test_repo_config.provider, From 43794f7ebe9d55e0b9bfd2fa33fa06a081f1b8cc Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 25 Feb 2022 16:34:01 -0800 Subject: [PATCH 04/58] Fix lint Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 0cb2f413b6..39ea3e9923 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -329,7 +329,7 @@ def _python_value_to_proto_value( # TODO: Make this better. val_list = [] for value in values: - if(value.dtype == "bool"): + if value.dtype == "bool": value = [bool(e) for e in value] val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) elif value is not None: @@ -358,7 +358,13 @@ def _python_value_to_proto_value( if valid_scalar_types: assert type(sample) in valid_scalar_types return [ - ProtoValue(**{field_name: func(bool(value) if type(value) is np.bool_ else value)}) + ProtoValue( + **{ + field_name: func( + bool(value) if type(value) is np.bool_ else value + ) + } + ) if not pd.isnull(value) else ProtoValue() for value in values From 343ed00444b9b8b8d9e8a6dcd943fe7b3afe2fdd Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 25 Feb 2022 16:37:21 -0800 Subject: [PATCH 05/58] Fix build errors Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 39ea3e9923..328136647e 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -331,11 +331,11 @@ def _python_value_to_proto_value( for value in values: if value.dtype == "bool": value = [bool(e) for e in value] - val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) + val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) # type: ignore elif value is not None: val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) # type: ignore else: - val_list.append(ProtoValue()) + val_list.append(ProtoValue()) # type: ignore return val_list # Handle scalar types below @@ -361,7 +361,7 @@ def _python_value_to_proto_value( ProtoValue( **{ field_name: func( - bool(value) if type(value) is np.bool_ else value + bool(value) if type(value) is np.bool_ else value # type: ignore ) } ) From 55c458d0c39af1f16289de077f1952308ea90f39 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 25 Feb 2022 16:37:56 -0800 Subject: [PATCH 06/58] Fix lint Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 328136647e..f808715681 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -335,7 +335,7 @@ def _python_value_to_proto_value( elif value is not None: val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) # type: ignore else: - val_list.append(ProtoValue()) # type: ignore + val_list.append(ProtoValue()) # type: ignore return val_list # Handle scalar types below @@ -361,7 +361,7 @@ def _python_value_to_proto_value( ProtoValue( **{ field_name: func( - bool(value) if type(value) is np.bool_ else value # type: ignore + bool(value) if type(value) is np.bool_ else value # type: ignore ) } ) From b8ace4367bc10dd9f0783dc97b024e24781ba4e2 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 28 Feb 2022 17:00:55 -0800 Subject: [PATCH 07/58] Add spark offline store components to test against current integration tests Signed-off-by: Kevin Zhang --- .../infra/offline_stores/third_party/spark.py | 564 ++++++++++++++++++ .../third_party/spark_source.py | 231 +++++++ .../third_party/spark_type_map.py | 53 ++ .../universal/data_sources/spark.py | 84 +++ 4 files changed, 932 insertions(+) create mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark.py create mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark_source.py create mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py create mode 100644 sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py new file mode 100644 index 0000000000..4d73f7d254 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -0,0 +1,564 @@ +import inspect +from typing import List, Union, Optional, Dict, Tuple +from ntpath import join +from pydantic import StrictStr +from datetime import datetime + +import pandas +from dateutil import parser +import pyspark +import pyarrow +import numpy as np +import pandas as pd +from pydantic import StrictStr +from pyspark import SparkConf +from pyspark.sql import SparkSession +from pytz import utc + +from feast import FeatureView, OnDemandFeatureView +from feast.data_source import DataSource +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.saved_dataset import SavedDatasetStorage + +import pandas as pd + +from feast.repo_config import FeastConfigBaseModel + +from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL, FeatureView +from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalJob, RetrievalMetadata +from feast.infra.offline_stores import offline_utils + + + +from feast.errors import InvalidEntityType +from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalJob +from feast.infra.offline_stores.offline_utils import FeatureViewQueryContext +from feast.registry import Registry +from feast.repo_config import FeastConfigBaseModel, RepoConfig + +from pyspark.sql import SparkSession +from pyspark import SparkConf +from feast_spark_offline_store.spark_source import SparkSource, SavedDatasetSparkStorage +from feast_spark_offline_store.spark_type_map import spark_schema_to_np_dtypes + + +class SparkOfflineStoreConfig(FeastConfigBaseModel): + type: StrictStr = "spark" + """ Offline store type selector""" + + spark_conf: Optional[Dict[str, str]] = None + """ Configuration overlay for the spark session """ + # to ensure sparksession is the correct config, if not created yet + # sparksession is not serializable and we dont want to pass it around as an argument + + +class SparkOfflineStore(OfflineStore): + @staticmethod + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + event_timestamp_column: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + spark_session = get_spark_session_or_start_new_with_repoconfig( + config.offline_store + ) + assert isinstance(config.offline_store, SparkOfflineStoreConfig) + assert isinstance(data_source, SparkSource) + + print("Pulling latest features from spark offline store") + + from_expression = data_source.get_table_query_string() + + partition_by_join_key_string = ", ".join(join_key_columns) + if partition_by_join_key_string != "": + partition_by_join_key_string = ( + "PARTITION BY " + partition_by_join_key_string + ) + timestamps = [event_timestamp_column] + if created_timestamp_column: + timestamps.append(created_timestamp_column) + timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" + field_string = ", ".join( + join_key_columns + feature_name_columns + timestamps) + + start_date = _format_datetime(start_date) + end_date = _format_datetime(end_date) + query = f""" + SELECT + {field_string} + {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} + FROM ( + SELECT {field_string}, + ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS feast_row_ + FROM {from_expression} t1 + WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') + ) t2 + WHERE feast_row_ = 1 + """ + + return SparkRetrievalJob( + spark_session=spark_session, + query=query, + full_feature_names=False, + on_demand_feature_views=None, + ) + + @staticmethod + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pandas.DataFrame, str], + registry: Registry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + assert isinstance(config.offline_store, SparkOfflineStoreConfig) + spark_session = get_spark_session_or_start_new_with_repoconfig( + store_config=config.offline_store + ) + tmp_entity_df_table_name = offline_utils.get_temp_entity_table_name() + + entity_schema = _upload_entity_df_and_get_entity_schema( + spark_session=spark_session, + table_name=tmp_entity_df_table_name, + entity_df=entity_df, + ) + event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( + entity_schema=entity_schema, + ) + entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df, event_timestamp_col, spark_session, + ) + + expected_join_keys = offline_utils.get_expected_join_keys( + project=project, feature_views=feature_views, registry=registry + ) + offline_utils.assert_expected_columns_in_entity_df( + entity_schema=entity_schema, + join_keys=expected_join_keys, + entity_df_event_timestamp_col=event_timestamp_col, + ) + + query_context = offline_utils.get_feature_view_query_context( + feature_refs, + feature_views, + registry, + project, + entity_df_event_timestamp_range, + ) + + query = offline_utils.build_point_in_time_query( + feature_view_query_contexts=query_context, + left_table_query_string=tmp_entity_df_table_name, + entity_df_event_timestamp_col=event_timestamp_col, + entity_df_columns=entity_schema.keys(), + query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, + full_feature_names=full_feature_names, + ) + on_demand_feature_views = OnDemandFeatureView.get_requested_odfvs( + feature_refs=feature_refs, project=project, registry=registry + ) + + return SparkRetrievalJob( + spark_session=spark_session, + query=query, + full_feature_names=full_feature_names, + on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( + feature_refs, project, registry + ), + metadata=RetrievalMetadata( + features=feature_refs, + keys=list(set(entity_schema.keys()) - {event_timestamp_col}), + min_event_timestamp=entity_df_event_timestamp_range[0], + max_event_timestamp=entity_df_event_timestamp_range[1], + ), + ) + + @staticmethod + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + event_timestamp_column: str, + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + """ + Note that join_key_columns, feature_name_columns, event_timestamp_column, and created_timestamp_column + have all already been mapped to column names of the source table and those column names are the values passed + into this function. + """ + return SparkOfflineStore.pull_latest_from_table_or_query( + config=config, + data_source=data_source, + join_key_columns=join_key_columns + + [event_timestamp_column], # avoid deduplication + feature_name_columns=feature_name_columns, + event_timestamp_column=event_timestamp_column, + created_timestamp_column=None, + start_date=start_date, + end_date=end_date, + ) + +# TODO fix internal abstract methods _to_df_internal _to_arrow_internal +class SparkRetrievalJob(RetrievalJob): + def __init__( + self, + spark_session: SparkSession, + query: str, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]], + metadata: Optional[RetrievalMetadata] = None, + ): + super().__init__() + self.spark_session = spark_session + self.query = query + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views + self._metadata = metadata + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + return self._on_demand_feature_views + + def to_spark_df(self) -> pyspark.sql.DataFrame: + statements = self.query.split( + "---EOS---" + ) # TODO can do better than this dirty split + *_, last = map(self.spark_session.sql, statements) + return last + + def to_df(self) -> pandas.DataFrame: + return self.to_spark_df().toPandas() # noqa, DataFrameLike instead of DataFrame + + def _to_df_internal(self) -> pd.DataFrame: + """Return dataset as Pandas DataFrame synchronously""" + return self.to_df() + + def _to_arrow_internal(self) -> pyarrow.Table: + """Return dataset as pyarrow Table synchronously""" + return self.to_arrow() + + def to_arrow(self) -> pyarrow.Table: + df = self.to_df() + return pyarrow.Table.from_pandas(df) # noqa + + def persist(self, storage: SavedDatasetStorage): + """ + Run the retrieval and persist the results in the same offline store used for read. + """ + pass + + @property + def metadata(self) -> Optional[RetrievalMetadata]: + """ + Return metadata information about retrieval. + Should be available even before materializing the dataset itself. + """ + return self._metadata + + +def get_spark_session_or_start_new_with_repoconfig( + store_config: SparkOfflineStoreConfig, +) -> SparkSession: + spark_session = SparkSession.getActiveSession() + if not spark_session: + spark_builder = SparkSession.builder + spark_conf = store_config.spark_conf + if spark_conf: + spark_builder = spark_builder.config( + conf=SparkConf().setAll(spark_conf.items()) + ) # noqa + + spark_session = spark_builder.getOrCreate() + spark_session.conf.set( + "spark.sql.parser.quotedRegexColumnNames", "true" + ) # important! + return spark_session + + +def _get_entity_df_event_timestamp_range( + entity_df: Union[pd.DataFrame, str], + entity_df_event_timestamp_col: str, + spark_session: SparkSession, +) -> Tuple[datetime, datetime]: + if isinstance(entity_df, pd.DataFrame): + entity_df_event_timestamp = entity_df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pd.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pd.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), determine range + # from table + df = spark_session.sql(entity_df).select(entity_df_event_timestamp_col) + entity_df_event_timestamp_range = ( + df.agg({entity_df_event_timestamp_col: "max"}).collect()[0][0], + df.agg({entity_df_event_timestamp_col: "min"}).collect()[0][0] + ) + else: + raise InvalidEntityType(type(entity_df)) + + return entity_df_event_timestamp_range + +def _upload_entity_df_and_get_entity_schema( + spark_session: SparkSession, + table_name: str, + entity_df: Union[pandas.DataFrame, str], +) -> Dict[str, np.dtype]: + if isinstance(entity_df, pd.DataFrame): + spark_session.createDataFrame(entity_df).createOrReplaceTempView(table_name) + return dict(zip(entity_df.columns, entity_df.dtypes)) + elif isinstance(entity_df, str): + spark_session.sql(entity_df).createOrReplaceTempView(table_name) + limited_entity_df = spark_session.table(table_name) + return dict( + zip( + limited_entity_df.columns, + spark_schema_to_np_dtypes(limited_entity_df.dtypes), + ) + ) + else: + raise InvalidEntityType(type(entity_df)) + +def _format_datetime(t: datetime): + # Since Hive does not support timezone, need to transform to utc. + if t.tzinfo: + t = t.astimezone(tz=utc) + t = t.strftime("%Y-%m-%d %H:%M:%S.%f") + return t + + + return spark_session + + +def _format_datetime(t: datetime): + # Since Hive does not support timezone, need to transform to utc. + if t.tzinfo: + t = t.astimezone(tz=utc) + t = t.strftime("%Y-%m-%d %H:%M:%S.%f") + return t + + +def _get_feature_view_query_context( + entity_df: Union[pd.DataFrame, str], + entity_df_event_timestamp_col: str, + feature_refs: List[str], + feature_views: List[FeatureView], + spark_session: SparkSession, + table_name: str, + registry: Registry, + project: str, +) -> List[FeatureViewQueryContext]: + # interface of offline_utils.get_feature_view_query_context changed in feast==0.17 + arg_spec = inspect.getfullargspec(func=offline_utils.get_feature_view_query_context) + if "entity_df_timestamp_range" in arg_spec.args: + # for feast>=0.17 + entity_df_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df=entity_df, + entity_df_event_timestamp_col=entity_df_event_timestamp_col, + spark_session=spark_session, + table_name=table_name, + ) + query_context = offline_utils.get_feature_view_query_context( + feature_refs=feature_refs, + feature_views=feature_views, + registry=registry, + project=project, + entity_df_timestamp_range=entity_df_timestamp_range, + ) + else: + # for feast<0.17 + query_context = offline_utils.get_feature_view_query_context( + feature_refs=feature_refs, + feature_views=feature_views, + registry=registry, + project=project, + ) + return query_context + + +MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """/* + Compute a deterministic hash for the `left_table_query_string` that will be used throughout + all the logic as the field to GROUP BY the data +*/ +CREATE OR REPLACE TEMPORARY VIEW entity_dataframe AS ( + SELECT *, + {{entity_df_event_timestamp_col}} AS entity_timestamp + {% for featureview in featureviews %} + ,CONCAT( + {% for entity in featureview.entities %} + CAST({{entity}} AS STRING), + {% endfor %} + CAST({{entity_df_event_timestamp_col}} AS STRING) + ) AS {{featureview.name}}__entity_row_unique_id + {% endfor %} + FROM {{ left_table_query_string }} +); +---EOS--- +-- Start create temporary table *__base +{% for featureview in featureviews %} +CREATE OR REPLACE TEMPORARY VIEW {{ featureview.name }}__base AS +WITH {{ featureview.name }}__entity_dataframe AS ( + SELECT + {{ featureview.entities | join(', ')}}, + entity_timestamp, + {{featureview.name}}__entity_row_unique_id + FROM entity_dataframe + GROUP BY {{ featureview.entities | join(', ')}}, entity_timestamp, {{featureview.name}}__entity_row_unique_id +), +/* + This query template performs the point-in-time correctness join for a single feature set table + to the provided entity table. + 1. We first join the current feature_view to the entity dataframe that has been passed. + This JOIN has the following logic: + - For each row of the entity dataframe, only keep the rows where the `event_timestamp_column` + is less than the one provided in the entity dataframe + - If there a TTL for the current feature_view, also keep the rows where the `event_timestamp_column` + is higher the the one provided minus the TTL + - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been + computed previously + The output of this CTE will contain all the necessary information and already filtered out most + of the data that is not relevant. +*/ +{{ featureview.name }}__subquery AS ( + SELECT + {{ featureview.event_timestamp_column }} as event_timestamp, + {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }} + {{ featureview.entity_selections | join(', ')}}, + {% for feature in featureview.features %} + {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %}{% if loop.last %}{% else %}, {% endif %} + {% endfor %} + FROM {{ featureview.table_subquery }} AS subquery + INNER JOIN ( + SELECT MAX(entity_timestamp) as max_entity_timestamp_ + {% if featureview.ttl == 0 %}{% else %} + ,(MIN(entity_timestamp) - interval '{{ featureview.ttl }}' second) as min_entity_timestamp_ + {% endif %} + FROM entity_dataframe + ) AS temp + ON ( + {{ featureview.event_timestamp_column }} <= max_entity_timestamp_ + {% if featureview.ttl == 0 %}{% else %} + AND {{ featureview.event_timestamp_column }} >= min_entity_timestamp_ + {% endif %} + ) +) +SELECT + subquery.*, + entity_dataframe.entity_timestamp, + entity_dataframe.{{featureview.name}}__entity_row_unique_id +FROM {{ featureview.name }}__subquery AS subquery +INNER JOIN ( + SELECT * + {% if featureview.ttl == 0 %}{% else %} + , (entity_timestamp - interval '{{ featureview.ttl }}' second) as ttl_entity_timestamp + {% endif %} + FROM {{ featureview.name }}__entity_dataframe +) AS entity_dataframe +ON ( + subquery.event_timestamp <= entity_dataframe.entity_timestamp + {% if featureview.ttl == 0 %}{% else %} + AND subquery.event_timestamp >= entity_dataframe.ttl_entity_timestamp + {% endif %} + {% for entity in featureview.entities %} + AND subquery.{{ entity }} = entity_dataframe.{{ entity }} + {% endfor %} +); +---EOS--- +{% endfor %} +-- End create temporary table *__base +{% for featureview in featureviews %} +{% if loop.first %}WITH{% endif %} +/* + 2. If the `created_timestamp_column` has been set, we need to + deduplicate the data first. This is done by calculating the + `MAX(created_at_timestamp)` for each event_timestamp. + We then join the data on the next CTE +*/ +{% if featureview.created_timestamp_column %} +{{ featureview.name }}__dedup AS ( + SELECT + {{featureview.name}}__entity_row_unique_id, + event_timestamp, + MAX(created_timestamp) as created_timestamp + FROM {{ featureview.name }}__base + GROUP BY {{featureview.name}}__entity_row_unique_id, event_timestamp +), +{% endif %} +/* + 3. The data has been filtered during the first CTE "*__base" + Thus we only need to compute the latest timestamp of each feature. +*/ +{{ featureview.name }}__latest AS ( + SELECT + base.{{featureview.name}}__entity_row_unique_id, + MAX(base.event_timestamp) AS event_timestamp + {% if featureview.created_timestamp_column %} + ,MAX(base.created_timestamp) AS created_timestamp + {% endif %} + FROM {{ featureview.name }}__base AS base + {% if featureview.created_timestamp_column %} + INNER JOIN {{ featureview.name }}__dedup AS dedup + ON ( + dedup.{{featureview.name}}__entity_row_unique_id=base.{{featureview.name}}__entity_row_unique_id + AND dedup.event_timestamp=base.event_timestamp + AND dedup.created_timestamp=base.created_timestamp + ) + {% endif %} + GROUP BY base.{{featureview.name}}__entity_row_unique_id +), +/* + 4. Once we know the latest value of each feature for a given timestamp, + we can join again the data back to the original "base" dataset +*/ +{{ featureview.name }}__cleaned AS ( + SELECT base.* + FROM {{ featureview.name }}__base AS base + INNER JOIN {{ featureview.name }}__latest AS latest + ON ( + base.{{featureview.name}}__entity_row_unique_id=latest.{{featureview.name}}__entity_row_unique_id + AND base.event_timestamp=latest.event_timestamp + {% if featureview.created_timestamp_column %} + AND base.created_timestamp=latest.created_timestamp + {% endif %} + ) +){% if loop.last %}{% else %}, {% endif %} +{% endfor %} +/* + Joins the outputs of multiple time travel joins to a single table. + The entity_dataframe dataset being our source of truth here. + */ +SELECT `(entity_timestamp|{% for featureview in featureviews %}{{featureview.name}}__entity_row_unique_id{% if loop.last %}{% else %}|{% endif %}{% endfor %})?+.+` +FROM entity_dataframe +{% for featureview in featureviews %} +LEFT JOIN ( + SELECT + {{featureview.name}}__entity_row_unique_id + {% for feature in featureview.features %} + ,{% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %} + {% endfor %} + FROM {{ featureview.name }}__cleaned +) AS {{ featureview.name }}__joined +ON ( + {{ featureview.name }}__joined.{{featureview.name}}__entity_row_unique_id=entity_dataframe.{{featureview.name}}__entity_row_unique_id +) +{% endfor %}""" diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py new file mode 100644 index 0000000000..13592dd4ca --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -0,0 +1,231 @@ +from typing import Optional, Dict, Callable, Any, Tuple, Iterable +from feast.data_source import DataSource +from feast.repo_config import RepoConfig +from pyspark.sql.utils import AnalysisException +from feast.value_type import ValueType +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.core.SavedDataset_pb2 import ( + SavedDatasetStorage as SavedDatasetStorageProto, +) +from feast.saved_dataset import SavedDatasetStorage +from feast_spark_offline_store.spark_type_map import spark_to_feast_value_type +import pickle +from feast.errors import DataSourceNotFoundException + + +class SparkSource(DataSource): + def __init__( + self, + table: Optional[str] = None, + query: Optional[str] = None, + # TODO support file readers + # path: Optional[str] = None, + # jdbc=None, + # format: Optional[str] = None, + # options: Optional[Dict[str, Any]] = None, + event_timestamp_column: Optional[str] = None, + created_timestamp_column: Optional[str] = None, + field_mapping: Optional[Dict[str, str]] = None, + date_partition_column: Optional[str] = None, + ): + super().__init__( + event_timestamp_column, + created_timestamp_column, + field_mapping, + date_partition_column, + ) + + self._spark_options = SparkOptions( + table=table, + query=query, + # path=path, + # jdbc=None, + # format=format, + # options=options, + ) + + @property + def spark_options(self): + """ + Returns the spark options of this data source + """ + return self._spark_options + + @spark_options.setter + def spark_options(self, spark_options): + """ + Sets the spark options of this data source + """ + self._spark_options = spark_options + + @property + def table(self): + """ + Returns the table of this feature data source + """ + return self._spark_options.table + + @property + def query(self): + """ + Returns the query of this feature data source + """ + return self._spark_options.query + + @staticmethod + def from_proto(data_source: DataSourceProto) -> Any: + + assert data_source.HasField("custom_options") + + spark_options = SparkOptions.from_proto(data_source.custom_options) + + return SparkSource( + field_mapping=dict(data_source.field_mapping), + table=spark_options.table, + query=spark_options.query, + # path=spark_options.path, + # jdbc=None, + # format=spark_options.format, + # options=spark_options.options, + event_timestamp_column=data_source.event_timestamp_column, + created_timestamp_column=data_source.created_timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + + def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + type=DataSourceProto.CUSTOM_SOURCE, + field_mapping=self.field_mapping, + custom_options=self.spark_options.to_proto(), + ) + + data_source_proto.event_timestamp_column = self.event_timestamp_column + data_source_proto.created_timestamp_column = self.created_timestamp_column + data_source_proto.date_partition_column = self.date_partition_column + + return data_source_proto + + def validate(self, config: RepoConfig): + self.get_table_column_names_and_types(config) + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + # TODO see feast.type_map for examples + return spark_to_feast_value_type + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + from feast_spark_offline_store.spark import ( + get_spark_session_or_start_new_with_repoconfig, + ) + + spark_session = get_spark_session_or_start_new_with_repoconfig( + config.offline_store + ) + try: + return ( + (fields["name"], fields["type"]) + for fields in spark_session.table(self.table).schema.jsonValue()[ + "fields" + ] + ) + except AnalysisException: + raise DataSourceNotFoundException(self.table) + + def get_table_query_string(self) -> str: + """Returns a string that can directly be used to reference this table in SQL""" + if self.table: + return f"`{self.table}`" + else: + return f"({self.query})" + + +class SparkOptions: + def __init__( + self, + table: str, + query: str, + ): + self._table = table + self._query = query + + @property + def table(self): + """ + Returns the table + """ + return self._table + + @table.setter + def table(self, table): + """ + Sets the table + """ + self._table = table + + @property + def query(self): + """ + Returns the query + """ + return self._query + + @query.setter + def query(self, query): + """ + Sets the query + """ + self._query = query + + @classmethod + def from_proto(cls, spark_options_proto: DataSourceProto.CustomSourceOptions): + """ + Creates a SparkOptions from a protobuf representation of a spark option + args: + spark_options_proto: a protobuf representation of a datasource + Returns: + Returns a SparkOptions object based on the spark_options protobuf + """ + spark_configuration = pickle.loads(spark_options_proto.configuration) + + spark_options = cls( + table=spark_configuration.table, + query=spark_configuration.query, + ) + return spark_options + + def to_proto(self) -> DataSourceProto.CustomSourceOptions: + """ + Converts an SparkOptionsProto object to its protobuf representation. + Returns: + SparkOptionsProto protobuf + """ + + spark_options_proto = DataSourceProto.CustomSourceOptions( + configuration=pickle.dumps(self), + ) + + return spark_options_proto + + +class SavedDatasetSparkStorage(SavedDatasetStorage): + _proto_attr_name = "spark_storage" + + spark_options: SparkOptions + def __init__(self, table_ref: str, query: str): + self.spark_options = SparkOptions(table=table_ref, query=query) + + @staticmethod + def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: + # options = SparkOptions.from_proto( + # storage_proto + # ) + # spark_options = SparkOptions(table=options.table, query=options.query) + return SavedDatasetSparkStorage(table="", query=None) + + def to_proto(self) -> SavedDatasetStorageProto: + return SavedDatasetStorageProto() + + def to_data_source(self) -> DataSource: + return SparkSource(table=self.spark_options.table) \ No newline at end of file diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py b/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py new file mode 100644 index 0000000000..253eadcff9 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py @@ -0,0 +1,53 @@ +from typing import Dict, List, Tuple, Iterator + +from feast import ValueType +from collections import defaultdict +from numpy import dtype + + +def spark_to_feast_value_type(spark_type_as_str: str) -> ValueType: + # TODO not all spark types are convertible + type_map: Dict[str, ValueType] = { + "null": ValueType.UNKNOWN, + "byte": ValueType.BYTES, + "string": ValueType.STRING, + "int": ValueType.INT32, + "bigint": ValueType.INT64, + "long": ValueType.INT64, + "double": ValueType.DOUBLE, + "float": ValueType.FLOAT, + "boolean": ValueType.BOOL, + "timestamp": ValueType.UNIX_TIMESTAMP, + "array": ValueType.BYTES_LIST, + "array": ValueType.STRING_LIST, + "array": ValueType.INT32_LIST, + "array": ValueType.INT64_LIST, + "array": ValueType.DOUBLE_LIST, + "array": ValueType.FLOAT_LIST, + "array": ValueType.BOOL_LIST, + "array": ValueType.UNIX_TIMESTAMP_LIST, + } + #TODO: this is just incorrect fix + if(type(spark_type_as_str) != str or spark_type_as_str not in type_map): + return ValueType.NULL + return type_map[spark_type_as_str.lower()] + + +def spark_schema_to_np_dtypes(dtypes: List[Tuple[str, str]]) -> Iterator[dtype]: + # TODO recheck all typing (also tz for timestamp) + # https://spark.apache.org/docs/latest/api/python/user_guide/arrow_pandas.html#timestamp-with-time-zone-semantics + + type_map = defaultdict( + lambda: dtype("O"), + { + "boolean": dtype("bool"), + "double": dtype("float64"), + "float": dtype("float64"), + "int": dtype("int64"), + "bigint": dtype("int64"), + "smallint": dtype("int64"), + "timestamp": dtype("datetime64[ns]"), + }, + ) + + return (type_map[t] for _, t in dtypes) \ No newline at end of file diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py new file mode 100644 index 0000000000..e430072967 --- /dev/null +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py @@ -0,0 +1,84 @@ +from typing import List, Union, Optional, Dict, Tuple +from feast.infra.offline_stores.third_party.spark import SparkOfflineStoreConfig +from feast.infra.offline_stores.third_party.spark_source import SparkSource, SavedDatasetSparkStorage +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) + + +import uuid +import pandas as pd + +from feast.data_source import DataSource + +from pyspark.sql import SparkSession +from pyspark import SparkConf +class SparkDataSourceCreator(DataSourceCreator): + tables: List[str] = [] + spark_offline_store_config = None + spark_session = None + + def __init__(self, project_name: str): + self.spark_conf = {'master': 'local[*]', 'spark.ui.enabled': 'false', 'spark.eventLog.enabled': 'false', 'spark.sql.parser.quotedRegexColumnNames': 'true', 'spark.sql.session.timeZone': 'UTC'} + self.project_name = project_name + if not self.spark_offline_store_config: + self.create_offline_store_config() + if not self.spark_session: + self.spark_session = (SparkSession + .builder + .config(conf=SparkConf().setAll(self.spark_conf.items())) + .appName('pytest-pyspark-local-testing') + .getOrCreate()) + self.tables: List[str] = [] + + def teardown(self): + self.spark_session.stop() + + def create_offline_store_config(self): + self.spark_offline_store_config = SparkOfflineStoreConfig() + self.spark_offline_store_config.type = "feast_spark_offline_store.spark.SparkOfflineStore" + self.spark_offline_store_config.spark_conf = self.spark_conf + return self.spark_offline_store_config + + # abstract + def create_data_source( + self, + df: pd.DataFrame, + destination_name: str, + event_timestamp_column="ts", + created_timestamp_column="created_ts", + field_mapping: Dict[str, str] = None, + **kwargs, + ) -> DataSource: + #df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True) + if event_timestamp_column in df: + df[event_timestamp_column] = pd.to_datetime(df[event_timestamp_column], utc=True) + # Make sure the field mapping is correct and convert the datetime datasources. + if field_mapping: + timestamp_mapping = {value:key for key, value in field_mapping.items()} + if(event_timestamp_column in timestamp_mapping and timestamp_mapping[event_timestamp_column] in df): + col = timestamp_mapping[event_timestamp_column] + df[col] = pd.to_datetime(df[col], utc=True) + + # https://stackoverflow.com/questions/51871200/analysisexception-it-is-not-allowed-to-add-database-prefix + # destination_name = self.get_prefixed_table_name(destination_name) + + df = self.spark_session.createDataFrame(df).createOrReplaceTempView(destination_name) + + self.tables.append(destination_name) + return SparkSource( + table=destination_name, + event_timestamp_column=event_timestamp_column, + created_timestamp_column=created_timestamp_column, + date_partition_column="", + # feature_view => datasource accompanied + # maps certain column names to other names + field_mapping=field_mapping or {"ts_1": "ts"}, + ) + + def create_saved_dataset_destination(self) -> SavedDatasetSparkStorage: + table = f"persisted_{str(uuid.uuid4()).replace('-', '_')}" + return SavedDatasetSparkStorage(table_ref=table, query="") + + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}.{suffix}" \ No newline at end of file From d13119f3f61a2fdc503f88b9e4c3d2833a705f00 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 28 Feb 2022 17:01:59 -0800 Subject: [PATCH 08/58] Fix lint Signed-off-by: Kevin Zhang --- .../infra/offline_stores/third_party/spark.py | 50 ++++++--------- .../third_party/spark_source.py | 26 ++++---- .../third_party/spark_type_map.py | 13 ++-- .../universal/data_sources/spark.py | 64 ++++++++++++------- 4 files changed, 82 insertions(+), 71 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index 4d73f7d254..494fc44b89 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -1,15 +1,16 @@ import inspect -from typing import List, Union, Optional, Dict, Tuple -from ntpath import join -from pydantic import StrictStr from datetime import datetime +from ntpath import join +from typing import Dict, List, Optional, Tuple, Union -import pandas -from dateutil import parser -import pyspark -import pyarrow import numpy as np +import pandas import pandas as pd +import pyarrow +import pyspark +from dateutil import parser +from feast_spark_offline_store.spark_source import SavedDatasetSparkStorage, SparkSource +from feast_spark_offline_store.spark_type_map import spark_schema_to_np_dtypes from pydantic import StrictStr from pyspark import SparkConf from pyspark.sql import SparkSession @@ -17,30 +18,18 @@ from feast import FeatureView, OnDemandFeatureView from feast.data_source import DataSource -from feast.repo_config import FeastConfigBaseModel, RepoConfig -from feast.saved_dataset import SavedDatasetStorage - -import pandas as pd - -from feast.repo_config import FeastConfigBaseModel - -from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL, FeatureView -from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalJob, RetrievalMetadata -from feast.infra.offline_stores import offline_utils - - - from feast.errors import InvalidEntityType +from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL, FeatureView from feast.infra.offline_stores import offline_utils -from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalJob +from feast.infra.offline_stores.offline_store import ( + OfflineStore, + RetrievalJob, + RetrievalMetadata, +) from feast.infra.offline_stores.offline_utils import FeatureViewQueryContext from feast.registry import Registry from feast.repo_config import FeastConfigBaseModel, RepoConfig - -from pyspark.sql import SparkSession -from pyspark import SparkConf -from feast_spark_offline_store.spark_source import SparkSource, SavedDatasetSparkStorage -from feast_spark_offline_store.spark_type_map import spark_schema_to_np_dtypes +from feast.saved_dataset import SavedDatasetStorage class SparkOfflineStoreConfig(FeastConfigBaseModel): @@ -84,8 +73,7 @@ def pull_latest_from_table_or_query( if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" - field_string = ", ".join( - join_key_columns + feature_name_columns + timestamps) + field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) start_date = _format_datetime(start_date) end_date = _format_datetime(end_date) @@ -208,6 +196,7 @@ def pull_all_from_table_or_query( end_date=end_date, ) + # TODO fix internal abstract methods _to_df_internal _to_arrow_internal class SparkRetrievalJob(RetrievalJob): def __init__( @@ -312,13 +301,14 @@ def _get_entity_df_event_timestamp_range( df = spark_session.sql(entity_df).select(entity_df_event_timestamp_col) entity_df_event_timestamp_range = ( df.agg({entity_df_event_timestamp_col: "max"}).collect()[0][0], - df.agg({entity_df_event_timestamp_col: "min"}).collect()[0][0] + df.agg({entity_df_event_timestamp_col: "min"}).collect()[0][0], ) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range + def _upload_entity_df_and_get_entity_schema( spark_session: SparkSession, table_name: str, @@ -339,6 +329,7 @@ def _upload_entity_df_and_get_entity_schema( else: raise InvalidEntityType(type(entity_df)) + def _format_datetime(t: datetime): # Since Hive does not support timezone, need to transform to utc. if t.tzinfo: @@ -346,7 +337,6 @@ def _format_datetime(t: datetime): t = t.strftime("%Y-%m-%d %H:%M:%S.%f") return t - return spark_session diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index 13592dd4ca..929705f40c 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -1,16 +1,18 @@ -from typing import Optional, Dict, Callable, Any, Tuple, Iterable -from feast.data_source import DataSource -from feast.repo_config import RepoConfig +import pickle +from typing import Any, Callable, Dict, Iterable, Optional, Tuple + +from feast_spark_offline_store.spark_type_map import spark_to_feast_value_type from pyspark.sql.utils import AnalysisException -from feast.value_type import ValueType + +from feast.data_source import DataSource +from feast.errors import DataSourceNotFoundException from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.protos.feast.core.SavedDataset_pb2 import ( SavedDatasetStorage as SavedDatasetStorageProto, ) +from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDatasetStorage -from feast_spark_offline_store.spark_type_map import spark_to_feast_value_type -import pickle -from feast.errors import DataSourceNotFoundException +from feast.value_type import ValueType class SparkSource(DataSource): @@ -143,9 +145,7 @@ def get_table_query_string(self) -> str: class SparkOptions: def __init__( - self, - table: str, - query: str, + self, table: str, query: str, ): self._table = table self._query = query @@ -190,8 +190,7 @@ def from_proto(cls, spark_options_proto: DataSourceProto.CustomSourceOptions): spark_configuration = pickle.loads(spark_options_proto.configuration) spark_options = cls( - table=spark_configuration.table, - query=spark_configuration.query, + table=spark_configuration.table, query=spark_configuration.query, ) return spark_options @@ -213,6 +212,7 @@ class SavedDatasetSparkStorage(SavedDatasetStorage): _proto_attr_name = "spark_storage" spark_options: SparkOptions + def __init__(self, table_ref: str, query: str): self.spark_options = SparkOptions(table=table_ref, query=query) @@ -228,4 +228,4 @@ def to_proto(self) -> SavedDatasetStorageProto: return SavedDatasetStorageProto() def to_data_source(self) -> DataSource: - return SparkSource(table=self.spark_options.table) \ No newline at end of file + return SparkSource(table=self.spark_options.table) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py b/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py index 253eadcff9..dee1c20d41 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py @@ -1,9 +1,10 @@ -from typing import Dict, List, Tuple, Iterator - -from feast import ValueType from collections import defaultdict +from typing import Dict, Iterator, List, Tuple + from numpy import dtype +from feast import ValueType + def spark_to_feast_value_type(spark_type_as_str: str) -> ValueType: # TODO not all spark types are convertible @@ -27,8 +28,8 @@ def spark_to_feast_value_type(spark_type_as_str: str) -> ValueType: "array": ValueType.BOOL_LIST, "array": ValueType.UNIX_TIMESTAMP_LIST, } - #TODO: this is just incorrect fix - if(type(spark_type_as_str) != str or spark_type_as_str not in type_map): + # TODO: this is just incorrect fix + if type(spark_type_as_str) != str or spark_type_as_str not in type_map: return ValueType.NULL return type_map[spark_type_as_str.lower()] @@ -50,4 +51,4 @@ def spark_schema_to_np_dtypes(dtypes: List[Tuple[str, str]]) -> Iterator[dtype]: }, ) - return (type_map[t] for _, t in dtypes) \ No newline at end of file + return (type_map[t] for _, t in dtypes) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py index e430072967..4cb7c52244 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py @@ -1,34 +1,45 @@ -from typing import List, Union, Optional, Dict, Tuple +import uuid +from typing import Dict, List, Optional, Tuple, Union + +import pandas as pd +from pyspark import SparkConf +from pyspark.sql import SparkSession + +from feast.data_source import DataSource from feast.infra.offline_stores.third_party.spark import SparkOfflineStoreConfig -from feast.infra.offline_stores.third_party.spark_source import SparkSource, SavedDatasetSparkStorage +from feast.infra.offline_stores.third_party.spark_source import ( + SavedDatasetSparkStorage, + SparkSource, +) from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, ) -import uuid -import pandas as pd - -from feast.data_source import DataSource - -from pyspark.sql import SparkSession -from pyspark import SparkConf class SparkDataSourceCreator(DataSourceCreator): tables: List[str] = [] spark_offline_store_config = None spark_session = None def __init__(self, project_name: str): - self.spark_conf = {'master': 'local[*]', 'spark.ui.enabled': 'false', 'spark.eventLog.enabled': 'false', 'spark.sql.parser.quotedRegexColumnNames': 'true', 'spark.sql.session.timeZone': 'UTC'} + self.spark_conf = { + "master": "local[*]", + "spark.ui.enabled": "false", + "spark.eventLog.enabled": "false", + "spark.sql.parser.quotedRegexColumnNames": "true", + "spark.sql.session.timeZone": "UTC", + } self.project_name = project_name if not self.spark_offline_store_config: self.create_offline_store_config() if not self.spark_session: - self.spark_session = (SparkSession - .builder - .config(conf=SparkConf().setAll(self.spark_conf.items())) - .appName('pytest-pyspark-local-testing') - .getOrCreate()) + self.spark_session = ( + SparkSession.builder.config( + conf=SparkConf().setAll(self.spark_conf.items()) + ) + .appName("pytest-pyspark-local-testing") + .getOrCreate() + ) self.tables: List[str] = [] def teardown(self): @@ -36,7 +47,9 @@ def teardown(self): def create_offline_store_config(self): self.spark_offline_store_config = SparkOfflineStoreConfig() - self.spark_offline_store_config.type = "feast_spark_offline_store.spark.SparkOfflineStore" + self.spark_offline_store_config.type = ( + "feast_spark_offline_store.spark.SparkOfflineStore" + ) self.spark_offline_store_config.spark_conf = self.spark_conf return self.spark_offline_store_config @@ -50,20 +63,27 @@ def create_data_source( field_mapping: Dict[str, str] = None, **kwargs, ) -> DataSource: - #df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True) + # df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True) if event_timestamp_column in df: - df[event_timestamp_column] = pd.to_datetime(df[event_timestamp_column], utc=True) + df[event_timestamp_column] = pd.to_datetime( + df[event_timestamp_column], utc=True + ) # Make sure the field mapping is correct and convert the datetime datasources. if field_mapping: - timestamp_mapping = {value:key for key, value in field_mapping.items()} - if(event_timestamp_column in timestamp_mapping and timestamp_mapping[event_timestamp_column] in df): + timestamp_mapping = {value: key for key, value in field_mapping.items()} + if ( + event_timestamp_column in timestamp_mapping + and timestamp_mapping[event_timestamp_column] in df + ): col = timestamp_mapping[event_timestamp_column] df[col] = pd.to_datetime(df[col], utc=True) # https://stackoverflow.com/questions/51871200/analysisexception-it-is-not-allowed-to-add-database-prefix # destination_name = self.get_prefixed_table_name(destination_name) - df = self.spark_session.createDataFrame(df).createOrReplaceTempView(destination_name) + df = self.spark_session.createDataFrame(df).createOrReplaceTempView( + destination_name + ) self.tables.append(destination_name) return SparkSource( @@ -81,4 +101,4 @@ def create_saved_dataset_destination(self) -> SavedDatasetSparkStorage: return SavedDatasetSparkStorage(table_ref=table, query="") def get_prefixed_table_name(self, suffix: str) -> str: - return f"{self.project_name}.{suffix}" \ No newline at end of file + return f"{self.project_name}.{suffix}" From 4b56f55861a7967dd047a872205cb6c081e37eb3 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 28 Feb 2022 17:05:10 -0800 Subject: [PATCH 09/58] Rename to pass checks Signed-off-by: Kevin Zhang --- .../data_sources/{spark.py => spark_data_source_creator.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sdk/python/tests/integration/feature_repos/universal/data_sources/{spark.py => spark_data_source_creator.py} (100%) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py similarity index 100% rename from sdk/python/tests/integration/feature_repos/universal/data_sources/spark.py rename to sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py From 6e278c4bae8bd93d58d13a20bd715dec643dd5e3 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 28 Feb 2022 17:35:24 -0800 Subject: [PATCH 10/58] Fix issues Signed-off-by: Kevin Zhang --- sdk/python/feast/feature_store.py | 6 +++- .../infra/offline_stores/third_party/spark.py | 34 +++++-------------- .../third_party/spark_source.py | 9 ++--- .../feature_repos/repo_configuration.py | 6 ++++ .../data_sources/spark_data_source_creator.py | 15 +++++--- .../registration/test_universal_types.py | 10 +++++- 6 files changed, 44 insertions(+), 36 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index b47e6745c9..20302ecd8e 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -758,7 +758,8 @@ def get_historical_features( all_request_feature_views, all_on_demand_feature_views, ) = self._get_feature_views_to_use(features) - + print("features and refs") + print(_feature_refs) # TODO(achal): _group_feature_refs returns the on demand feature views, but it's no passed into the provider. # This is a weird interface quirk - we should revisit the `get_historical_features` to # pass in the on demand feature views as well. @@ -768,6 +769,7 @@ def get_historical_features( all_request_feature_views, all_on_demand_feature_views, ) + feature_views = list(view for view, _ in fvs) on_demand_feature_views = list(view for view, _ in odfvs) request_feature_views = list(view for view, _ in request_fvs) @@ -796,6 +798,8 @@ def get_historical_features( # Drop refs that refer to RequestFeatureViews since they don't need to be fetched and # already exist in the entity_df _feature_refs = [ref for ref in _feature_refs if ref not in request_fv_refs] + print("Feature refssss") + print(_feature_refs) provider = self._get_provider() job = provider.get_historical_features( diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index 494fc44b89..c01604e845 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -75,8 +75,8 @@ def pull_latest_from_table_or_query( timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) - start_date = _format_datetime(start_date) - end_date = _format_datetime(end_date) + start_date_str = _format_datetime(start_date) + end_date_str = _format_datetime(end_date) query = f""" SELECT {field_string} @@ -85,7 +85,7 @@ def pull_latest_from_table_or_query( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS feast_row_ FROM {from_expression} t1 - WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') + WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date_str}') AND TIMESTAMP('{end_date_str}') ) t2 WHERE feast_row_ = 1 """ @@ -229,18 +229,12 @@ def to_spark_df(self) -> pyspark.sql.DataFrame: *_, last = map(self.spark_session.sql, statements) return last - def to_df(self) -> pandas.DataFrame: - return self.to_spark_df().toPandas() # noqa, DataFrameLike instead of DataFrame - def _to_df_internal(self) -> pd.DataFrame: """Return dataset as Pandas DataFrame synchronously""" - return self.to_df() + return self.to_spark_df().toPandas() def _to_arrow_internal(self) -> pyarrow.Table: """Return dataset as pyarrow Table synchronously""" - return self.to_arrow() - - def to_arrow(self) -> pyarrow.Table: df = self.to_df() return pyarrow.Table.from_pandas(df) # noqa @@ -268,7 +262,7 @@ def get_spark_session_or_start_new_with_repoconfig( spark_conf = store_config.spark_conf if spark_conf: spark_builder = spark_builder.config( - conf=SparkConf().setAll(spark_conf.items()) + conf=SparkConf().setAll([(k, v) for k, v in spark_conf.items()]) ) # noqa spark_session = spark_builder.getOrCreate() @@ -330,23 +324,12 @@ def _upload_entity_df_and_get_entity_schema( raise InvalidEntityType(type(entity_df)) -def _format_datetime(t: datetime): - # Since Hive does not support timezone, need to transform to utc. - if t.tzinfo: - t = t.astimezone(tz=utc) - t = t.strftime("%Y-%m-%d %H:%M:%S.%f") - return t - - return spark_session - - -def _format_datetime(t: datetime): +def _format_datetime(t: datetime) -> str: # Since Hive does not support timezone, need to transform to utc. if t.tzinfo: t = t.astimezone(tz=utc) - t = t.strftime("%Y-%m-%d %H:%M:%S.%f") - return t - + dt = t.strftime("%Y-%m-%d %H:%M:%S.%f") + return dt def _get_feature_view_query_context( entity_df: Union[pd.DataFrame, str], @@ -366,7 +349,6 @@ def _get_feature_view_query_context( entity_df=entity_df, entity_df_event_timestamp_col=entity_df_event_timestamp_col, spark_session=spark_session, - table_name=table_name, ) query_context = offline_utils.get_feature_view_query_context( feature_refs=feature_refs, diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index 929705f40c..9ba99797d5 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -145,7 +145,9 @@ def get_table_query_string(self) -> str: class SparkOptions: def __init__( - self, table: str, query: str, + self, + table: Optional[str] = None, + query: Optional[str] = None, ): self._table = table self._query = query @@ -212,8 +214,7 @@ class SavedDatasetSparkStorage(SavedDatasetStorage): _proto_attr_name = "spark_storage" spark_options: SparkOptions - - def __init__(self, table_ref: str, query: str): + def __init__(self, table_ref: Optional[str] = None, query: Optional[str] = None): self.spark_options = SparkOptions(table=table_ref, query=query) @staticmethod @@ -222,7 +223,7 @@ def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: # storage_proto # ) # spark_options = SparkOptions(table=options.table, query=options.query) - return SavedDatasetSparkStorage(table="", query=None) + return SavedDatasetSparkStorage(table_ref="", query=None) def to_proto(self) -> SavedDatasetStorageProto: return SavedDatasetStorageProto() diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index d7e4231d7f..7964b287d7 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -33,6 +33,9 @@ from tests.integration.feature_repos.universal.data_sources.snowflake import ( SnowflakeDataSourceCreator, ) +from tests.integration.feature_repos.universal.data_sources.spark_data_source_creator import ( + SparkDataSourceCreator, +) from tests.integration.feature_repos.universal.feature_views import ( conv_rate_plus_100_feature_view, create_conv_rate_request_data_source, @@ -74,6 +77,9 @@ IntegrationTestRepoConfig(online_store=REDIS_CONFIG), IntegrationTestRepoConfig(online_store=REDIS_CLUSTER_CONFIG), # GCP configurations + # IntegrationTestRepoConfig( + # provider="local", offline_store_creator=SparkDataSourceCreator, + # ) IntegrationTestRepoConfig( provider="gcp", offline_store_creator=BigQueryDataSourceCreator, diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 4cb7c52244..271495d875 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -35,7 +35,7 @@ def __init__(self, project_name: str): if not self.spark_session: self.spark_session = ( SparkSession.builder.config( - conf=SparkConf().setAll(self.spark_conf.items()) + conf=SparkConf().setAll([(k, v) for k, v in self.spark_conf.items()]) ) .appName("pytest-pyspark-local-testing") .getOrCreate() @@ -80,12 +80,19 @@ def create_data_source( # https://stackoverflow.com/questions/51871200/analysisexception-it-is-not-allowed-to-add-database-prefix # destination_name = self.get_prefixed_table_name(destination_name) - - df = self.spark_session.createDataFrame(df).createOrReplaceTempView( + if not self.spark_session: + self.spark_session = ( + SparkSession.builder.config( + conf=SparkConf().setAll([(k, v) for k, v in self.spark_conf.items()]) + ) + .appName("pytest-pyspark-local-testing") + .getOrCreate() + ) + self.spark_session.createDataFrame(df).createOrReplaceTempView( destination_name ) - self.tables.append(destination_name) + return SparkSource( table=destination_name, event_timestamp_column=event_timestamp_column, diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 59ca119f98..e6d819e484 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -181,9 +181,12 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures ts - timedelta(hours=4), ts - timedelta(hours=2), ] + print("entity df") + print(entity_df) historical_features = fs.get_historical_features( entity_df=entity_df, features=features, ) + print(historical_features) # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() print(historical_features_df) @@ -285,7 +288,8 @@ def create_feature_view( value_type = ValueType.BOOL elif feature_dtype == "datetime": value_type = ValueType.UNIX_TIMESTAMP - + print("value type") + print(value_type) return driver_feature_view(data_source, name=name, value_type=value_type,) @@ -358,6 +362,8 @@ def assert_expected_arrow_types( "date": pa.types.is_date, "datetime": pa.types.is_timestamp, } + print("asdfas") + print(feature_dtype) arrow_type_checker = feature_list_dtype_to_expected_historical_feature_arrow_type[ feature_dtype ] @@ -365,6 +371,8 @@ def assert_expected_arrow_types( if feature_is_list: assert pa.types.is_list(pa_type) + print("Type") + print(pa_type.value_type) assert arrow_type_checker(pa_type.value_type) else: assert arrow_type_checker(pa_type) From 4bc67d85f74ff45769b18d8fadb2590c9e5fd8b9 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 28 Feb 2022 17:41:46 -0800 Subject: [PATCH 11/58] Fix type checking issues Signed-off-by: Kevin Zhang --- .../feast/infra/offline_stores/third_party/spark.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index c01604e845..b16350ed62 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -357,14 +357,6 @@ def _get_feature_view_query_context( project=project, entity_df_timestamp_range=entity_df_timestamp_range, ) - else: - # for feast<0.17 - query_context = offline_utils.get_feature_view_query_context( - feature_refs=feature_refs, - feature_views=feature_views, - registry=registry, - project=project, - ) return query_context From c934edce4ca0b6aa70d370e54bf0b015d1f00f53 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Mon, 28 Feb 2022 17:42:24 -0800 Subject: [PATCH 12/58] Fix lint Signed-off-by: Kevin Zhang --- .../feast/infra/offline_stores/third_party/spark.py | 1 + .../infra/offline_stores/third_party/spark_source.py | 5 ++--- .../data_sources/spark_data_source_creator.py | 12 +++++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index b16350ed62..f1ba2b1a57 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -331,6 +331,7 @@ def _format_datetime(t: datetime) -> str: dt = t.strftime("%Y-%m-%d %H:%M:%S.%f") return dt + def _get_feature_view_query_context( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index 9ba99797d5..ce5cb1b59e 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -145,9 +145,7 @@ def get_table_query_string(self) -> str: class SparkOptions: def __init__( - self, - table: Optional[str] = None, - query: Optional[str] = None, + self, table: Optional[str] = None, query: Optional[str] = None, ): self._table = table self._query = query @@ -214,6 +212,7 @@ class SavedDatasetSparkStorage(SavedDatasetStorage): _proto_attr_name = "spark_storage" spark_options: SparkOptions + def __init__(self, table_ref: Optional[str] = None, query: Optional[str] = None): self.spark_options = SparkOptions(table=table_ref, query=query) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 271495d875..0d995bdea7 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -35,7 +35,9 @@ def __init__(self, project_name: str): if not self.spark_session: self.spark_session = ( SparkSession.builder.config( - conf=SparkConf().setAll([(k, v) for k, v in self.spark_conf.items()]) + conf=SparkConf().setAll( + [(k, v) for k, v in self.spark_conf.items()] + ) ) .appName("pytest-pyspark-local-testing") .getOrCreate() @@ -83,14 +85,14 @@ def create_data_source( if not self.spark_session: self.spark_session = ( SparkSession.builder.config( - conf=SparkConf().setAll([(k, v) for k, v in self.spark_conf.items()]) + conf=SparkConf().setAll( + [(k, v) for k, v in self.spark_conf.items()] + ) ) .appName("pytest-pyspark-local-testing") .getOrCreate() ) - self.spark_session.createDataFrame(df).createOrReplaceTempView( - destination_name - ) + self.spark_session.createDataFrame(df).createOrReplaceTempView(destination_name) self.tables.append(destination_name) return SparkSource( From e01d0232ab43c91750f61a733c5cd796663f3383 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 1 Mar 2022 10:43:14 -0800 Subject: [PATCH 13/58] Clean up print statements for first review Signed-off-by: Kevin Zhang --- sdk/python/feast/feature_store.py | 4 ---- .../integration/feature_repos/repo_configuration.py | 2 +- .../integration/registration/test_universal_types.py | 12 +----------- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 20302ecd8e..650623274f 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -758,8 +758,6 @@ def get_historical_features( all_request_feature_views, all_on_demand_feature_views, ) = self._get_feature_views_to_use(features) - print("features and refs") - print(_feature_refs) # TODO(achal): _group_feature_refs returns the on demand feature views, but it's no passed into the provider. # This is a weird interface quirk - we should revisit the `get_historical_features` to # pass in the on demand feature views as well. @@ -798,8 +796,6 @@ def get_historical_features( # Drop refs that refer to RequestFeatureViews since they don't need to be fetched and # already exist in the entity_df _feature_refs = [ref for ref in _feature_refs if ref not in request_fv_refs] - print("Feature refssss") - print(_feature_refs) provider = self._get_provider() job = provider.get_historical_features( diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 7964b287d7..486c806b9d 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -55,7 +55,7 @@ "type": "redis", "redis_type": "redis_cluster", # Redis Cluster Port Forwarding is setup in "pr_integration_tests.yaml" under "Setup Redis Cluster". - "connection_string": "127.0.0.1:7001,127.0.0.1:7002,127.0.0.1:7003", + "connection_string": "127.0.0.1:32001,127.0.0.1:32002,127.0.0.1:32003", } # FULL_REPO_CONFIGS contains the repo configurations (e.g. provider, offline store, diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index e6d819e484..7322cc01e4 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -181,15 +181,12 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures ts - timedelta(hours=4), ts - timedelta(hours=2), ] - print("entity df") - print(entity_df) historical_features = fs.get_historical_features( entity_df=entity_df, features=features, ) - print(historical_features) + # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() - print(historical_features_df) if config.feature_is_list: assert_feature_list_types( @@ -288,8 +285,6 @@ def create_feature_view( value_type = ValueType.BOOL elif feature_dtype == "datetime": value_type = ValueType.UNIX_TIMESTAMP - print("value type") - print(value_type) return driver_feature_view(data_source, name=name, value_type=value_type,) @@ -352,7 +347,6 @@ def assert_expected_arrow_types( ): print("Asserting historical feature arrow types") historical_features_arrow = historical_features.to_arrow() - print(historical_features_arrow) feature_list_dtype_to_expected_historical_feature_arrow_type = { "int32": pa.types.is_int64, "int64": pa.types.is_int64, @@ -362,8 +356,6 @@ def assert_expected_arrow_types( "date": pa.types.is_date, "datetime": pa.types.is_timestamp, } - print("asdfas") - print(feature_dtype) arrow_type_checker = feature_list_dtype_to_expected_historical_feature_arrow_type[ feature_dtype ] @@ -371,8 +363,6 @@ def assert_expected_arrow_types( if feature_is_list: assert pa.types.is_list(pa_type) - print("Type") - print(pa_type.value_type) assert arrow_type_checker(pa_type.value_type) else: assert arrow_type_checker(pa_type) From 01ac14ac36581ce1e68349e6f123b0f27594db9c Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 1 Mar 2022 10:47:56 -0800 Subject: [PATCH 14/58] Fix lint Signed-off-by: Kevin Zhang --- .../tests/integration/feature_repos/repo_configuration.py | 3 --- .../universal/data_sources/spark_data_source_creator.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 486c806b9d..0459fc7d42 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -33,9 +33,6 @@ from tests.integration.feature_repos.universal.data_sources.snowflake import ( SnowflakeDataSourceCreator, ) -from tests.integration.feature_repos.universal.data_sources.spark_data_source_creator import ( - SparkDataSourceCreator, -) from tests.integration.feature_repos.universal.feature_views import ( conv_rate_plus_100_feature_view, create_conv_rate_request_data_source, diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 0d995bdea7..97416d2959 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -1,5 +1,5 @@ import uuid -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List import pandas as pd from pyspark import SparkConf From 26c8a01d79ede1f63c0db615c96844a09bd88f8a Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 1 Mar 2022 10:50:16 -0800 Subject: [PATCH 15/58] Fix flake 8 lint tests Signed-off-by: Kevin Zhang --- .../infra/offline_stores/third_party/spark.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index f1ba2b1a57..fd75cad668 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -1,6 +1,5 @@ import inspect from datetime import datetime -from ntpath import join from typing import Dict, List, Optional, Tuple, Union import numpy as np @@ -8,8 +7,7 @@ import pandas as pd import pyarrow import pyspark -from dateutil import parser -from feast_spark_offline_store.spark_source import SavedDatasetSparkStorage, SparkSource +from feast_spark_offline_store.spark_source import SparkSource from feast_spark_offline_store.spark_type_map import spark_schema_to_np_dtypes from pydantic import StrictStr from pyspark import SparkConf @@ -19,7 +17,7 @@ from feast import FeatureView, OnDemandFeatureView from feast.data_source import DataSource from feast.errors import InvalidEntityType -from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL, FeatureView +from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL from feast.infra.offline_stores import offline_utils from feast.infra.offline_stores.offline_store import ( OfflineStore, @@ -150,9 +148,11 @@ def get_historical_features( query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, full_feature_names=full_feature_names, ) - on_demand_feature_views = OnDemandFeatureView.get_requested_odfvs( - feature_refs=feature_refs, project=project, registry=registry - ) + + # TODO: Figure out what this is used for + # on_demand_feature_views = OnDemandFeatureView.get_requested_odfvs( + # feature_refs=feature_refs, project=project, registry=registry + # ) return SparkRetrievalJob( spark_session=spark_session, From 6f8ce3c395dbd5a71a941dc2659c501e113ac351 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 1 Mar 2022 16:26:11 -0800 Subject: [PATCH 16/58] Add warnings for alpha version release Signed-off-by: Kevin Zhang --- .../infra/offline_stores/third_party/spark.py | 19 ++++++++++++++++++- .../third_party/spark_source.py | 6 ++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index fd75cad668..08abc9e79f 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -1,7 +1,7 @@ import inspect from datetime import datetime from typing import Dict, List, Optional, Tuple, Union - +import warnings import numpy as np import pandas import pandas as pd @@ -58,6 +58,12 @@ def pull_latest_from_table_or_query( assert isinstance(config.offline_store, SparkOfflineStoreConfig) assert isinstance(data_source, SparkSource) + warnings.warn( + "The spark offline store is an experimental feature in alpha development. " + "This API is unstable and it could and most probably will be changed in the future.", + RuntimeWarning, + ) + print("Pulling latest features from spark offline store") from_expression = data_source.get_table_query_string() @@ -106,6 +112,11 @@ def get_historical_features( full_feature_names: bool = False, ) -> RetrievalJob: assert isinstance(config.offline_store, SparkOfflineStoreConfig) + warnings.warn( + "The spark offline store is an experimental feature in alpha development. " + "This API is unstable and it could and most probably will be changed in the future.", + RuntimeWarning, + ) spark_session = get_spark_session_or_start_new_with_repoconfig( store_config=config.offline_store ) @@ -184,6 +195,12 @@ def pull_all_from_table_or_query( have all already been mapped to column names of the source table and those column names are the values passed into this function. """ + warnings.warn( + "The spark offline store is an experimental feature in alpha development. " + "This API is unstable and it could and most probably will be changed in the future.", + RuntimeWarning, + ) + return SparkOfflineStore.pull_latest_from_table_or_query( config=config, data_source=data_source, diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index ce5cb1b59e..0e335499b7 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -1,5 +1,6 @@ import pickle from typing import Any, Callable, Dict, Iterable, Optional, Tuple +import warnings from feast_spark_offline_store.spark_type_map import spark_to_feast_value_type from pyspark.sql.utils import AnalysisException @@ -36,6 +37,11 @@ def __init__( field_mapping, date_partition_column, ) + warnings.warn( + "The spark data source API is an experimental feature in alpha development. " + "This API is unstable and it could and most probably will be changed in the future.", + RuntimeWarning, + ) self._spark_options = SparkOptions( table=table, From 551eea1304d4ec4cdc83f8a1ebae05152a4137d0 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 1 Mar 2022 16:56:51 -0800 Subject: [PATCH 17/58] Format Signed-off-by: Kevin Zhang --- java/datatypes/src/main/proto/feast | 1 - .../main/proto/feast/core/DataFormat.proto | 56 ++++++ .../main/proto/feast/core/DataSource.proto | 177 ++++++++++++++++++ .../proto/feast/core/DatastoreTable.proto | 39 ++++ .../main/proto/feast/core/DynamoDBTable.proto | 31 +++ .../src/main/proto/feast/core/Entity.proto | 60 ++++++ .../src/main/proto/feast/core/Feature.proto | 36 ++++ .../proto/feast/core/FeatureService.proto | 48 +++++ .../main/proto/feast/core/FeatureTable.proto | 86 +++++++++ .../main/proto/feast/core/FeatureView.proto | 85 +++++++++ .../feast/core/FeatureViewProjection.proto | 25 +++ .../main/proto/feast/core/InfraObject.proto | 51 +++++ .../feast/core/OnDemandFeatureView.proto | 78 ++++++++ .../src/main/proto/feast/core/Registry.proto | 48 +++++ .../proto/feast/core/RequestFeatureView.proto | 43 +++++ .../main/proto/feast/core/SavedDataset.proto | 77 ++++++++ .../main/proto/feast/core/SqliteTable.proto | 31 +++ .../src/main/proto/feast/core/Store.proto | 130 +++++++++++++ .../proto/feast/core/ValidationProfile.proto | 48 +++++ .../proto/feast/serving/ServingService.proto | 134 +++++++++++++ .../feast/serving/TransformationService.proto | 67 +++++++ .../src/main/proto/feast/storage/Redis.proto | 34 ++++ .../grpc/health/v1/HealthService.proto | 24 +++ .../main/proto/feast/types/EntityKey.proto | 30 +++ .../src/main/proto/feast/types/Field.proto | 30 +++ .../src/main/proto/feast/types/Value.proto | 109 +++++++++++ .../infra/offline_stores/third_party/spark.py | 3 +- .../third_party/spark_source.py | 2 +- 28 files changed, 1580 insertions(+), 3 deletions(-) delete mode 120000 java/datatypes/src/main/proto/feast create mode 100644 java/datatypes/src/main/proto/feast/core/DataFormat.proto create mode 100644 java/datatypes/src/main/proto/feast/core/DataSource.proto create mode 100644 java/datatypes/src/main/proto/feast/core/DatastoreTable.proto create mode 100644 java/datatypes/src/main/proto/feast/core/DynamoDBTable.proto create mode 100644 java/datatypes/src/main/proto/feast/core/Entity.proto create mode 100644 java/datatypes/src/main/proto/feast/core/Feature.proto create mode 100644 java/datatypes/src/main/proto/feast/core/FeatureService.proto create mode 100644 java/datatypes/src/main/proto/feast/core/FeatureTable.proto create mode 100644 java/datatypes/src/main/proto/feast/core/FeatureView.proto create mode 100644 java/datatypes/src/main/proto/feast/core/FeatureViewProjection.proto create mode 100644 java/datatypes/src/main/proto/feast/core/InfraObject.proto create mode 100644 java/datatypes/src/main/proto/feast/core/OnDemandFeatureView.proto create mode 100644 java/datatypes/src/main/proto/feast/core/Registry.proto create mode 100644 java/datatypes/src/main/proto/feast/core/RequestFeatureView.proto create mode 100644 java/datatypes/src/main/proto/feast/core/SavedDataset.proto create mode 100644 java/datatypes/src/main/proto/feast/core/SqliteTable.proto create mode 100644 java/datatypes/src/main/proto/feast/core/Store.proto create mode 100644 java/datatypes/src/main/proto/feast/core/ValidationProfile.proto create mode 100644 java/datatypes/src/main/proto/feast/serving/ServingService.proto create mode 100644 java/datatypes/src/main/proto/feast/serving/TransformationService.proto create mode 100644 java/datatypes/src/main/proto/feast/storage/Redis.proto create mode 100644 java/datatypes/src/main/proto/feast/third_party/grpc/health/v1/HealthService.proto create mode 100644 java/datatypes/src/main/proto/feast/types/EntityKey.proto create mode 100644 java/datatypes/src/main/proto/feast/types/Field.proto create mode 100644 java/datatypes/src/main/proto/feast/types/Value.proto diff --git a/java/datatypes/src/main/proto/feast b/java/datatypes/src/main/proto/feast deleted file mode 120000 index 463e4045de..0000000000 --- a/java/datatypes/src/main/proto/feast +++ /dev/null @@ -1 +0,0 @@ -../../../../../protos/feast \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/DataFormat.proto b/java/datatypes/src/main/proto/feast/core/DataFormat.proto new file mode 100644 index 0000000000..2926e08c63 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/DataFormat.proto @@ -0,0 +1,56 @@ +// +// Copyright 2020 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; +package feast.core; + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "DataFormatProto"; +option java_package = "feast.proto.core"; + +// Defines the file format encoding the features/entity data in files +message FileFormat { + // Defines options for the Parquet data format + message ParquetFormat {} + + oneof format { + ParquetFormat parquet_format = 1; + } +} + +// Defines the data format encoding features/entity data in data streams +message StreamFormat { + // Defines options for the protobuf data format + message ProtoFormat { + // Classpath to the generated Java Protobuf class that can be used to decode + // Feature data from the obtained stream message + string class_path = 1; + } + + // Defines options for the avro data format + message AvroFormat { + // Optional if used in a File DataSource as schema is embedded in avro file. + // Specifies the schema of the Avro message as JSON string. + string schema_json = 1; + } + + // Specifies the data format and format specific options + oneof format { + AvroFormat avro_format = 1; + ProtoFormat proto_format = 2; + } +} diff --git a/java/datatypes/src/main/proto/feast/core/DataSource.proto b/java/datatypes/src/main/proto/feast/core/DataSource.proto new file mode 100644 index 0000000000..41bba6443f --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/DataSource.proto @@ -0,0 +1,177 @@ +// +// Copyright 2020 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; +package feast.core; + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "DataSourceProto"; +option java_package = "feast.proto.core"; + +import "feast/core/DataFormat.proto"; +import "feast/types/Value.proto"; + +// Defines a Data Source that can be used source Feature data +message DataSource { + // Field indexes should *not* be reused. Not sure if fields 6-10 were used previously or not, + // but they are going to be reserved for backwards compatibility. + reserved 6 to 10; + + // Type of Data Source. + // Next available id: 9 + enum SourceType { + INVALID = 0; + BATCH_FILE = 1; + BATCH_SNOWFLAKE = 8; + BATCH_BIGQUERY = 2; + BATCH_REDSHIFT = 5; + STREAM_KAFKA = 3; + STREAM_KINESIS = 4; + CUSTOM_SOURCE = 6; + REQUEST_SOURCE = 7; + + } + SourceType type = 1; + + // Defines mapping between fields in the sourced data + // and fields in parent FeatureTable. + map field_mapping = 2; + + // Must specify event timestamp column name + string event_timestamp_column = 3; + + // (Optional) Specify partition column + // useful for file sources + string date_partition_column = 4; + + // Must specify creation timestamp column name + string created_timestamp_column = 5; + + // This is an internal field that is represents the python class for the data source object a proto object represents. + // This should be set by feast, and not by users. + string data_source_class_type = 17; + + // Defines options for DataSource that sources features from a file + message FileOptions { + FileFormat file_format = 1; + + // Target URL of file to retrieve and source features from. + // s3://path/to/file for AWS S3 storage + // gs://path/to/file for GCP GCS storage + // file:///path/to/file for local storage + string file_url = 2; + + // override AWS S3 storage endpoint with custom S3 endpoint + string s3_endpoint_override = 3; + } + + // Defines options for DataSource that sources features from a BigQuery Query + message BigQueryOptions { + // Full table reference in the form of [project:dataset.table] + string table_ref = 1; + + // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective + // entity columns + string query = 2; + } + + // Defines options for DataSource that sources features from Kafka messages. + // Each message should be a Protobuf that can be decoded with the generated + // Java Protobuf class at the given class path + message KafkaOptions { + // Comma separated list of Kafka bootstrap servers. Used for feature tables without a defined source host[:port]] + string bootstrap_servers = 1; + + // Kafka topic to collect feature data from. + string topic = 2; + + // Defines the stream data format encoding feature/entity data in Kafka messages. + StreamFormat message_format = 3; + } + + // Defines options for DataSource that sources features from Kinesis records. + // Each record should be a Protobuf that can be decoded with the generated + // Java Protobuf class at the given class path + message KinesisOptions { + // AWS region of the Kinesis stream + string region = 1; + + // Name of the Kinesis stream to obtain feature data from. + string stream_name = 2; + + // Defines the data format encoding the feature/entity data in Kinesis records. + // Kinesis Data Sources support Avro and Proto as data formats. + StreamFormat record_format = 3; + } + + // Defines options for DataSource that sources features from a Redshift Query + message RedshiftOptions { + // Redshift table name + string table = 1; + + // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective + // entity columns + string query = 2; + + // Redshift schema name + string schema = 3; + } + + // Defines options for DataSource that sources features from a Snowflake Query + message SnowflakeOptions { + // Snowflake table name + string table = 1; + + // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective + // entity columns + string query = 2; + + // Snowflake schema name + string schema = 3; + + // Snowflake schema name + string database = 4; + } + + // Defines configuration for custom third-party data sources. + message CustomSourceOptions { + // Serialized configuration information for the data source. The implementer of the custom data source is + // responsible for serializing and deserializing data from bytes + bytes configuration = 1; + } + + // Defines options for DataSource that sources features from request data + message RequestDataOptions { + // Name of the request data source + string name = 1; + + // Mapping of feature name to type + map schema = 2; + } + + // DataSource options. + oneof options { + FileOptions file_options = 11; + BigQueryOptions bigquery_options = 12; + KafkaOptions kafka_options = 13; + KinesisOptions kinesis_options = 14; + RedshiftOptions redshift_options = 15; + RequestDataOptions request_data_options = 18; + CustomSourceOptions custom_options = 16; + SnowflakeOptions snowflake_options = 19; + } +} diff --git a/java/datatypes/src/main/proto/feast/core/DatastoreTable.proto b/java/datatypes/src/main/proto/feast/core/DatastoreTable.proto new file mode 100644 index 0000000000..15720ad809 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/DatastoreTable.proto @@ -0,0 +1,39 @@ +// +// * Copyright 2021 The Feast Authors +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * https://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// + +syntax = "proto3"; + +package feast.core; +option java_package = "feast.proto.core"; +option java_outer_classname = "DatastoreTableProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +import "google/protobuf/wrappers.proto"; + +// Represents a Datastore table +message DatastoreTable { + // Feast project of the table + string project = 1; + + // Name of the table + string name = 2; + + // GCP project id + google.protobuf.StringValue project_id = 3; + + // Datastore namespace + google.protobuf.StringValue namespace = 4; +} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/DynamoDBTable.proto b/java/datatypes/src/main/proto/feast/core/DynamoDBTable.proto new file mode 100644 index 0000000000..1ab77febbd --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/DynamoDBTable.proto @@ -0,0 +1,31 @@ +// +// * Copyright 2021 The Feast Authors +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * https://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// + +syntax = "proto3"; + +package feast.core; +option java_package = "feast.proto.core"; +option java_outer_classname = "DynamoDBTableProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +// Represents a DynamoDB table +message DynamoDBTable { + // Name of the table + string name = 1; + + // Region of the table + string region = 2; +} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/Entity.proto b/java/datatypes/src/main/proto/feast/core/Entity.proto new file mode 100644 index 0000000000..cd54c64922 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/Entity.proto @@ -0,0 +1,60 @@ +// +// * Copyright 2020 The Feast Authors +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * https://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// + +syntax = "proto3"; + +package feast.core; +option java_package = "feast.proto.core"; +option java_outer_classname = "EntityProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +import "feast/types/Value.proto"; +import "google/protobuf/timestamp.proto"; + +message Entity { + // User-specified specifications of this entity. + EntitySpecV2 spec = 1; + // System-populated metadata for this entity. + EntityMeta meta = 2; +} + +message EntitySpecV2 { + // Name of the entity. + string name = 1; + + // Name of Feast project that this feature table belongs to. + string project = 9; + + // Type of the entity. + feast.types.ValueType.Enum value_type = 2; + + // Description of the entity. + string description = 3; + + // Join key for the entity (i.e. name of the column the entity maps to). + string join_key = 4; + + // User defined metadata + map tags = 8; + + // Owner of the entity. + string owner = 10; +} + +message EntityMeta { + google.protobuf.Timestamp created_timestamp = 1; + google.protobuf.Timestamp last_updated_timestamp = 2; +} diff --git a/java/datatypes/src/main/proto/feast/core/Feature.proto b/java/datatypes/src/main/proto/feast/core/Feature.proto new file mode 100644 index 0000000000..ea0d340a00 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/Feature.proto @@ -0,0 +1,36 @@ +// +// Copyright 2020 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +syntax = "proto3"; +package feast.core; + + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "FeatureProto"; +option java_package = "feast.proto.core"; + +import "feast/types/Value.proto"; + +message FeatureSpecV2 { + // Name of the feature. Not updatable. + string name = 1; + + // Value type of the feature. Not updatable. + feast.types.ValueType.Enum value_type = 2; + + // Labels for user defined metadata on a feature + map labels = 3; +} diff --git a/java/datatypes/src/main/proto/feast/core/FeatureService.proto b/java/datatypes/src/main/proto/feast/core/FeatureService.proto new file mode 100644 index 0000000000..4aaa0d5f06 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/FeatureService.proto @@ -0,0 +1,48 @@ +syntax = "proto3"; +package feast.core; + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "FeatureServiceProto"; +option java_package = "feast.proto.core"; + +import "google/protobuf/timestamp.proto"; +import "feast/core/FeatureViewProjection.proto"; + +message FeatureService { + // User-specified specifications of this feature service. + FeatureServiceSpec spec = 1; + + // System-populated metadata for this feature service. + FeatureServiceMeta meta = 2; +} + +message FeatureServiceSpec { + // Name of the Feature Service. Must be unique. Not updated. + string name = 1; + + // Name of Feast project that this Feature Service belongs to. + string project = 2; + + // Represents a projection that's to be applied on top of the FeatureView. + // Contains data such as the features to use from a FeatureView. + repeated FeatureViewProjection features = 3; + + // User defined metadata + map tags = 4; + + // Description of the feature service. + string description = 5; + + // Owner of the feature service. + string owner = 6; +} + + +message FeatureServiceMeta { + // Time where this Feature Service is created + google.protobuf.Timestamp created_timestamp = 1; + + // Time where this Feature Service is last updated + google.protobuf.Timestamp last_updated_timestamp = 2; + +} diff --git a/java/datatypes/src/main/proto/feast/core/FeatureTable.proto b/java/datatypes/src/main/proto/feast/core/FeatureTable.proto new file mode 100644 index 0000000000..661f4eecfc --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/FeatureTable.proto @@ -0,0 +1,86 @@ +// +// Copyright 2020 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; +package feast.core; + + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "FeatureTableProto"; +option java_package = "feast.proto.core"; + +import "google/protobuf/duration.proto"; +import "google/protobuf/timestamp.proto"; +import "feast/core/DataSource.proto"; +import "feast/core/Feature.proto"; + +message FeatureTable { + // User-specified specifications of this feature table. + FeatureTableSpec spec = 1; + + // System-populated metadata for this feature table. + FeatureTableMeta meta = 2; +} + +message FeatureTableSpec { + // Name of the feature table. Must be unique. Not updated. + string name = 1; + + // Name of Feast project that this feature table belongs to. + string project = 9; + + // List names of entities to associate with the Features defined in this + // Feature Table. Not updatable. + repeated string entities = 3; + + // List of features specifications for each feature defined with this feature table. + repeated FeatureSpecV2 features = 4; + + // User defined metadata + map labels = 5; + + // Features in this feature table can only be retrieved from online serving + // younger than max age. Age is measured as the duration of time between + // the feature's event timestamp and when the feature is retrieved + // Feature values outside max age will be returned as unset values and indicated to end user + google.protobuf.Duration max_age = 6; + + // Batch/Offline DataSource to source batch/offline feature data. + // Only batch DataSource can be specified + // (ie source type should start with 'BATCH_') + DataSource batch_source = 7; + + // Stream/Online DataSource to source stream/online feature data. + // Only stream DataSource can be specified + // (ie source type should start with 'STREAM_') + DataSource stream_source = 8; +} + +message FeatureTableMeta { + // Time where this Feature Table is created + google.protobuf.Timestamp created_timestamp = 1; + + // Time where this Feature Table is last updated + google.protobuf.Timestamp last_updated_timestamp = 2; + + // Auto incrementing revision no. of this Feature Table + int64 revision = 3; + + // Hash entities, features, batch_source and stream_source to inform JobService if + // jobs should be restarted should hash change + string hash = 4; +} diff --git a/java/datatypes/src/main/proto/feast/core/FeatureView.proto b/java/datatypes/src/main/proto/feast/core/FeatureView.proto new file mode 100644 index 0000000000..6edba9f7fe --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/FeatureView.proto @@ -0,0 +1,85 @@ +// +// Copyright 2020 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; +package feast.core; + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "FeatureViewProto"; +option java_package = "feast.proto.core"; + +import "google/protobuf/duration.proto"; +import "google/protobuf/timestamp.proto"; +import "feast/core/DataSource.proto"; +import "feast/core/Feature.proto"; + +message FeatureView { + // User-specified specifications of this feature view. + FeatureViewSpec spec = 1; + + // System-populated metadata for this feature view. + FeatureViewMeta meta = 2; +} + +// TODO(adchia): refactor common fields from this and ODFV into separate metadata proto +message FeatureViewSpec { + // Name of the feature view. Must be unique. Not updated. + string name = 1; + + // Name of Feast project that this feature view belongs to. + string project = 2; + + // List names of entities to associate with the Features defined in this + // Feature View. Not updatable. + repeated string entities = 3; + + // List of features specifications for each feature defined with this feature view. + repeated FeatureSpecV2 features = 4; + + // User defined metadata + map tags = 5; + + // Features in this feature view can only be retrieved from online serving + // younger than ttl. Ttl is measured as the duration of time between + // the feature's event timestamp and when the feature is retrieved + // Feature values outside ttl will be returned as unset values and indicated to end user + google.protobuf.Duration ttl = 6; + + // Batch/Offline DataSource where this view can retrieve offline feature data. + DataSource batch_source = 7; + // Streaming DataSource from where this view can consume "online" feature data. + DataSource stream_source = 9; + + // Whether these features should be served online or not + bool online = 8; +} + +message FeatureViewMeta { + // Time where this Feature View is created + google.protobuf.Timestamp created_timestamp = 1; + + // Time where this Feature View is last updated + google.protobuf.Timestamp last_updated_timestamp = 2; + + // List of pairs (start_time, end_time) for which this feature view has been materialized. + repeated MaterializationInterval materialization_intervals = 3; +} + +message MaterializationInterval { + google.protobuf.Timestamp start_time = 1; + google.protobuf.Timestamp end_time = 2; +} diff --git a/java/datatypes/src/main/proto/feast/core/FeatureViewProjection.proto b/java/datatypes/src/main/proto/feast/core/FeatureViewProjection.proto new file mode 100644 index 0000000000..e81d8dad01 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/FeatureViewProjection.proto @@ -0,0 +1,25 @@ +syntax = "proto3"; +package feast.core; + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "FeatureReferenceProto"; +option java_package = "feast.proto.core"; + +import "feast/core/Feature.proto"; + + +// A projection to be applied on top of a FeatureView. +// Contains the modifications to a FeatureView such as the features subset to use. +message FeatureViewProjection { + // The feature view name + string feature_view_name = 1; + + // Alias for feature view name + string feature_view_name_alias = 3; + + // The features of the feature view that are a part of the feature reference. + repeated FeatureSpecV2 feature_columns = 2; + + // Map for entity join_key overrides of feature data entity join_key to entity data join_key + map join_key_map = 4; +} diff --git a/java/datatypes/src/main/proto/feast/core/InfraObject.proto b/java/datatypes/src/main/proto/feast/core/InfraObject.proto new file mode 100644 index 0000000000..863f1b64da --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/InfraObject.proto @@ -0,0 +1,51 @@ +// +// * Copyright 2021 The Feast Authors +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * https://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// + +syntax = "proto3"; + +package feast.core; +option java_package = "feast.proto.core"; +option java_outer_classname = "InfraObjectProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +import "feast/core/DatastoreTable.proto"; +import "feast/core/DynamoDBTable.proto"; +import "feast/core/SqliteTable.proto"; + +// Represents a set of infrastructure objects managed by Feast +message Infra { + // List of infrastructure objects managed by Feast + repeated InfraObject infra_objects = 1; +} + +// Represents a single infrastructure object managed by Feast +message InfraObject { + // Represents the Python class for the infrastructure object + string infra_object_class_type = 1; + + // The infrastructure object + oneof infra_object { + DynamoDBTable dynamodb_table = 2; + DatastoreTable datastore_table = 3; + SqliteTable sqlite_table = 4; + CustomInfra custom_infra = 100; + } + + // Allows for custom infra objects to be added + message CustomInfra { + bytes field = 1; + } +} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/OnDemandFeatureView.proto b/java/datatypes/src/main/proto/feast/core/OnDemandFeatureView.proto new file mode 100644 index 0000000000..58feff5bfd --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/OnDemandFeatureView.proto @@ -0,0 +1,78 @@ +// +// Copyright 2020 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; +package feast.core; + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "OnDemandFeatureViewProto"; +option java_package = "feast.proto.core"; + +import "google/protobuf/timestamp.proto"; +import "feast/core/FeatureView.proto"; +import "feast/core/FeatureViewProjection.proto"; +import "feast/core/Feature.proto"; +import "feast/core/DataSource.proto"; + +message OnDemandFeatureView { + // User-specified specifications of this feature view. + OnDemandFeatureViewSpec spec = 1; + OnDemandFeatureViewMeta meta = 2; +} + +message OnDemandFeatureViewSpec { + // Name of the feature view. Must be unique. Not updated. + string name = 1; + + // Name of Feast project that this feature view belongs to. + string project = 2; + + // List of features specifications for each feature defined with this feature view. + repeated FeatureSpecV2 features = 3; + + // Map of inputs for this feature view. + map inputs = 4; + + UserDefinedFunction user_defined_function = 5; + + +} + +message OnDemandFeatureViewMeta { + // Time where this Feature View is created + google.protobuf.Timestamp created_timestamp = 1; + + // Time where this Feature View is last updated + google.protobuf.Timestamp last_updated_timestamp = 2; +} + +message OnDemandInput { + oneof input { + FeatureView feature_view = 1; + FeatureViewProjection feature_view_projection = 3; + DataSource request_data_source = 2; + } +} + +// Serialized representation of python function. +message UserDefinedFunction { + // The function name + string name = 1; + + // The python-syntax function body (serialized by dill) + bytes body = 2; +} diff --git a/java/datatypes/src/main/proto/feast/core/Registry.proto b/java/datatypes/src/main/proto/feast/core/Registry.proto new file mode 100644 index 0000000000..3deeb97238 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/Registry.proto @@ -0,0 +1,48 @@ +// +// * Copyright 2020 The Feast Authors +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * https://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// + +syntax = "proto3"; + +package feast.core; +option java_package = "feast.proto.core"; +option java_outer_classname = "RegistryProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +import "feast/core/Entity.proto"; +import "feast/core/FeatureService.proto"; +import "feast/core/FeatureTable.proto"; +import "feast/core/FeatureView.proto"; +import "feast/core/InfraObject.proto"; +import "feast/core/OnDemandFeatureView.proto"; +import "feast/core/RequestFeatureView.proto"; +import "feast/core/SavedDataset.proto"; +import "google/protobuf/timestamp.proto"; + +message Registry { + repeated Entity entities = 1; + repeated FeatureTable feature_tables = 2; + repeated FeatureView feature_views = 6; + repeated OnDemandFeatureView on_demand_feature_views = 8; + repeated RequestFeatureView request_feature_views = 9; + repeated FeatureService feature_services = 7; + repeated SavedDataset saved_datasets = 11; + Infra infra = 10; + + string registry_schema_version = 3; // to support migrations; incremented when schema is changed + string version_id = 4; // version id, random string generated on each update of the data; now used only for debugging purposes + google.protobuf.Timestamp last_updated = 5; + +} diff --git a/java/datatypes/src/main/proto/feast/core/RequestFeatureView.proto b/java/datatypes/src/main/proto/feast/core/RequestFeatureView.proto new file mode 100644 index 0000000000..c9ee540e6f --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/RequestFeatureView.proto @@ -0,0 +1,43 @@ +// +// Copyright 2021 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; +package feast.core; + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "RequestFeatureViewProto"; +option java_package = "feast.proto.core"; + +import "feast/core/FeatureView.proto"; +import "feast/core/Feature.proto"; +import "feast/core/DataSource.proto"; + +message RequestFeatureView { + // User-specified specifications of this feature view. + RequestFeatureViewSpec spec = 1; +} + +message RequestFeatureViewSpec { + // Name of the feature view. Must be unique. Not updated. + string name = 1; + + // Name of Feast project that this feature view belongs to. + string project = 2; + + // Request data which contains the underlying data schema and list of associated features + DataSource request_data_source = 3; +} diff --git a/java/datatypes/src/main/proto/feast/core/SavedDataset.proto b/java/datatypes/src/main/proto/feast/core/SavedDataset.proto new file mode 100644 index 0000000000..ebd2e56d35 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/SavedDataset.proto @@ -0,0 +1,77 @@ +// +// Copyright 2021 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; + +package feast.core; +option java_package = "feast.proto.core"; +option java_outer_classname = "SavedDatasetProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +import "google/protobuf/timestamp.proto"; +import "feast/core/FeatureViewProjection.proto"; +import "feast/core/DataSource.proto"; + +message SavedDatasetSpec { + // Name of the dataset. Must be unique since it's possible to overwrite dataset by name + string name = 1; + + // Name of Feast project that this Dataset belongs to. + string project = 2; + + // list of feature references with format ":" + repeated string features = 3; + + // entity columns + request columns from all feature views used during retrieval + repeated string join_keys = 4; + + // Whether full feature names are used in stored data + bool full_feature_names = 5; + + SavedDatasetStorage storage = 6; + + // User defined metadata + map tags = 7; +} + +message SavedDatasetStorage { + oneof kind { + DataSource.FileOptions file_storage = 4; + DataSource.BigQueryOptions bigquery_storage = 5; + DataSource.RedshiftOptions redshift_storage = 6; + DataSource.SnowflakeOptions snowflake_storage = 7; + } +} + +message SavedDatasetMeta { + // Time when this saved dataset is created + google.protobuf.Timestamp created_timestamp = 1; + + // Time when this saved dataset is last updated + google.protobuf.Timestamp last_updated_timestamp = 2; + + // Min timestamp in the dataset (needed for retrieval) + google.protobuf.Timestamp min_event_timestamp = 3; + + // Max timestamp in the dataset (needed for retrieval) + google.protobuf.Timestamp max_event_timestamp = 4; +} + +message SavedDataset { + SavedDatasetSpec spec = 1; + SavedDatasetMeta meta = 2; +} diff --git a/java/datatypes/src/main/proto/feast/core/SqliteTable.proto b/java/datatypes/src/main/proto/feast/core/SqliteTable.proto new file mode 100644 index 0000000000..1732931b8f --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/SqliteTable.proto @@ -0,0 +1,31 @@ +// +// * Copyright 2021 The Feast Authors +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * https://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// + +syntax = "proto3"; + +package feast.core; +option java_package = "feast.proto.core"; +option java_outer_classname = "SqliteTableProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +// Represents a Sqlite table +message SqliteTable { + // Absolute path of the table + string path = 1; + + // Name of the table + string name = 2; +} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/Store.proto b/java/datatypes/src/main/proto/feast/core/Store.proto new file mode 100644 index 0000000000..41a76a11c2 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/Store.proto @@ -0,0 +1,130 @@ +// +// * Copyright 2019 The Feast Authors +// * +// * Licensed under the Apache License, Version 2.0 (the "License"); +// * you may not use this file except in compliance with the License. +// * You may obtain a copy of the License at +// * +// * https://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// + +syntax = "proto3"; +package feast.core; + +option java_package = "feast.proto.core"; +option java_outer_classname = "StoreProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +// Store provides a location where Feast reads and writes feature values. +// Feature values will be written to the Store in the form of FeatureRow elements. +// The way FeatureRow is encoded and decoded when it is written to and read from +// the Store depends on the type of the Store. +// +message Store { + + enum StoreType { + // These positions should not be reused. + reserved 2, 3, 12, 13; + + INVALID = 0; + + // Redis stores a FeatureRow element as a key, value pair. + // + // The Redis data types used (https://redis.io/topics/data-types): + // - key: STRING + // - value: STRING + // + // Encodings: + // - key: byte array of RedisKey (refer to feast.storage.RedisKeyV2) + // - value: Redis hashmap + // + REDIS = 1; + + REDIS_CLUSTER = 4; + } + + message RedisConfig { + string host = 1; + int32 port = 2; + // Optional. The number of milliseconds to wait before retrying failed Redis connection. + // By default, Feast uses exponential backoff policy and "initial_backoff_ms" sets the initial wait duration. + int32 initial_backoff_ms = 3; + // Optional. Maximum total number of retries for connecting to Redis. Default to zero retries. + int32 max_retries = 4; + // Optional. How often flush data to redis + int32 flush_frequency_seconds = 5; + // Optional. Connect over SSL. + bool ssl = 6; + } + + message RedisClusterConfig { + // List of Redis Uri for all the nodes in Redis Cluster, comma separated. Eg. host1:6379, host2:6379 + string connection_string = 1; + int32 initial_backoff_ms = 2; + int32 max_retries = 3; + // Optional. How often flush data to redis + int32 flush_frequency_seconds = 4; + // Optional. Append a prefix to the Redis Key + string key_prefix = 5; + // Optional. Enable fallback to another key prefix if the original key is not present. + // Useful for migrating key prefix without re-ingestion. Disabled by default. + bool enable_fallback = 6; + // Optional. This would be the fallback prefix to use if enable_fallback is true. + string fallback_prefix = 7; + + // Optional. Priority of nodes when reading from cluster + enum ReadFrom { + MASTER = 0; + MASTER_PREFERRED = 1; + REPLICA = 2; + REPLICA_PREFERRED = 3; + } + ReadFrom read_from = 8; + } + + message Subscription { + // Name of project that the feature sets belongs to. This can be one of + // - [project_name] + // - * + // If an asterisk is provided, filtering on projects will be disabled. All projects will + // be matched. It is NOT possible to provide an asterisk with a string in order to do + // pattern matching. + string project = 3; + + // Name of the desired feature set. Asterisks can be used as wildcards in the name. + // Matching on names is only permitted if a specific project is defined. It is disallowed + // If the project name is set to "*" + // e.g. + // - * can be used to match all feature sets + // - my-feature-set* can be used to match all features prefixed by "my-feature-set" + // - my-feature-set-6 can be used to select a single feature set + string name = 1; + + // All matches with exclude enabled will be filtered out instead of added + bool exclude = 4; + + // Feature set version was removed in v0.5.0. + reserved 2; + } + + // Name of the store. + string name = 1; + + // Type of store. + StoreType type = 2; + + // Feature sets to subscribe to. + repeated Subscription subscriptions = 4; + + // Configuration to connect to the store. Required. + oneof config { + RedisConfig redis_config = 11; + RedisClusterConfig redis_cluster_config = 14; + } +} diff --git a/java/datatypes/src/main/proto/feast/core/ValidationProfile.proto b/java/datatypes/src/main/proto/feast/core/ValidationProfile.proto new file mode 100644 index 0000000000..31c4e150a0 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/core/ValidationProfile.proto @@ -0,0 +1,48 @@ +// +// Copyright 2021 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; + +package feast.core; +option java_package = "feast.proto.core"; +option java_outer_classname = "ValidationProfile"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; + +import "google/protobuf/timestamp.proto"; +import "feast/core/SavedDataset.proto"; + +message GEValidationProfiler { + message UserDefinedProfiler { + // The python-syntax function body (serialized by dill) + bytes body = 1; + } + + UserDefinedProfiler profiler = 1; +} + +message GEValidationProfile { + // JSON-serialized ExpectationSuite object + bytes expectation_suite = 1; +} + +message ValidationReference { + SavedDataset dataset = 1; + + oneof profiler { + GEValidationProfiler ge_profiler = 2; + } +} diff --git a/java/datatypes/src/main/proto/feast/serving/ServingService.proto b/java/datatypes/src/main/proto/feast/serving/ServingService.proto new file mode 100644 index 0000000000..6c551a97ba --- /dev/null +++ b/java/datatypes/src/main/proto/feast/serving/ServingService.proto @@ -0,0 +1,134 @@ +/* + * Copyright 2018 The Feast Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +package feast.serving; + +import "google/protobuf/timestamp.proto"; +import "feast/types/Value.proto"; + +option java_package = "feast.proto.serving"; +option java_outer_classname = "ServingAPIProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/serving"; + +service ServingService { + // Get information about this Feast serving. + rpc GetFeastServingInfo (GetFeastServingInfoRequest) returns (GetFeastServingInfoResponse); + + // Get online features synchronously. + rpc GetOnlineFeatures (GetOnlineFeaturesRequest) returns (GetOnlineFeaturesResponse); +} + +message GetFeastServingInfoRequest {} + +message GetFeastServingInfoResponse { + // Feast version of this serving deployment. + string version = 1; +} + +message FeatureReferenceV2 { + // Name of the Feature View to retrieve the feature from. + string feature_view_name = 1; + + // Name of the Feature to retrieve the feature from. + string feature_name = 2; +} + +// ToDo (oleksii): remove this message (since it's not used) and move EntityRow on package level +message GetOnlineFeaturesRequestV2 { + // List of features that are being retrieved + repeated FeatureReferenceV2 features = 4; + + // List of entity rows, containing entity id and timestamp data. + // Used during retrieval of feature rows and for joining feature + // rows into a final dataset + repeated EntityRow entity_rows = 2; + + // Optional field to specify project name override. If specified, uses the + // given project for retrieval. Overrides the projects specified in + // Feature References if both are specified. + string project = 5; + + message EntityRow { + // Request timestamp of this row. This value will be used, + // together with maxAge, to determine feature staleness. + google.protobuf.Timestamp timestamp = 1; + + // Map containing mapping of entity name to entity value. + map fields = 2; + } +} + +// In JSON "val" field can be omitted +message FeatureList { + repeated string val = 1; +} + +message GetOnlineFeaturesRequest { + oneof kind { + string feature_service = 1; + FeatureList features = 2; + } + // The entity data is specified in a columnar format + // A map of entity name -> list of values + map entities = 3; + bool full_feature_names = 4; + + // Context for OnDemand Feature Transformation + // (was moved to dedicated parameter to avoid unnecessary separation logic on serving side) + // A map of variable name -> list of values + map request_context = 5; +} + +message GetOnlineFeaturesResponse { + GetOnlineFeaturesResponseMetadata metadata = 1; + + // Length of "results" array should match length of requested features. + // We also preserve the same order of features here as in metadata.feature_names + repeated FeatureVector results = 2; + + message FeatureVector { + repeated feast.types.Value values = 1; + repeated FieldStatus statuses = 2; + repeated google.protobuf.Timestamp event_timestamps = 3; + } +} + +message GetOnlineFeaturesResponseMetadata { + FeatureList feature_names = 1; +} + +enum FieldStatus { + // Status is unset for this field. + INVALID = 0; + + // Field value is present for this field and age is within max age. + PRESENT = 1; + + // Values could be found for entity key and age is within max age, but + // this field value is assigned a value on ingestion into feast. + NULL_VALUE = 2; + + // Entity key did not return any values as they do not exist in Feast. + // This could suggest that the feature values have not yet been ingested + // into feast or the ingestion failed. + NOT_FOUND = 3; + + // Values could be found for entity key, but field values are outside the maximum + // allowable range. + OUTSIDE_MAX_AGE = 4; +} diff --git a/java/datatypes/src/main/proto/feast/serving/TransformationService.proto b/java/datatypes/src/main/proto/feast/serving/TransformationService.proto new file mode 100644 index 0000000000..113bd120c8 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/serving/TransformationService.proto @@ -0,0 +1,67 @@ +/* + * Copyright 2021 The Feast Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +package feast.serving; + +option java_package = "feast.proto.serving"; +option java_outer_classname = "TransformationServiceAPIProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/serving"; + +service TransformationService { + rpc GetTransformationServiceInfo (GetTransformationServiceInfoRequest) returns (GetTransformationServiceInfoResponse); + + rpc TransformFeatures (TransformFeaturesRequest) returns (TransformFeaturesResponse); +} + +message ValueType { + oneof value { + // Having a oneOf provides forward compatibility if we need to support compound types + // that are not supported by arrow natively. + bytes arrow_value = 1; + } +} + +message GetTransformationServiceInfoRequest {} + +message GetTransformationServiceInfoResponse { + // Feast version of this transformation service deployment. + string version = 1; + + // Type of transformation service deployment. This is either Python, or custom + TransformationServiceType type = 2; + + string transformation_service_type_details = 3; +} + +message TransformFeaturesRequest { + string on_demand_feature_view_name = 1; + string project = 2; + + ValueType transformation_input = 3; +} + +message TransformFeaturesResponse { + ValueType transformation_output = 3; +} + +enum TransformationServiceType { + TRANSFORMATION_SERVICE_TYPE_INVALID = 0; + TRANSFORMATION_SERVICE_TYPE_PYTHON = 1; + + TRANSFORMATION_SERVICE_TYPE_CUSTOM = 100; +} diff --git a/java/datatypes/src/main/proto/feast/storage/Redis.proto b/java/datatypes/src/main/proto/feast/storage/Redis.proto new file mode 100644 index 0000000000..a662e352f4 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/storage/Redis.proto @@ -0,0 +1,34 @@ +/* + * Copyright 2019 The Feast Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +import "feast/types/Field.proto"; +import "feast/types/Value.proto"; + +package feast.storage; + +option java_outer_classname = "RedisProto"; +option java_package = "feast.proto.storage"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/storage"; + +message RedisKeyV2 { + string project = 1; + + repeated string entity_names = 2; + + repeated feast.types.Value entity_values = 3; +} diff --git a/java/datatypes/src/main/proto/feast/third_party/grpc/health/v1/HealthService.proto b/java/datatypes/src/main/proto/feast/third_party/grpc/health/v1/HealthService.proto new file mode 100644 index 0000000000..342db35d4c --- /dev/null +++ b/java/datatypes/src/main/proto/feast/third_party/grpc/health/v1/HealthService.proto @@ -0,0 +1,24 @@ +syntax = "proto3"; + +package grpc.health.v1; + +option java_package = "io.grpc.health.v1"; +option java_outer_classname = "HealthProto"; + +message HealthCheckRequest { + string service = 1; +} + +enum ServingStatus { + UNKNOWN = 0; + SERVING = 1; + NOT_SERVING = 2; +} + +message HealthCheckResponse { + ServingStatus status = 1; +} + +service Health { + rpc Check(HealthCheckRequest) returns (HealthCheckResponse); +} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/types/EntityKey.proto b/java/datatypes/src/main/proto/feast/types/EntityKey.proto new file mode 100644 index 0000000000..cbc3c55442 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/types/EntityKey.proto @@ -0,0 +1,30 @@ +/* + * Copyright 2018 The Feast Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +import "feast/types/Value.proto"; + +package feast.types; + +option java_package = "feast.proto.types"; +option java_outer_classname = "EntityKeyProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types"; + +message EntityKey { + repeated string join_keys = 1; + repeated feast.types.Value entity_values = 2; +} diff --git a/java/datatypes/src/main/proto/feast/types/Field.proto b/java/datatypes/src/main/proto/feast/types/Field.proto new file mode 100644 index 0000000000..3b8416c253 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/types/Field.proto @@ -0,0 +1,30 @@ +/* + * Copyright 2018 The Feast Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +import "feast/types/Value.proto"; + +package feast.types; + +option java_package = "feast.proto.types"; +option java_outer_classname = "FieldProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types"; + +message Field { + string name = 1; + feast.types.Value value = 2; +} diff --git a/java/datatypes/src/main/proto/feast/types/Value.proto b/java/datatypes/src/main/proto/feast/types/Value.proto new file mode 100644 index 0000000000..b00d4d9b84 --- /dev/null +++ b/java/datatypes/src/main/proto/feast/types/Value.proto @@ -0,0 +1,109 @@ +/* + * Copyright 2018 The Feast Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +package feast.types; + +option java_package = "feast.proto.types"; +option java_outer_classname = "ValueProto"; +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types"; + +message ValueType { + enum Enum { + INVALID = 0; + BYTES = 1; + STRING = 2; + INT32 = 3; + INT64 = 4; + DOUBLE = 5; + FLOAT = 6; + BOOL = 7; + UNIX_TIMESTAMP = 8; + BYTES_LIST = 11; + STRING_LIST = 12; + INT32_LIST = 13; + INT64_LIST = 14; + DOUBLE_LIST = 15; + FLOAT_LIST = 16; + BOOL_LIST = 17; + UNIX_TIMESTAMP_LIST = 18; + NULL = 19; + } +} + +message Value { + // ValueType is referenced by the metadata types, FeatureInfo and EntityInfo. + // The enum values do not have to match the oneof val field ids, but they should. + // In JSON "*_val" field can be omitted + oneof val { + bytes bytes_val = 1; + string string_val = 2; + int32 int32_val = 3; + int64 int64_val = 4; + double double_val = 5; + float float_val = 6; + bool bool_val = 7; + int64 unix_timestamp_val = 8; + BytesList bytes_list_val = 11; + StringList string_list_val = 12; + Int32List int32_list_val = 13; + Int64List int64_list_val = 14; + DoubleList double_list_val = 15; + FloatList float_list_val = 16; + BoolList bool_list_val = 17; + Int64List unix_timestamp_list_val = 18; + Null null_val = 19; + } +} + +enum Null { + NULL = 0; +} + +message BytesList { + repeated bytes val = 1; +} + +message StringList { + repeated string val = 1; +} + +message Int32List { + repeated int32 val = 1; +} + +message Int64List { + repeated int64 val = 1; +} + +message DoubleList { + repeated double val = 1; +} + +message FloatList { + repeated float val = 1; +} + +message BoolList { + repeated bool val = 1; +} + +// This is to avoid an issue of being unable to specify `repeated value` in oneofs or maps +// In JSON "val" field can be omitted +message RepeatedValue { + repeated Value val = 1; +} \ No newline at end of file diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index 08abc9e79f..1a870912d1 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -1,7 +1,8 @@ import inspect +import warnings from datetime import datetime from typing import Dict, List, Optional, Tuple, Union -import warnings + import numpy as np import pandas import pandas as pd diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index 0e335499b7..765ff64408 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -1,6 +1,6 @@ import pickle -from typing import Any, Callable, Dict, Iterable, Optional, Tuple import warnings +from typing import Any, Callable, Dict, Iterable, Optional, Tuple from feast_spark_offline_store.spark_type_map import spark_to_feast_value_type from pyspark.sql.utils import AnalysisException From 1e7c2b441d381180503a6bfb5274a31a539cc3a6 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 1 Mar 2022 19:01:17 -0800 Subject: [PATCH 18/58] Address review Signed-off-by: Kevin Zhang --- sdk/python/feast/__init__.py | 1 + sdk/python/feast/feature_store.py | 2 +- sdk/python/feast/inference.py | 5 +- .../infra/offline_stores/third_party/spark.py | 43 +++++++----- .../third_party/spark_source.py | 2 +- .../third_party/spark_type_map.py | 52 -------------- sdk/python/feast/type_map.py | 49 ++++++++++++++ .../feature_repos/repo_configuration.py | 67 ++++++++++--------- .../data_sources/spark_data_source_creator.py | 1 - .../registration/test_universal_types.py | 5 +- 10 files changed, 119 insertions(+), 108 deletions(-) diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 83b504b0cb..1974b49222 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -6,6 +6,7 @@ from feast.infra.offline_stores.file_source import FileSource from feast.infra.offline_stores.redshift_source import RedshiftSource from feast.infra.offline_stores.snowflake_source import SnowflakeSource +from feast.infra.offline_stores.third_party.spark_source import SparkSource from .data_source import KafkaSource, KinesisSource, SourceType from .entity import Entity diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 650623274f..b47e6745c9 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -758,6 +758,7 @@ def get_historical_features( all_request_feature_views, all_on_demand_feature_views, ) = self._get_feature_views_to_use(features) + # TODO(achal): _group_feature_refs returns the on demand feature views, but it's no passed into the provider. # This is a weird interface quirk - we should revisit the `get_historical_features` to # pass in the on demand feature views as well. @@ -767,7 +768,6 @@ def get_historical_features( all_request_feature_views, all_on_demand_feature_views, ) - feature_views = list(view for view, _ in fvs) on_demand_feature_views = list(view for view, _ in odfvs) request_feature_views = list(view for view, _ in request_fvs) diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index ce8fa919f1..57f77d9e7c 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -8,6 +8,7 @@ FileSource, RedshiftSource, SnowflakeSource, + SparkSource, ) from feast.data_source import DataSource from feast.errors import RegistryInferenceFailure @@ -84,7 +85,7 @@ def update_data_sources_with_inferred_event_timestamp_col( ): # prepare right match pattern for data source ts_column_type_regex_pattern = "" - if isinstance(data_source, FileSource): + if isinstance(data_source, FileSource) or isinstance(data_source, SparkSource): ts_column_type_regex_pattern = r"^timestamp" elif isinstance(data_source, BigQuerySource): ts_column_type_regex_pattern = "TIMESTAMP|DATETIME" @@ -97,7 +98,7 @@ def update_data_sources_with_inferred_event_timestamp_col( "DataSource", """ DataSource inferencing of event_timestamp_column is currently only supported - for FileSource and BigQuerySource. + for FileSource, SparkSource, BigQuerySource, RedshiftSource, and SnowflakeSource. """, ) # for informing the type checker diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index 1a870912d1..4a95128b98 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -8,8 +8,8 @@ import pandas as pd import pyarrow import pyspark -from feast_spark_offline_store.spark_source import SparkSource -from feast_spark_offline_store.spark_type_map import spark_schema_to_np_dtypes +from feast.infra.offline_stores.third_party.spark_source import SparkSource +from feast.type_map import spark_schema_to_np_dtypes from pydantic import StrictStr from pyspark import SparkConf from pyspark.sql import SparkSession @@ -37,7 +37,6 @@ class SparkOfflineStoreConfig(FeastConfigBaseModel): spark_conf: Optional[Dict[str, str]] = None """ Configuration overlay for the spark session """ - # to ensure sparksession is the correct config, if not created yet # sparksession is not serializable and we dont want to pass it around as an argument @@ -161,11 +160,6 @@ def get_historical_features( full_feature_names=full_feature_names, ) - # TODO: Figure out what this is used for - # on_demand_feature_views = OnDemandFeatureView.get_requested_odfvs( - # feature_refs=feature_refs, project=project, registry=registry - # ) - return SparkRetrievalJob( spark_session=spark_session, query=query, @@ -196,22 +190,37 @@ def pull_all_from_table_or_query( have all already been mapped to column names of the source table and those column names are the values passed into this function. """ + assert isinstance(data_source, SparkSource) warnings.warn( "The spark offline store is an experimental feature in alpha development. " "This API is unstable and it could and most probably will be changed in the future.", RuntimeWarning, ) + from_expression = data_source.get_table_query_string() + + field_string = ( + '"' + + '", "'.join( + join_key_columns + feature_name_columns + [event_timestamp_column] + ) + + '"' + ) + start_date = start_date.astimezone(tz=utc) + end_date = end_date.astimezone(tz=utc) - return SparkOfflineStore.pull_latest_from_table_or_query( + query = f""" + SELECT {field_string} + FROM {from_expression} + WHERE "{event_timestamp_column}" BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' + """ + spark_session = get_spark_session_or_start_new_with_repoconfig( + store_config=config.offline_store + ) + return SparkRetrievalJob( + spark_session=spark_session, + query=query, config=config, - data_source=data_source, - join_key_columns=join_key_columns - + [event_timestamp_column], # avoid deduplication - feature_name_columns=feature_name_columns, - event_timestamp_column=event_timestamp_column, - created_timestamp_column=None, - start_date=start_date, - end_date=end_date, + full_feature_names=False, ) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index 765ff64408..d8ebacf16f 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -2,7 +2,7 @@ import warnings from typing import Any, Callable, Dict, Iterable, Optional, Tuple -from feast_spark_offline_store.spark_type_map import spark_to_feast_value_type +from feast.type_map import spark_to_feast_value_type from pyspark.sql.utils import AnalysisException from feast.data_source import DataSource diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py b/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py index dee1c20d41..139597f9cb 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py @@ -1,54 +1,2 @@ -from collections import defaultdict -from typing import Dict, Iterator, List, Tuple -from numpy import dtype -from feast import ValueType - - -def spark_to_feast_value_type(spark_type_as_str: str) -> ValueType: - # TODO not all spark types are convertible - type_map: Dict[str, ValueType] = { - "null": ValueType.UNKNOWN, - "byte": ValueType.BYTES, - "string": ValueType.STRING, - "int": ValueType.INT32, - "bigint": ValueType.INT64, - "long": ValueType.INT64, - "double": ValueType.DOUBLE, - "float": ValueType.FLOAT, - "boolean": ValueType.BOOL, - "timestamp": ValueType.UNIX_TIMESTAMP, - "array": ValueType.BYTES_LIST, - "array": ValueType.STRING_LIST, - "array": ValueType.INT32_LIST, - "array": ValueType.INT64_LIST, - "array": ValueType.DOUBLE_LIST, - "array": ValueType.FLOAT_LIST, - "array": ValueType.BOOL_LIST, - "array": ValueType.UNIX_TIMESTAMP_LIST, - } - # TODO: this is just incorrect fix - if type(spark_type_as_str) != str or spark_type_as_str not in type_map: - return ValueType.NULL - return type_map[spark_type_as_str.lower()] - - -def spark_schema_to_np_dtypes(dtypes: List[Tuple[str, str]]) -> Iterator[dtype]: - # TODO recheck all typing (also tz for timestamp) - # https://spark.apache.org/docs/latest/api/python/user_guide/arrow_pandas.html#timestamp-with-time-zone-semantics - - type_map = defaultdict( - lambda: dtype("O"), - { - "boolean": dtype("bool"), - "double": dtype("float64"), - "float": dtype("float64"), - "int": dtype("int64"), - "bigint": dtype("int64"), - "smallint": dtype("int64"), - "timestamp": dtype("datetime64[ns]"), - }, - ) - - return (type_map[t] for _, t in dtypes) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index f808715681..89de11109f 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -24,10 +24,12 @@ Tuple, Type, Union, + Iterator, cast, ) import numpy as np +from collections import defaultdict import pandas as pd import pyarrow from google.protobuf.timestamp_pb2 import Timestamp @@ -575,3 +577,50 @@ def _non_empty_value(value: Any) -> bool: return value is not None and ( not isinstance(value, Sized) or len(value) > 0 or isinstance(value, str) ) + +def spark_to_feast_value_type(spark_type_as_str: str) -> ValueType: + # TODO not all spark types are convertible + type_map: Dict[str, ValueType] = { + "null": ValueType.UNKNOWN, + "byte": ValueType.BYTES, + "string": ValueType.STRING, + "int": ValueType.INT32, + "bigint": ValueType.INT64, + "long": ValueType.INT64, + "double": ValueType.DOUBLE, + "float": ValueType.FLOAT, + "boolean": ValueType.BOOL, + "timestamp": ValueType.UNIX_TIMESTAMP, + "array": ValueType.BYTES_LIST, + "array": ValueType.STRING_LIST, + "array": ValueType.INT32_LIST, + "array": ValueType.INT64_LIST, + "array": ValueType.DOUBLE_LIST, + "array": ValueType.FLOAT_LIST, + "array": ValueType.BOOL_LIST, + "array": ValueType.UNIX_TIMESTAMP_LIST, + } + # TODO: this is just incorrect fix + if type(spark_type_as_str) != str or spark_type_as_str not in type_map: + return ValueType.NULL + return type_map[spark_type_as_str.lower()] + + +def spark_schema_to_np_dtypes(dtypes: List[Tuple[str, str]]) -> Iterator[np.dtype]: + # TODO recheck all typing (also tz for timestamp) + # https://spark.apache.org/docs/latest/api/python/user_guide/arrow_pandas.html#timestamp-with-time-zone-semantics + + type_map = defaultdict( + lambda: np.dtype("O"), + { + "boolean": np.dtype("bool"), + "double": np.dtype("float64"), + "float": np.dtype("float64"), + "int": np.dtype("int64"), + "bigint": np.dtype("int64"), + "smallint": np.dtype("int64"), + "timestamp": np.dtype("datetime64[ns]"), + }, + ) + + return (type_map[t] for _, t in dtypes) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 0459fc7d42..3fc1b70f4e 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -33,6 +33,9 @@ from tests.integration.feature_repos.universal.data_sources.snowflake import ( SnowflakeDataSourceCreator, ) +from tests.integration.feature_repos.universal.data_sources.spark_data_source_creator import ( + SparkDataSourceCreator, +) from tests.integration.feature_repos.universal.feature_views import ( conv_rate_plus_100_feature_view, create_conv_rate_request_data_source, @@ -71,40 +74,40 @@ DEFAULT_FULL_REPO_CONFIGS.extend( [ # Redis configurations - IntegrationTestRepoConfig(online_store=REDIS_CONFIG), - IntegrationTestRepoConfig(online_store=REDIS_CLUSTER_CONFIG), + # IntegrationTestRepoConfig(online_store=REDIS_CONFIG), + # IntegrationTestRepoConfig(online_store=REDIS_CLUSTER_CONFIG), # GCP configurations - # IntegrationTestRepoConfig( - # provider="local", offline_store_creator=SparkDataSourceCreator, - # ) - IntegrationTestRepoConfig( - provider="gcp", - offline_store_creator=BigQueryDataSourceCreator, - online_store="datastore", - ), - IntegrationTestRepoConfig( - provider="gcp", - offline_store_creator=BigQueryDataSourceCreator, - online_store=REDIS_CONFIG, - ), - # AWS configurations - IntegrationTestRepoConfig( - provider="aws", - offline_store_creator=RedshiftDataSourceCreator, - online_store=DYNAMO_CONFIG, - python_feature_server=True, - ), - IntegrationTestRepoConfig( - provider="aws", - offline_store_creator=RedshiftDataSourceCreator, - online_store=REDIS_CONFIG, - ), - # Snowflake configurations IntegrationTestRepoConfig( - provider="aws", # no list features, no feature server - offline_store_creator=SnowflakeDataSourceCreator, - online_store=REDIS_CONFIG, - ), + provider="local", offline_store_creator=SparkDataSourceCreator, + ) + # IntegrationTestRepoConfig( + # provider="gcp", + # offline_store_creator=BigQueryDataSourceCreator, + # online_store="datastore", + # ), + # IntegrationTestRepoConfig( + # provider="gcp", + # offline_store_creator=BigQueryDataSourceCreator, + # online_store=REDIS_CONFIG, + # ), + # # AWS configurations + # IntegrationTestRepoConfig( + # provider="aws", + # offline_store_creator=RedshiftDataSourceCreator, + # online_store=DYNAMO_CONFIG, + # python_feature_server=True, + # ), + # IntegrationTestRepoConfig( + # provider="aws", + # offline_store_creator=RedshiftDataSourceCreator, + # online_store=REDIS_CONFIG, + # ), + # # Snowflake configurations + # IntegrationTestRepoConfig( + # provider="aws", # no list features, no feature server + # offline_store_creator=SnowflakeDataSourceCreator, + # online_store=REDIS_CONFIG, + # ), ] ) full_repo_configs_module = os.environ.get(FULL_REPO_CONFIGS_MODULE_ENV_NAME) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 97416d2959..bd1e19e7eb 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -65,7 +65,6 @@ def create_data_source( field_mapping: Dict[str, str] = None, **kwargs, ) -> DataSource: - # df["event_timestamp"] = pd.to_datetime(df["event_timestamp"], utc=True) if event_timestamp_column in df: df[event_timestamp_column] = pd.to_datetime( df[event_timestamp_column], utc=True diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 7322cc01e4..e2f2e9df56 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -184,10 +184,9 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures historical_features = fs.get_historical_features( entity_df=entity_df, features=features, ) - # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() - + print(historical_features_df) if config.feature_is_list: assert_feature_list_types( environment.test_repo_config.provider, @@ -285,6 +284,7 @@ def create_feature_view( value_type = ValueType.BOOL elif feature_dtype == "datetime": value_type = ValueType.UNIX_TIMESTAMP + return driver_feature_view(data_source, name=name, value_type=value_type,) @@ -347,6 +347,7 @@ def assert_expected_arrow_types( ): print("Asserting historical feature arrow types") historical_features_arrow = historical_features.to_arrow() + print(historical_features_arrow) feature_list_dtype_to_expected_historical_feature_arrow_type = { "int32": pa.types.is_int64, "int64": pa.types.is_int64, From 8e3e9a45965ff8af2e9644dae789a20a733a70d6 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 1 Mar 2022 19:03:17 -0800 Subject: [PATCH 19/58] Address review Signed-off-by: Kevin Zhang --- sdk/python/feast/inference.py | 4 +++- sdk/python/feast/infra/offline_stores/third_party/spark.py | 5 ++--- .../feast/infra/offline_stores/third_party/spark_source.py | 2 +- sdk/python/feast/type_map.py | 5 +++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index 57f77d9e7c..3fc6f054f1 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -85,7 +85,9 @@ def update_data_sources_with_inferred_event_timestamp_col( ): # prepare right match pattern for data source ts_column_type_regex_pattern = "" - if isinstance(data_source, FileSource) or isinstance(data_source, SparkSource): + if isinstance(data_source, FileSource) or isinstance( + data_source, SparkSource + ): ts_column_type_regex_pattern = r"^timestamp" elif isinstance(data_source, BigQuerySource): ts_column_type_regex_pattern = "TIMESTAMP|DATETIME" diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index 4a95128b98..1586037ac0 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -8,8 +8,6 @@ import pandas as pd import pyarrow import pyspark -from feast.infra.offline_stores.third_party.spark_source import SparkSource -from feast.type_map import spark_schema_to_np_dtypes from pydantic import StrictStr from pyspark import SparkConf from pyspark.sql import SparkSession @@ -26,9 +24,11 @@ RetrievalMetadata, ) from feast.infra.offline_stores.offline_utils import FeatureViewQueryContext +from feast.infra.offline_stores.third_party.spark_source import SparkSource from feast.registry import Registry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage +from feast.type_map import spark_schema_to_np_dtypes class SparkOfflineStoreConfig(FeastConfigBaseModel): @@ -219,7 +219,6 @@ def pull_all_from_table_or_query( return SparkRetrievalJob( spark_session=spark_session, query=query, - config=config, full_feature_names=False, ) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index d8ebacf16f..ee84293f10 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -2,7 +2,6 @@ import warnings from typing import Any, Callable, Dict, Iterable, Optional, Tuple -from feast.type_map import spark_to_feast_value_type from pyspark.sql.utils import AnalysisException from feast.data_source import DataSource @@ -13,6 +12,7 @@ ) from feast.repo_config import RepoConfig from feast.saved_dataset import SavedDatasetStorage +from feast.type_map import spark_to_feast_value_type from feast.value_type import ValueType diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 89de11109f..64d9c485b0 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections import defaultdict from datetime import datetime, timezone from typing import ( Any, Dict, + Iterator, List, Optional, Sequence, @@ -24,12 +26,10 @@ Tuple, Type, Union, - Iterator, cast, ) import numpy as np -from collections import defaultdict import pandas as pd import pyarrow from google.protobuf.timestamp_pb2 import Timestamp @@ -578,6 +578,7 @@ def _non_empty_value(value: Any) -> bool: not isinstance(value, Sized) or len(value) > 0 or isinstance(value, str) ) + def spark_to_feast_value_type(spark_type_as_str: str) -> ValueType: # TODO not all spark types are convertible type_map: Dict[str, ValueType] = { From cc1651e51be78b71fcd22ee2cf810e389b6d82a8 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Tue, 1 Mar 2022 19:06:45 -0800 Subject: [PATCH 20/58] Fix lint Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/third_party/spark.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark.py index 1586037ac0..1edffa2aab 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark.py @@ -217,9 +217,7 @@ def pull_all_from_table_or_query( store_config=config.offline_store ) return SparkRetrievalJob( - spark_session=spark_session, - query=query, - full_feature_names=False, + spark_session=spark_session, query=query, full_feature_names=False ) From acf1c28c152cec60995d32f1ecc450611f0e4893 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 12:26:39 -0800 Subject: [PATCH 21/58] Add file store functionality Signed-off-by: Kevin Zhang --- .../third_party/spark_source.py | 63 +++++++++++++---- sdk/python/feast/templates/spark/bootstrap.py | 50 ++++++++++++++ sdk/python/feast/templates/spark/example.py | 69 +++++++++++++++++++ .../feast/templates/spark/feature_store.yaml | 15 ++++ 4 files changed, 185 insertions(+), 12 deletions(-) create mode 100644 sdk/python/feast/templates/spark/bootstrap.py create mode 100644 sdk/python/feast/templates/spark/example.py create mode 100644 sdk/python/feast/templates/spark/feature_store.yaml diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index ee84293f10..75d420771d 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -1,8 +1,10 @@ import pickle +from enum import Enum import warnings from typing import Any, Callable, Dict, Iterable, Optional, Tuple from pyspark.sql.utils import AnalysisException +from pyspark.sql import SparkSession, DataFrame from feast.data_source import DataSource from feast.errors import DataSourceNotFoundException @@ -14,18 +16,20 @@ from feast.saved_dataset import SavedDatasetStorage from feast.type_map import spark_to_feast_value_type from feast.value_type import ValueType +from feast.infra.offline_stores.offline_utils import get_temp_entity_table_name +class SparkSourceFormat(Enum): + csv = "csv" + json = "json" + parquet = "parquet" class SparkSource(DataSource): def __init__( self, table: Optional[str] = None, query: Optional[str] = None, - # TODO support file readers - # path: Optional[str] = None, - # jdbc=None, - # format: Optional[str] = None, - # options: Optional[Dict[str, Any]] = None, + path: Optional[str] = None, + file_format: Optional[str] = None, event_timestamp_column: Optional[str] = None, created_timestamp_column: Optional[str] = None, field_mapping: Optional[Dict[str, str]] = None, @@ -42,14 +46,26 @@ def __init__( "This API is unstable and it could and most probably will be changed in the future.", RuntimeWarning, ) + self.allowed_formats = [format.value for format in SparkSourceFormat] + + # Check that only one of the ways to load a spark dataframe can be used. + if sum([(arg is not None) for arg in [table, query, path]]) != 1: + raise ValueError("Exactly one of params(table, query, path) must be specified.") + + if path is not None: + if file_format is None: + raise ValueError( + "If 'path' is specified, then 'file_format' is required." + ) + if file_format not in self.allowed_formats: + raise ValueError(f"'file_format' should be one of {self.allowed_formats}") + self._spark_options = SparkOptions( table=table, query=query, - # path=path, - # jdbc=None, - # format=format, - # options=options, + path=path, + file_format=file_format, ) @property @@ -80,6 +96,21 @@ def query(self): """ return self._spark_options.query + + @property + def path(self): + """ + Returns the path of the spark data source file. + """ + return self._spark_options.path + + @property + def file_format(self): + """ + Returns the file format of this feature data source. + """ + return self._spark_options.file_format + @staticmethod def from_proto(data_source: DataSourceProto) -> Any: @@ -118,7 +149,6 @@ def validate(self, config: RepoConfig): @staticmethod def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: - # TODO see feast.type_map for examples return spark_to_feast_value_type def get_table_column_names_and_types( @@ -145,9 +175,19 @@ def get_table_query_string(self) -> str: """Returns a string that can directly be used to reference this table in SQL""" if self.table: return f"`{self.table}`" - else: + if self.query: return f"({self.query})" + # If both the table query string and the actual query are null, we can load from file. + spark_session = SparkSession.getActiveSession() + if spark_session is None: + raise AssertionError("Could not find an active spark session.") + df = spark_session.read.format(self.file_format).load(self.path) + + tmp_table_name = get_temp_entity_table_name() + df.createOrReplaceTempView(tmp_table_name) + + return f"`{tmp_table_name}`" class SparkOptions: def __init__( @@ -213,7 +253,6 @@ def to_proto(self) -> DataSourceProto.CustomSourceOptions: return spark_options_proto - class SavedDatasetSparkStorage(SavedDatasetStorage): _proto_attr_name = "spark_storage" diff --git a/sdk/python/feast/templates/spark/bootstrap.py b/sdk/python/feast/templates/spark/bootstrap.py new file mode 100644 index 0000000000..899e466cfc --- /dev/null +++ b/sdk/python/feast/templates/spark/bootstrap.py @@ -0,0 +1,50 @@ +from datetime import datetime, timedelta +from pathlib import Path + +from pyspark.sql import SparkSession +from feast.driver_test_data import ( + create_driver_hourly_stats_df, + create_customer_daily_profile_df, +) + +CURRENT_DIR = Path(__file__).parent +DRIVER_ENTITIES = [1001, 1002, 1003] +CUSTOMER_ENTITIES = [201, 202, 203] +START_DATE = datetime.strptime("2022-01-01", "%Y-%m-%d") +END_DATE = START_DATE + timedelta(days=7) + +def bootstrap(): + # Bootstrap() will automatically be called from the init_repo() during `feast init` + generate_example_data( + spark_session=SparkSession.builder.getOrCreate(), + base_dir=str(CURRENT_DIR), + ) + + +def example_data_exists(base_dir: str) -> bool: + for path in [ + Path(base_dir) / "data" / "driver_hourly_stats", + Path(base_dir) / "data" / "customer_daily_profile", + ]: + if not path.exists(): + return False + return True + + +def generate_example_data(spark_session: SparkSession, base_dir: str) -> None: + spark_session.createDataFrame( + data=create_driver_hourly_stats_df(DRIVER_ENTITIES, START_DATE, END_DATE) + ).write.parquet( + path=str(Path(base_dir) / "data" / "driver_hourly_stats"), + mode="overwrite", + ) + + spark_session.createDataFrame( + data=create_customer_daily_profile_df(CUSTOMER_ENTITIES, START_DATE, END_DATE) + ).write.parquet( + path=str(Path(base_dir) / "data" / "customer_daily_profile"), + mode="overwrite", + ) + +if __name__ == "__main__": + bootstrap() \ No newline at end of file diff --git a/sdk/python/feast/templates/spark/example.py b/sdk/python/feast/templates/spark/example.py new file mode 100644 index 0000000000..68d249c72f --- /dev/null +++ b/sdk/python/feast/templates/spark/example.py @@ -0,0 +1,69 @@ +# # # # # # # # # # # # # # # # # # # # # # # # +# This is an example feature definition file # +# # # # # # # # # # # # # # # # # # # # # # # # + +from pathlib import Path + +from feast import Entity, Feature, FeatureView, ValueType + +from google.protobuf.duration_pb2 import Duration + +from feast_spark_offline_store import SparkSource + +# Constants related to the generated data sets +CURRENT_DIR = Path(__file__).parent + + +# Entity definitions +driver = Entity( + name="driver_id", + value_type=ValueType.INT64, + description="driver id", +) +customer = Entity( + name="customer_id", + value_type=ValueType.INT64, + description="customer id", +) + +# Sources +driver_hourly_stats = SparkSource( + path=f"{CURRENT_DIR}/data/driver_hourly_stats", + file_format="parquet", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", +) +customer_daily_profile = SparkSource( + path=f"{CURRENT_DIR}/data/customer_daily_profile", + file_format="parquet", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", +) + +# Feature Views +driver_hourly_stats_view = FeatureView( + name="driver_hourly_stats", + entities=["driver_id"], + ttl=Duration(seconds=86400 * 7), # one week + features=[ + Feature(name="conv_rate", dtype=ValueType.FLOAT), + Feature(name="acc_rate", dtype=ValueType.FLOAT), + Feature(name="avg_daily_trips", dtype=ValueType.INT64), + ], + online=True, + batch_source=driver_hourly_stats, + tags={}, +) +customer_daily_profile_view = FeatureView( + name="customer_daily_profile", + entities=["customer_id"], + ttl=Duration(seconds=86400 * 7), # one week + features=[ + Feature(name="current_balance", dtype=ValueType.FLOAT), + Feature(name="avg_passenger_count", dtype=ValueType.FLOAT), + Feature(name="lifetime_trip_count", dtype=ValueType.INT64), + ], + online=True, + batch_source=customer_daily_profile, + tags={}, +) \ No newline at end of file diff --git a/sdk/python/feast/templates/spark/feature_store.yaml b/sdk/python/feast/templates/spark/feature_store.yaml new file mode 100644 index 0000000000..800cdf009e --- /dev/null +++ b/sdk/python/feast/templates/spark/feature_store.yaml @@ -0,0 +1,15 @@ +project: my_project +registry: data/registry.db +provider: local +offline_store: + type: feast_spark_offline_store.spark.SparkOfflineStore + spark_conf: + spark.master: "local[*]" + spark.ui.enabled: "false" + spark.eventLog.enabled: "false" + spark.sql.catalogImplementation: "hive" + spark.sql.parser.quotedRegexColumnNames: "true" + spark.sql.session.timeZone: "UTC" +online_store: + path: data/online_store.db +## etc: etc.... \ No newline at end of file From 65b113ac69fe3a55c500469964e3f12443bff168 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 12:27:04 -0800 Subject: [PATCH 22/58] lint Signed-off-by: Kevin Zhang --- .../third_party/spark_source.py | 25 +++++++++++-------- sdk/python/feast/templates/spark/bootstrap.py | 16 ++++++------ sdk/python/feast/templates/spark/example.py | 17 ++++--------- 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index 75d420771d..619ba75f5f 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -1,13 +1,14 @@ import pickle -from enum import Enum import warnings +from enum import Enum from typing import Any, Callable, Dict, Iterable, Optional, Tuple +from pyspark.sql import DataFrame, SparkSession from pyspark.sql.utils import AnalysisException -from pyspark.sql import SparkSession, DataFrame from feast.data_source import DataSource from feast.errors import DataSourceNotFoundException +from feast.infra.offline_stores.offline_utils import get_temp_entity_table_name from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.protos.feast.core.SavedDataset_pb2 import ( SavedDatasetStorage as SavedDatasetStorageProto, @@ -16,13 +17,14 @@ from feast.saved_dataset import SavedDatasetStorage from feast.type_map import spark_to_feast_value_type from feast.value_type import ValueType -from feast.infra.offline_stores.offline_utils import get_temp_entity_table_name + class SparkSourceFormat(Enum): csv = "csv" json = "json" parquet = "parquet" + class SparkSource(DataSource): def __init__( self, @@ -50,7 +52,9 @@ def __init__( # Check that only one of the ways to load a spark dataframe can be used. if sum([(arg is not None) for arg in [table, query, path]]) != 1: - raise ValueError("Exactly one of params(table, query, path) must be specified.") + raise ValueError( + "Exactly one of params(table, query, path) must be specified." + ) if path is not None: if file_format is None: @@ -58,14 +62,12 @@ def __init__( "If 'path' is specified, then 'file_format' is required." ) if file_format not in self.allowed_formats: - raise ValueError(f"'file_format' should be one of {self.allowed_formats}") - + raise ValueError( + f"'file_format' should be one of {self.allowed_formats}" + ) self._spark_options = SparkOptions( - table=table, - query=query, - path=path, - file_format=file_format, + table=table, query=query, path=path, file_format=file_format, ) @property @@ -96,7 +98,6 @@ def query(self): """ return self._spark_options.query - @property def path(self): """ @@ -189,6 +190,7 @@ def get_table_query_string(self) -> str: return f"`{tmp_table_name}`" + class SparkOptions: def __init__( self, table: Optional[str] = None, query: Optional[str] = None, @@ -253,6 +255,7 @@ def to_proto(self) -> DataSourceProto.CustomSourceOptions: return spark_options_proto + class SavedDatasetSparkStorage(SavedDatasetStorage): _proto_attr_name = "spark_storage" diff --git a/sdk/python/feast/templates/spark/bootstrap.py b/sdk/python/feast/templates/spark/bootstrap.py index 899e466cfc..155a86bf48 100644 --- a/sdk/python/feast/templates/spark/bootstrap.py +++ b/sdk/python/feast/templates/spark/bootstrap.py @@ -2,9 +2,10 @@ from pathlib import Path from pyspark.sql import SparkSession + from feast.driver_test_data import ( - create_driver_hourly_stats_df, create_customer_daily_profile_df, + create_driver_hourly_stats_df, ) CURRENT_DIR = Path(__file__).parent @@ -13,11 +14,11 @@ START_DATE = datetime.strptime("2022-01-01", "%Y-%m-%d") END_DATE = START_DATE + timedelta(days=7) + def bootstrap(): # Bootstrap() will automatically be called from the init_repo() during `feast init` generate_example_data( - spark_session=SparkSession.builder.getOrCreate(), - base_dir=str(CURRENT_DIR), + spark_session=SparkSession.builder.getOrCreate(), base_dir=str(CURRENT_DIR), ) @@ -35,16 +36,15 @@ def generate_example_data(spark_session: SparkSession, base_dir: str) -> None: spark_session.createDataFrame( data=create_driver_hourly_stats_df(DRIVER_ENTITIES, START_DATE, END_DATE) ).write.parquet( - path=str(Path(base_dir) / "data" / "driver_hourly_stats"), - mode="overwrite", + path=str(Path(base_dir) / "data" / "driver_hourly_stats"), mode="overwrite", ) spark_session.createDataFrame( data=create_customer_daily_profile_df(CUSTOMER_ENTITIES, START_DATE, END_DATE) ).write.parquet( - path=str(Path(base_dir) / "data" / "customer_daily_profile"), - mode="overwrite", + path=str(Path(base_dir) / "data" / "customer_daily_profile"), mode="overwrite", ) + if __name__ == "__main__": - bootstrap() \ No newline at end of file + bootstrap() diff --git a/sdk/python/feast/templates/spark/example.py b/sdk/python/feast/templates/spark/example.py index 68d249c72f..ecb7500c2e 100644 --- a/sdk/python/feast/templates/spark/example.py +++ b/sdk/python/feast/templates/spark/example.py @@ -4,26 +4,19 @@ from pathlib import Path -from feast import Entity, Feature, FeatureView, ValueType - +from feast_spark_offline_store import SparkSource from google.protobuf.duration_pb2 import Duration -from feast_spark_offline_store import SparkSource +from feast import Entity, Feature, FeatureView, ValueType # Constants related to the generated data sets CURRENT_DIR = Path(__file__).parent # Entity definitions -driver = Entity( - name="driver_id", - value_type=ValueType.INT64, - description="driver id", -) +driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id",) customer = Entity( - name="customer_id", - value_type=ValueType.INT64, - description="customer id", + name="customer_id", value_type=ValueType.INT64, description="customer id", ) # Sources @@ -66,4 +59,4 @@ online=True, batch_source=customer_daily_profile, tags={}, -) \ No newline at end of file +) From 7adb8d2ffece436c88c69b2a25a15992e2714e3e Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 13:06:32 -0800 Subject: [PATCH 23/58] Add example feature repo Signed-off-by: Kevin Zhang --- .../third_party/spark_source.py | 58 ++++++++++++++----- sdk/python/feast/repo_config.py | 1 + sdk/python/feast/templates/spark/example.py | 2 +- .../feast/templates/spark/feature_store.yaml | 2 +- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py index 619ba75f5f..24614efdb6 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_source.py @@ -118,15 +118,12 @@ def from_proto(data_source: DataSourceProto) -> Any: assert data_source.HasField("custom_options") spark_options = SparkOptions.from_proto(data_source.custom_options) - return SparkSource( field_mapping=dict(data_source.field_mapping), table=spark_options.table, query=spark_options.query, - # path=spark_options.path, - # jdbc=None, - # format=spark_options.format, - # options=spark_options.options, + path=spark_options.path, + file_format=spark_options.file_format, event_timestamp_column=data_source.event_timestamp_column, created_timestamp_column=data_source.created_timestamp_column, date_partition_column=data_source.date_partition_column, @@ -155,22 +152,22 @@ def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: def get_table_column_names_and_types( self, config: RepoConfig ) -> Iterable[Tuple[str, str]]: - from feast_spark_offline_store.spark import ( + from feast.infra.offline_stores.third_party.spark import ( get_spark_session_or_start_new_with_repoconfig, ) spark_session = get_spark_session_or_start_new_with_repoconfig( - config.offline_store + store_config=config.offline_store ) + df = spark_session.sql(f"SELECT * FROM {self.get_table_query_string()}") try: return ( (fields["name"], fields["type"]) - for fields in spark_session.table(self.table).schema.jsonValue()[ - "fields" - ] + for fields in df.schema.jsonValue()["fields"] ) except AnalysisException: - raise DataSourceNotFoundException(self.table) + raise DataSourceNotFoundException() # TODO: review error handling + def get_table_query_string(self) -> str: """Returns a string that can directly be used to reference this table in SQL""" @@ -193,10 +190,13 @@ def get_table_query_string(self) -> str: class SparkOptions: def __init__( - self, table: Optional[str] = None, query: Optional[str] = None, + self, table: Optional[str] = None, query: Optional[str] = None, path: Optional[str] = None, file_format: Optional[str] = None ): self._table = table self._query = query + self._path = path + self._file_format = file_format + @property def table(self): @@ -226,6 +226,35 @@ def query(self, query): """ self._query = query + @property + def path(self): + """ + Returns the path + """ + return self._path + + @path.setter + def path(self, path): + """ + Sets the path + """ + self._path = path + + @property + def file_format(self): + """ + Returns the file_format + """ + return self._file_format + + @file_format.setter + def file_format(self, file_format): + """ + Sets the file_format + """ + self._file_format = file_format + + @classmethod def from_proto(cls, spark_options_proto: DataSourceProto.CustomSourceOptions): """ @@ -238,7 +267,10 @@ def from_proto(cls, spark_options_proto: DataSourceProto.CustomSourceOptions): spark_configuration = pickle.loads(spark_options_proto.configuration) spark_options = cls( - table=spark_configuration.table, query=spark_configuration.query, + table=spark_configuration.table, + query=spark_configuration.query, + path=spark_configuration.path, + file_format=spark_configuration.file_format, ) return spark_options diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 3f32d18b80..97e97e873b 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -39,6 +39,7 @@ "bigquery": "feast.infra.offline_stores.bigquery.BigQueryOfflineStore", "redshift": "feast.infra.offline_stores.redshift.RedshiftOfflineStore", "snowflake.offline": "feast.infra.offline_stores.snowflake.SnowflakeOfflineStore", + "spark": "feast.infra.offline_stores.third_party.spark.SparkOfflineStore", } FEATURE_SERVER_CONFIG_CLASS_FOR_TYPE = { diff --git a/sdk/python/feast/templates/spark/example.py b/sdk/python/feast/templates/spark/example.py index ecb7500c2e..c712e95c83 100644 --- a/sdk/python/feast/templates/spark/example.py +++ b/sdk/python/feast/templates/spark/example.py @@ -4,7 +4,7 @@ from pathlib import Path -from feast_spark_offline_store import SparkSource +from feast.infra.offline_stores.third_party.spark_source import SparkSource from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType diff --git a/sdk/python/feast/templates/spark/feature_store.yaml b/sdk/python/feast/templates/spark/feature_store.yaml index 800cdf009e..2975cb2df1 100644 --- a/sdk/python/feast/templates/spark/feature_store.yaml +++ b/sdk/python/feast/templates/spark/feature_store.yaml @@ -2,7 +2,7 @@ project: my_project registry: data/registry.db provider: local offline_store: - type: feast_spark_offline_store.spark.SparkOfflineStore + type: spark spark_conf: spark.master: "local[*]" spark.ui.enabled: "false" From 648f935fb8200d78bce5af3a3906d7671d1e5295 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 13:06:57 -0800 Subject: [PATCH 24/58] Update data source creator Signed-off-by: Kevin Zhang --- .../universal/data_sources/spark_data_source_creator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index bd1e19e7eb..df55512eb7 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -50,7 +50,7 @@ def teardown(self): def create_offline_store_config(self): self.spark_offline_store_config = SparkOfflineStoreConfig() self.spark_offline_store_config.type = ( - "feast_spark_offline_store.spark.SparkOfflineStore" + "sdk.python.feast.infra.third_party.spark.SparkOfflineStore" ) self.spark_offline_store_config.spark_conf = self.spark_conf return self.spark_offline_store_config From 7b84ac19c72e76621c74b858bd88c269a19ecd36 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 13:13:50 -0800 Subject: [PATCH 25/58] Make cli work for feast init with spark Signed-off-by: Kevin Zhang --- sdk/python/feast/cli.py | 2 +- sdk/python/setup.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index c23c3d104a..b835e239bd 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -477,7 +477,7 @@ def materialize_incremental_command(ctx: click.Context, end_ts: str, views: List @click.option( "--template", "-t", - type=click.Choice(["local", "gcp", "aws", "snowflake"], case_sensitive=False), + type=click.Choice(["local", "gcp", "aws", "snowflake", "spark"], case_sensitive=False), help="Specify a template for the created project", default="local", ) diff --git a/sdk/python/setup.py b/sdk/python/setup.py index 80daff00eb..8029f9570d 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -92,6 +92,10 @@ "snowflake-connector-python[pandas]>=2.7.3", ] +SPARK_REQUIRED = [ + "pyspark>=3.0", +] + GE_REQUIRED = [ "great_expectations>=0.14.0,<0.15.0" ] @@ -141,6 +145,7 @@ + REDIS_REQUIRED + AWS_REQUIRED + SNOWFLAKE_REQUIRED + * SPARK_REQUIRED + GE_REQUIRED ) From b066a6f2caee11bc57dc85f9c6a7079944d514df Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 13:37:53 -0800 Subject: [PATCH 26/58] Update the docs Signed-off-by: Kevin Zhang --- docs/reference/data-sources/README.md | 2 + docs/reference/data-sources/spark.md | 48 +++++++++++++++++++ docs/reference/offline-stores/README.md | 2 + docs/reference/offline-stores/spark.md | 40 ++++++++++++++++ .../feast/templates/spark/feature_store.yaml | 1 - .../data_sources/spark_data_source_creator.py | 2 +- 6 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 docs/reference/data-sources/spark.md create mode 100644 docs/reference/offline-stores/spark.md diff --git a/docs/reference/data-sources/README.md b/docs/reference/data-sources/README.md index fc6e136a9c..cba652a91a 100644 --- a/docs/reference/data-sources/README.md +++ b/docs/reference/data-sources/README.md @@ -9,3 +9,5 @@ Please see [Data Source](../../getting-started/concepts/feature-view.md#data-sou {% page-ref page="bigquery.md" %} {% page-ref page="redshift.md" %} + +{% page-ref page="spark.md" %} diff --git a/docs/reference/data-sources/spark.md b/docs/reference/data-sources/spark.md new file mode 100644 index 0000000000..9499f9a310 --- /dev/null +++ b/docs/reference/data-sources/spark.md @@ -0,0 +1,48 @@ +# Snowflake + +## Description + +**NOTE**: Spark data source api is currently in alpha development and the API is not completely stable. The API may change or update in the future. + +The spark data source API allows for the retrieval of historical feature values from file/database sources for building training datasets as well as materializing features into an online store. + +* Either a table name, a SQL query, or a file path can be provided. + +## Examples + +Using a table reference from SparkSession(for example, either in memory or a Hive Metastore) + +```python +from feast import SparkSource + +my_spark_source = SparkSource( + table="FEATURE_TABLE", +) +``` + +Using a query + +```python +from feast import SparkSource + +my_spark_source = SparkSource( + query="SELECT timestamp as ts, created, f1, f2 " + "FROM spark_table", +) +``` + +Using a file reference + +```python +from feast import SparkSource + +my_spark_source = SparkSource( + path=f"{CURRENT_DIR}/data/driver_hourly_stats", + file_format="parquet", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", +) +``` + + +Configuration options are available [here](https://rtd.feast.dev/en/latest/index.html#feast.data_source.SnowflakeSource). diff --git a/docs/reference/offline-stores/README.md b/docs/reference/offline-stores/README.md index 141a34d03b..087c6918f1 100644 --- a/docs/reference/offline-stores/README.md +++ b/docs/reference/offline-stores/README.md @@ -9,3 +9,5 @@ Please see [Offline Store](../../getting-started/architecture-and-components/off {% page-ref page="bigquery.md" %} {% page-ref page="redshift.md" %} + +{% page-ref page="snowflake.md" %} diff --git a/docs/reference/offline-stores/spark.md b/docs/reference/offline-stores/spark.md new file mode 100644 index 0000000000..33fd71347d --- /dev/null +++ b/docs/reference/offline-stores/spark.md @@ -0,0 +1,40 @@ +# Spark + +## Description + +The Spark offline store is an offline store currently in alpha development that provides support for reading [SparkSources](../data-sources/spark.md). + +## Disclaimer + +This Spark offline store still does not achieve full test coverage and continues to fail some integration tests when integrating with the feast universal test suite. Please do NOT assume complete stability of the API. + +As of 3/1/2022, 159/194 integration tests pass. + +* Spark tables and views are allowed as sources that are loaded in from some Spark store(e.g in Hive or in memory). +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be converted to a Spark dataframe and processed as a temporary view. +* A `SparkRetrievalJob` is returned when calling `get_historical_features()`. + * This allows you to call + * `to_df` to retrieve the pandas dataframe. + * `to_arrow` to retrieve the dataframe as a pyarrow Table. + * `to_spark-df` to retrieve the dataframe the spark. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +project: my_project +registry: data/registry.db +provider: local +offline_store: + type: spark + spark_conf: + spark.master: "local[*]" + spark.ui.enabled: "false" + spark.eventLog.enabled: "false" + spark.sql.catalogImplementation: "hive" + spark.sql.parser.quotedRegexColumnNames: "true" + spark.sql.session.timeZone: "UTC" +online_store: + path: data/online_store.db +``` +{% endcode %} diff --git a/sdk/python/feast/templates/spark/feature_store.yaml b/sdk/python/feast/templates/spark/feature_store.yaml index 2975cb2df1..2ea0ddfcc9 100644 --- a/sdk/python/feast/templates/spark/feature_store.yaml +++ b/sdk/python/feast/templates/spark/feature_store.yaml @@ -12,4 +12,3 @@ offline_store: spark.sql.session.timeZone: "UTC" online_store: path: data/online_store.db -## etc: etc.... \ No newline at end of file diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index df55512eb7..0a4e52ed9c 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -50,7 +50,7 @@ def teardown(self): def create_offline_store_config(self): self.spark_offline_store_config = SparkOfflineStoreConfig() self.spark_offline_store_config.type = ( - "sdk.python.feast.infra.third_party.spark.SparkOfflineStore" + "spark" ) self.spark_offline_store_config.spark_conf = self.spark_conf return self.spark_offline_store_config From e0099ae61a1ac8d232e58330291a818c4f04edcd Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 14:07:00 -0800 Subject: [PATCH 27/58] Clean up code Signed-off-by: Kevin Zhang --- sdk/python/feast/__init__.py | 2 +- sdk/python/feast/cli.py | 4 +++- .../spark_offline_store/__init__.py | 0 .../{ => spark_offline_store}/spark.py | 4 ++-- .../{ => spark_offline_store}/spark_source.py | 22 +++++++++---------- .../third_party/spark_type_map.py | 2 -- sdk/python/feast/repo_config.py | 2 +- sdk/python/feast/templates/spark/__init__.py | 0 sdk/python/feast/templates/spark/example.py | 2 +- .../data_sources/spark_data_source_creator.py | 8 +++---- 10 files changed, 21 insertions(+), 25 deletions(-) create mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/__init__.py rename sdk/python/feast/infra/offline_stores/third_party/{ => spark_offline_store}/spark.py (99%) rename sdk/python/feast/infra/offline_stores/third_party/{ => spark_offline_store}/spark_source.py (95%) delete mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py create mode 100644 sdk/python/feast/templates/spark/__init__.py diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 1974b49222..2c0b3da54e 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -6,7 +6,7 @@ from feast.infra.offline_stores.file_source import FileSource from feast.infra.offline_stores.redshift_source import RedshiftSource from feast.infra.offline_stores.snowflake_source import SnowflakeSource -from feast.infra.offline_stores.third_party.spark_source import SparkSource +from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import SparkSource from .data_source import KafkaSource, KinesisSource, SourceType from .entity import Entity diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index b835e239bd..febac56fcc 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -477,7 +477,9 @@ def materialize_incremental_command(ctx: click.Context, end_ts: str, views: List @click.option( "--template", "-t", - type=click.Choice(["local", "gcp", "aws", "snowflake", "spark"], case_sensitive=False), + type=click.Choice( + ["local", "gcp", "aws", "snowflake", "spark"], case_sensitive=False + ), help="Specify a template for the created project", default="local", ) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py similarity index 99% rename from sdk/python/feast/infra/offline_stores/third_party/spark.py rename to sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py index 1edffa2aab..976510f6b0 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py @@ -24,7 +24,7 @@ RetrievalMetadata, ) from feast.infra.offline_stores.offline_utils import FeatureViewQueryContext -from feast.infra.offline_stores.third_party.spark_source import SparkSource +from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import SparkSource from feast.registry import Registry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage @@ -228,7 +228,7 @@ def __init__( spark_session: SparkSession, query: str, full_feature_names: bool, - on_demand_feature_views: Optional[List[OnDemandFeatureView]], + on_demand_feature_views: Optional[List[OnDemandFeatureView]] = None, metadata: Optional[RetrievalMetadata] = None, ): super().__init__() diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py similarity index 95% rename from sdk/python/feast/infra/offline_stores/third_party/spark_source.py rename to sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py index 24614efdb6..2081e994b5 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py @@ -152,7 +152,7 @@ def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: def get_table_column_names_and_types( self, config: RepoConfig ) -> Iterable[Tuple[str, str]]: - from feast.infra.offline_stores.third_party.spark import ( + from feast.infra.offline_stores.third_party.spark_offline_store.spark import ( get_spark_session_or_start_new_with_repoconfig, ) @@ -160,14 +160,10 @@ def get_table_column_names_and_types( store_config=config.offline_store ) df = spark_session.sql(f"SELECT * FROM {self.get_table_query_string()}") - try: - return ( - (fields["name"], fields["type"]) - for fields in df.schema.jsonValue()["fields"] - ) - except AnalysisException: - raise DataSourceNotFoundException() # TODO: review error handling - + return ( + (fields["name"], fields["type"]) + for fields in df.schema.jsonValue()["fields"] + ) def get_table_query_string(self) -> str: """Returns a string that can directly be used to reference this table in SQL""" @@ -190,14 +186,17 @@ def get_table_query_string(self) -> str: class SparkOptions: def __init__( - self, table: Optional[str] = None, query: Optional[str] = None, path: Optional[str] = None, file_format: Optional[str] = None + self, + table: Optional[str] = None, + query: Optional[str] = None, + path: Optional[str] = None, + file_format: Optional[str] = None, ): self._table = table self._query = query self._path = path self._file_format = file_format - @property def table(self): """ @@ -254,7 +253,6 @@ def file_format(self, file_format): """ self._file_format = file_format - @classmethod def from_proto(cls, spark_options_proto: DataSourceProto.CustomSourceOptions): """ diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py b/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py deleted file mode 100644 index 139597f9cb..0000000000 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_type_map.py +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 97e97e873b..9c67a6bedc 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -39,7 +39,7 @@ "bigquery": "feast.infra.offline_stores.bigquery.BigQueryOfflineStore", "redshift": "feast.infra.offline_stores.redshift.RedshiftOfflineStore", "snowflake.offline": "feast.infra.offline_stores.snowflake.SnowflakeOfflineStore", - "spark": "feast.infra.offline_stores.third_party.spark.SparkOfflineStore", + "spark": "feast.infra.offline_stores.third_party.spark_offline_store.spark.SparkOfflineStore", } FEATURE_SERVER_CONFIG_CLASS_FOR_TYPE = { diff --git a/sdk/python/feast/templates/spark/__init__.py b/sdk/python/feast/templates/spark/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdk/python/feast/templates/spark/example.py b/sdk/python/feast/templates/spark/example.py index c712e95c83..0bd91c4381 100644 --- a/sdk/python/feast/templates/spark/example.py +++ b/sdk/python/feast/templates/spark/example.py @@ -4,10 +4,10 @@ from pathlib import Path -from feast.infra.offline_stores.third_party.spark_source import SparkSource from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType +from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import SparkSource # Constants related to the generated data sets CURRENT_DIR = Path(__file__).parent diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 0a4e52ed9c..8e4b38614e 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -6,8 +6,8 @@ from pyspark.sql import SparkSession from feast.data_source import DataSource -from feast.infra.offline_stores.third_party.spark import SparkOfflineStoreConfig -from feast.infra.offline_stores.third_party.spark_source import ( +from feast.infra.offline_stores.third_party.spark_offline_store.spark import SparkOfflineStoreConfig +from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( SavedDatasetSparkStorage, SparkSource, ) @@ -49,9 +49,7 @@ def teardown(self): def create_offline_store_config(self): self.spark_offline_store_config = SparkOfflineStoreConfig() - self.spark_offline_store_config.type = ( - "spark" - ) + self.spark_offline_store_config.type = "spark" self.spark_offline_store_config.spark_conf = self.spark_conf return self.spark_offline_store_config From 86e74c0d85e027c7edf1b4a7e6041662114d49fc Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 14:14:15 -0800 Subject: [PATCH 28/58] Clean up more code Signed-off-by: Kevin Zhang --- docs/reference/offline-stores/spark.md | 2 +- sdk/python/feast/__init__.py | 5 ++++- .../offline_stores/third_party/spark_offline_store/spark.py | 4 +++- .../third_party/spark_offline_store/spark_source.py | 4 +--- sdk/python/feast/templates/spark/example.py | 4 +++- .../universal/data_sources/spark_data_source_creator.py | 4 +++- 6 files changed, 15 insertions(+), 8 deletions(-) diff --git a/docs/reference/offline-stores/spark.md b/docs/reference/offline-stores/spark.md index 33fd71347d..3a37d0b185 100644 --- a/docs/reference/offline-stores/spark.md +++ b/docs/reference/offline-stores/spark.md @@ -8,7 +8,7 @@ The Spark offline store is an offline store currently in alpha development that This Spark offline store still does not achieve full test coverage and continues to fail some integration tests when integrating with the feast universal test suite. Please do NOT assume complete stability of the API. -As of 3/1/2022, 159/194 integration tests pass. +As of 3/1/2022, 179/194 integration tests pass. * Spark tables and views are allowed as sources that are loaded in from some Spark store(e.g in Hive or in memory). * Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be converted to a Spark dataframe and processed as a temporary view. diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 2c0b3da54e..674cadc2a2 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -6,7 +6,9 @@ from feast.infra.offline_stores.file_source import FileSource from feast.infra.offline_stores.redshift_source import RedshiftSource from feast.infra.offline_stores.snowflake_source import SnowflakeSource -from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import SparkSource +from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( + SparkSource, +) from .data_source import KafkaSource, KinesisSource, SourceType from .entity import Entity @@ -48,4 +50,5 @@ "RedshiftSource", "RequestFeatureView", "SnowflakeSource", + "SparkSource", ] diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py index 976510f6b0..5f142a2de8 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py @@ -24,7 +24,9 @@ RetrievalMetadata, ) from feast.infra.offline_stores.offline_utils import FeatureViewQueryContext -from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import SparkSource +from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( + SparkSource, +) from feast.registry import Registry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py index 2081e994b5..801be2838a 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py @@ -3,11 +3,9 @@ from enum import Enum from typing import Any, Callable, Dict, Iterable, Optional, Tuple -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.utils import AnalysisException +from pyspark.sql import SparkSession from feast.data_source import DataSource -from feast.errors import DataSourceNotFoundException from feast.infra.offline_stores.offline_utils import get_temp_entity_table_name from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.protos.feast.core.SavedDataset_pb2 import ( diff --git a/sdk/python/feast/templates/spark/example.py b/sdk/python/feast/templates/spark/example.py index 0bd91c4381..bbeff14c48 100644 --- a/sdk/python/feast/templates/spark/example.py +++ b/sdk/python/feast/templates/spark/example.py @@ -7,7 +7,9 @@ from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType -from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import SparkSource +from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( + SparkSource, +) # Constants related to the generated data sets CURRENT_DIR = Path(__file__).parent diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 8e4b38614e..2bdaeb960e 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -6,7 +6,9 @@ from pyspark.sql import SparkSession from feast.data_source import DataSource -from feast.infra.offline_stores.third_party.spark_offline_store.spark import SparkOfflineStoreConfig +from feast.infra.offline_stores.third_party.spark_offline_store.spark import ( + SparkOfflineStoreConfig, +) from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( SavedDatasetSparkStorage, SparkSource, From 6fe5b9eb8bcf6e2241dab64071305ce71cce3140 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 14:18:27 -0800 Subject: [PATCH 29/58] Uncomment repo configs Signed-off-by: Kevin Zhang --- .../feature_repos/repo_configuration.py | 62 +++++++++---------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 3fc1b70f4e..ed342de47c 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -74,40 +74,40 @@ DEFAULT_FULL_REPO_CONFIGS.extend( [ # Redis configurations - # IntegrationTestRepoConfig(online_store=REDIS_CONFIG), - # IntegrationTestRepoConfig(online_store=REDIS_CLUSTER_CONFIG), + IntegrationTestRepoConfig(online_store=REDIS_CONFIG), + IntegrationTestRepoConfig(online_store=REDIS_CLUSTER_CONFIG), # GCP configurations IntegrationTestRepoConfig( provider="local", offline_store_creator=SparkDataSourceCreator, - ) - # IntegrationTestRepoConfig( - # provider="gcp", - # offline_store_creator=BigQueryDataSourceCreator, - # online_store="datastore", - # ), - # IntegrationTestRepoConfig( - # provider="gcp", - # offline_store_creator=BigQueryDataSourceCreator, - # online_store=REDIS_CONFIG, - # ), - # # AWS configurations - # IntegrationTestRepoConfig( - # provider="aws", - # offline_store_creator=RedshiftDataSourceCreator, - # online_store=DYNAMO_CONFIG, - # python_feature_server=True, - # ), - # IntegrationTestRepoConfig( - # provider="aws", - # offline_store_creator=RedshiftDataSourceCreator, - # online_store=REDIS_CONFIG, - # ), - # # Snowflake configurations - # IntegrationTestRepoConfig( - # provider="aws", # no list features, no feature server - # offline_store_creator=SnowflakeDataSourceCreator, - # online_store=REDIS_CONFIG, - # ), + ), + IntegrationTestRepoConfig( + provider="gcp", + offline_store_creator=BigQueryDataSourceCreator, + online_store="datastore", + ), + IntegrationTestRepoConfig( + provider="gcp", + offline_store_creator=BigQueryDataSourceCreator, + online_store=REDIS_CONFIG, + ), + # AWS configurations + IntegrationTestRepoConfig( + provider="aws", + offline_store_creator=RedshiftDataSourceCreator, + online_store=DYNAMO_CONFIG, + python_feature_server=True, + ), + IntegrationTestRepoConfig( + provider="aws", + offline_store_creator=RedshiftDataSourceCreator, + online_store=REDIS_CONFIG, + ), + # Snowflake configurations + IntegrationTestRepoConfig( + provider="aws", # no list features, no feature server + offline_store_creator=SnowflakeDataSourceCreator, + online_store=REDIS_CONFIG, + ), ] ) full_repo_configs_module = os.environ.get(FULL_REPO_CONFIGS_MODULE_ENV_NAME) From 92c4f87dd9880afebc1c66660ba3c5d6a3ebe1b6 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 14:24:15 -0800 Subject: [PATCH 30/58] Fix setup.py Signed-off-by: Kevin Zhang --- sdk/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/setup.py b/sdk/python/setup.py index 8029f9570d..3f545ac805 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -145,7 +145,7 @@ + REDIS_REQUIRED + AWS_REQUIRED + SNOWFLAKE_REQUIRED - * SPARK_REQUIRED + + SPARK_REQUIRED + GE_REQUIRED ) From 18a2892f35ecd7fc1c248b5d87d32b8d8854c59a Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 15:26:50 -0800 Subject: [PATCH 31/58] Update dependencies Signed-off-by: Kevin Zhang --- .../requirements/py3.7-ci-requirements.txt | 68 ++++++--- .../requirements/py3.8-ci-requirements.txt | 143 ++++++++++-------- sdk/python/setup.py | 3 +- 3 files changed, 125 insertions(+), 89 deletions(-) diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index d8f4eed9ce..b5d8b14073 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -42,6 +42,8 @@ asn1crypto==1.4.0 # snowflake-connector-python assertpy==1.1 # via feast (setup.py) +asttokens==2.0.5 + # via stack-data async-timeout==4.0.2 # via aiohttp attrs==21.4.0 @@ -59,7 +61,7 @@ azure-core==1.22.1 # azure-storage-blob azure-datalake-store==0.0.52 # via adlfs -azure-identity==1.7.1 +azure-identity==1.8.0 # via adlfs azure-storage-blob==12.9.0 # via adlfs @@ -67,7 +69,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports.zoneinfo==0.2.1 +backports-zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -75,11 +77,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.21.8 +boto3==1.21.11 # via # feast (setup.py) # moto -botocore==1.24.8 +botocore==1.24.11 # via # boto3 # moto @@ -162,6 +164,8 @@ entrypoints==0.4 # nbconvert execnet==1.9.0 # via pytest-xdist +executing==0.8.3 + # via stack-data fastapi==0.74.1 # via feast (setup.py) fastavro==1.4.9 @@ -195,7 +199,7 @@ google-api-core[grpc]==1.31.5 # google-cloud-core # google-cloud-datastore # google-cloud-firestore -google-api-python-client==2.38.0 +google-api-python-client==2.39.0 # via firebase-admin google-auth==1.35.0 # via @@ -210,7 +214,7 @@ google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.0 # via gcsfs -google-cloud-bigquery==2.34.0 +google-cloud-bigquery==2.34.1 # via feast (setup.py) google-cloud-bigquery-storage==2.12.0 # via feast (setup.py) @@ -221,7 +225,7 @@ google-cloud-core==1.7.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.4.0 +google-cloud-datastore==2.5.0 # via feast (setup.py) google-cloud-firestore==2.3.4 # via firebase-admin @@ -277,7 +281,11 @@ idna==3.3 # yarl imagesize==1.3.0 # via sphinx +<<<<<<< HEAD importlib-metadata==4.2.0 +======= +importlib-metadata==4.11.2 +>>>>>>> 92472003 (Update dependencies) # via great-expectations importlib-resources==5.4.0 # via jsonschema @@ -287,7 +295,7 @@ ipykernel==6.9.1 # via # ipywidgets # notebook -ipython==7.32.0 +ipython==8.1.0 # via # ipykernel # ipywidgets @@ -342,10 +350,13 @@ jupyterlab-pygments==0.1.2 # via nbconvert jupyterlab-widgets==1.0.2 # via ipywidgets +<<<<<<< HEAD libcst==0.4.1 # via # google-cloud-bigquery-storage # google-cloud-datastore +======= +>>>>>>> 92472003 (Update dependencies) locket==0.2.1 # via partd markupsafe==2.0.1 @@ -391,9 +402,7 @@ multidict==6.0.2 mypy==0.931 # via feast (setup.py) mypy-extensions==0.4.3 - # via - # mypy - # typing-inspect + # via mypy mypy-protobuf==3.1.0 # via feast (setup.py) nbclient==0.5.11 @@ -416,7 +425,7 @@ nodeenv==1.6.0 # via pre-commit notebook==6.4.8 # via widgetsnbextension -numpy==1.21.5 +numpy==1.22.2 # via # altair # great-expectations @@ -500,12 +509,16 @@ ptyprocess==0.7.0 # via # pexpect # terminado +pure-eval==0.2.2 + # via stack-data py==1.11.0 # via # pytest # pytest-forked py-cpuinfo==8.0.0 # via pytest-benchmark +py4j==0.10.9.3 + # via pyspark pyarrow==6.0.1 # via # feast (setup.py) @@ -548,6 +561,8 @@ pyparsing==2.4.7 # packaging pyrsistent==0.18.1 # via jsonschema +pyspark==3.2.1 + # via feast (setup.py) pytest==7.0.1 # via # feast (setup.py) @@ -600,7 +615,6 @@ pyyaml==6.0 # via # dask # feast (setup.py) - # libcst # pre-commit # uvicorn pyzmq==22.3.0 @@ -613,7 +627,7 @@ redis==3.5.3 # redis-py-cluster redis-py-cluster==2.1.3 # via feast (setup.py) -regex==2022.1.18 +regex==2022.3.2 # via black requests==2.27.1 # via @@ -643,19 +657,20 @@ responses==0.18.0 # via moto rsa==4.8 # via google-auth -ruamel.yaml==0.17.17 +ruamel-yaml==0.17.17 # via great-expectations -ruamel.yaml.clib==0.2.6 - # via ruamel.yaml +ruamel-yaml-clib==0.2.6 + # via ruamel-yaml s3transfer==0.5.2 # via boto3 -scipy==1.7.3 +scipy==1.8.0 # via great-expectations send2trash==1.8.0 # via notebook six==1.16.0 # via # absl-py + # asttokens # azure-core # azure-identity # bleach @@ -697,13 +712,15 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx +stack-data==0.2.0 + # via ipython starlette==0.17.1 # via fastapi tabulate==0.8.9 # via feast (setup.py) tenacity==8.0.1 # via feast (setup.py) -tensorflow-metadata==1.6.0 +tensorflow-metadata==1.7.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations @@ -735,7 +752,7 @@ tornado==6.1 # jupyter-client # notebook # terminado -tqdm==4.62.3 +tqdm==4.63.0 # via # feast (setup.py) # great-expectations @@ -753,9 +770,13 @@ traitlets==5.1.1 # notebook typed-ast==1.5.2 # via black +<<<<<<< HEAD types-futures==3.3.8 # via types-protobuf types-protobuf==3.19.2 +======= +types-protobuf==3.19.12 +>>>>>>> 92472003 (Update dependencies) # via # feast (setup.py) # mypy-protobuf @@ -773,17 +794,22 @@ types-setuptools==57.4.9 # via feast (setup.py) types-tabulate==0.8.5 # via feast (setup.py) -types-urllib3==1.26.9 +types-urllib3==1.26.10 # via types-requests typing-extensions==4.1.1 # via # great-expectations +<<<<<<< HEAD # libcst # mypy # pydantic # typing-inspect typing-inspect==0.7.1 # via libcst +======= + # mypy + # pydantic +>>>>>>> 92472003 (Update dependencies) tzdata==2021.5 # via pytz-deprecation-shim tzlocal==4.1 diff --git a/sdk/python/requirements/py3.8-ci-requirements.txt b/sdk/python/requirements/py3.8-ci-requirements.txt index 5a8aa35532..f0f530d8fa 100644 --- a/sdk/python/requirements/py3.8-ci-requirements.txt +++ b/sdk/python/requirements/py3.8-ci-requirements.txt @@ -42,6 +42,8 @@ asn1crypto==1.4.0 # snowflake-connector-python assertpy==1.1 # via feast (setup.py) +asttokens==2.0.5 + # via stack-data async-timeout==4.0.2 # via aiohttp attrs==21.4.0 @@ -52,14 +54,14 @@ attrs==21.4.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.21.1 +azure-core==1.22.1 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.52 # via adlfs -azure-identity==1.7.1 +azure-identity==1.8.0 # via adlfs azure-storage-blob==12.9.0 # via adlfs @@ -67,7 +69,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports.zoneinfo==0.2.1 +backports-zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -75,11 +77,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.20.46 +boto3==1.21.11 # via # feast (setup.py) # moto -botocore==1.23.46 +botocore==1.24.11 # via # boto3 # moto @@ -102,12 +104,12 @@ cffi==1.15.0 # snowflake-connector-python cfgv==3.3.1 # via pre-commit -charset-normalizer==2.0.11 +charset-normalizer==2.0.12 # via # aiohttp # requests # snowflake-connector-python -click==8.0.3 +click==8.0.4 # via # black # feast (setup.py) @@ -118,7 +120,7 @@ cloudpickle==2.0.0 # via dask colorama==0.4.4 # via feast (setup.py) -coverage[toml]==6.3 +coverage[toml]==6.3.2 # via pytest-cov cryptography==3.3.2 # via @@ -155,20 +157,22 @@ docutils==0.17.1 # via # sphinx # sphinx-rtd-theme -entrypoints==0.3 +entrypoints==0.4 # via # altair # jupyter-client # nbconvert execnet==1.9.0 # via pytest-xdist -fastapi==0.73.0 +executing==0.8.3 + # via stack-data +fastapi==0.74.1 # via feast (setup.py) fastavro==1.4.9 # via # feast (setup.py) # pandavro -filelock==3.4.2 +filelock==3.6.0 # via virtualenv firebase-admin==4.5.2 # via feast (setup.py) @@ -178,12 +182,12 @@ frozenlist==1.3.0 # via # aiohttp # aiosignal -fsspec==2022.1.0 +fsspec==2022.2.0 # via # adlfs # dask # gcsfs -gcsfs==2022.1.0 +gcsfs==2022.2.0 # via feast (setup.py) google-api-core[grpc]==1.31.5 # via @@ -195,7 +199,7 @@ google-api-core[grpc]==1.31.5 # google-cloud-core # google-cloud-datastore # google-cloud-firestore -google-api-python-client==2.36.0 +google-api-python-client==2.39.0 # via firebase-admin google-auth==1.35.0 # via @@ -208,11 +212,11 @@ google-auth==1.35.0 # google-cloud-storage google-auth-httplib2==0.1.0 # via google-api-python-client -google-auth-oauthlib==0.4.6 +google-auth-oauthlib==0.5.0 # via gcsfs -google-cloud-bigquery==2.32.0 +google-cloud-bigquery==2.34.1 # via feast (setup.py) -google-cloud-bigquery-storage==2.11.0 +google-cloud-bigquery-storage==2.12.0 # via feast (setup.py) google-cloud-core==1.7.2 # via @@ -221,7 +225,7 @@ google-cloud-core==1.7.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.4.0 +google-cloud-datastore==2.5.0 # via feast (setup.py) google-cloud-firestore==2.3.4 # via firebase-admin @@ -241,9 +245,9 @@ googleapis-common-protos==1.52.0 # feast (setup.py) # google-api-core # tensorflow-metadata -great-expectations==0.14.4 +great-expectations==0.14.8 # via feast (setup.py) -grpcio==1.43.0 +grpcio==1.44.0 # via # feast (setup.py) # google-api-core @@ -251,7 +255,7 @@ grpcio==1.43.0 # grpcio-reflection # grpcio-testing # grpcio-tools -grpcio-reflection==1.43.0 +grpcio-reflection==1.44.0 # via feast (setup.py) grpcio-testing==1.34.0 # via feast (setup.py) @@ -261,13 +265,13 @@ h11==0.13.0 # via uvicorn hiredis==2.0.0 # via feast (setup.py) -httplib2==0.20.2 +httplib2==0.20.4 # via # google-api-python-client # google-auth-httplib2 httptools==0.3.0 # via uvicorn -identify==2.4.7 +identify==2.4.11 # via pre-commit idna==3.3 # via @@ -283,11 +287,11 @@ importlib-resources==5.4.0 # via jsonschema iniconfig==1.1.1 # via pytest -ipykernel==6.7.0 +ipykernel==6.9.1 # via # ipywidgets # notebook -ipython==7.31.1 +ipython==8.1.0 # via # ipykernel # ipywidgets @@ -332,7 +336,7 @@ jupyter-client==7.1.2 # ipykernel # nbclient # notebook -jupyter-core==4.9.1 +jupyter-core==4.9.2 # via # jupyter-client # nbconvert @@ -368,9 +372,9 @@ mmh3==3.0.0 # via feast (setup.py) mock==2.0.0 # via feast (setup.py) -moto==3.0.2 +moto==3.0.5 # via feast (setup.py) -msal==1.16.0 +msal==1.17.0 # via # azure-identity # msal-extensions @@ -391,14 +395,12 @@ multidict==6.0.2 mypy==0.931 # via feast (setup.py) mypy-extensions==0.4.3 - # via - # mypy - # typing-inspect + # via mypy mypy-protobuf==3.1.0 # via feast (setup.py) -nbclient==0.5.10 +nbclient==0.5.11 # via nbconvert -nbconvert==6.4.1 +nbconvert==6.4.2 # via notebook nbformat==5.1.3 # via @@ -416,7 +418,7 @@ nodeenv==1.6.0 # via pre-commit notebook==6.4.8 # via widgetsnbextension -numpy==1.21.5 +numpy==1.22.2 # via # altair # great-expectations @@ -455,7 +457,7 @@ partd==1.2.0 # via dask pathspec==0.9.0 # via black -pbr==5.8.0 +pbr==5.8.1 # via mock pep517==0.12.0 # via pip-tools @@ -463,19 +465,19 @@ pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.4.0 +pip-tools==6.5.1 # via feast (setup.py) -platformdirs==2.4.1 +platformdirs==2.5.1 # via virtualenv pluggy==1.0.0 # via pytest -portalocker==2.3.2 +portalocker==2.4.0 # via msal-extensions pre-commit==2.17.0 # via feast (setup.py) prometheus-client==0.13.1 # via notebook -prompt-toolkit==3.0.26 +prompt-toolkit==3.0.28 # via ipython proto-plus==1.19.6 # via @@ -500,12 +502,16 @@ ptyprocess==0.7.0 # via # pexpect # terminado +pure-eval==0.2.2 + # via stack-data py==1.11.0 # via # pytest # pytest-forked py-cpuinfo==8.0.0 # via pytest-benchmark +py4j==0.10.9.3 + # via pyspark pyarrow==6.0.1 # via # feast (setup.py) @@ -520,7 +526,7 @@ pycodestyle==2.8.0 # via flake8 pycparser==2.21 # via cffi -pycryptodomex==3.14.0 +pycryptodomex==3.14.1 # via snowflake-connector-python pydantic==1.9.0 # via @@ -548,7 +554,9 @@ pyparsing==2.4.7 # packaging pyrsistent==0.18.1 # via jsonschema -pytest==6.2.5 +pyspark==3.2.1 + # via feast (setup.py) +pytest==7.0.1 # via # feast (setup.py) # pytest-benchmark @@ -600,7 +608,6 @@ pyyaml==6.0 # via # dask # feast (setup.py) - # libcst # pre-commit # uvicorn pyzmq==22.3.0 @@ -613,7 +620,7 @@ redis==3.5.3 # redis-py-cluster redis-py-cluster==2.1.3 # via feast (setup.py) -regex==2022.1.18 +regex==2022.3.2 # via black requests==2.27.1 # via @@ -639,23 +646,24 @@ requests-oauthlib==1.3.1 # via # google-auth-oauthlib # msrest -responses==0.17.0 +responses==0.18.0 # via moto rsa==4.8 # via google-auth -ruamel.yaml==0.17.17 +ruamel-yaml==0.17.17 # via great-expectations -ruamel.yaml.clib==0.2.6 - # via ruamel.yaml -s3transfer==0.5.0 +ruamel-yaml-clib==0.2.6 + # via ruamel-yaml +s3transfer==0.5.2 # via boto3 -scipy==1.7.3 +scipy==1.8.0 # via great-expectations send2trash==1.8.0 # via notebook six==1.16.0 # via # absl-py + # asttokens # azure-core # azure-identity # bleach @@ -672,13 +680,12 @@ six==1.16.0 # pandavro # pyopenssl # python-dateutil - # responses # virtualenv sniffio==1.2.0 # via anyio snowballstemmer==2.2.0 # via sphinx -snowflake-connector-python[pandas]==2.7.3 +snowflake-connector-python[pandas]==2.7.4 # via feast (setup.py) sphinx==4.3.2 # via @@ -698,13 +705,15 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx +stack-data==0.2.0 + # via ipython starlette==0.17.1 # via fastapi tabulate==0.8.9 # via feast (setup.py) tenacity==8.0.1 # via feast (setup.py) -tensorflow-metadata==1.6.0 +tensorflow-metadata==1.7.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations @@ -712,19 +721,19 @@ terminado==0.13.1 # via notebook testcontainers==3.4.2 # via feast (setup.py) -testpath==0.5.0 +testpath==0.6.0 # via nbconvert toml==0.10.2 # via # black # feast (setup.py) # pre-commit - # pytest -tomli==2.0.0 +tomli==2.0.1 # via # coverage # mypy # pep517 + # pytest toolz==0.11.2 # via # altair @@ -736,7 +745,7 @@ tornado==6.1 # jupyter-client # notebook # terminado -tqdm==4.62.3 +tqdm==4.63.0 # via # feast (setup.py) # great-expectations @@ -762,21 +771,21 @@ types-protobuf==3.19.7 # mypy-protobuf types-python-dateutil==2.8.9 # via feast (setup.py) -types-pytz==2021.3.4 +types-pytz==2021.3.5 # via feast (setup.py) types-pyyaml==6.0.4 # via feast (setup.py) -types-redis==4.1.13 +types-redis==4.1.17 # via feast (setup.py) -types-requests==2.27.8 +types-requests==2.27.11 # via feast (setup.py) -types-setuptools==57.4.8 +types-setuptools==57.4.9 # via feast (setup.py) types-tabulate==0.8.5 # via feast (setup.py) -types-urllib3==1.26.8 +types-urllib3==1.26.10 # via types-requests -typing-extensions==4.0.1 +typing-extensions==4.1.1 # via # great-expectations # libcst @@ -798,11 +807,11 @@ urllib3==1.26.8 # minio # requests # responses -uvicorn[standard]==0.17.1 +uvicorn[standard]==0.17.5 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.13.0 +virtualenv==20.13.2 # via pre-commit watchgod==0.7 # via uvicorn @@ -810,11 +819,11 @@ wcwidth==0.2.5 # via prompt-toolkit webencodings==0.5.1 # via bleach -websocket-client==1.2.3 +websocket-client==1.3.1 # via docker -websockets==10.1 +websockets==10.2 # via uvicorn -werkzeug==2.0.2 +werkzeug==2.0.3 # via moto wheel==0.37.1 # via pip-tools diff --git a/sdk/python/setup.py b/sdk/python/setup.py index 3f545ac805..7666b2b535 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -93,7 +93,7 @@ ] SPARK_REQUIRED = [ - "pyspark>=3.0", + "pyspark==3.2.1", ] GE_REQUIRED = [ @@ -249,6 +249,7 @@ def run(self): "aws": AWS_REQUIRED, "redis": REDIS_REQUIRED, "snowflake": SNOWFLAKE_REQUIRED, + "spark": SPARK_REQUIRED, "ge": GE_REQUIRED, }, include_package_data=True, From c6443880f9302a1c9e7d736d0586e487658e7a12 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 16:21:17 -0800 Subject: [PATCH 32/58] Fix ci dependencies Signed-off-by: Kevin Zhang --- .../requirements/py3.7-ci-requirements.txt | 37 ++++- .../requirements/py3.9-ci-requirements.txt | 133 ++++++++++-------- 2 files changed, 101 insertions(+), 69 deletions(-) diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index b5d8b14073..c8f0bdd2b3 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -46,6 +46,8 @@ asttokens==2.0.5 # via stack-data async-timeout==4.0.2 # via aiohttp +asynctest==0.13.0 + # via aiohttp attrs==21.4.0 # via # aiohttp @@ -69,7 +71,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports-zoneinfo==0.2.1 +backports.zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -77,11 +79,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.21.11 +boto3==1.21.8 # via # feast (setup.py) # moto -botocore==1.24.11 +botocore==1.24.8 # via # boto3 # moto @@ -214,7 +216,7 @@ google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.0 # via gcsfs -google-cloud-bigquery==2.34.1 +google-cloud-bigquery==2.34.0 # via feast (setup.py) google-cloud-bigquery-storage==2.12.0 # via feast (setup.py) @@ -225,7 +227,7 @@ google-cloud-core==1.7.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.5.0 +google-cloud-datastore==2.4.0 # via feast (setup.py) google-cloud-firestore==2.3.4 # via firebase-admin @@ -282,11 +284,27 @@ idna==3.3 imagesize==1.3.0 # via sphinx <<<<<<< HEAD +<<<<<<< HEAD importlib-metadata==4.2.0 ======= importlib-metadata==4.11.2 >>>>>>> 92472003 (Update dependencies) # via great-expectations +======= +importlib-metadata==4.2.0 + # via + # click + # flake8 + # great-expectations + # jsonschema + # moto + # pep517 + # pluggy + # pre-commit + # pytest + # redis + # virtualenv +>>>>>>> 3279f1fa (Fix ci dependencies) importlib-resources==5.4.0 # via jsonschema iniconfig==1.1.1 @@ -295,7 +313,7 @@ ipykernel==6.9.1 # via # ipywidgets # notebook -ipython==8.1.0 +ipython==7.32.0 # via # ipykernel # ipywidgets @@ -351,12 +369,17 @@ jupyterlab-pygments==0.1.2 jupyterlab-widgets==1.0.2 # via ipywidgets <<<<<<< HEAD +<<<<<<< HEAD libcst==0.4.1 # via # google-cloud-bigquery-storage # google-cloud-datastore ======= >>>>>>> 92472003 (Update dependencies) +======= +libcst==0.4.1 + # via google-cloud-datastore +>>>>>>> 3279f1fa (Fix ci dependencies) locket==0.2.1 # via partd markupsafe==2.0.1 @@ -425,7 +448,7 @@ nodeenv==1.6.0 # via pre-commit notebook==6.4.8 # via widgetsnbextension -numpy==1.22.2 +numpy==1.22.5 # via # altair # great-expectations diff --git a/sdk/python/requirements/py3.9-ci-requirements.txt b/sdk/python/requirements/py3.9-ci-requirements.txt index 403aab70aa..ec6f876c72 100644 --- a/sdk/python/requirements/py3.9-ci-requirements.txt +++ b/sdk/python/requirements/py3.9-ci-requirements.txt @@ -42,6 +42,8 @@ asn1crypto==1.4.0 # snowflake-connector-python assertpy==1.1 # via feast (setup.py) +asttokens==2.0.5 + # via stack-data async-timeout==4.0.2 # via aiohttp attrs==21.4.0 @@ -52,14 +54,14 @@ attrs==21.4.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.21.1 +azure-core==1.22.1 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.52 # via adlfs -azure-identity==1.7.1 +azure-identity==1.8.0 # via adlfs azure-storage-blob==12.9.0 # via adlfs @@ -71,11 +73,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.20.46 +boto3==1.21.11 # via # feast (setup.py) # moto -botocore==1.23.46 +botocore==1.24.11 # via # boto3 # moto @@ -98,12 +100,12 @@ cffi==1.15.0 # snowflake-connector-python cfgv==3.3.1 # via pre-commit -charset-normalizer==2.0.11 +charset-normalizer==2.0.12 # via # aiohttp # requests # snowflake-connector-python -click==8.0.3 +click==8.0.4 # via # black # feast (setup.py) @@ -114,7 +116,7 @@ cloudpickle==2.0.0 # via dask colorama==0.4.4 # via feast (setup.py) -coverage[toml]==6.3 +coverage[toml]==6.3.2 # via pytest-cov cryptography==3.3.2 # via @@ -151,20 +153,22 @@ docutils==0.17.1 # via # sphinx # sphinx-rtd-theme -entrypoints==0.3 +entrypoints==0.4 # via # altair # jupyter-client # nbconvert execnet==1.9.0 # via pytest-xdist -fastapi==0.73.0 +executing==0.8.3 + # via stack-data +fastapi==0.74.1 # via feast (setup.py) fastavro==1.4.9 # via # feast (setup.py) # pandavro -filelock==3.4.2 +filelock==3.6.0 # via virtualenv firebase-admin==4.5.2 # via feast (setup.py) @@ -174,12 +178,12 @@ frozenlist==1.3.0 # via # aiohttp # aiosignal -fsspec==2022.1.0 +fsspec==2022.2.0 # via # adlfs # dask # gcsfs -gcsfs==2022.1.0 +gcsfs==2022.2.0 # via feast (setup.py) google-api-core[grpc]==1.31.5 # via @@ -191,7 +195,7 @@ google-api-core[grpc]==1.31.5 # google-cloud-core # google-cloud-datastore # google-cloud-firestore -google-api-python-client==2.36.0 +google-api-python-client==2.39.0 # via firebase-admin google-auth==1.35.0 # via @@ -204,11 +208,11 @@ google-auth==1.35.0 # google-cloud-storage google-auth-httplib2==0.1.0 # via google-api-python-client -google-auth-oauthlib==0.4.6 +google-auth-oauthlib==0.5.0 # via gcsfs -google-cloud-bigquery==2.32.0 +google-cloud-bigquery==2.34.1 # via feast (setup.py) -google-cloud-bigquery-storage==2.11.0 +google-cloud-bigquery-storage==2.12.0 # via feast (setup.py) google-cloud-core==1.7.2 # via @@ -217,7 +221,7 @@ google-cloud-core==1.7.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.4.0 +google-cloud-datastore==2.5.0 # via feast (setup.py) google-cloud-firestore==2.3.4 # via firebase-admin @@ -237,9 +241,9 @@ googleapis-common-protos==1.52.0 # feast (setup.py) # google-api-core # tensorflow-metadata -great-expectations==0.14.4 +great-expectations==0.14.8 # via feast (setup.py) -grpcio==1.43.0 +grpcio==1.44.0 # via # feast (setup.py) # google-api-core @@ -247,7 +251,7 @@ grpcio==1.43.0 # grpcio-reflection # grpcio-testing # grpcio-tools -grpcio-reflection==1.43.0 +grpcio-reflection==1.44.0 # via feast (setup.py) grpcio-testing==1.34.0 # via feast (setup.py) @@ -257,13 +261,13 @@ h11==0.13.0 # via uvicorn hiredis==2.0.0 # via feast (setup.py) -httplib2==0.20.2 +httplib2==0.20.4 # via # google-api-python-client # google-auth-httplib2 httptools==0.3.0 # via uvicorn -identify==2.4.7 +identify==2.4.11 # via pre-commit idna==3.3 # via @@ -277,11 +281,11 @@ importlib-metadata==4.2.0 # via great-expectations iniconfig==1.1.1 # via pytest -ipykernel==6.7.0 +ipykernel==6.9.1 # via # ipywidgets # notebook -ipython==7.31.1 +ipython==8.1.0 # via # ipykernel # ipywidgets @@ -326,7 +330,7 @@ jupyter-client==7.1.2 # ipykernel # nbclient # notebook -jupyter-core==4.9.1 +jupyter-core==4.9.2 # via # jupyter-client # nbconvert @@ -362,9 +366,9 @@ mmh3==3.0.0 # via feast (setup.py) mock==2.0.0 # via feast (setup.py) -moto==3.0.2 +moto==3.0.5 # via feast (setup.py) -msal==1.16.0 +msal==1.17.0 # via # azure-identity # msal-extensions @@ -385,14 +389,12 @@ multidict==6.0.2 mypy==0.931 # via feast (setup.py) mypy-extensions==0.4.3 - # via - # mypy - # typing-inspect + # via mypy mypy-protobuf==3.1.0 # via feast (setup.py) -nbclient==0.5.10 +nbclient==0.5.11 # via nbconvert -nbconvert==6.4.1 +nbconvert==6.4.2 # via notebook nbformat==5.1.3 # via @@ -410,7 +412,7 @@ nodeenv==1.6.0 # via pre-commit notebook==6.4.8 # via widgetsnbextension -numpy==1.21.5 +numpy==1.22.2 # via # altair # great-expectations @@ -449,7 +451,7 @@ partd==1.2.0 # via dask pathspec==0.9.0 # via black -pbr==5.8.0 +pbr==5.8.1 # via mock pep517==0.12.0 # via pip-tools @@ -457,19 +459,19 @@ pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.4.0 +pip-tools==6.5.1 # via feast (setup.py) -platformdirs==2.4.1 +platformdirs==2.5.1 # via virtualenv pluggy==1.0.0 # via pytest -portalocker==2.3.2 +portalocker==2.4.0 # via msal-extensions pre-commit==2.17.0 # via feast (setup.py) prometheus-client==0.13.1 # via notebook -prompt-toolkit==3.0.26 +prompt-toolkit==3.0.28 # via ipython proto-plus==1.19.6 # via @@ -494,12 +496,16 @@ ptyprocess==0.7.0 # via # pexpect # terminado +pure-eval==0.2.2 + # via stack-data py==1.11.0 # via # pytest # pytest-forked py-cpuinfo==8.0.0 # via pytest-benchmark +py4j==0.10.9.3 + # via pyspark pyarrow==6.0.1 # via # feast (setup.py) @@ -514,7 +520,7 @@ pycodestyle==2.8.0 # via flake8 pycparser==2.21 # via cffi -pycryptodomex==3.14.0 +pycryptodomex==3.14.1 # via snowflake-connector-python pydantic==1.9.0 # via @@ -542,7 +548,9 @@ pyparsing==2.4.7 # packaging pyrsistent==0.18.1 # via jsonschema -pytest==6.2.5 +pyspark==3.2.1 + # via feast (setup.py) +pytest==7.0.1 # via # feast (setup.py) # pytest-benchmark @@ -594,7 +602,6 @@ pyyaml==6.0 # via # dask # feast (setup.py) - # libcst # pre-commit # uvicorn pyzmq==22.3.0 @@ -607,7 +614,7 @@ redis==3.5.3 # redis-py-cluster redis-py-cluster==2.1.3 # via feast (setup.py) -regex==2022.1.18 +regex==2022.3.2 # via black requests==2.27.1 # via @@ -633,7 +640,7 @@ requests-oauthlib==1.3.1 # via # google-auth-oauthlib # msrest -responses==0.17.0 +responses==0.18.0 # via moto rsa==4.8 # via google-auth @@ -643,13 +650,14 @@ ruamel-yaml-clib==0.2.6 # via ruamel-yaml s3transfer==0.5.0 # via boto3 -scipy==1.7.3 +scipy==1.8.0 # via great-expectations send2trash==1.8.0 # via notebook six==1.16.0 # via # absl-py + # asttokens # azure-core # azure-identity # bleach @@ -666,13 +674,12 @@ six==1.16.0 # pandavro # pyopenssl # python-dateutil - # responses # virtualenv sniffio==1.2.0 # via anyio snowballstemmer==2.2.0 # via sphinx -snowflake-connector-python[pandas]==2.7.3 +snowflake-connector-python[pandas]==2.7.4 # via feast (setup.py) sphinx==4.3.2 # via @@ -692,13 +699,15 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx +stack-data==0.2.0 + # via ipython starlette==0.17.1 # via fastapi tabulate==0.8.9 # via feast (setup.py) tenacity==8.0.1 # via feast (setup.py) -tensorflow-metadata==1.6.0 +tensorflow-metadata==1.7.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations @@ -706,19 +715,19 @@ terminado==0.13.1 # via notebook testcontainers==3.4.2 # via feast (setup.py) -testpath==0.5.0 +testpath==0.6.0 # via nbconvert toml==0.10.2 # via # black # feast (setup.py) # pre-commit - # pytest -tomli==2.0.0 +tomli==2.0.1 # via # coverage # mypy # pep517 + # pytest toolz==0.11.2 # via # altair @@ -730,7 +739,7 @@ tornado==6.1 # jupyter-client # notebook # terminado -tqdm==4.62.3 +tqdm==4.63.0 # via # feast (setup.py) # great-expectations @@ -756,21 +765,21 @@ types-protobuf==3.19.7 # mypy-protobuf types-python-dateutil==2.8.9 # via feast (setup.py) -types-pytz==2021.3.4 +types-pytz==2021.3.5 # via feast (setup.py) types-pyyaml==6.0.4 # via feast (setup.py) -types-redis==4.1.13 +types-redis==4.1.17 # via feast (setup.py) -types-requests==2.27.8 +types-requests==2.27.11 # via feast (setup.py) -types-setuptools==57.4.8 +types-setuptools==57.4.9 # via feast (setup.py) types-tabulate==0.8.5 # via feast (setup.py) -types-urllib3==1.26.8 +types-urllib3==1.26.10 # via types-requests -typing-extensions==4.0.1 +typing-extensions==4.1.1 # via # great-expectations # libcst @@ -792,11 +801,11 @@ urllib3==1.26.8 # minio # requests # responses -uvicorn[standard]==0.17.1 +uvicorn[standard]==0.17.5 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.13.0 +virtualenv==20.13.2 # via pre-commit watchgod==0.7 # via uvicorn @@ -804,11 +813,11 @@ wcwidth==0.2.5 # via prompt-toolkit webencodings==0.5.1 # via bleach -websocket-client==1.2.3 +websocket-client==1.3.1 # via docker -websockets==10.1 +websockets==10.2 # via uvicorn -werkzeug==2.0.2 +werkzeug==2.0.3 # via moto wheel==0.37.1 # via pip-tools From 933313031cdcefd914f6ef9c15cb0b8c0a09437f Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 16:31:35 -0800 Subject: [PATCH 33/58] Screwed up rebase Signed-off-by: Kevin Zhang --- .../requirements/py3.7-ci-requirements.txt | 28 ------------------- 1 file changed, 28 deletions(-) diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index c8f0bdd2b3..f04a7d39d2 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -283,12 +283,7 @@ idna==3.3 # yarl imagesize==1.3.0 # via sphinx -<<<<<<< HEAD -<<<<<<< HEAD importlib-metadata==4.2.0 -======= -importlib-metadata==4.11.2 ->>>>>>> 92472003 (Update dependencies) # via great-expectations ======= importlib-metadata==4.2.0 @@ -368,18 +363,10 @@ jupyterlab-pygments==0.1.2 # via nbconvert jupyterlab-widgets==1.0.2 # via ipywidgets -<<<<<<< HEAD -<<<<<<< HEAD libcst==0.4.1 # via # google-cloud-bigquery-storage # google-cloud-datastore -======= ->>>>>>> 92472003 (Update dependencies) -======= -libcst==0.4.1 - # via google-cloud-datastore ->>>>>>> 3279f1fa (Fix ci dependencies) locket==0.2.1 # via partd markupsafe==2.0.1 @@ -793,13 +780,7 @@ traitlets==5.1.1 # notebook typed-ast==1.5.2 # via black -<<<<<<< HEAD -types-futures==3.3.8 - # via types-protobuf -types-protobuf==3.19.2 -======= types-protobuf==3.19.12 ->>>>>>> 92472003 (Update dependencies) # via # feast (setup.py) # mypy-protobuf @@ -822,17 +803,8 @@ types-urllib3==1.26.10 typing-extensions==4.1.1 # via # great-expectations -<<<<<<< HEAD - # libcst - # mypy - # pydantic - # typing-inspect -typing-inspect==0.7.1 - # via libcst -======= # mypy # pydantic ->>>>>>> 92472003 (Update dependencies) tzdata==2021.5 # via pytz-deprecation-shim tzlocal==4.1 From 6272f0533341bdbdb65600545148af414b5afe71 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 16:38:25 -0800 Subject: [PATCH 34/58] Screwed up rebase Signed-off-by: Kevin Zhang --- sdk/python/requirements/py3.7-ci-requirements.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index f04a7d39d2..627cb9efd0 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -283,9 +283,6 @@ idna==3.3 # yarl imagesize==1.3.0 # via sphinx -importlib-metadata==4.2.0 - # via great-expectations -======= importlib-metadata==4.2.0 # via # click @@ -299,7 +296,6 @@ importlib-metadata==4.2.0 # pytest # redis # virtualenv ->>>>>>> 3279f1fa (Fix ci dependencies) importlib-resources==5.4.0 # via jsonschema iniconfig==1.1.1 From cf6bae1a327ed6d4c730cda503ee96706c920c12 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 16:44:18 -0800 Subject: [PATCH 35/58] Screwed up rebase Signed-off-by: Kevin Zhang --- sdk/python/requirements/py3.7-ci-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index 627cb9efd0..5132b835e0 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -431,7 +431,7 @@ nodeenv==1.6.0 # via pre-commit notebook==6.4.8 # via widgetsnbextension -numpy==1.22.5 +numpy==1.21.5 # via # altair # great-expectations From 0569b6d8747713f03e3a81cba9c7ac66cc670a63 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 16:50:20 -0800 Subject: [PATCH 36/58] Realign with master Signed-off-by: Kevin Zhang --- java/datatypes/src/main/proto/feast | 2 + .../main/proto/feast/core/DataFormat.proto | 56 ------ .../main/proto/feast/core/DataSource.proto | 177 ------------------ .../proto/feast/core/DatastoreTable.proto | 39 ---- .../main/proto/feast/core/DynamoDBTable.proto | 31 --- .../src/main/proto/feast/core/Entity.proto | 60 ------ .../src/main/proto/feast/core/Feature.proto | 36 ---- .../proto/feast/core/FeatureService.proto | 48 ----- .../main/proto/feast/core/FeatureTable.proto | 86 --------- .../main/proto/feast/core/FeatureView.proto | 85 --------- .../feast/core/FeatureViewProjection.proto | 25 --- .../main/proto/feast/core/InfraObject.proto | 51 ----- .../feast/core/OnDemandFeatureView.proto | 78 -------- .../src/main/proto/feast/core/Registry.proto | 48 ----- .../proto/feast/core/RequestFeatureView.proto | 43 ----- .../main/proto/feast/core/SavedDataset.proto | 77 -------- .../main/proto/feast/core/SqliteTable.proto | 31 --- .../src/main/proto/feast/core/Store.proto | 130 ------------- .../proto/feast/core/ValidationProfile.proto | 48 ----- .../proto/feast/serving/ServingService.proto | 134 ------------- .../feast/serving/TransformationService.proto | 67 ------- .../src/main/proto/feast/storage/Redis.proto | 34 ---- .../grpc/health/v1/HealthService.proto | 24 --- .../main/proto/feast/types/EntityKey.proto | 30 --- .../src/main/proto/feast/types/Field.proto | 30 --- .../src/main/proto/feast/types/Value.proto | 109 ----------- 26 files changed, 2 insertions(+), 1577 deletions(-) create mode 100644 java/datatypes/src/main/proto/feast delete mode 100644 java/datatypes/src/main/proto/feast/core/DataFormat.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/DataSource.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/DatastoreTable.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/DynamoDBTable.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/Entity.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/Feature.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/FeatureService.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/FeatureTable.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/FeatureView.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/FeatureViewProjection.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/InfraObject.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/OnDemandFeatureView.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/Registry.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/RequestFeatureView.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/SavedDataset.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/SqliteTable.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/Store.proto delete mode 100644 java/datatypes/src/main/proto/feast/core/ValidationProfile.proto delete mode 100644 java/datatypes/src/main/proto/feast/serving/ServingService.proto delete mode 100644 java/datatypes/src/main/proto/feast/serving/TransformationService.proto delete mode 100644 java/datatypes/src/main/proto/feast/storage/Redis.proto delete mode 100644 java/datatypes/src/main/proto/feast/third_party/grpc/health/v1/HealthService.proto delete mode 100644 java/datatypes/src/main/proto/feast/types/EntityKey.proto delete mode 100644 java/datatypes/src/main/proto/feast/types/Field.proto delete mode 100644 java/datatypes/src/main/proto/feast/types/Value.proto diff --git a/java/datatypes/src/main/proto/feast b/java/datatypes/src/main/proto/feast new file mode 100644 index 0000000000..4501c0df4b --- /dev/null +++ b/java/datatypes/src/main/proto/feast @@ -0,0 +1,2 @@ +../../../../../protos/feast + diff --git a/java/datatypes/src/main/proto/feast/core/DataFormat.proto b/java/datatypes/src/main/proto/feast/core/DataFormat.proto deleted file mode 100644 index 2926e08c63..0000000000 --- a/java/datatypes/src/main/proto/feast/core/DataFormat.proto +++ /dev/null @@ -1,56 +0,0 @@ -// -// Copyright 2020 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - - -syntax = "proto3"; -package feast.core; - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "DataFormatProto"; -option java_package = "feast.proto.core"; - -// Defines the file format encoding the features/entity data in files -message FileFormat { - // Defines options for the Parquet data format - message ParquetFormat {} - - oneof format { - ParquetFormat parquet_format = 1; - } -} - -// Defines the data format encoding features/entity data in data streams -message StreamFormat { - // Defines options for the protobuf data format - message ProtoFormat { - // Classpath to the generated Java Protobuf class that can be used to decode - // Feature data from the obtained stream message - string class_path = 1; - } - - // Defines options for the avro data format - message AvroFormat { - // Optional if used in a File DataSource as schema is embedded in avro file. - // Specifies the schema of the Avro message as JSON string. - string schema_json = 1; - } - - // Specifies the data format and format specific options - oneof format { - AvroFormat avro_format = 1; - ProtoFormat proto_format = 2; - } -} diff --git a/java/datatypes/src/main/proto/feast/core/DataSource.proto b/java/datatypes/src/main/proto/feast/core/DataSource.proto deleted file mode 100644 index 41bba6443f..0000000000 --- a/java/datatypes/src/main/proto/feast/core/DataSource.proto +++ /dev/null @@ -1,177 +0,0 @@ -// -// Copyright 2020 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - - -syntax = "proto3"; -package feast.core; - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "DataSourceProto"; -option java_package = "feast.proto.core"; - -import "feast/core/DataFormat.proto"; -import "feast/types/Value.proto"; - -// Defines a Data Source that can be used source Feature data -message DataSource { - // Field indexes should *not* be reused. Not sure if fields 6-10 were used previously or not, - // but they are going to be reserved for backwards compatibility. - reserved 6 to 10; - - // Type of Data Source. - // Next available id: 9 - enum SourceType { - INVALID = 0; - BATCH_FILE = 1; - BATCH_SNOWFLAKE = 8; - BATCH_BIGQUERY = 2; - BATCH_REDSHIFT = 5; - STREAM_KAFKA = 3; - STREAM_KINESIS = 4; - CUSTOM_SOURCE = 6; - REQUEST_SOURCE = 7; - - } - SourceType type = 1; - - // Defines mapping between fields in the sourced data - // and fields in parent FeatureTable. - map field_mapping = 2; - - // Must specify event timestamp column name - string event_timestamp_column = 3; - - // (Optional) Specify partition column - // useful for file sources - string date_partition_column = 4; - - // Must specify creation timestamp column name - string created_timestamp_column = 5; - - // This is an internal field that is represents the python class for the data source object a proto object represents. - // This should be set by feast, and not by users. - string data_source_class_type = 17; - - // Defines options for DataSource that sources features from a file - message FileOptions { - FileFormat file_format = 1; - - // Target URL of file to retrieve and source features from. - // s3://path/to/file for AWS S3 storage - // gs://path/to/file for GCP GCS storage - // file:///path/to/file for local storage - string file_url = 2; - - // override AWS S3 storage endpoint with custom S3 endpoint - string s3_endpoint_override = 3; - } - - // Defines options for DataSource that sources features from a BigQuery Query - message BigQueryOptions { - // Full table reference in the form of [project:dataset.table] - string table_ref = 1; - - // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective - // entity columns - string query = 2; - } - - // Defines options for DataSource that sources features from Kafka messages. - // Each message should be a Protobuf that can be decoded with the generated - // Java Protobuf class at the given class path - message KafkaOptions { - // Comma separated list of Kafka bootstrap servers. Used for feature tables without a defined source host[:port]] - string bootstrap_servers = 1; - - // Kafka topic to collect feature data from. - string topic = 2; - - // Defines the stream data format encoding feature/entity data in Kafka messages. - StreamFormat message_format = 3; - } - - // Defines options for DataSource that sources features from Kinesis records. - // Each record should be a Protobuf that can be decoded with the generated - // Java Protobuf class at the given class path - message KinesisOptions { - // AWS region of the Kinesis stream - string region = 1; - - // Name of the Kinesis stream to obtain feature data from. - string stream_name = 2; - - // Defines the data format encoding the feature/entity data in Kinesis records. - // Kinesis Data Sources support Avro and Proto as data formats. - StreamFormat record_format = 3; - } - - // Defines options for DataSource that sources features from a Redshift Query - message RedshiftOptions { - // Redshift table name - string table = 1; - - // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective - // entity columns - string query = 2; - - // Redshift schema name - string schema = 3; - } - - // Defines options for DataSource that sources features from a Snowflake Query - message SnowflakeOptions { - // Snowflake table name - string table = 1; - - // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective - // entity columns - string query = 2; - - // Snowflake schema name - string schema = 3; - - // Snowflake schema name - string database = 4; - } - - // Defines configuration for custom third-party data sources. - message CustomSourceOptions { - // Serialized configuration information for the data source. The implementer of the custom data source is - // responsible for serializing and deserializing data from bytes - bytes configuration = 1; - } - - // Defines options for DataSource that sources features from request data - message RequestDataOptions { - // Name of the request data source - string name = 1; - - // Mapping of feature name to type - map schema = 2; - } - - // DataSource options. - oneof options { - FileOptions file_options = 11; - BigQueryOptions bigquery_options = 12; - KafkaOptions kafka_options = 13; - KinesisOptions kinesis_options = 14; - RedshiftOptions redshift_options = 15; - RequestDataOptions request_data_options = 18; - CustomSourceOptions custom_options = 16; - SnowflakeOptions snowflake_options = 19; - } -} diff --git a/java/datatypes/src/main/proto/feast/core/DatastoreTable.proto b/java/datatypes/src/main/proto/feast/core/DatastoreTable.proto deleted file mode 100644 index 15720ad809..0000000000 --- a/java/datatypes/src/main/proto/feast/core/DatastoreTable.proto +++ /dev/null @@ -1,39 +0,0 @@ -// -// * Copyright 2021 The Feast Authors -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * https://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// - -syntax = "proto3"; - -package feast.core; -option java_package = "feast.proto.core"; -option java_outer_classname = "DatastoreTableProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -import "google/protobuf/wrappers.proto"; - -// Represents a Datastore table -message DatastoreTable { - // Feast project of the table - string project = 1; - - // Name of the table - string name = 2; - - // GCP project id - google.protobuf.StringValue project_id = 3; - - // Datastore namespace - google.protobuf.StringValue namespace = 4; -} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/DynamoDBTable.proto b/java/datatypes/src/main/proto/feast/core/DynamoDBTable.proto deleted file mode 100644 index 1ab77febbd..0000000000 --- a/java/datatypes/src/main/proto/feast/core/DynamoDBTable.proto +++ /dev/null @@ -1,31 +0,0 @@ -// -// * Copyright 2021 The Feast Authors -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * https://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// - -syntax = "proto3"; - -package feast.core; -option java_package = "feast.proto.core"; -option java_outer_classname = "DynamoDBTableProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -// Represents a DynamoDB table -message DynamoDBTable { - // Name of the table - string name = 1; - - // Region of the table - string region = 2; -} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/Entity.proto b/java/datatypes/src/main/proto/feast/core/Entity.proto deleted file mode 100644 index cd54c64922..0000000000 --- a/java/datatypes/src/main/proto/feast/core/Entity.proto +++ /dev/null @@ -1,60 +0,0 @@ -// -// * Copyright 2020 The Feast Authors -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * https://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// - -syntax = "proto3"; - -package feast.core; -option java_package = "feast.proto.core"; -option java_outer_classname = "EntityProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -import "feast/types/Value.proto"; -import "google/protobuf/timestamp.proto"; - -message Entity { - // User-specified specifications of this entity. - EntitySpecV2 spec = 1; - // System-populated metadata for this entity. - EntityMeta meta = 2; -} - -message EntitySpecV2 { - // Name of the entity. - string name = 1; - - // Name of Feast project that this feature table belongs to. - string project = 9; - - // Type of the entity. - feast.types.ValueType.Enum value_type = 2; - - // Description of the entity. - string description = 3; - - // Join key for the entity (i.e. name of the column the entity maps to). - string join_key = 4; - - // User defined metadata - map tags = 8; - - // Owner of the entity. - string owner = 10; -} - -message EntityMeta { - google.protobuf.Timestamp created_timestamp = 1; - google.protobuf.Timestamp last_updated_timestamp = 2; -} diff --git a/java/datatypes/src/main/proto/feast/core/Feature.proto b/java/datatypes/src/main/proto/feast/core/Feature.proto deleted file mode 100644 index ea0d340a00..0000000000 --- a/java/datatypes/src/main/proto/feast/core/Feature.proto +++ /dev/null @@ -1,36 +0,0 @@ -// -// Copyright 2020 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -syntax = "proto3"; -package feast.core; - - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "FeatureProto"; -option java_package = "feast.proto.core"; - -import "feast/types/Value.proto"; - -message FeatureSpecV2 { - // Name of the feature. Not updatable. - string name = 1; - - // Value type of the feature. Not updatable. - feast.types.ValueType.Enum value_type = 2; - - // Labels for user defined metadata on a feature - map labels = 3; -} diff --git a/java/datatypes/src/main/proto/feast/core/FeatureService.proto b/java/datatypes/src/main/proto/feast/core/FeatureService.proto deleted file mode 100644 index 4aaa0d5f06..0000000000 --- a/java/datatypes/src/main/proto/feast/core/FeatureService.proto +++ /dev/null @@ -1,48 +0,0 @@ -syntax = "proto3"; -package feast.core; - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "FeatureServiceProto"; -option java_package = "feast.proto.core"; - -import "google/protobuf/timestamp.proto"; -import "feast/core/FeatureViewProjection.proto"; - -message FeatureService { - // User-specified specifications of this feature service. - FeatureServiceSpec spec = 1; - - // System-populated metadata for this feature service. - FeatureServiceMeta meta = 2; -} - -message FeatureServiceSpec { - // Name of the Feature Service. Must be unique. Not updated. - string name = 1; - - // Name of Feast project that this Feature Service belongs to. - string project = 2; - - // Represents a projection that's to be applied on top of the FeatureView. - // Contains data such as the features to use from a FeatureView. - repeated FeatureViewProjection features = 3; - - // User defined metadata - map tags = 4; - - // Description of the feature service. - string description = 5; - - // Owner of the feature service. - string owner = 6; -} - - -message FeatureServiceMeta { - // Time where this Feature Service is created - google.protobuf.Timestamp created_timestamp = 1; - - // Time where this Feature Service is last updated - google.protobuf.Timestamp last_updated_timestamp = 2; - -} diff --git a/java/datatypes/src/main/proto/feast/core/FeatureTable.proto b/java/datatypes/src/main/proto/feast/core/FeatureTable.proto deleted file mode 100644 index 661f4eecfc..0000000000 --- a/java/datatypes/src/main/proto/feast/core/FeatureTable.proto +++ /dev/null @@ -1,86 +0,0 @@ -// -// Copyright 2020 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - - -syntax = "proto3"; -package feast.core; - - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "FeatureTableProto"; -option java_package = "feast.proto.core"; - -import "google/protobuf/duration.proto"; -import "google/protobuf/timestamp.proto"; -import "feast/core/DataSource.proto"; -import "feast/core/Feature.proto"; - -message FeatureTable { - // User-specified specifications of this feature table. - FeatureTableSpec spec = 1; - - // System-populated metadata for this feature table. - FeatureTableMeta meta = 2; -} - -message FeatureTableSpec { - // Name of the feature table. Must be unique. Not updated. - string name = 1; - - // Name of Feast project that this feature table belongs to. - string project = 9; - - // List names of entities to associate with the Features defined in this - // Feature Table. Not updatable. - repeated string entities = 3; - - // List of features specifications for each feature defined with this feature table. - repeated FeatureSpecV2 features = 4; - - // User defined metadata - map labels = 5; - - // Features in this feature table can only be retrieved from online serving - // younger than max age. Age is measured as the duration of time between - // the feature's event timestamp and when the feature is retrieved - // Feature values outside max age will be returned as unset values and indicated to end user - google.protobuf.Duration max_age = 6; - - // Batch/Offline DataSource to source batch/offline feature data. - // Only batch DataSource can be specified - // (ie source type should start with 'BATCH_') - DataSource batch_source = 7; - - // Stream/Online DataSource to source stream/online feature data. - // Only stream DataSource can be specified - // (ie source type should start with 'STREAM_') - DataSource stream_source = 8; -} - -message FeatureTableMeta { - // Time where this Feature Table is created - google.protobuf.Timestamp created_timestamp = 1; - - // Time where this Feature Table is last updated - google.protobuf.Timestamp last_updated_timestamp = 2; - - // Auto incrementing revision no. of this Feature Table - int64 revision = 3; - - // Hash entities, features, batch_source and stream_source to inform JobService if - // jobs should be restarted should hash change - string hash = 4; -} diff --git a/java/datatypes/src/main/proto/feast/core/FeatureView.proto b/java/datatypes/src/main/proto/feast/core/FeatureView.proto deleted file mode 100644 index 6edba9f7fe..0000000000 --- a/java/datatypes/src/main/proto/feast/core/FeatureView.proto +++ /dev/null @@ -1,85 +0,0 @@ -// -// Copyright 2020 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - - -syntax = "proto3"; -package feast.core; - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "FeatureViewProto"; -option java_package = "feast.proto.core"; - -import "google/protobuf/duration.proto"; -import "google/protobuf/timestamp.proto"; -import "feast/core/DataSource.proto"; -import "feast/core/Feature.proto"; - -message FeatureView { - // User-specified specifications of this feature view. - FeatureViewSpec spec = 1; - - // System-populated metadata for this feature view. - FeatureViewMeta meta = 2; -} - -// TODO(adchia): refactor common fields from this and ODFV into separate metadata proto -message FeatureViewSpec { - // Name of the feature view. Must be unique. Not updated. - string name = 1; - - // Name of Feast project that this feature view belongs to. - string project = 2; - - // List names of entities to associate with the Features defined in this - // Feature View. Not updatable. - repeated string entities = 3; - - // List of features specifications for each feature defined with this feature view. - repeated FeatureSpecV2 features = 4; - - // User defined metadata - map tags = 5; - - // Features in this feature view can only be retrieved from online serving - // younger than ttl. Ttl is measured as the duration of time between - // the feature's event timestamp and when the feature is retrieved - // Feature values outside ttl will be returned as unset values and indicated to end user - google.protobuf.Duration ttl = 6; - - // Batch/Offline DataSource where this view can retrieve offline feature data. - DataSource batch_source = 7; - // Streaming DataSource from where this view can consume "online" feature data. - DataSource stream_source = 9; - - // Whether these features should be served online or not - bool online = 8; -} - -message FeatureViewMeta { - // Time where this Feature View is created - google.protobuf.Timestamp created_timestamp = 1; - - // Time where this Feature View is last updated - google.protobuf.Timestamp last_updated_timestamp = 2; - - // List of pairs (start_time, end_time) for which this feature view has been materialized. - repeated MaterializationInterval materialization_intervals = 3; -} - -message MaterializationInterval { - google.protobuf.Timestamp start_time = 1; - google.protobuf.Timestamp end_time = 2; -} diff --git a/java/datatypes/src/main/proto/feast/core/FeatureViewProjection.proto b/java/datatypes/src/main/proto/feast/core/FeatureViewProjection.proto deleted file mode 100644 index e81d8dad01..0000000000 --- a/java/datatypes/src/main/proto/feast/core/FeatureViewProjection.proto +++ /dev/null @@ -1,25 +0,0 @@ -syntax = "proto3"; -package feast.core; - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "FeatureReferenceProto"; -option java_package = "feast.proto.core"; - -import "feast/core/Feature.proto"; - - -// A projection to be applied on top of a FeatureView. -// Contains the modifications to a FeatureView such as the features subset to use. -message FeatureViewProjection { - // The feature view name - string feature_view_name = 1; - - // Alias for feature view name - string feature_view_name_alias = 3; - - // The features of the feature view that are a part of the feature reference. - repeated FeatureSpecV2 feature_columns = 2; - - // Map for entity join_key overrides of feature data entity join_key to entity data join_key - map join_key_map = 4; -} diff --git a/java/datatypes/src/main/proto/feast/core/InfraObject.proto b/java/datatypes/src/main/proto/feast/core/InfraObject.proto deleted file mode 100644 index 863f1b64da..0000000000 --- a/java/datatypes/src/main/proto/feast/core/InfraObject.proto +++ /dev/null @@ -1,51 +0,0 @@ -// -// * Copyright 2021 The Feast Authors -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * https://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// - -syntax = "proto3"; - -package feast.core; -option java_package = "feast.proto.core"; -option java_outer_classname = "InfraObjectProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -import "feast/core/DatastoreTable.proto"; -import "feast/core/DynamoDBTable.proto"; -import "feast/core/SqliteTable.proto"; - -// Represents a set of infrastructure objects managed by Feast -message Infra { - // List of infrastructure objects managed by Feast - repeated InfraObject infra_objects = 1; -} - -// Represents a single infrastructure object managed by Feast -message InfraObject { - // Represents the Python class for the infrastructure object - string infra_object_class_type = 1; - - // The infrastructure object - oneof infra_object { - DynamoDBTable dynamodb_table = 2; - DatastoreTable datastore_table = 3; - SqliteTable sqlite_table = 4; - CustomInfra custom_infra = 100; - } - - // Allows for custom infra objects to be added - message CustomInfra { - bytes field = 1; - } -} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/OnDemandFeatureView.proto b/java/datatypes/src/main/proto/feast/core/OnDemandFeatureView.proto deleted file mode 100644 index 58feff5bfd..0000000000 --- a/java/datatypes/src/main/proto/feast/core/OnDemandFeatureView.proto +++ /dev/null @@ -1,78 +0,0 @@ -// -// Copyright 2020 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - - -syntax = "proto3"; -package feast.core; - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "OnDemandFeatureViewProto"; -option java_package = "feast.proto.core"; - -import "google/protobuf/timestamp.proto"; -import "feast/core/FeatureView.proto"; -import "feast/core/FeatureViewProjection.proto"; -import "feast/core/Feature.proto"; -import "feast/core/DataSource.proto"; - -message OnDemandFeatureView { - // User-specified specifications of this feature view. - OnDemandFeatureViewSpec spec = 1; - OnDemandFeatureViewMeta meta = 2; -} - -message OnDemandFeatureViewSpec { - // Name of the feature view. Must be unique. Not updated. - string name = 1; - - // Name of Feast project that this feature view belongs to. - string project = 2; - - // List of features specifications for each feature defined with this feature view. - repeated FeatureSpecV2 features = 3; - - // Map of inputs for this feature view. - map inputs = 4; - - UserDefinedFunction user_defined_function = 5; - - -} - -message OnDemandFeatureViewMeta { - // Time where this Feature View is created - google.protobuf.Timestamp created_timestamp = 1; - - // Time where this Feature View is last updated - google.protobuf.Timestamp last_updated_timestamp = 2; -} - -message OnDemandInput { - oneof input { - FeatureView feature_view = 1; - FeatureViewProjection feature_view_projection = 3; - DataSource request_data_source = 2; - } -} - -// Serialized representation of python function. -message UserDefinedFunction { - // The function name - string name = 1; - - // The python-syntax function body (serialized by dill) - bytes body = 2; -} diff --git a/java/datatypes/src/main/proto/feast/core/Registry.proto b/java/datatypes/src/main/proto/feast/core/Registry.proto deleted file mode 100644 index 3deeb97238..0000000000 --- a/java/datatypes/src/main/proto/feast/core/Registry.proto +++ /dev/null @@ -1,48 +0,0 @@ -// -// * Copyright 2020 The Feast Authors -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * https://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// - -syntax = "proto3"; - -package feast.core; -option java_package = "feast.proto.core"; -option java_outer_classname = "RegistryProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -import "feast/core/Entity.proto"; -import "feast/core/FeatureService.proto"; -import "feast/core/FeatureTable.proto"; -import "feast/core/FeatureView.proto"; -import "feast/core/InfraObject.proto"; -import "feast/core/OnDemandFeatureView.proto"; -import "feast/core/RequestFeatureView.proto"; -import "feast/core/SavedDataset.proto"; -import "google/protobuf/timestamp.proto"; - -message Registry { - repeated Entity entities = 1; - repeated FeatureTable feature_tables = 2; - repeated FeatureView feature_views = 6; - repeated OnDemandFeatureView on_demand_feature_views = 8; - repeated RequestFeatureView request_feature_views = 9; - repeated FeatureService feature_services = 7; - repeated SavedDataset saved_datasets = 11; - Infra infra = 10; - - string registry_schema_version = 3; // to support migrations; incremented when schema is changed - string version_id = 4; // version id, random string generated on each update of the data; now used only for debugging purposes - google.protobuf.Timestamp last_updated = 5; - -} diff --git a/java/datatypes/src/main/proto/feast/core/RequestFeatureView.proto b/java/datatypes/src/main/proto/feast/core/RequestFeatureView.proto deleted file mode 100644 index c9ee540e6f..0000000000 --- a/java/datatypes/src/main/proto/feast/core/RequestFeatureView.proto +++ /dev/null @@ -1,43 +0,0 @@ -// -// Copyright 2021 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - - -syntax = "proto3"; -package feast.core; - -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; -option java_outer_classname = "RequestFeatureViewProto"; -option java_package = "feast.proto.core"; - -import "feast/core/FeatureView.proto"; -import "feast/core/Feature.proto"; -import "feast/core/DataSource.proto"; - -message RequestFeatureView { - // User-specified specifications of this feature view. - RequestFeatureViewSpec spec = 1; -} - -message RequestFeatureViewSpec { - // Name of the feature view. Must be unique. Not updated. - string name = 1; - - // Name of Feast project that this feature view belongs to. - string project = 2; - - // Request data which contains the underlying data schema and list of associated features - DataSource request_data_source = 3; -} diff --git a/java/datatypes/src/main/proto/feast/core/SavedDataset.proto b/java/datatypes/src/main/proto/feast/core/SavedDataset.proto deleted file mode 100644 index ebd2e56d35..0000000000 --- a/java/datatypes/src/main/proto/feast/core/SavedDataset.proto +++ /dev/null @@ -1,77 +0,0 @@ -// -// Copyright 2021 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - - -syntax = "proto3"; - -package feast.core; -option java_package = "feast.proto.core"; -option java_outer_classname = "SavedDatasetProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -import "google/protobuf/timestamp.proto"; -import "feast/core/FeatureViewProjection.proto"; -import "feast/core/DataSource.proto"; - -message SavedDatasetSpec { - // Name of the dataset. Must be unique since it's possible to overwrite dataset by name - string name = 1; - - // Name of Feast project that this Dataset belongs to. - string project = 2; - - // list of feature references with format ":" - repeated string features = 3; - - // entity columns + request columns from all feature views used during retrieval - repeated string join_keys = 4; - - // Whether full feature names are used in stored data - bool full_feature_names = 5; - - SavedDatasetStorage storage = 6; - - // User defined metadata - map tags = 7; -} - -message SavedDatasetStorage { - oneof kind { - DataSource.FileOptions file_storage = 4; - DataSource.BigQueryOptions bigquery_storage = 5; - DataSource.RedshiftOptions redshift_storage = 6; - DataSource.SnowflakeOptions snowflake_storage = 7; - } -} - -message SavedDatasetMeta { - // Time when this saved dataset is created - google.protobuf.Timestamp created_timestamp = 1; - - // Time when this saved dataset is last updated - google.protobuf.Timestamp last_updated_timestamp = 2; - - // Min timestamp in the dataset (needed for retrieval) - google.protobuf.Timestamp min_event_timestamp = 3; - - // Max timestamp in the dataset (needed for retrieval) - google.protobuf.Timestamp max_event_timestamp = 4; -} - -message SavedDataset { - SavedDatasetSpec spec = 1; - SavedDatasetMeta meta = 2; -} diff --git a/java/datatypes/src/main/proto/feast/core/SqliteTable.proto b/java/datatypes/src/main/proto/feast/core/SqliteTable.proto deleted file mode 100644 index 1732931b8f..0000000000 --- a/java/datatypes/src/main/proto/feast/core/SqliteTable.proto +++ /dev/null @@ -1,31 +0,0 @@ -// -// * Copyright 2021 The Feast Authors -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * https://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// - -syntax = "proto3"; - -package feast.core; -option java_package = "feast.proto.core"; -option java_outer_classname = "SqliteTableProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -// Represents a Sqlite table -message SqliteTable { - // Absolute path of the table - string path = 1; - - // Name of the table - string name = 2; -} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/core/Store.proto b/java/datatypes/src/main/proto/feast/core/Store.proto deleted file mode 100644 index 41a76a11c2..0000000000 --- a/java/datatypes/src/main/proto/feast/core/Store.proto +++ /dev/null @@ -1,130 +0,0 @@ -// -// * Copyright 2019 The Feast Authors -// * -// * Licensed under the Apache License, Version 2.0 (the "License"); -// * you may not use this file except in compliance with the License. -// * You may obtain a copy of the License at -// * -// * https://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// - -syntax = "proto3"; -package feast.core; - -option java_package = "feast.proto.core"; -option java_outer_classname = "StoreProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -// Store provides a location where Feast reads and writes feature values. -// Feature values will be written to the Store in the form of FeatureRow elements. -// The way FeatureRow is encoded and decoded when it is written to and read from -// the Store depends on the type of the Store. -// -message Store { - - enum StoreType { - // These positions should not be reused. - reserved 2, 3, 12, 13; - - INVALID = 0; - - // Redis stores a FeatureRow element as a key, value pair. - // - // The Redis data types used (https://redis.io/topics/data-types): - // - key: STRING - // - value: STRING - // - // Encodings: - // - key: byte array of RedisKey (refer to feast.storage.RedisKeyV2) - // - value: Redis hashmap - // - REDIS = 1; - - REDIS_CLUSTER = 4; - } - - message RedisConfig { - string host = 1; - int32 port = 2; - // Optional. The number of milliseconds to wait before retrying failed Redis connection. - // By default, Feast uses exponential backoff policy and "initial_backoff_ms" sets the initial wait duration. - int32 initial_backoff_ms = 3; - // Optional. Maximum total number of retries for connecting to Redis. Default to zero retries. - int32 max_retries = 4; - // Optional. How often flush data to redis - int32 flush_frequency_seconds = 5; - // Optional. Connect over SSL. - bool ssl = 6; - } - - message RedisClusterConfig { - // List of Redis Uri for all the nodes in Redis Cluster, comma separated. Eg. host1:6379, host2:6379 - string connection_string = 1; - int32 initial_backoff_ms = 2; - int32 max_retries = 3; - // Optional. How often flush data to redis - int32 flush_frequency_seconds = 4; - // Optional. Append a prefix to the Redis Key - string key_prefix = 5; - // Optional. Enable fallback to another key prefix if the original key is not present. - // Useful for migrating key prefix without re-ingestion. Disabled by default. - bool enable_fallback = 6; - // Optional. This would be the fallback prefix to use if enable_fallback is true. - string fallback_prefix = 7; - - // Optional. Priority of nodes when reading from cluster - enum ReadFrom { - MASTER = 0; - MASTER_PREFERRED = 1; - REPLICA = 2; - REPLICA_PREFERRED = 3; - } - ReadFrom read_from = 8; - } - - message Subscription { - // Name of project that the feature sets belongs to. This can be one of - // - [project_name] - // - * - // If an asterisk is provided, filtering on projects will be disabled. All projects will - // be matched. It is NOT possible to provide an asterisk with a string in order to do - // pattern matching. - string project = 3; - - // Name of the desired feature set. Asterisks can be used as wildcards in the name. - // Matching on names is only permitted if a specific project is defined. It is disallowed - // If the project name is set to "*" - // e.g. - // - * can be used to match all feature sets - // - my-feature-set* can be used to match all features prefixed by "my-feature-set" - // - my-feature-set-6 can be used to select a single feature set - string name = 1; - - // All matches with exclude enabled will be filtered out instead of added - bool exclude = 4; - - // Feature set version was removed in v0.5.0. - reserved 2; - } - - // Name of the store. - string name = 1; - - // Type of store. - StoreType type = 2; - - // Feature sets to subscribe to. - repeated Subscription subscriptions = 4; - - // Configuration to connect to the store. Required. - oneof config { - RedisConfig redis_config = 11; - RedisClusterConfig redis_cluster_config = 14; - } -} diff --git a/java/datatypes/src/main/proto/feast/core/ValidationProfile.proto b/java/datatypes/src/main/proto/feast/core/ValidationProfile.proto deleted file mode 100644 index 31c4e150a0..0000000000 --- a/java/datatypes/src/main/proto/feast/core/ValidationProfile.proto +++ /dev/null @@ -1,48 +0,0 @@ -// -// Copyright 2021 The Feast Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - - -syntax = "proto3"; - -package feast.core; -option java_package = "feast.proto.core"; -option java_outer_classname = "ValidationProfile"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; - -import "google/protobuf/timestamp.proto"; -import "feast/core/SavedDataset.proto"; - -message GEValidationProfiler { - message UserDefinedProfiler { - // The python-syntax function body (serialized by dill) - bytes body = 1; - } - - UserDefinedProfiler profiler = 1; -} - -message GEValidationProfile { - // JSON-serialized ExpectationSuite object - bytes expectation_suite = 1; -} - -message ValidationReference { - SavedDataset dataset = 1; - - oneof profiler { - GEValidationProfiler ge_profiler = 2; - } -} diff --git a/java/datatypes/src/main/proto/feast/serving/ServingService.proto b/java/datatypes/src/main/proto/feast/serving/ServingService.proto deleted file mode 100644 index 6c551a97ba..0000000000 --- a/java/datatypes/src/main/proto/feast/serving/ServingService.proto +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright 2018 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -package feast.serving; - -import "google/protobuf/timestamp.proto"; -import "feast/types/Value.proto"; - -option java_package = "feast.proto.serving"; -option java_outer_classname = "ServingAPIProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/serving"; - -service ServingService { - // Get information about this Feast serving. - rpc GetFeastServingInfo (GetFeastServingInfoRequest) returns (GetFeastServingInfoResponse); - - // Get online features synchronously. - rpc GetOnlineFeatures (GetOnlineFeaturesRequest) returns (GetOnlineFeaturesResponse); -} - -message GetFeastServingInfoRequest {} - -message GetFeastServingInfoResponse { - // Feast version of this serving deployment. - string version = 1; -} - -message FeatureReferenceV2 { - // Name of the Feature View to retrieve the feature from. - string feature_view_name = 1; - - // Name of the Feature to retrieve the feature from. - string feature_name = 2; -} - -// ToDo (oleksii): remove this message (since it's not used) and move EntityRow on package level -message GetOnlineFeaturesRequestV2 { - // List of features that are being retrieved - repeated FeatureReferenceV2 features = 4; - - // List of entity rows, containing entity id and timestamp data. - // Used during retrieval of feature rows and for joining feature - // rows into a final dataset - repeated EntityRow entity_rows = 2; - - // Optional field to specify project name override. If specified, uses the - // given project for retrieval. Overrides the projects specified in - // Feature References if both are specified. - string project = 5; - - message EntityRow { - // Request timestamp of this row. This value will be used, - // together with maxAge, to determine feature staleness. - google.protobuf.Timestamp timestamp = 1; - - // Map containing mapping of entity name to entity value. - map fields = 2; - } -} - -// In JSON "val" field can be omitted -message FeatureList { - repeated string val = 1; -} - -message GetOnlineFeaturesRequest { - oneof kind { - string feature_service = 1; - FeatureList features = 2; - } - // The entity data is specified in a columnar format - // A map of entity name -> list of values - map entities = 3; - bool full_feature_names = 4; - - // Context for OnDemand Feature Transformation - // (was moved to dedicated parameter to avoid unnecessary separation logic on serving side) - // A map of variable name -> list of values - map request_context = 5; -} - -message GetOnlineFeaturesResponse { - GetOnlineFeaturesResponseMetadata metadata = 1; - - // Length of "results" array should match length of requested features. - // We also preserve the same order of features here as in metadata.feature_names - repeated FeatureVector results = 2; - - message FeatureVector { - repeated feast.types.Value values = 1; - repeated FieldStatus statuses = 2; - repeated google.protobuf.Timestamp event_timestamps = 3; - } -} - -message GetOnlineFeaturesResponseMetadata { - FeatureList feature_names = 1; -} - -enum FieldStatus { - // Status is unset for this field. - INVALID = 0; - - // Field value is present for this field and age is within max age. - PRESENT = 1; - - // Values could be found for entity key and age is within max age, but - // this field value is assigned a value on ingestion into feast. - NULL_VALUE = 2; - - // Entity key did not return any values as they do not exist in Feast. - // This could suggest that the feature values have not yet been ingested - // into feast or the ingestion failed. - NOT_FOUND = 3; - - // Values could be found for entity key, but field values are outside the maximum - // allowable range. - OUTSIDE_MAX_AGE = 4; -} diff --git a/java/datatypes/src/main/proto/feast/serving/TransformationService.proto b/java/datatypes/src/main/proto/feast/serving/TransformationService.proto deleted file mode 100644 index 113bd120c8..0000000000 --- a/java/datatypes/src/main/proto/feast/serving/TransformationService.proto +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2021 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -package feast.serving; - -option java_package = "feast.proto.serving"; -option java_outer_classname = "TransformationServiceAPIProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/serving"; - -service TransformationService { - rpc GetTransformationServiceInfo (GetTransformationServiceInfoRequest) returns (GetTransformationServiceInfoResponse); - - rpc TransformFeatures (TransformFeaturesRequest) returns (TransformFeaturesResponse); -} - -message ValueType { - oneof value { - // Having a oneOf provides forward compatibility if we need to support compound types - // that are not supported by arrow natively. - bytes arrow_value = 1; - } -} - -message GetTransformationServiceInfoRequest {} - -message GetTransformationServiceInfoResponse { - // Feast version of this transformation service deployment. - string version = 1; - - // Type of transformation service deployment. This is either Python, or custom - TransformationServiceType type = 2; - - string transformation_service_type_details = 3; -} - -message TransformFeaturesRequest { - string on_demand_feature_view_name = 1; - string project = 2; - - ValueType transformation_input = 3; -} - -message TransformFeaturesResponse { - ValueType transformation_output = 3; -} - -enum TransformationServiceType { - TRANSFORMATION_SERVICE_TYPE_INVALID = 0; - TRANSFORMATION_SERVICE_TYPE_PYTHON = 1; - - TRANSFORMATION_SERVICE_TYPE_CUSTOM = 100; -} diff --git a/java/datatypes/src/main/proto/feast/storage/Redis.proto b/java/datatypes/src/main/proto/feast/storage/Redis.proto deleted file mode 100644 index a662e352f4..0000000000 --- a/java/datatypes/src/main/proto/feast/storage/Redis.proto +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 2019 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -import "feast/types/Field.proto"; -import "feast/types/Value.proto"; - -package feast.storage; - -option java_outer_classname = "RedisProto"; -option java_package = "feast.proto.storage"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/storage"; - -message RedisKeyV2 { - string project = 1; - - repeated string entity_names = 2; - - repeated feast.types.Value entity_values = 3; -} diff --git a/java/datatypes/src/main/proto/feast/third_party/grpc/health/v1/HealthService.proto b/java/datatypes/src/main/proto/feast/third_party/grpc/health/v1/HealthService.proto deleted file mode 100644 index 342db35d4c..0000000000 --- a/java/datatypes/src/main/proto/feast/third_party/grpc/health/v1/HealthService.proto +++ /dev/null @@ -1,24 +0,0 @@ -syntax = "proto3"; - -package grpc.health.v1; - -option java_package = "io.grpc.health.v1"; -option java_outer_classname = "HealthProto"; - -message HealthCheckRequest { - string service = 1; -} - -enum ServingStatus { - UNKNOWN = 0; - SERVING = 1; - NOT_SERVING = 2; -} - -message HealthCheckResponse { - ServingStatus status = 1; -} - -service Health { - rpc Check(HealthCheckRequest) returns (HealthCheckResponse); -} \ No newline at end of file diff --git a/java/datatypes/src/main/proto/feast/types/EntityKey.proto b/java/datatypes/src/main/proto/feast/types/EntityKey.proto deleted file mode 100644 index cbc3c55442..0000000000 --- a/java/datatypes/src/main/proto/feast/types/EntityKey.proto +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2018 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -import "feast/types/Value.proto"; - -package feast.types; - -option java_package = "feast.proto.types"; -option java_outer_classname = "EntityKeyProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types"; - -message EntityKey { - repeated string join_keys = 1; - repeated feast.types.Value entity_values = 2; -} diff --git a/java/datatypes/src/main/proto/feast/types/Field.proto b/java/datatypes/src/main/proto/feast/types/Field.proto deleted file mode 100644 index 3b8416c253..0000000000 --- a/java/datatypes/src/main/proto/feast/types/Field.proto +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2018 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -import "feast/types/Value.proto"; - -package feast.types; - -option java_package = "feast.proto.types"; -option java_outer_classname = "FieldProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types"; - -message Field { - string name = 1; - feast.types.Value value = 2; -} diff --git a/java/datatypes/src/main/proto/feast/types/Value.proto b/java/datatypes/src/main/proto/feast/types/Value.proto deleted file mode 100644 index b00d4d9b84..0000000000 --- a/java/datatypes/src/main/proto/feast/types/Value.proto +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright 2018 The Feast Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -package feast.types; - -option java_package = "feast.proto.types"; -option java_outer_classname = "ValueProto"; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/types"; - -message ValueType { - enum Enum { - INVALID = 0; - BYTES = 1; - STRING = 2; - INT32 = 3; - INT64 = 4; - DOUBLE = 5; - FLOAT = 6; - BOOL = 7; - UNIX_TIMESTAMP = 8; - BYTES_LIST = 11; - STRING_LIST = 12; - INT32_LIST = 13; - INT64_LIST = 14; - DOUBLE_LIST = 15; - FLOAT_LIST = 16; - BOOL_LIST = 17; - UNIX_TIMESTAMP_LIST = 18; - NULL = 19; - } -} - -message Value { - // ValueType is referenced by the metadata types, FeatureInfo and EntityInfo. - // The enum values do not have to match the oneof val field ids, but they should. - // In JSON "*_val" field can be omitted - oneof val { - bytes bytes_val = 1; - string string_val = 2; - int32 int32_val = 3; - int64 int64_val = 4; - double double_val = 5; - float float_val = 6; - bool bool_val = 7; - int64 unix_timestamp_val = 8; - BytesList bytes_list_val = 11; - StringList string_list_val = 12; - Int32List int32_list_val = 13; - Int64List int64_list_val = 14; - DoubleList double_list_val = 15; - FloatList float_list_val = 16; - BoolList bool_list_val = 17; - Int64List unix_timestamp_list_val = 18; - Null null_val = 19; - } -} - -enum Null { - NULL = 0; -} - -message BytesList { - repeated bytes val = 1; -} - -message StringList { - repeated string val = 1; -} - -message Int32List { - repeated int32 val = 1; -} - -message Int64List { - repeated int64 val = 1; -} - -message DoubleList { - repeated double val = 1; -} - -message FloatList { - repeated float val = 1; -} - -message BoolList { - repeated bool val = 1; -} - -// This is to avoid an issue of being unable to specify `repeated value` in oneofs or maps -// In JSON "val" field can be omitted -message RepeatedValue { - repeated Value val = 1; -} \ No newline at end of file From b02e51e1f5f7125ba7bba62dfa6598a7e550c518 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 16:54:43 -0800 Subject: [PATCH 37/58] Fix accidental changes Signed-off-by: Kevin Zhang --- java/datatypes/src/main/proto/feast | 3 +-- .../tests/integration/registration/test_universal_types.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 120000 java/datatypes/src/main/proto/feast diff --git a/java/datatypes/src/main/proto/feast b/java/datatypes/src/main/proto/feast deleted file mode 100644 index 4501c0df4b..0000000000 --- a/java/datatypes/src/main/proto/feast +++ /dev/null @@ -1,2 +0,0 @@ -../../../../../protos/feast - diff --git a/java/datatypes/src/main/proto/feast b/java/datatypes/src/main/proto/feast new file mode 120000 index 0000000000..463e4045de --- /dev/null +++ b/java/datatypes/src/main/proto/feast @@ -0,0 +1 @@ +../../../../../protos/feast \ No newline at end of file diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index e2f2e9df56..59ca119f98 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -187,6 +187,7 @@ def test_feature_get_historical_features_types_match(offline_types_test_fixtures # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() print(historical_features_df) + if config.feature_is_list: assert_feature_list_types( environment.test_repo_config.provider, From a161fad341ddda31eaa8a6f27d9820d20cd9d84e Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 2 Mar 2022 17:40:44 -0800 Subject: [PATCH 38/58] Make type map change cleaner Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 49 ++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 64d9c485b0..c5af8251fb 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -328,17 +328,20 @@ def _python_value_to_proto_value( ProtoValue(unix_timestamp_list_val=Int64List(val=ts)) # type: ignore for ts in int_timestamps_lists ] - # TODO: Make this better. - val_list = [] - for value in values: - if value.dtype == "bool": - value = [bool(e) for e in value] - val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) # type: ignore - elif value is not None: - val_list.append(ProtoValue(**{field_name: proto_type(val=value)})) # type: ignore - else: - val_list.append(ProtoValue()) # type: ignore - return val_list + if feast_value_type == ValueType.BOOL_LIST: + # ProtoValue no longer supports conversion of np.bool_ so we need it convert it. + return [ + ProtoValue(**{field_name: proto_type(val=[bool(e) for e in value])}) # type: ignore + if value is not None + else ProtoValue() + for value in values + ] + return [ + ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore + if value is not None + else ProtoValue() + for value in values + ] # Handle scalar types below else: @@ -351,14 +354,15 @@ def _python_value_to_proto_value( # ProtoValue does actually accept `np.int_` but the typing complains. return [ProtoValue(unix_timestamp_val=ts) for ts in int_timestamps] # type: ignore - if feast_value_type in PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE: - ( - field_name, - func, - valid_scalar_types, - ) = PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE[feast_value_type] - if valid_scalar_types: - assert type(sample) in valid_scalar_types + ( + field_name, + func, + valid_scalar_types, + ) = PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE[feast_value_type] + if valid_scalar_types: + assert type(sample) in valid_scalar_types + if feast_value_type == ValueType.BOOL: + # ProtoValue no longer supports conversion of np.bool_ so we need it convert it. return [ ProtoValue( **{ @@ -371,6 +375,13 @@ def _python_value_to_proto_value( else ProtoValue() for value in values ] + if feast_value_type in PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE: + return [ + ProtoValue(**{field_name: func(value)}) + if not pd.isnull(value) + else ProtoValue() + for value in values + ] raise Exception(f"Unsupported data type: ${str(type(values[0]))}") From f7c618a1bddbbea3c971351800d501d34d8b156b Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 10:36:13 -0800 Subject: [PATCH 39/58] Address review comments Signed-off-by: Kevin Zhang --- docs/reference/data-sources/spark.md | 5 +- docs/reference/offline-stores/README.md | 2 +- docs/reference/offline-stores/spark.md | 4 +- .../third_party/spark_offline_store/spark.py | 61 ++-------- .../spark_offline_store/spark_source.py | 105 ++++-------------- .../feature_repos/repo_configuration.py | 8 +- .../data_sources/spark_data_source_creator.py | 1 - 7 files changed, 31 insertions(+), 155 deletions(-) diff --git a/docs/reference/data-sources/spark.md b/docs/reference/data-sources/spark.md index 9499f9a310..25b69c7355 100644 --- a/docs/reference/data-sources/spark.md +++ b/docs/reference/data-sources/spark.md @@ -1,4 +1,4 @@ -# Snowflake +# Spark ## Description @@ -43,6 +43,3 @@ my_spark_source = SparkSource( created_timestamp_column="created", ) ``` - - -Configuration options are available [here](https://rtd.feast.dev/en/latest/index.html#feast.data_source.SnowflakeSource). diff --git a/docs/reference/offline-stores/README.md b/docs/reference/offline-stores/README.md index 087c6918f1..b3c85470b9 100644 --- a/docs/reference/offline-stores/README.md +++ b/docs/reference/offline-stores/README.md @@ -10,4 +10,4 @@ Please see [Offline Store](../../getting-started/architecture-and-components/off {% page-ref page="redshift.md" %} -{% page-ref page="snowflake.md" %} +{% page-ref page="spark.md" %} diff --git a/docs/reference/offline-stores/spark.md b/docs/reference/offline-stores/spark.md index 3a37d0b185..48ddf46d17 100644 --- a/docs/reference/offline-stores/spark.md +++ b/docs/reference/offline-stores/spark.md @@ -8,15 +8,13 @@ The Spark offline store is an offline store currently in alpha development that This Spark offline store still does not achieve full test coverage and continues to fail some integration tests when integrating with the feast universal test suite. Please do NOT assume complete stability of the API. -As of 3/1/2022, 179/194 integration tests pass. - * Spark tables and views are allowed as sources that are loaded in from some Spark store(e.g in Hive or in memory). * Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be converted to a Spark dataframe and processed as a temporary view. * A `SparkRetrievalJob` is returned when calling `get_historical_features()`. * This allows you to call * `to_df` to retrieve the pandas dataframe. * `to_arrow` to retrieve the dataframe as a pyarrow Table. - * `to_spark-df` to retrieve the dataframe the spark. + * `to_spark_df` to retrieve the dataframe the spark. ## Example diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py index 5f142a2de8..59e54ef807 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py @@ -62,7 +62,7 @@ def pull_latest_from_table_or_query( warnings.warn( "The spark offline store is an experimental feature in alpha development. " - "This API is unstable and it could and most probably will be changed in the future.", + "Some functionality may still be unstable so functionality can change in the future.", RuntimeWarning, ) @@ -116,7 +116,7 @@ def get_historical_features( assert isinstance(config.offline_store, SparkOfflineStoreConfig) warnings.warn( "The spark offline store is an experimental feature in alpha development. " - "This API is unstable and it could and most probably will be changed in the future.", + "Some functionality may still be unstable so functionality can change in the future.", RuntimeWarning, ) spark_session = get_spark_session_or_start_new_with_repoconfig( @@ -223,7 +223,6 @@ def pull_all_from_table_or_query( ) -# TODO fix internal abstract methods _to_df_internal _to_arrow_internal class SparkRetrievalJob(RetrievalJob): def __init__( self, @@ -236,17 +235,9 @@ def __init__( super().__init__() self.spark_session = spark_session self.query = query - self._full_feature_names = full_feature_names - self._on_demand_feature_views = on_demand_feature_views - self._metadata = metadata - - @property - def full_feature_names(self) -> bool: - return self._full_feature_names - - @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: - return self._on_demand_feature_views + self.full_feature_names = full_feature_names + self.on_demand_feature_views = on_demand_feature_views + self.metadata = metadata def to_spark_df(self) -> pyspark.sql.DataFrame: statements = self.query.split( @@ -270,14 +261,6 @@ def persist(self, storage: SavedDatasetStorage): """ pass - @property - def metadata(self) -> Optional[RetrievalMetadata]: - """ - Return metadata information about retrieval. - Should be available even before materializing the dataset itself. - """ - return self._metadata - def get_spark_session_or_start_new_with_repoconfig( store_config: SparkOfflineStoreConfig, @@ -289,12 +272,12 @@ def get_spark_session_or_start_new_with_repoconfig( if spark_conf: spark_builder = spark_builder.config( conf=SparkConf().setAll([(k, v) for k, v in spark_conf.items()]) - ) # noqa + ) spark_session = spark_builder.getOrCreate() spark_session.conf.set( "spark.sql.parser.quotedRegexColumnNames", "true" - ) # important! + ) return spark_session @@ -319,6 +302,7 @@ def _get_entity_df_event_timestamp_range( # If the entity_df is a string (SQL query), determine range # from table df = spark_session.sql(entity_df).select(entity_df_event_timestamp_col) + # TODO(kzhang132): need utc conversion here. entity_df_event_timestamp_range = ( df.agg({entity_df_event_timestamp_col: "max"}).collect()[0][0], df.agg({entity_df_event_timestamp_col: "min"}).collect()[0][0], @@ -358,35 +342,6 @@ def _format_datetime(t: datetime) -> str: return dt -def _get_feature_view_query_context( - entity_df: Union[pd.DataFrame, str], - entity_df_event_timestamp_col: str, - feature_refs: List[str], - feature_views: List[FeatureView], - spark_session: SparkSession, - table_name: str, - registry: Registry, - project: str, -) -> List[FeatureViewQueryContext]: - # interface of offline_utils.get_feature_view_query_context changed in feast==0.17 - arg_spec = inspect.getfullargspec(func=offline_utils.get_feature_view_query_context) - if "entity_df_timestamp_range" in arg_spec.args: - # for feast>=0.17 - entity_df_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df=entity_df, - entity_df_event_timestamp_col=entity_df_event_timestamp_col, - spark_session=spark_session, - ) - query_context = offline_utils.get_feature_view_query_context( - feature_refs=feature_refs, - feature_views=feature_views, - registry=registry, - project=project, - entity_df_timestamp_range=entity_df_timestamp_range, - ) - return query_context - - MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """/* Compute a deterministic hash for the `left_table_query_string` that will be used throughout all the logic as the field to GROUP BY the data diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py index 801be2838a..45abb560f4 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py @@ -1,3 +1,5 @@ +import logging +import traceback import pickle import warnings from enum import Enum @@ -16,6 +18,7 @@ from feast.type_map import spark_to_feast_value_type from feast.value_type import ValueType +logger = logging.getLogger(__name__) class SparkSourceFormat(Enum): csv = "csv" @@ -64,55 +67,40 @@ def __init__( f"'file_format' should be one of {self.allowed_formats}" ) - self._spark_options = SparkOptions( + self.spark_options = SparkOptions( table=table, query=query, path=path, file_format=file_format, ) - @property - def spark_options(self): - """ - Returns the spark options of this data source - """ - return self._spark_options - - @spark_options.setter - def spark_options(self, spark_options): - """ - Sets the spark options of this data source - """ - self._spark_options = spark_options - @property def table(self): """ Returns the table of this feature data source """ - return self._spark_options.table + return self.spark_options.table @property def query(self): """ Returns the query of this feature data source """ - return self._spark_options.query + return self.spark_options.query @property def path(self): """ Returns the path of the spark data source file. """ - return self._spark_options.path + return self.spark_options.path @property def file_format(self): """ Returns the file format of this feature data source. """ - return self._spark_options.file_format + return self.spark_options.file_format @staticmethod def from_proto(data_source: DataSourceProto) -> Any: - assert data_source.HasField("custom_options") spark_options = SparkOptions.from_proto(data_source.custom_options) @@ -166,6 +154,7 @@ def get_table_column_names_and_types( def get_table_query_string(self) -> str: """Returns a string that can directly be used to reference this table in SQL""" if self.table: + # Backticks make sure that spark sql knows this a table reference. return f"`{self.table}`" if self.query: return f"({self.query})" @@ -174,8 +163,11 @@ def get_table_query_string(self) -> str: spark_session = SparkSession.getActiveSession() if spark_session is None: raise AssertionError("Could not find an active spark session.") - df = spark_session.read.format(self.file_format).load(self.path) - + try: + df = spark_session.read.format(self.file_format).load(self.path) + except Exception as e: + logger.log("Spark read of file source failed.") + logger.exception(traceback.format_exc()) tmp_table_name = get_temp_entity_table_name() df.createOrReplaceTempView(tmp_table_name) @@ -190,66 +182,10 @@ def __init__( path: Optional[str] = None, file_format: Optional[str] = None, ): - self._table = table - self._query = query - self._path = path - self._file_format = file_format - - @property - def table(self): - """ - Returns the table - """ - return self._table - - @table.setter - def table(self, table): - """ - Sets the table - """ - self._table = table - - @property - def query(self): - """ - Returns the query - """ - return self._query - - @query.setter - def query(self, query): - """ - Sets the query - """ - self._query = query - - @property - def path(self): - """ - Returns the path - """ - return self._path - - @path.setter - def path(self, path): - """ - Sets the path - """ - self._path = path - - @property - def file_format(self): - """ - Returns the file_format - """ - return self._file_format - - @file_format.setter - def file_format(self, file_format): - """ - Sets the file_format - """ - self._file_format = file_format + self.table = table + self.query = query + self.path = path + self.file_format = file_format @classmethod def from_proto(cls, spark_options_proto: DataSourceProto.CustomSourceOptions): @@ -294,10 +230,7 @@ def __init__(self, table_ref: Optional[str] = None, query: Optional[str] = None) @staticmethod def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: - # options = SparkOptions.from_proto( - # storage_proto - # ) - # spark_options = SparkOptions(table=options.table, query=options.query) + # TODO: implementation is not correct. Needs fix and update to protos. return SavedDatasetSparkStorage(table_ref="", query=None) def to_proto(self) -> SavedDatasetStorageProto: diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index ed342de47c..89aea727a6 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -33,9 +33,6 @@ from tests.integration.feature_repos.universal.data_sources.snowflake import ( SnowflakeDataSourceCreator, ) -from tests.integration.feature_repos.universal.data_sources.spark_data_source_creator import ( - SparkDataSourceCreator, -) from tests.integration.feature_repos.universal.feature_views import ( conv_rate_plus_100_feature_view, create_conv_rate_request_data_source, @@ -55,7 +52,7 @@ "type": "redis", "redis_type": "redis_cluster", # Redis Cluster Port Forwarding is setup in "pr_integration_tests.yaml" under "Setup Redis Cluster". - "connection_string": "127.0.0.1:32001,127.0.0.1:32002,127.0.0.1:32003", + "connection_string": "127.0.0.1:6001,127.0.0.1:6002,127.0.0.1:6003", } # FULL_REPO_CONFIGS contains the repo configurations (e.g. provider, offline store, @@ -77,9 +74,6 @@ IntegrationTestRepoConfig(online_store=REDIS_CONFIG), IntegrationTestRepoConfig(online_store=REDIS_CLUSTER_CONFIG), # GCP configurations - IntegrationTestRepoConfig( - provider="local", offline_store_creator=SparkDataSourceCreator, - ), IntegrationTestRepoConfig( provider="gcp", offline_store_creator=BigQueryDataSourceCreator, diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 2bdaeb960e..9d27e5dd56 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -55,7 +55,6 @@ def create_offline_store_config(self): self.spark_offline_store_config.spark_conf = self.spark_conf return self.spark_offline_store_config - # abstract def create_data_source( self, df: pd.DataFrame, From c81fe319aa4fd236d61e9c947887c76837b3e158 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 10:51:01 -0800 Subject: [PATCH 40/58] Fix tests accidentally broken Signed-off-by: Kevin Zhang --- .../third_party/spark_offline_store/spark.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py index 59e54ef807..1386392cab 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py @@ -235,9 +235,17 @@ def __init__( super().__init__() self.spark_session = spark_session self.query = query - self.full_feature_names = full_feature_names - self.on_demand_feature_views = on_demand_feature_views - self.metadata = metadata + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views + self._metadata = metadata + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + return self._on_demand_feature_views def to_spark_df(self) -> pyspark.sql.DataFrame: statements = self.query.split( @@ -261,6 +269,14 @@ def persist(self, storage: SavedDatasetStorage): """ pass + @property + def metadata(self) -> Optional[RetrievalMetadata]: + """ + Return metadata information about retrieval. + Should be available even before materializing the dataset itself. + """ + return self._metadata + def get_spark_session_or_start_new_with_repoconfig( store_config: SparkOfflineStoreConfig, From d790a1cee272c1976d56ee8ec7d44d0ec00d48c8 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 11:10:47 -0800 Subject: [PATCH 41/58] Add comments Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 4 +++- .../universal/data_sources/spark_data_source_creator.py | 6 ------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index c5af8251fb..0e30d68496 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -592,11 +592,13 @@ def _non_empty_value(value: Any) -> bool: def spark_to_feast_value_type(spark_type_as_str: str) -> ValueType: # TODO not all spark types are convertible + # Current non-convertible types: interval, map, struct, structfield, decimal, binary type_map: Dict[str, ValueType] = { "null": ValueType.UNKNOWN, "byte": ValueType.BYTES, "string": ValueType.STRING, "int": ValueType.INT32, + "short": ValueType.INT32, "bigint": ValueType.INT64, "long": ValueType.INT64, "double": ValueType.DOUBLE, @@ -612,7 +614,7 @@ def spark_to_feast_value_type(spark_type_as_str: str) -> ValueType: "array": ValueType.BOOL_LIST, "array": ValueType.UNIX_TIMESTAMP_LIST, } - # TODO: this is just incorrect fix + # TODO: Find better way of doing this. if type(spark_type_as_str) != str or spark_type_as_str not in type_map: return ValueType.NULL return type_map[spark_type_as_str.lower()] diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 9d27e5dd56..c06cb6a1d3 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -78,8 +78,6 @@ def create_data_source( col = timestamp_mapping[event_timestamp_column] df[col] = pd.to_datetime(df[col], utc=True) - # https://stackoverflow.com/questions/51871200/analysisexception-it-is-not-allowed-to-add-database-prefix - # destination_name = self.get_prefixed_table_name(destination_name) if not self.spark_session: self.spark_session = ( SparkSession.builder.config( @@ -98,7 +96,6 @@ def create_data_source( event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", - # feature_view => datasource accompanied # maps certain column names to other names field_mapping=field_mapping or {"ts_1": "ts"}, ) @@ -106,6 +103,3 @@ def create_data_source( def create_saved_dataset_destination(self) -> SavedDatasetSparkStorage: table = f"persisted_{str(uuid.uuid4()).replace('-', '_')}" return SavedDatasetSparkStorage(table_ref=table, query="") - - def get_prefixed_table_name(self, suffix: str) -> str: - return f"{self.project_name}.{suffix}" From 1408b8fc8a68971861a8c126753ee1a55980064e Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 11:11:17 -0800 Subject: [PATCH 42/58] Reformat Signed-off-by: Kevin Zhang --- .../offline_stores/third_party/spark_offline_store/spark.py | 4 +--- .../third_party/spark_offline_store/spark_source.py | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py index 1386392cab..92aea96306 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py @@ -291,9 +291,7 @@ def get_spark_session_or_start_new_with_repoconfig( ) spark_session = spark_builder.getOrCreate() - spark_session.conf.set( - "spark.sql.parser.quotedRegexColumnNames", "true" - ) + spark_session.conf.set("spark.sql.parser.quotedRegexColumnNames", "true") return spark_session diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py index 45abb560f4..1b2aecf4df 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py @@ -1,6 +1,6 @@ import logging -import traceback import pickle +import traceback import warnings from enum import Enum from typing import Any, Callable, Dict, Iterable, Optional, Tuple @@ -20,6 +20,7 @@ logger = logging.getLogger(__name__) + class SparkSourceFormat(Enum): csv = "csv" json = "json" From bf071b373490197ffbcabcc1ffb5939721d636a8 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 11:15:31 -0800 Subject: [PATCH 43/58] Fix logger Signed-off-by: Kevin Zhang --- .../third_party/spark_offline_store/spark_source.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py index 1b2aecf4df..139257ae8d 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py @@ -167,8 +167,7 @@ def get_table_query_string(self) -> str: try: df = spark_session.read.format(self.file_format).load(self.path) except Exception as e: - logger.log("Spark read of file source failed.") - logger.exception(traceback.format_exc()) + logger.exception("Spark read of file source failed.\n" + traceback.format_exc()) tmp_table_name = get_temp_entity_table_name() df.createOrReplaceTempView(tmp_table_name) From 62a92acb9f91e5912d80d5cc3405dd6b3a085764 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 11:17:21 -0800 Subject: [PATCH 44/58] Remove unused imports Signed-off-by: Kevin Zhang --- .../third_party/spark_offline_store/spark_source.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py index 139257ae8d..b1a3484902 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py @@ -166,8 +166,10 @@ def get_table_query_string(self) -> str: raise AssertionError("Could not find an active spark session.") try: df = spark_session.read.format(self.file_format).load(self.path) - except Exception as e: - logger.exception("Spark read of file source failed.\n" + traceback.format_exc()) + except Exception: + logger.exception( + "Spark read of file source failed.\n" + traceback.format_exc() + ) tmp_table_name = get_temp_entity_table_name() df.createOrReplaceTempView(tmp_table_name) From 3ec6d223f26c85411825ac43be284a380a7101a7 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 11:18:14 -0800 Subject: [PATCH 45/58] Fix imports Signed-off-by: Kevin Zhang --- .../offline_stores/third_party/spark_offline_store/spark.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py index 92aea96306..097c24da0c 100644 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py @@ -1,4 +1,3 @@ -import inspect import warnings from datetime import datetime from typing import Dict, List, Optional, Tuple, Union @@ -23,7 +22,6 @@ RetrievalJob, RetrievalMetadata, ) -from feast.infra.offline_stores.offline_utils import FeatureViewQueryContext from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( SparkSource, ) From 62ff1852906657097b5a4c0cbdbbdda53a7baad1 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Thu, 3 Mar 2022 14:53:35 -0500 Subject: [PATCH 46/58] Fix CI dependencies Signed-off-by: Danny Chiao --- .../requirements/py3.7-ci-requirements.txt | 51 ++++++++++--------- .../requirements/py3.8-ci-requirements.txt | 33 +++++------- .../requirements/py3.9-ci-requirements.txt | 33 +++++------- sdk/python/setup.py | 2 +- 4 files changed, 51 insertions(+), 68 deletions(-) diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index 5132b835e0..149aadf92c 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with python 3.7 # To update, run: # -# pip-compile --extra=ci --output-file=requirements/py3.7-ci-requirements.txt setup.py +# pip-compile --extra=ci --output-file=requirements/py3.7-ci-requirements.txt # absl-py==1.0.0 # via tensorflow-metadata @@ -42,8 +42,6 @@ asn1crypto==1.4.0 # snowflake-connector-python assertpy==1.1 # via feast (setup.py) -asttokens==2.0.5 - # via stack-data async-timeout==4.0.2 # via aiohttp asynctest==0.13.0 @@ -56,7 +54,7 @@ attrs==21.4.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.22.1 +azure-core==1.23.0 # via # adlfs # azure-identity @@ -71,7 +69,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports.zoneinfo==0.2.1 +backports-zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -79,11 +77,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.21.8 +boto3==1.21.11 # via # feast (setup.py) # moto -botocore==1.24.8 +botocore==1.24.11 # via # boto3 # moto @@ -166,8 +164,6 @@ entrypoints==0.4 # nbconvert execnet==1.9.0 # via pytest-xdist -executing==0.8.3 - # via stack-data fastapi==0.74.1 # via feast (setup.py) fastavro==1.4.9 @@ -216,7 +212,7 @@ google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.0 # via gcsfs -google-cloud-bigquery==2.34.0 +google-cloud-bigquery==2.34.1 # via feast (setup.py) google-cloud-bigquery-storage==2.12.0 # via feast (setup.py) @@ -227,7 +223,7 @@ google-cloud-core==1.7.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.4.0 +google-cloud-datastore==2.5.0 # via feast (setup.py) google-cloud-firestore==2.3.4 # via firebase-admin @@ -294,7 +290,6 @@ importlib-metadata==4.2.0 # pluggy # pre-commit # pytest - # redis # virtualenv importlib-resources==5.4.0 # via jsonschema @@ -359,13 +354,9 @@ jupyterlab-pygments==0.1.2 # via nbconvert jupyterlab-widgets==1.0.2 # via ipywidgets -libcst==0.4.1 - # via - # google-cloud-bigquery-storage - # google-cloud-datastore locket==0.2.1 # via partd -markupsafe==2.0.1 +markupsafe==2.1.0 # via # jinja2 # moto @@ -515,8 +506,6 @@ ptyprocess==0.7.0 # via # pexpect # terminado -pure-eval==0.2.2 - # via stack-data py==1.11.0 # via # pytest @@ -669,14 +658,13 @@ ruamel-yaml-clib==0.2.6 # via ruamel-yaml s3transfer==0.5.2 # via boto3 -scipy==1.8.0 +scipy==1.7.3 # via great-expectations send2trash==1.8.0 # via notebook six==1.16.0 # via # absl-py - # asttokens # azure-core # azure-identity # bleach @@ -718,8 +706,6 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -stack-data==0.2.0 - # via ipython starlette==0.17.1 # via fastapi tabulate==0.8.9 @@ -730,7 +716,7 @@ tensorflow-metadata==1.7.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations -terminado==0.13.1 +terminado==0.13.2 # via notebook testcontainers==3.4.2 # via feast (setup.py) @@ -775,7 +761,9 @@ traitlets==5.1.1 # nbformat # notebook typed-ast==1.5.2 - # via black + # via + # black + # mypy types-protobuf==3.19.12 # via # feast (setup.py) @@ -798,9 +786,21 @@ types-urllib3==1.26.10 # via types-requests typing-extensions==4.1.1 # via + # aiohttp + # anyio + # argon2-cffi + # asgiref + # async-timeout + # azure-core # great-expectations + # h11 + # importlib-metadata + # jsonschema # mypy # pydantic + # starlette + # uvicorn + # yarl tzdata==2021.5 # via pytz-deprecation-shim tzlocal==4.1 @@ -846,6 +846,7 @@ zipp==3.7.0 # via # importlib-metadata # importlib-resources + # pep517 # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/sdk/python/requirements/py3.8-ci-requirements.txt b/sdk/python/requirements/py3.8-ci-requirements.txt index f0f530d8fa..7829d22c6f 100644 --- a/sdk/python/requirements/py3.8-ci-requirements.txt +++ b/sdk/python/requirements/py3.8-ci-requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with python 3.8 # To update, run: # -# pip-compile --extra=ci --output-file=requirements/py3.8-ci-requirements.txt setup.py +# pip-compile --extra=ci --output-file=requirements/py3.8-ci-requirements.txt # absl-py==1.0.0 # via tensorflow-metadata @@ -54,7 +54,7 @@ attrs==21.4.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.22.1 +azure-core==1.23.0 # via # adlfs # azure-identity @@ -69,7 +69,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports-zoneinfo==0.2.1 +backports.zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -281,7 +281,7 @@ idna==3.3 # yarl imagesize==1.3.0 # via sphinx -importlib-metadata==4.2.0 +importlib-metadata==4.11.2 # via great-expectations importlib-resources==5.4.0 # via jsonschema @@ -291,7 +291,7 @@ ipykernel==6.9.1 # via # ipywidgets # notebook -ipython==8.1.0 +ipython==8.1.1 # via # ipykernel # ipywidgets @@ -346,13 +346,9 @@ jupyterlab-pygments==0.1.2 # via nbconvert jupyterlab-widgets==1.0.2 # via ipywidgets -libcst==0.4.1 - # via - # google-cloud-bigquery-storage - # google-cloud-datastore locket==0.2.1 # via partd -markupsafe==2.0.1 +markupsafe==2.1.0 # via # jinja2 # moto @@ -650,10 +646,10 @@ responses==0.18.0 # via moto rsa==4.8 # via google-auth -ruamel-yaml==0.17.17 +ruamel.yaml==0.17.17 # via great-expectations -ruamel-yaml-clib==0.2.6 - # via ruamel-yaml +ruamel.yaml.clib==0.2.6 + # via ruamel.yaml s3transfer==0.5.2 # via boto3 scipy==1.8.0 @@ -717,7 +713,7 @@ tensorflow-metadata==1.7.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations -terminado==0.13.1 +terminado==0.13.2 # via notebook testcontainers==3.4.2 # via feast (setup.py) @@ -763,9 +759,7 @@ traitlets==5.1.1 # notebook typed-ast==1.5.2 # via black -types-futures==3.3.8 - # via types-protobuf -types-protobuf==3.19.7 +types-protobuf==3.19.12 # via # feast (setup.py) # mypy-protobuf @@ -787,13 +781,10 @@ types-urllib3==1.26.10 # via types-requests typing-extensions==4.1.1 # via + # azure-core # great-expectations - # libcst # mypy # pydantic - # typing-inspect -typing-inspect==0.7.1 - # via libcst tzdata==2021.5 # via pytz-deprecation-shim tzlocal==4.1 diff --git a/sdk/python/requirements/py3.9-ci-requirements.txt b/sdk/python/requirements/py3.9-ci-requirements.txt index ec6f876c72..0e7d6defeb 100644 --- a/sdk/python/requirements/py3.9-ci-requirements.txt +++ b/sdk/python/requirements/py3.9-ci-requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with python 3.9 # To update, run: # -# pip-compile --extra=ci --output-file=requirements/py3.9-ci-requirements.txt setup.py +# pip-compile --extra=ci --output-file=requirements/py3.9-ci-requirements.txt # absl-py==1.0.0 # via tensorflow-metadata @@ -54,7 +54,7 @@ attrs==21.4.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.22.1 +azure-core==1.23.0 # via # adlfs # azure-identity @@ -277,7 +277,7 @@ idna==3.3 # yarl imagesize==1.3.0 # via sphinx -importlib-metadata==4.2.0 +importlib-metadata==4.11.2 # via great-expectations iniconfig==1.1.1 # via pytest @@ -285,7 +285,7 @@ ipykernel==6.9.1 # via # ipywidgets # notebook -ipython==8.1.0 +ipython==8.1.1 # via # ipykernel # ipywidgets @@ -340,13 +340,9 @@ jupyterlab-pygments==0.1.2 # via nbconvert jupyterlab-widgets==1.0.2 # via ipywidgets -libcst==0.4.1 - # via - # google-cloud-bigquery-storage - # google-cloud-datastore locket==0.2.1 # via partd -markupsafe==2.0.1 +markupsafe==2.1.0 # via # jinja2 # moto @@ -644,11 +640,11 @@ responses==0.18.0 # via moto rsa==4.8 # via google-auth -ruamel-yaml==0.17.17 +ruamel.yaml==0.17.17 # via great-expectations -ruamel-yaml-clib==0.2.6 - # via ruamel-yaml -s3transfer==0.5.0 +ruamel.yaml.clib==0.2.6 + # via ruamel.yaml +s3transfer==0.5.2 # via boto3 scipy==1.8.0 # via great-expectations @@ -711,7 +707,7 @@ tensorflow-metadata==1.7.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations -terminado==0.13.1 +terminado==0.13.2 # via notebook testcontainers==3.4.2 # via feast (setup.py) @@ -757,9 +753,7 @@ traitlets==5.1.1 # notebook typed-ast==1.5.2 # via black -types-futures==3.3.8 - # via types-protobuf -types-protobuf==3.19.7 +types-protobuf==3.19.12 # via # feast (setup.py) # mypy-protobuf @@ -781,13 +775,10 @@ types-urllib3==1.26.10 # via types-requests typing-extensions==4.1.1 # via + # azure-core # great-expectations - # libcst # mypy # pydantic - # typing-inspect -typing-inspect==0.7.1 - # via libcst tzdata==2021.5 # via pytz-deprecation-shim tzlocal==4.1 diff --git a/sdk/python/setup.py b/sdk/python/setup.py index 7666b2b535..f95dd2b806 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -93,7 +93,7 @@ ] SPARK_REQUIRED = [ - "pyspark==3.2.1", + "pyspark>=3.0.0", ] GE_REQUIRED = [ From 0dbc4e707448c66d980bdd314f9d6450a4a275da Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 12:02:56 -0800 Subject: [PATCH 47/58] Prefix destinations with project name Signed-off-by: Kevin Zhang --- .../universal/data_sources/spark_data_source_creator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index c06cb6a1d3..13eaa30116 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -77,7 +77,7 @@ def create_data_source( ): col = timestamp_mapping[event_timestamp_column] df[col] = pd.to_datetime(df[col], utc=True) - + destination_name = self.get_prefixed_table_name(destination_name) if not self.spark_session: self.spark_session = ( SparkSession.builder.config( @@ -103,3 +103,6 @@ def create_data_source( def create_saved_dataset_destination(self) -> SavedDatasetSparkStorage: table = f"persisted_{str(uuid.uuid4()).replace('-', '_')}" return SavedDatasetSparkStorage(table_ref=table, query="") + + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}_{suffix}" From 40cb4f8fb6ac08f2a8436a34696ddf6b5561d806 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 12:05:59 -0800 Subject: [PATCH 48/58] Update comment Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 0e30d68496..b1d3dfc706 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -329,7 +329,7 @@ def _python_value_to_proto_value( for ts in int_timestamps_lists ] if feast_value_type == ValueType.BOOL_LIST: - # ProtoValue no longer supports conversion of np.bool_ so we need it convert it. + # ProtoValue does not support conversion of np.bool_ so we need to convert it to support np.bool_. return [ ProtoValue(**{field_name: proto_type(val=[bool(e) for e in value])}) # type: ignore if value is not None From 4f5359aaf505631724d35cb150634d5d3a39e2fa Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 12:52:59 -0800 Subject: [PATCH 49/58] Fix 3.8 Signed-off-by: Kevin Zhang --- sdk/python/requirements/py3.8-ci-requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sdk/python/requirements/py3.8-ci-requirements.txt b/sdk/python/requirements/py3.8-ci-requirements.txt index 7829d22c6f..a146335e1a 100644 --- a/sdk/python/requirements/py3.8-ci-requirements.txt +++ b/sdk/python/requirements/py3.8-ci-requirements.txt @@ -69,7 +69,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports.zoneinfo==0.2.1 +backports-zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -77,11 +77,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.21.11 +boto3==1.21.12 # via # feast (setup.py) # moto -botocore==1.24.11 +botocore==1.24.12 # via # boto3 # moto @@ -646,10 +646,10 @@ responses==0.18.0 # via moto rsa==4.8 # via google-auth -ruamel.yaml==0.17.17 +ruamel-yaml==0.17.17 # via great-expectations -ruamel.yaml.clib==0.2.6 - # via ruamel.yaml +ruamel-yaml-clib==0.2.6 + # via ruamel-yaml s3transfer==0.5.2 # via boto3 scipy==1.8.0 From 513d5bc72d2096cb47806dbaaab0d87a0c518325 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 15:54:36 -0800 Subject: [PATCH 50/58] temporary fix Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index b1d3dfc706..90b48fe14c 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -328,14 +328,14 @@ def _python_value_to_proto_value( ProtoValue(unix_timestamp_list_val=Int64List(val=ts)) # type: ignore for ts in int_timestamps_lists ] - if feast_value_type == ValueType.BOOL_LIST: - # ProtoValue does not support conversion of np.bool_ so we need to convert it to support np.bool_. - return [ - ProtoValue(**{field_name: proto_type(val=[bool(e) for e in value])}) # type: ignore - if value is not None - else ProtoValue() - for value in values - ] + # if feast_value_type == ValueType.BOOL_LIST: + # # ProtoValue does not support conversion of np.bool_ so we need to convert it to support np.bool_. + # return [ + # ProtoValue(**{field_name: proto_type(val=[bool(e) for e in value])}) # type: ignore + # if value is not None + # else ProtoValue() + # for value in values + # ] return [ ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore if value is not None @@ -361,20 +361,20 @@ def _python_value_to_proto_value( ) = PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE[feast_value_type] if valid_scalar_types: assert type(sample) in valid_scalar_types - if feast_value_type == ValueType.BOOL: - # ProtoValue no longer supports conversion of np.bool_ so we need it convert it. - return [ - ProtoValue( - **{ - field_name: func( - bool(value) if type(value) is np.bool_ else value # type: ignore - ) - } - ) - if not pd.isnull(value) - else ProtoValue() - for value in values - ] + # if feast_value_type == ValueType.BOOL: + # # ProtoValue no longer supports conversion of np.bool_ so we need it convert it. + # return [ + # ProtoValue( + # **{ + # field_name: func( + # bool(value) if type(value) is np.bool_ else value # type: ignore + # ) + # } + # ) + # if not pd.isnull(value) + # else ProtoValue() + # for value in values + # ] if feast_value_type in PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE: return [ ProtoValue(**{field_name: func(value)}) From 880588466126cc4161c1d00b9e1f64139cf95b41 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 16:30:45 -0800 Subject: [PATCH 51/58] rollback Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 90b48fe14c..b1d3dfc706 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -328,14 +328,14 @@ def _python_value_to_proto_value( ProtoValue(unix_timestamp_list_val=Int64List(val=ts)) # type: ignore for ts in int_timestamps_lists ] - # if feast_value_type == ValueType.BOOL_LIST: - # # ProtoValue does not support conversion of np.bool_ so we need to convert it to support np.bool_. - # return [ - # ProtoValue(**{field_name: proto_type(val=[bool(e) for e in value])}) # type: ignore - # if value is not None - # else ProtoValue() - # for value in values - # ] + if feast_value_type == ValueType.BOOL_LIST: + # ProtoValue does not support conversion of np.bool_ so we need to convert it to support np.bool_. + return [ + ProtoValue(**{field_name: proto_type(val=[bool(e) for e in value])}) # type: ignore + if value is not None + else ProtoValue() + for value in values + ] return [ ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore if value is not None @@ -361,20 +361,20 @@ def _python_value_to_proto_value( ) = PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE[feast_value_type] if valid_scalar_types: assert type(sample) in valid_scalar_types - # if feast_value_type == ValueType.BOOL: - # # ProtoValue no longer supports conversion of np.bool_ so we need it convert it. - # return [ - # ProtoValue( - # **{ - # field_name: func( - # bool(value) if type(value) is np.bool_ else value # type: ignore - # ) - # } - # ) - # if not pd.isnull(value) - # else ProtoValue() - # for value in values - # ] + if feast_value_type == ValueType.BOOL: + # ProtoValue no longer supports conversion of np.bool_ so we need it convert it. + return [ + ProtoValue( + **{ + field_name: func( + bool(value) if type(value) is np.bool_ else value # type: ignore + ) + } + ) + if not pd.isnull(value) + else ProtoValue() + for value in values + ] if feast_value_type in PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE: return [ ProtoValue(**{field_name: func(value)}) From 3ac3b715d5ed0f97f134e2fd21c0518e29a967ab Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 16:54:08 -0800 Subject: [PATCH 52/58] update Signed-off-by: Kevin Zhang --- sdk/python/requirements/py3.7-ci-requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index 149aadf92c..ab2457debf 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -77,11 +77,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.21.11 +boto3==1.21.8 # via # feast (setup.py) # moto -botocore==1.24.11 +botocore==1.24.8 # via # boto3 # moto From cfbaef5770a503c44e85da479297369b49793ad9 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 3 Mar 2022 17:11:21 -0800 Subject: [PATCH 53/58] Update ci? Signed-off-by: Kevin Zhang --- .../requirements/py3.7-ci-requirements.txt | 82 +- .../requirements/py3.8-requirements.txt | 736 +++++++++++++++++- 2 files changed, 745 insertions(+), 73 deletions(-) diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index ab2457debf..93ab57002b 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with python 3.7 # To update, run: # -# pip-compile --extra=ci --output-file=requirements/py3.7-ci-requirements.txt +# pip-compile --extra=ci --output-file=requirements/py3.7-ci-requirements.txt setup.py # absl-py==1.0.0 # via tensorflow-metadata @@ -44,8 +44,6 @@ assertpy==1.1 # via feast (setup.py) async-timeout==4.0.2 # via aiohttp -asynctest==0.13.0 - # via aiohttp attrs==21.4.0 # via # aiohttp @@ -54,14 +52,14 @@ attrs==21.4.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.23.0 +azure-core==1.22.1 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.52 # via adlfs -azure-identity==1.8.0 +azure-identity==1.7.1 # via adlfs azure-storage-blob==12.9.0 # via adlfs @@ -69,7 +67,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports-zoneinfo==0.2.1 +backports.zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -197,7 +195,7 @@ google-api-core[grpc]==1.31.5 # google-cloud-core # google-cloud-datastore # google-cloud-firestore -google-api-python-client==2.39.0 +google-api-python-client==2.38.0 # via firebase-admin google-auth==1.35.0 # via @@ -212,7 +210,7 @@ google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.0 # via gcsfs -google-cloud-bigquery==2.34.1 +google-cloud-bigquery==2.34.0 # via feast (setup.py) google-cloud-bigquery-storage==2.12.0 # via feast (setup.py) @@ -223,7 +221,7 @@ google-cloud-core==1.7.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.5.0 +google-cloud-datastore==2.4.0 # via feast (setup.py) google-cloud-firestore==2.3.4 # via firebase-admin @@ -280,17 +278,7 @@ idna==3.3 imagesize==1.3.0 # via sphinx importlib-metadata==4.2.0 - # via - # click - # flake8 - # great-expectations - # jsonschema - # moto - # pep517 - # pluggy - # pre-commit - # pytest - # virtualenv + # via great-expectations importlib-resources==5.4.0 # via jsonschema iniconfig==1.1.1 @@ -354,9 +342,13 @@ jupyterlab-pygments==0.1.2 # via nbconvert jupyterlab-widgets==1.0.2 # via ipywidgets +libcst==0.4.1 + # via + # google-cloud-bigquery-storage + # google-cloud-datastore locket==0.2.1 # via partd -markupsafe==2.1.0 +markupsafe==2.0.1 # via # jinja2 # moto @@ -399,7 +391,9 @@ multidict==6.0.2 mypy==0.931 # via feast (setup.py) mypy-extensions==0.4.3 - # via mypy + # via + # mypy + # typing-inspect mypy-protobuf==3.1.0 # via feast (setup.py) nbclient==0.5.11 @@ -610,6 +604,7 @@ pyyaml==6.0 # via # dask # feast (setup.py) + # libcst # pre-commit # uvicorn pyzmq==22.3.0 @@ -622,7 +617,7 @@ redis==3.5.3 # redis-py-cluster redis-py-cluster==2.1.3 # via feast (setup.py) -regex==2022.3.2 +regex==2022.1.18 # via black requests==2.27.1 # via @@ -652,10 +647,10 @@ responses==0.18.0 # via moto rsa==4.8 # via google-auth -ruamel-yaml==0.17.17 +ruamel.yaml==0.17.17 # via great-expectations -ruamel-yaml-clib==0.2.6 - # via ruamel-yaml +ruamel.yaml.clib==0.2.6 + # via ruamel.yaml s3transfer==0.5.2 # via boto3 scipy==1.7.3 @@ -712,11 +707,11 @@ tabulate==0.8.9 # via feast (setup.py) tenacity==8.0.1 # via feast (setup.py) -tensorflow-metadata==1.7.0 +tensorflow-metadata==1.6.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations -terminado==0.13.2 +terminado==0.13.1 # via notebook testcontainers==3.4.2 # via feast (setup.py) @@ -744,7 +739,7 @@ tornado==6.1 # jupyter-client # notebook # terminado -tqdm==4.63.0 +tqdm==4.62.3 # via # feast (setup.py) # great-expectations @@ -761,10 +756,10 @@ traitlets==5.1.1 # nbformat # notebook typed-ast==1.5.2 - # via - # black - # mypy -types-protobuf==3.19.12 + # via black +types-futures==3.3.8 + # via types-protobuf +types-protobuf==3.19.2 # via # feast (setup.py) # mypy-protobuf @@ -782,25 +777,17 @@ types-setuptools==57.4.9 # via feast (setup.py) types-tabulate==0.8.5 # via feast (setup.py) -types-urllib3==1.26.10 +types-urllib3==1.26.9 # via types-requests typing-extensions==4.1.1 # via - # aiohttp - # anyio - # argon2-cffi - # asgiref - # async-timeout - # azure-core # great-expectations - # h11 - # importlib-metadata - # jsonschema + # libcst # mypy # pydantic - # starlette - # uvicorn - # yarl + # typing-inspect +typing-inspect==0.7.1 + # via libcst tzdata==2021.5 # via pytz-deprecation-shim tzlocal==4.1 @@ -846,8 +833,7 @@ zipp==3.7.0 # via # importlib-metadata # importlib-resources - # pep517 # The following packages are considered to be unsafe in a requirements file: # pip -# setuptools +# setuptools \ No newline at end of file diff --git a/sdk/python/requirements/py3.8-requirements.txt b/sdk/python/requirements/py3.8-requirements.txt index 90b4276013..6b2a9661dd 100644 --- a/sdk/python/requirements/py3.8-requirements.txt +++ b/sdk/python/requirements/py3.8-requirements.txt @@ -2,127 +2,706 @@ # This file is autogenerated by pip-compile with python 3.8 # To update, run: # -# pip-compile --output-file=requirements/py3.8-requirements.txt +# pip-compile --extra=ci --output-file=requirements/py3.8-ci-requirements.txt setup.py # absl-py==1.0.0 # via tensorflow-metadata +adal==1.2.7 + # via + # azure-datalake-store + # msrestazure +adlfs==0.5.9 + # via feast (setup.py) +aiohttp==3.8.1 + # via + # adlfs + # gcsfs +aiosignal==1.2.0 + # via aiohttp +alabaster==0.7.12 + # via sphinx +altair==4.2.0 + # via great-expectations anyio==3.5.0 # via starlette -asgiref==3.4.1 +appdirs==1.4.4 + # via black +appnope==0.1.2 + # via + # ipykernel + # ipython +argon2-cffi==21.3.0 + # via notebook +argon2-cffi-bindings==21.2.0 + # via argon2-cffi +asgiref==3.5.0 # via uvicorn +asn1crypto==1.4.0 + # via + # oscrypto + # snowflake-connector-python +assertpy==1.1 + # via feast (setup.py) +async-timeout==4.0.2 + # via aiohttp attrs==21.4.0 - # via jsonschema + # via + # aiohttp + # black + # jsonschema + # pytest +avro==1.10.0 + # via feast (setup.py) +azure-core==1.21.1 + # via + # adlfs + # azure-identity + # azure-storage-blob +azure-datalake-store==0.0.52 + # via adlfs +azure-identity==1.7.1 + # via adlfs +azure-storage-blob==12.9.0 + # via adlfs +babel==2.9.1 + # via sphinx +backcall==0.2.0 + # via ipython +backports.zoneinfo==0.2.1 + # via + # pytz-deprecation-shim + # tzlocal +black==19.10b0 + # via feast (setup.py) +bleach==4.1.0 + # via nbconvert +boto3==1.20.46 + # via + # feast (setup.py) + # moto +botocore==1.23.46 + # via + # boto3 + # moto + # s3transfer +cachecontrol==0.12.10 + # via firebase-admin cachetools==4.2.4 # via google-auth certifi==2021.10.8 - # via requests -charset-normalizer==2.0.10 - # via requests + # via + # minio + # msrest + # requests + # snowflake-connector-python +cffi==1.15.0 + # via + # argon2-cffi-bindings + # azure-datalake-store + # cryptography + # snowflake-connector-python +cfgv==3.3.1 + # via pre-commit +charset-normalizer==2.0.11 + # via + # aiohttp + # requests + # snowflake-connector-python click==8.0.3 # via + # black # feast (setup.py) + # great-expectations + # pip-tools # uvicorn +cloudpickle==2.0.0 + # via dask colorama==0.4.4 # via feast (setup.py) +coverage[toml]==6.3 + # via pytest-cov +cryptography==3.3.2 + # via + # adal + # azure-identity + # azure-storage-blob + # feast (setup.py) + # moto + # msal + # pyjwt + # pyopenssl + # snowflake-connector-python +dask==2022.1.1 + # via feast (setup.py) +debugpy==1.5.1 + # via ipykernel +decorator==5.1.1 + # via + # gcsfs + # ipython +defusedxml==0.7.1 + # via nbconvert +deprecation==2.1.0 + # via testcontainers dill==0.3.4 # via feast (setup.py) -fastapi==0.72.0 +distlib==0.3.4 + # via virtualenv +docker==5.0.3 + # via + # feast (setup.py) + # testcontainers +docutils==0.17.1 + # via + # sphinx + # sphinx-rtd-theme +entrypoints==0.3 + # via + # altair + # jupyter-client + # nbconvert +execnet==1.9.0 + # via pytest-xdist +fastapi==0.73.0 # via feast (setup.py) fastavro==1.4.9 # via # feast (setup.py) # pandavro -google-api-core==2.4.0 +filelock==3.4.2 + # via virtualenv +firebase-admin==4.5.2 + # via feast (setup.py) +flake8==4.0.1 + # via feast (setup.py) +frozenlist==1.3.0 + # via + # aiohttp + # aiosignal +fsspec==2022.1.0 + # via + # adlfs + # dask + # gcsfs +gcsfs==2022.1.0 + # via feast (setup.py) +google-api-core[grpc]==1.31.5 + # via + # feast (setup.py) + # firebase-admin + # google-api-python-client + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-core + # google-cloud-datastore + # google-cloud-firestore +google-api-python-client==2.36.0 + # via firebase-admin +google-auth==1.35.0 + # via + # gcsfs + # google-api-core + # google-api-python-client + # google-auth-httplib2 + # google-auth-oauthlib + # google-cloud-core + # google-cloud-storage +google-auth-httplib2==0.1.0 + # via google-api-python-client +google-auth-oauthlib==0.4.6 + # via gcsfs +google-cloud-bigquery==2.32.0 + # via feast (setup.py) +google-cloud-bigquery-storage==2.11.0 + # via feast (setup.py) +google-cloud-core==1.7.2 + # via + # feast (setup.py) + # google-cloud-bigquery + # google-cloud-datastore + # google-cloud-firestore + # google-cloud-storage +google-cloud-datastore==2.4.0 # via feast (setup.py) -google-auth==2.3.3 - # via google-api-core +google-cloud-firestore==2.3.4 + # via firebase-admin +google-cloud-storage==1.40.0 + # via + # feast (setup.py) + # firebase-admin + # gcsfs +google-crc32c==1.3.0 + # via google-resumable-media +google-resumable-media==1.3.3 + # via + # google-cloud-bigquery + # google-cloud-storage googleapis-common-protos==1.52.0 # via # feast (setup.py) # google-api-core # tensorflow-metadata +great-expectations==0.14.4 + # via feast (setup.py) grpcio==1.43.0 # via # feast (setup.py) + # google-api-core + # google-cloud-bigquery # grpcio-reflection + # grpcio-testing + # grpcio-tools grpcio-reflection==1.43.0 # via feast (setup.py) +grpcio-testing==1.34.0 + # via feast (setup.py) +grpcio-tools==1.34.0 + # via feast (setup.py) h11==0.13.0 # via uvicorn +hiredis==2.0.0 + # via feast (setup.py) +httplib2==0.20.2 + # via + # google-api-python-client + # google-auth-httplib2 httptools==0.3.0 # via uvicorn +identify==2.4.7 + # via pre-commit idna==3.3 # via # anyio # requests + # snowflake-connector-python + # yarl +imagesize==1.3.0 + # via sphinx +importlib-metadata==4.2.0 + # via great-expectations importlib-resources==5.4.0 # via jsonschema -jinja2==3.0.3 +iniconfig==1.1.1 + # via pytest +ipykernel==6.7.0 + # via + # ipywidgets + # notebook +ipython==7.31.1 + # via + # ipykernel + # ipywidgets +ipython-genutils==0.2.0 + # via + # ipywidgets + # nbformat + # notebook +ipywidgets==7.6.5 + # via great-expectations +isodate==0.6.1 + # via msrest +isort==5.10.1 # via feast (setup.py) +jedi==0.18.1 + # via ipython +jinja2==3.0.3 + # via + # altair + # feast (setup.py) + # great-expectations + # moto + # nbconvert + # notebook + # sphinx +jmespath==0.10.0 + # via + # boto3 + # botocore +jsonpatch==1.32 + # via great-expectations +jsonpointer==2.2 + # via jsonpatch jsonschema==4.4.0 - # via feast (setup.py) + # via + # altair + # feast (setup.py) + # great-expectations + # nbformat +jupyter-client==7.1.2 + # via + # ipykernel + # nbclient + # notebook +jupyter-core==4.9.1 + # via + # jupyter-client + # nbconvert + # nbformat + # notebook +jupyterlab-pygments==0.1.2 + # via nbconvert +jupyterlab-widgets==1.0.2 + # via ipywidgets +libcst==0.4.1 + # via + # google-cloud-bigquery-storage + # google-cloud-datastore +locket==0.2.1 + # via partd markupsafe==2.0.1 - # via jinja2 + # via + # jinja2 + # moto +matplotlib-inline==0.1.3 + # via + # ipykernel + # ipython +mccabe==0.6.1 + # via flake8 +minio==7.1.0 + # via feast (setup.py) +mistune==0.8.4 + # via + # great-expectations + # nbconvert mmh3==3.0.0 # via feast (setup.py) +mock==2.0.0 + # via feast (setup.py) +moto==3.0.2 + # via feast (setup.py) +msal==1.16.0 + # via + # azure-identity + # msal-extensions +msal-extensions==0.3.1 + # via azure-identity +msgpack==1.0.3 + # via cachecontrol +msrest==0.6.21 + # via + # azure-storage-blob + # msrestazure +msrestazure==0.6.4 + # via adlfs +multidict==6.0.2 + # via + # aiohttp + # yarl +mypy==0.931 + # via feast (setup.py) +mypy-extensions==0.4.3 + # via + # mypy + # typing-inspect +mypy-protobuf==3.1.0 + # via feast (setup.py) +nbclient==0.5.10 + # via nbconvert +nbconvert==6.4.1 + # via notebook +nbformat==5.1.3 + # via + # ipywidgets + # nbclient + # nbconvert + # notebook +nest-asyncio==1.5.4 + # via + # ipykernel + # jupyter-client + # nbclient + # notebook +nodeenv==1.6.0 + # via pre-commit +notebook==6.4.8 + # via widgetsnbextension numpy==1.21.5 # via + # altair + # great-expectations # pandas # pandavro # pyarrow + # scipy +oauthlib==3.2.0 + # via requests-oauthlib +oscrypto==1.2.1 + # via snowflake-connector-python +packaging==21.3 + # via + # bleach + # dask + # deprecation + # google-api-core + # google-cloud-bigquery + # google-cloud-firestore + # pytest + # sphinx pandas==1.3.5 # via + # altair # feast (setup.py) + # great-expectations # pandavro + # snowflake-connector-python pandavro==1.5.2 # via feast (setup.py) -proto-plus==1.19.6 +pandocfilters==1.5.0 + # via nbconvert +parso==0.8.3 + # via jedi +partd==1.2.0 + # via dask +pathspec==0.9.0 + # via black +pbr==5.8.0 + # via mock +pep517==0.12.0 + # via pip-tools +pexpect==4.8.0 + # via ipython +pickleshare==0.7.5 + # via ipython +pip-tools==6.4.0 # via feast (setup.py) -protobuf==3.19.3 +platformdirs==2.4.1 + # via virtualenv +pluggy==1.0.0 + # via pytest +portalocker==2.3.2 + # via msal-extensions +pre-commit==2.17.0 + # via feast (setup.py) +prometheus-client==0.13.1 + # via notebook +prompt-toolkit==3.0.26 + # via ipython +proto-plus==1.19.6 + # via + # feast (setup.py) + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-datastore + # google-cloud-firestore +protobuf==3.19.4 # via # feast (setup.py) # google-api-core + # google-cloud-bigquery # googleapis-common-protos # grpcio-reflection + # grpcio-testing + # grpcio-tools + # mypy-protobuf # proto-plus # tensorflow-metadata +ptyprocess==0.7.0 + # via + # pexpect + # terminado +py==1.11.0 + # via + # pytest + # pytest-forked +py-cpuinfo==8.0.0 + # via pytest-benchmark +py4j==0.10.9.3 + # via pyspark pyarrow==6.0.1 - # via feast (setup.py) + # via + # feast (setup.py) + # snowflake-connector-python pyasn1==0.4.8 # via # pyasn1-modules # rsa pyasn1-modules==0.2.8 # via google-auth +pycodestyle==2.8.0 + # via flake8 +pycparser==2.21 + # via cffi +pycryptodomex==3.14.0 + # via snowflake-connector-python pydantic==1.9.0 # via # fastapi # feast (setup.py) +pyflakes==2.4.0 + # via flake8 +pygments==2.11.2 + # via + # ipython + # jupyterlab-pygments + # nbconvert + # sphinx +pyjwt[crypto]==2.3.0 + # via + # adal + # msal + # snowflake-connector-python +pyopenssl==21.0.0 + # via snowflake-connector-python +pyparsing==2.4.7 + # via + # great-expectations + # httplib2 + # packaging pyrsistent==0.18.1 # via jsonschema +pyspark==3.2.1 + # via feast (setup.py) +pytest==6.2.5 + # via + # feast (setup.py) + # pytest-benchmark + # pytest-cov + # pytest-forked + # pytest-lazy-fixture + # pytest-mock + # pytest-ordering + # pytest-timeout + # pytest-xdist +pytest-benchmark==3.4.1 + # via feast (setup.py) +pytest-cov==3.0.0 + # via feast (setup.py) +pytest-forked==1.4.0 + # via pytest-xdist +pytest-lazy-fixture==0.6.3 + # via feast (setup.py) +pytest-mock==1.10.4 + # via feast (setup.py) +pytest-ordering==0.6 + # via feast (setup.py) +pytest-timeout==1.4.2 + # via feast (setup.py) +pytest-xdist==2.5.0 + # via feast (setup.py) python-dateutil==2.8.2 - # via pandas + # via + # adal + # botocore + # google-cloud-bigquery + # great-expectations + # jupyter-client + # moto + # pandas python-dotenv==0.19.2 # via uvicorn pytz==2021.3 - # via pandas + # via + # babel + # google-api-core + # great-expectations + # moto + # pandas + # snowflake-connector-python +pytz-deprecation-shim==0.1.0.post0 + # via tzlocal pyyaml==6.0 # via + # dask # feast (setup.py) + # libcst + # pre-commit # uvicorn +pyzmq==22.3.0 + # via + # jupyter-client + # notebook +redis==3.5.3 + # via + # feast (setup.py) + # redis-py-cluster +redis-py-cluster==2.1.3 + # via feast (setup.py) +regex==2022.1.18 + # via black requests==2.27.1 - # via google-api-core + # via + # adal + # adlfs + # azure-core + # azure-datalake-store + # cachecontrol + # docker + # gcsfs + # google-api-core + # google-cloud-bigquery + # google-cloud-storage + # great-expectations + # moto + # msal + # msrest + # requests-oauthlib + # responses + # snowflake-connector-python + # sphinx +requests-oauthlib==1.3.1 + # via + # google-auth-oauthlib + # msrest +responses==0.17.0 + # via moto rsa==4.8 # via google-auth +ruamel.yaml==0.17.17 + # via great-expectations +ruamel.yaml.clib==0.2.6 + # via ruamel.yaml +s3transfer==0.5.0 + # via boto3 +scipy==1.7.3 + # via great-expectations +send2trash==1.8.0 + # via notebook six==1.16.0 # via # absl-py + # azure-core + # azure-identity + # bleach + # cryptography + # google-api-core # google-auth + # google-auth-httplib2 + # google-cloud-core + # google-resumable-media # grpcio + # isodate + # mock + # msrestazure # pandavro + # pyopenssl # python-dateutil + # responses + # virtualenv sniffio==1.2.0 # via anyio +snowballstemmer==2.2.0 + # via sphinx +snowflake-connector-python[pandas]==2.7.3 + # via feast (setup.py) +sphinx==4.3.2 + # via + # feast (setup.py) + # sphinx-rtd-theme +sphinx-rtd-theme==1.0.0 + # via feast (setup.py) +sphinxcontrib-applehelp==1.0.2 + # via sphinx +sphinxcontrib-devhelp==1.0.2 + # via sphinx +sphinxcontrib-htmlhelp==2.0.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==1.0.3 + # via sphinx +sphinxcontrib-serializinghtml==1.1.5 + # via sphinx starlette==0.17.1 # via fastapi tabulate==0.8.9 @@ -131,24 +710,131 @@ tenacity==8.0.1 # via feast (setup.py) tensorflow-metadata==1.6.0 # via feast (setup.py) -toml==0.10.2 +termcolor==1.1.0 + # via great-expectations +terminado==0.13.1 + # via notebook +testcontainers==3.4.2 # via feast (setup.py) +testpath==0.5.0 + # via nbconvert +toml==0.10.2 + # via + # black + # feast (setup.py) + # pre-commit + # pytest +tomli==2.0.0 + # via + # coverage + # mypy + # pep517 +toolz==0.11.2 + # via + # altair + # dask + # partd +tornado==6.1 + # via + # ipykernel + # jupyter-client + # notebook + # terminado tqdm==4.62.3 + # via + # feast (setup.py) + # great-expectations +traitlets==5.1.1 + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-core + # matplotlib-inline + # nbclient + # nbconvert + # nbformat + # notebook +typed-ast==1.5.2 + # via black +types-futures==3.3.8 + # via types-protobuf +types-protobuf==3.19.7 + # via + # feast (setup.py) + # mypy-protobuf +types-python-dateutil==2.8.9 + # via feast (setup.py) +types-pytz==2021.3.4 + # via feast (setup.py) +types-pyyaml==6.0.4 + # via feast (setup.py) +types-redis==4.1.13 # via feast (setup.py) +types-requests==2.27.8 + # via feast (setup.py) +types-setuptools==57.4.8 + # via feast (setup.py) +types-tabulate==0.8.5 + # via feast (setup.py) +types-urllib3==1.26.8 + # via types-requests typing-extensions==4.0.1 - # via pydantic + # via + # great-expectations + # libcst + # mypy + # pydantic + # typing-inspect +typing-inspect==0.7.1 + # via libcst +tzdata==2021.5 + # via pytz-deprecation-shim +tzlocal==4.1 + # via great-expectations +uritemplate==4.1.1 + # via google-api-python-client urllib3==1.26.8 - # via requests -uvicorn[standard]==0.17.0 + # via + # botocore + # feast (setup.py) + # minio + # requests + # responses +uvicorn[standard]==0.17.1 # via feast (setup.py) uvloop==0.16.0 # via uvicorn +virtualenv==20.13.0 + # via pre-commit watchgod==0.7 # via uvicorn +wcwidth==0.2.5 + # via prompt-toolkit +webencodings==0.5.1 + # via bleach +websocket-client==1.2.3 + # via docker websockets==10.1 # via uvicorn +werkzeug==2.0.2 + # via moto +wheel==0.37.1 + # via pip-tools +widgetsnbextension==3.5.2 + # via ipywidgets +wrapt==1.13.3 + # via testcontainers +xmltodict==0.12.0 + # via moto +yarl==1.7.2 + # via aiohttp zipp==3.7.0 - # via importlib-resources + # via + # importlib-metadata + # importlib-resources # The following packages are considered to be unsafe in a requirements file: -# setuptools +# pip +# setuptools \ No newline at end of file From 6abea5f5c92c6828ee02e2116637343ee912c440 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 4 Mar 2022 10:53:31 -0800 Subject: [PATCH 54/58] Move third party to contrib Signed-off-by: Kevin Zhang --- .../{third_party => contrib}/spark_offline_store/__init__.py | 0 .../{third_party => contrib}/spark_offline_store/spark.py | 0 .../{third_party => contrib}/spark_offline_store/spark_source.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename sdk/python/feast/infra/offline_stores/{third_party => contrib}/spark_offline_store/__init__.py (100%) rename sdk/python/feast/infra/offline_stores/{third_party => contrib}/spark_offline_store/spark.py (100%) rename sdk/python/feast/infra/offline_stores/{third_party => contrib}/spark_offline_store/spark_source.py (100%) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/__init__.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/__init__.py similarity index 100% rename from sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/__init__.py rename to sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/__init__.py diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py similarity index 100% rename from sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py rename to sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py similarity index 100% rename from sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py rename to sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py From 05aaeb87f4b8ed0bd79dd02679d2695c57a64387 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 4 Mar 2022 11:18:00 -0800 Subject: [PATCH 55/58] Fix imports Signed-off-by: Kevin Zhang --- sdk/python/feast/__init__.py | 2 +- .../contrib/spark_offline_store/spark.py | 2 +- .../spark_offline_store/spark_source.py | 2 +- .../third_party/spark_offline_store/spark.py | 522 ++++++++++++++++++ .../spark_offline_store/spark_source.py | 242 ++++++++ sdk/python/feast/repo_config.py | 2 +- sdk/python/feast/templates/spark/example.py | 2 +- .../data_sources/spark_data_source_creator.py | 4 +- 8 files changed, 771 insertions(+), 7 deletions(-) create mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py create mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 674cadc2a2..76167cf9f7 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -6,7 +6,7 @@ from feast.infra.offline_stores.file_source import FileSource from feast.infra.offline_stores.redshift_source import RedshiftSource from feast.infra.offline_stores.snowflake_source import SnowflakeSource -from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( +from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( SparkSource, ) diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py index 097c24da0c..b54987dc0b 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py @@ -22,7 +22,7 @@ RetrievalJob, RetrievalMetadata, ) -from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( +from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( SparkSource, ) from feast.registry import Registry diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py index b1a3484902..50e365a631 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark_source.py @@ -139,7 +139,7 @@ def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: def get_table_column_names_and_types( self, config: RepoConfig ) -> Iterable[Tuple[str, str]]: - from feast.infra.offline_stores.third_party.spark_offline_store.spark import ( + from feast.infra.offline_stores.contrib.spark_offline_store.spark import ( get_spark_session_or_start_new_with_repoconfig, ) diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py new file mode 100644 index 0000000000..b54987dc0b --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py @@ -0,0 +1,522 @@ +import warnings +from datetime import datetime +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import pandas +import pandas as pd +import pyarrow +import pyspark +from pydantic import StrictStr +from pyspark import SparkConf +from pyspark.sql import SparkSession +from pytz import utc + +from feast import FeatureView, OnDemandFeatureView +from feast.data_source import DataSource +from feast.errors import InvalidEntityType +from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL +from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.offline_store import ( + OfflineStore, + RetrievalJob, + RetrievalMetadata, +) +from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( + SparkSource, +) +from feast.registry import Registry +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.type_map import spark_schema_to_np_dtypes + + +class SparkOfflineStoreConfig(FeastConfigBaseModel): + type: StrictStr = "spark" + """ Offline store type selector""" + + spark_conf: Optional[Dict[str, str]] = None + """ Configuration overlay for the spark session """ + # sparksession is not serializable and we dont want to pass it around as an argument + + +class SparkOfflineStore(OfflineStore): + @staticmethod + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + event_timestamp_column: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + spark_session = get_spark_session_or_start_new_with_repoconfig( + config.offline_store + ) + assert isinstance(config.offline_store, SparkOfflineStoreConfig) + assert isinstance(data_source, SparkSource) + + warnings.warn( + "The spark offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + + print("Pulling latest features from spark offline store") + + from_expression = data_source.get_table_query_string() + + partition_by_join_key_string = ", ".join(join_key_columns) + if partition_by_join_key_string != "": + partition_by_join_key_string = ( + "PARTITION BY " + partition_by_join_key_string + ) + timestamps = [event_timestamp_column] + if created_timestamp_column: + timestamps.append(created_timestamp_column) + timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" + field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) + + start_date_str = _format_datetime(start_date) + end_date_str = _format_datetime(end_date) + query = f""" + SELECT + {field_string} + {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} + FROM ( + SELECT {field_string}, + ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS feast_row_ + FROM {from_expression} t1 + WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date_str}') AND TIMESTAMP('{end_date_str}') + ) t2 + WHERE feast_row_ = 1 + """ + + return SparkRetrievalJob( + spark_session=spark_session, + query=query, + full_feature_names=False, + on_demand_feature_views=None, + ) + + @staticmethod + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pandas.DataFrame, str], + registry: Registry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + assert isinstance(config.offline_store, SparkOfflineStoreConfig) + warnings.warn( + "The spark offline store is an experimental feature in alpha development. " + "Some functionality may still be unstable so functionality can change in the future.", + RuntimeWarning, + ) + spark_session = get_spark_session_or_start_new_with_repoconfig( + store_config=config.offline_store + ) + tmp_entity_df_table_name = offline_utils.get_temp_entity_table_name() + + entity_schema = _upload_entity_df_and_get_entity_schema( + spark_session=spark_session, + table_name=tmp_entity_df_table_name, + entity_df=entity_df, + ) + event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( + entity_schema=entity_schema, + ) + entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df, event_timestamp_col, spark_session, + ) + + expected_join_keys = offline_utils.get_expected_join_keys( + project=project, feature_views=feature_views, registry=registry + ) + offline_utils.assert_expected_columns_in_entity_df( + entity_schema=entity_schema, + join_keys=expected_join_keys, + entity_df_event_timestamp_col=event_timestamp_col, + ) + + query_context = offline_utils.get_feature_view_query_context( + feature_refs, + feature_views, + registry, + project, + entity_df_event_timestamp_range, + ) + + query = offline_utils.build_point_in_time_query( + feature_view_query_contexts=query_context, + left_table_query_string=tmp_entity_df_table_name, + entity_df_event_timestamp_col=event_timestamp_col, + entity_df_columns=entity_schema.keys(), + query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, + full_feature_names=full_feature_names, + ) + + return SparkRetrievalJob( + spark_session=spark_session, + query=query, + full_feature_names=full_feature_names, + on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( + feature_refs, project, registry + ), + metadata=RetrievalMetadata( + features=feature_refs, + keys=list(set(entity_schema.keys()) - {event_timestamp_col}), + min_event_timestamp=entity_df_event_timestamp_range[0], + max_event_timestamp=entity_df_event_timestamp_range[1], + ), + ) + + @staticmethod + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + event_timestamp_column: str, + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + """ + Note that join_key_columns, feature_name_columns, event_timestamp_column, and created_timestamp_column + have all already been mapped to column names of the source table and those column names are the values passed + into this function. + """ + assert isinstance(data_source, SparkSource) + warnings.warn( + "The spark offline store is an experimental feature in alpha development. " + "This API is unstable and it could and most probably will be changed in the future.", + RuntimeWarning, + ) + from_expression = data_source.get_table_query_string() + + field_string = ( + '"' + + '", "'.join( + join_key_columns + feature_name_columns + [event_timestamp_column] + ) + + '"' + ) + start_date = start_date.astimezone(tz=utc) + end_date = end_date.astimezone(tz=utc) + + query = f""" + SELECT {field_string} + FROM {from_expression} + WHERE "{event_timestamp_column}" BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' + """ + spark_session = get_spark_session_or_start_new_with_repoconfig( + store_config=config.offline_store + ) + return SparkRetrievalJob( + spark_session=spark_session, query=query, full_feature_names=False + ) + + +class SparkRetrievalJob(RetrievalJob): + def __init__( + self, + spark_session: SparkSession, + query: str, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]] = None, + metadata: Optional[RetrievalMetadata] = None, + ): + super().__init__() + self.spark_session = spark_session + self.query = query + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views + self._metadata = metadata + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + return self._on_demand_feature_views + + def to_spark_df(self) -> pyspark.sql.DataFrame: + statements = self.query.split( + "---EOS---" + ) # TODO can do better than this dirty split + *_, last = map(self.spark_session.sql, statements) + return last + + def _to_df_internal(self) -> pd.DataFrame: + """Return dataset as Pandas DataFrame synchronously""" + return self.to_spark_df().toPandas() + + def _to_arrow_internal(self) -> pyarrow.Table: + """Return dataset as pyarrow Table synchronously""" + df = self.to_df() + return pyarrow.Table.from_pandas(df) # noqa + + def persist(self, storage: SavedDatasetStorage): + """ + Run the retrieval and persist the results in the same offline store used for read. + """ + pass + + @property + def metadata(self) -> Optional[RetrievalMetadata]: + """ + Return metadata information about retrieval. + Should be available even before materializing the dataset itself. + """ + return self._metadata + + +def get_spark_session_or_start_new_with_repoconfig( + store_config: SparkOfflineStoreConfig, +) -> SparkSession: + spark_session = SparkSession.getActiveSession() + if not spark_session: + spark_builder = SparkSession.builder + spark_conf = store_config.spark_conf + if spark_conf: + spark_builder = spark_builder.config( + conf=SparkConf().setAll([(k, v) for k, v in spark_conf.items()]) + ) + + spark_session = spark_builder.getOrCreate() + spark_session.conf.set("spark.sql.parser.quotedRegexColumnNames", "true") + return spark_session + + +def _get_entity_df_event_timestamp_range( + entity_df: Union[pd.DataFrame, str], + entity_df_event_timestamp_col: str, + spark_session: SparkSession, +) -> Tuple[datetime, datetime]: + if isinstance(entity_df, pd.DataFrame): + entity_df_event_timestamp = entity_df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pd.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pd.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), determine range + # from table + df = spark_session.sql(entity_df).select(entity_df_event_timestamp_col) + # TODO(kzhang132): need utc conversion here. + entity_df_event_timestamp_range = ( + df.agg({entity_df_event_timestamp_col: "max"}).collect()[0][0], + df.agg({entity_df_event_timestamp_col: "min"}).collect()[0][0], + ) + else: + raise InvalidEntityType(type(entity_df)) + + return entity_df_event_timestamp_range + + +def _upload_entity_df_and_get_entity_schema( + spark_session: SparkSession, + table_name: str, + entity_df: Union[pandas.DataFrame, str], +) -> Dict[str, np.dtype]: + if isinstance(entity_df, pd.DataFrame): + spark_session.createDataFrame(entity_df).createOrReplaceTempView(table_name) + return dict(zip(entity_df.columns, entity_df.dtypes)) + elif isinstance(entity_df, str): + spark_session.sql(entity_df).createOrReplaceTempView(table_name) + limited_entity_df = spark_session.table(table_name) + return dict( + zip( + limited_entity_df.columns, + spark_schema_to_np_dtypes(limited_entity_df.dtypes), + ) + ) + else: + raise InvalidEntityType(type(entity_df)) + + +def _format_datetime(t: datetime) -> str: + # Since Hive does not support timezone, need to transform to utc. + if t.tzinfo: + t = t.astimezone(tz=utc) + dt = t.strftime("%Y-%m-%d %H:%M:%S.%f") + return dt + + +MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """/* + Compute a deterministic hash for the `left_table_query_string` that will be used throughout + all the logic as the field to GROUP BY the data +*/ +CREATE OR REPLACE TEMPORARY VIEW entity_dataframe AS ( + SELECT *, + {{entity_df_event_timestamp_col}} AS entity_timestamp + {% for featureview in featureviews %} + ,CONCAT( + {% for entity in featureview.entities %} + CAST({{entity}} AS STRING), + {% endfor %} + CAST({{entity_df_event_timestamp_col}} AS STRING) + ) AS {{featureview.name}}__entity_row_unique_id + {% endfor %} + FROM {{ left_table_query_string }} +); +---EOS--- +-- Start create temporary table *__base +{% for featureview in featureviews %} +CREATE OR REPLACE TEMPORARY VIEW {{ featureview.name }}__base AS +WITH {{ featureview.name }}__entity_dataframe AS ( + SELECT + {{ featureview.entities | join(', ')}}, + entity_timestamp, + {{featureview.name}}__entity_row_unique_id + FROM entity_dataframe + GROUP BY {{ featureview.entities | join(', ')}}, entity_timestamp, {{featureview.name}}__entity_row_unique_id +), +/* + This query template performs the point-in-time correctness join for a single feature set table + to the provided entity table. + 1. We first join the current feature_view to the entity dataframe that has been passed. + This JOIN has the following logic: + - For each row of the entity dataframe, only keep the rows where the `event_timestamp_column` + is less than the one provided in the entity dataframe + - If there a TTL for the current feature_view, also keep the rows where the `event_timestamp_column` + is higher the the one provided minus the TTL + - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been + computed previously + The output of this CTE will contain all the necessary information and already filtered out most + of the data that is not relevant. +*/ +{{ featureview.name }}__subquery AS ( + SELECT + {{ featureview.event_timestamp_column }} as event_timestamp, + {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }} + {{ featureview.entity_selections | join(', ')}}, + {% for feature in featureview.features %} + {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %}{% if loop.last %}{% else %}, {% endif %} + {% endfor %} + FROM {{ featureview.table_subquery }} AS subquery + INNER JOIN ( + SELECT MAX(entity_timestamp) as max_entity_timestamp_ + {% if featureview.ttl == 0 %}{% else %} + ,(MIN(entity_timestamp) - interval '{{ featureview.ttl }}' second) as min_entity_timestamp_ + {% endif %} + FROM entity_dataframe + ) AS temp + ON ( + {{ featureview.event_timestamp_column }} <= max_entity_timestamp_ + {% if featureview.ttl == 0 %}{% else %} + AND {{ featureview.event_timestamp_column }} >= min_entity_timestamp_ + {% endif %} + ) +) +SELECT + subquery.*, + entity_dataframe.entity_timestamp, + entity_dataframe.{{featureview.name}}__entity_row_unique_id +FROM {{ featureview.name }}__subquery AS subquery +INNER JOIN ( + SELECT * + {% if featureview.ttl == 0 %}{% else %} + , (entity_timestamp - interval '{{ featureview.ttl }}' second) as ttl_entity_timestamp + {% endif %} + FROM {{ featureview.name }}__entity_dataframe +) AS entity_dataframe +ON ( + subquery.event_timestamp <= entity_dataframe.entity_timestamp + {% if featureview.ttl == 0 %}{% else %} + AND subquery.event_timestamp >= entity_dataframe.ttl_entity_timestamp + {% endif %} + {% for entity in featureview.entities %} + AND subquery.{{ entity }} = entity_dataframe.{{ entity }} + {% endfor %} +); +---EOS--- +{% endfor %} +-- End create temporary table *__base +{% for featureview in featureviews %} +{% if loop.first %}WITH{% endif %} +/* + 2. If the `created_timestamp_column` has been set, we need to + deduplicate the data first. This is done by calculating the + `MAX(created_at_timestamp)` for each event_timestamp. + We then join the data on the next CTE +*/ +{% if featureview.created_timestamp_column %} +{{ featureview.name }}__dedup AS ( + SELECT + {{featureview.name}}__entity_row_unique_id, + event_timestamp, + MAX(created_timestamp) as created_timestamp + FROM {{ featureview.name }}__base + GROUP BY {{featureview.name}}__entity_row_unique_id, event_timestamp +), +{% endif %} +/* + 3. The data has been filtered during the first CTE "*__base" + Thus we only need to compute the latest timestamp of each feature. +*/ +{{ featureview.name }}__latest AS ( + SELECT + base.{{featureview.name}}__entity_row_unique_id, + MAX(base.event_timestamp) AS event_timestamp + {% if featureview.created_timestamp_column %} + ,MAX(base.created_timestamp) AS created_timestamp + {% endif %} + FROM {{ featureview.name }}__base AS base + {% if featureview.created_timestamp_column %} + INNER JOIN {{ featureview.name }}__dedup AS dedup + ON ( + dedup.{{featureview.name}}__entity_row_unique_id=base.{{featureview.name}}__entity_row_unique_id + AND dedup.event_timestamp=base.event_timestamp + AND dedup.created_timestamp=base.created_timestamp + ) + {% endif %} + GROUP BY base.{{featureview.name}}__entity_row_unique_id +), +/* + 4. Once we know the latest value of each feature for a given timestamp, + we can join again the data back to the original "base" dataset +*/ +{{ featureview.name }}__cleaned AS ( + SELECT base.* + FROM {{ featureview.name }}__base AS base + INNER JOIN {{ featureview.name }}__latest AS latest + ON ( + base.{{featureview.name}}__entity_row_unique_id=latest.{{featureview.name}}__entity_row_unique_id + AND base.event_timestamp=latest.event_timestamp + {% if featureview.created_timestamp_column %} + AND base.created_timestamp=latest.created_timestamp + {% endif %} + ) +){% if loop.last %}{% else %}, {% endif %} +{% endfor %} +/* + Joins the outputs of multiple time travel joins to a single table. + The entity_dataframe dataset being our source of truth here. + */ +SELECT `(entity_timestamp|{% for featureview in featureviews %}{{featureview.name}}__entity_row_unique_id{% if loop.last %}{% else %}|{% endif %}{% endfor %})?+.+` +FROM entity_dataframe +{% for featureview in featureviews %} +LEFT JOIN ( + SELECT + {{featureview.name}}__entity_row_unique_id + {% for feature in featureview.features %} + ,{% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %} + {% endfor %} + FROM {{ featureview.name }}__cleaned +) AS {{ featureview.name }}__joined +ON ( + {{ featureview.name }}__joined.{{featureview.name}}__entity_row_unique_id=entity_dataframe.{{featureview.name}}__entity_row_unique_id +) +{% endfor %}""" diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py new file mode 100644 index 0000000000..50e365a631 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py @@ -0,0 +1,242 @@ +import logging +import pickle +import traceback +import warnings +from enum import Enum +from typing import Any, Callable, Dict, Iterable, Optional, Tuple + +from pyspark.sql import SparkSession + +from feast.data_source import DataSource +from feast.infra.offline_stores.offline_utils import get_temp_entity_table_name +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.core.SavedDataset_pb2 import ( + SavedDatasetStorage as SavedDatasetStorageProto, +) +from feast.repo_config import RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.type_map import spark_to_feast_value_type +from feast.value_type import ValueType + +logger = logging.getLogger(__name__) + + +class SparkSourceFormat(Enum): + csv = "csv" + json = "json" + parquet = "parquet" + + +class SparkSource(DataSource): + def __init__( + self, + table: Optional[str] = None, + query: Optional[str] = None, + path: Optional[str] = None, + file_format: Optional[str] = None, + event_timestamp_column: Optional[str] = None, + created_timestamp_column: Optional[str] = None, + field_mapping: Optional[Dict[str, str]] = None, + date_partition_column: Optional[str] = None, + ): + super().__init__( + event_timestamp_column, + created_timestamp_column, + field_mapping, + date_partition_column, + ) + warnings.warn( + "The spark data source API is an experimental feature in alpha development. " + "This API is unstable and it could and most probably will be changed in the future.", + RuntimeWarning, + ) + self.allowed_formats = [format.value for format in SparkSourceFormat] + + # Check that only one of the ways to load a spark dataframe can be used. + if sum([(arg is not None) for arg in [table, query, path]]) != 1: + raise ValueError( + "Exactly one of params(table, query, path) must be specified." + ) + + if path is not None: + if file_format is None: + raise ValueError( + "If 'path' is specified, then 'file_format' is required." + ) + if file_format not in self.allowed_formats: + raise ValueError( + f"'file_format' should be one of {self.allowed_formats}" + ) + + self.spark_options = SparkOptions( + table=table, query=query, path=path, file_format=file_format, + ) + + @property + def table(self): + """ + Returns the table of this feature data source + """ + return self.spark_options.table + + @property + def query(self): + """ + Returns the query of this feature data source + """ + return self.spark_options.query + + @property + def path(self): + """ + Returns the path of the spark data source file. + """ + return self.spark_options.path + + @property + def file_format(self): + """ + Returns the file format of this feature data source. + """ + return self.spark_options.file_format + + @staticmethod + def from_proto(data_source: DataSourceProto) -> Any: + assert data_source.HasField("custom_options") + + spark_options = SparkOptions.from_proto(data_source.custom_options) + return SparkSource( + field_mapping=dict(data_source.field_mapping), + table=spark_options.table, + query=spark_options.query, + path=spark_options.path, + file_format=spark_options.file_format, + event_timestamp_column=data_source.event_timestamp_column, + created_timestamp_column=data_source.created_timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + + def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + type=DataSourceProto.CUSTOM_SOURCE, + field_mapping=self.field_mapping, + custom_options=self.spark_options.to_proto(), + ) + + data_source_proto.event_timestamp_column = self.event_timestamp_column + data_source_proto.created_timestamp_column = self.created_timestamp_column + data_source_proto.date_partition_column = self.date_partition_column + + return data_source_proto + + def validate(self, config: RepoConfig): + self.get_table_column_names_and_types(config) + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return spark_to_feast_value_type + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + from feast.infra.offline_stores.contrib.spark_offline_store.spark import ( + get_spark_session_or_start_new_with_repoconfig, + ) + + spark_session = get_spark_session_or_start_new_with_repoconfig( + store_config=config.offline_store + ) + df = spark_session.sql(f"SELECT * FROM {self.get_table_query_string()}") + return ( + (fields["name"], fields["type"]) + for fields in df.schema.jsonValue()["fields"] + ) + + def get_table_query_string(self) -> str: + """Returns a string that can directly be used to reference this table in SQL""" + if self.table: + # Backticks make sure that spark sql knows this a table reference. + return f"`{self.table}`" + if self.query: + return f"({self.query})" + + # If both the table query string and the actual query are null, we can load from file. + spark_session = SparkSession.getActiveSession() + if spark_session is None: + raise AssertionError("Could not find an active spark session.") + try: + df = spark_session.read.format(self.file_format).load(self.path) + except Exception: + logger.exception( + "Spark read of file source failed.\n" + traceback.format_exc() + ) + tmp_table_name = get_temp_entity_table_name() + df.createOrReplaceTempView(tmp_table_name) + + return f"`{tmp_table_name}`" + + +class SparkOptions: + def __init__( + self, + table: Optional[str] = None, + query: Optional[str] = None, + path: Optional[str] = None, + file_format: Optional[str] = None, + ): + self.table = table + self.query = query + self.path = path + self.file_format = file_format + + @classmethod + def from_proto(cls, spark_options_proto: DataSourceProto.CustomSourceOptions): + """ + Creates a SparkOptions from a protobuf representation of a spark option + args: + spark_options_proto: a protobuf representation of a datasource + Returns: + Returns a SparkOptions object based on the spark_options protobuf + """ + spark_configuration = pickle.loads(spark_options_proto.configuration) + + spark_options = cls( + table=spark_configuration.table, + query=spark_configuration.query, + path=spark_configuration.path, + file_format=spark_configuration.file_format, + ) + return spark_options + + def to_proto(self) -> DataSourceProto.CustomSourceOptions: + """ + Converts an SparkOptionsProto object to its protobuf representation. + Returns: + SparkOptionsProto protobuf + """ + + spark_options_proto = DataSourceProto.CustomSourceOptions( + configuration=pickle.dumps(self), + ) + + return spark_options_proto + + +class SavedDatasetSparkStorage(SavedDatasetStorage): + _proto_attr_name = "spark_storage" + + spark_options: SparkOptions + + def __init__(self, table_ref: Optional[str] = None, query: Optional[str] = None): + self.spark_options = SparkOptions(table=table_ref, query=query) + + @staticmethod + def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: + # TODO: implementation is not correct. Needs fix and update to protos. + return SavedDatasetSparkStorage(table_ref="", query=None) + + def to_proto(self) -> SavedDatasetStorageProto: + return SavedDatasetStorageProto() + + def to_data_source(self) -> DataSource: + return SparkSource(table=self.spark_options.table) diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index 9c67a6bedc..20edebf5c7 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -39,7 +39,7 @@ "bigquery": "feast.infra.offline_stores.bigquery.BigQueryOfflineStore", "redshift": "feast.infra.offline_stores.redshift.RedshiftOfflineStore", "snowflake.offline": "feast.infra.offline_stores.snowflake.SnowflakeOfflineStore", - "spark": "feast.infra.offline_stores.third_party.spark_offline_store.spark.SparkOfflineStore", + "spark": "feast.infra.offline_stores.contrib.spark_offline_store.spark.SparkOfflineStore", } FEATURE_SERVER_CONFIG_CLASS_FOR_TYPE = { diff --git a/sdk/python/feast/templates/spark/example.py b/sdk/python/feast/templates/spark/example.py index bbeff14c48..ddda73b787 100644 --- a/sdk/python/feast/templates/spark/example.py +++ b/sdk/python/feast/templates/spark/example.py @@ -7,7 +7,7 @@ from google.protobuf.duration_pb2 import Duration from feast import Entity, Feature, FeatureView, ValueType -from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( +from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( SparkSource, ) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py index 13eaa30116..4284c3cf4c 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/spark_data_source_creator.py @@ -6,10 +6,10 @@ from pyspark.sql import SparkSession from feast.data_source import DataSource -from feast.infra.offline_stores.third_party.spark_offline_store.spark import ( +from feast.infra.offline_stores.contrib.spark_offline_store.spark import ( SparkOfflineStoreConfig, ) -from feast.infra.offline_stores.third_party.spark_offline_store.spark_source import ( +from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( SavedDatasetSparkStorage, SparkSource, ) From 29a60d79e6cea11ec7f7344178ff05c2b57573b2 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 4 Mar 2022 11:36:16 -0800 Subject: [PATCH 56/58] Remove third_party refactor Signed-off-by: Kevin Zhang --- sdk/python/feast/__init__.py | 6 +- .../contrib/spark_offline_store/spark.py | 6 +- .../third_party/spark_offline_store/spark.py | 522 ------------------ .../spark_offline_store/spark_source.py | 242 -------- 4 files changed, 6 insertions(+), 770 deletions(-) delete mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py delete mode 100644 sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 76167cf9f7..0af226aa05 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -3,12 +3,12 @@ from pkg_resources import DistributionNotFound, get_distribution from feast.infra.offline_stores.bigquery_source import BigQuerySource -from feast.infra.offline_stores.file_source import FileSource -from feast.infra.offline_stores.redshift_source import RedshiftSource -from feast.infra.offline_stores.snowflake_source import SnowflakeSource from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( SparkSource, ) +from feast.infra.offline_stores.file_source import FileSource +from feast.infra.offline_stores.redshift_source import RedshiftSource +from feast.infra.offline_stores.snowflake_source import SnowflakeSource from .data_source import KafkaSource, KinesisSource, SourceType from .entity import Entity diff --git a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py index b54987dc0b..95e306aa60 100644 --- a/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py +++ b/sdk/python/feast/infra/offline_stores/contrib/spark_offline_store/spark.py @@ -17,14 +17,14 @@ from feast.errors import InvalidEntityType from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( + SparkSource, +) from feast.infra.offline_stores.offline_store import ( OfflineStore, RetrievalJob, RetrievalMetadata, ) -from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( - SparkSource, -) from feast.registry import Registry from feast.repo_config import FeastConfigBaseModel, RepoConfig from feast.saved_dataset import SavedDatasetStorage diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py deleted file mode 100644 index b54987dc0b..0000000000 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark.py +++ /dev/null @@ -1,522 +0,0 @@ -import warnings -from datetime import datetime -from typing import Dict, List, Optional, Tuple, Union - -import numpy as np -import pandas -import pandas as pd -import pyarrow -import pyspark -from pydantic import StrictStr -from pyspark import SparkConf -from pyspark.sql import SparkSession -from pytz import utc - -from feast import FeatureView, OnDemandFeatureView -from feast.data_source import DataSource -from feast.errors import InvalidEntityType -from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL -from feast.infra.offline_stores import offline_utils -from feast.infra.offline_stores.offline_store import ( - OfflineStore, - RetrievalJob, - RetrievalMetadata, -) -from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import ( - SparkSource, -) -from feast.registry import Registry -from feast.repo_config import FeastConfigBaseModel, RepoConfig -from feast.saved_dataset import SavedDatasetStorage -from feast.type_map import spark_schema_to_np_dtypes - - -class SparkOfflineStoreConfig(FeastConfigBaseModel): - type: StrictStr = "spark" - """ Offline store type selector""" - - spark_conf: Optional[Dict[str, str]] = None - """ Configuration overlay for the spark session """ - # sparksession is not serializable and we dont want to pass it around as an argument - - -class SparkOfflineStore(OfflineStore): - @staticmethod - def pull_latest_from_table_or_query( - config: RepoConfig, - data_source: DataSource, - join_key_columns: List[str], - feature_name_columns: List[str], - event_timestamp_column: str, - created_timestamp_column: Optional[str], - start_date: datetime, - end_date: datetime, - ) -> RetrievalJob: - spark_session = get_spark_session_or_start_new_with_repoconfig( - config.offline_store - ) - assert isinstance(config.offline_store, SparkOfflineStoreConfig) - assert isinstance(data_source, SparkSource) - - warnings.warn( - "The spark offline store is an experimental feature in alpha development. " - "Some functionality may still be unstable so functionality can change in the future.", - RuntimeWarning, - ) - - print("Pulling latest features from spark offline store") - - from_expression = data_source.get_table_query_string() - - partition_by_join_key_string = ", ".join(join_key_columns) - if partition_by_join_key_string != "": - partition_by_join_key_string = ( - "PARTITION BY " + partition_by_join_key_string - ) - timestamps = [event_timestamp_column] - if created_timestamp_column: - timestamps.append(created_timestamp_column) - timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" - field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) - - start_date_str = _format_datetime(start_date) - end_date_str = _format_datetime(end_date) - query = f""" - SELECT - {field_string} - {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} - FROM ( - SELECT {field_string}, - ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS feast_row_ - FROM {from_expression} t1 - WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date_str}') AND TIMESTAMP('{end_date_str}') - ) t2 - WHERE feast_row_ = 1 - """ - - return SparkRetrievalJob( - spark_session=spark_session, - query=query, - full_feature_names=False, - on_demand_feature_views=None, - ) - - @staticmethod - def get_historical_features( - config: RepoConfig, - feature_views: List[FeatureView], - feature_refs: List[str], - entity_df: Union[pandas.DataFrame, str], - registry: Registry, - project: str, - full_feature_names: bool = False, - ) -> RetrievalJob: - assert isinstance(config.offline_store, SparkOfflineStoreConfig) - warnings.warn( - "The spark offline store is an experimental feature in alpha development. " - "Some functionality may still be unstable so functionality can change in the future.", - RuntimeWarning, - ) - spark_session = get_spark_session_or_start_new_with_repoconfig( - store_config=config.offline_store - ) - tmp_entity_df_table_name = offline_utils.get_temp_entity_table_name() - - entity_schema = _upload_entity_df_and_get_entity_schema( - spark_session=spark_session, - table_name=tmp_entity_df_table_name, - entity_df=entity_df, - ) - event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( - entity_schema=entity_schema, - ) - entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df, event_timestamp_col, spark_session, - ) - - expected_join_keys = offline_utils.get_expected_join_keys( - project=project, feature_views=feature_views, registry=registry - ) - offline_utils.assert_expected_columns_in_entity_df( - entity_schema=entity_schema, - join_keys=expected_join_keys, - entity_df_event_timestamp_col=event_timestamp_col, - ) - - query_context = offline_utils.get_feature_view_query_context( - feature_refs, - feature_views, - registry, - project, - entity_df_event_timestamp_range, - ) - - query = offline_utils.build_point_in_time_query( - feature_view_query_contexts=query_context, - left_table_query_string=tmp_entity_df_table_name, - entity_df_event_timestamp_col=event_timestamp_col, - entity_df_columns=entity_schema.keys(), - query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, - full_feature_names=full_feature_names, - ) - - return SparkRetrievalJob( - spark_session=spark_session, - query=query, - full_feature_names=full_feature_names, - on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( - feature_refs, project, registry - ), - metadata=RetrievalMetadata( - features=feature_refs, - keys=list(set(entity_schema.keys()) - {event_timestamp_col}), - min_event_timestamp=entity_df_event_timestamp_range[0], - max_event_timestamp=entity_df_event_timestamp_range[1], - ), - ) - - @staticmethod - def pull_all_from_table_or_query( - config: RepoConfig, - data_source: DataSource, - join_key_columns: List[str], - feature_name_columns: List[str], - event_timestamp_column: str, - start_date: datetime, - end_date: datetime, - ) -> RetrievalJob: - """ - Note that join_key_columns, feature_name_columns, event_timestamp_column, and created_timestamp_column - have all already been mapped to column names of the source table and those column names are the values passed - into this function. - """ - assert isinstance(data_source, SparkSource) - warnings.warn( - "The spark offline store is an experimental feature in alpha development. " - "This API is unstable and it could and most probably will be changed in the future.", - RuntimeWarning, - ) - from_expression = data_source.get_table_query_string() - - field_string = ( - '"' - + '", "'.join( - join_key_columns + feature_name_columns + [event_timestamp_column] - ) - + '"' - ) - start_date = start_date.astimezone(tz=utc) - end_date = end_date.astimezone(tz=utc) - - query = f""" - SELECT {field_string} - FROM {from_expression} - WHERE "{event_timestamp_column}" BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' - """ - spark_session = get_spark_session_or_start_new_with_repoconfig( - store_config=config.offline_store - ) - return SparkRetrievalJob( - spark_session=spark_session, query=query, full_feature_names=False - ) - - -class SparkRetrievalJob(RetrievalJob): - def __init__( - self, - spark_session: SparkSession, - query: str, - full_feature_names: bool, - on_demand_feature_views: Optional[List[OnDemandFeatureView]] = None, - metadata: Optional[RetrievalMetadata] = None, - ): - super().__init__() - self.spark_session = spark_session - self.query = query - self._full_feature_names = full_feature_names - self._on_demand_feature_views = on_demand_feature_views - self._metadata = metadata - - @property - def full_feature_names(self) -> bool: - return self._full_feature_names - - @property - def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: - return self._on_demand_feature_views - - def to_spark_df(self) -> pyspark.sql.DataFrame: - statements = self.query.split( - "---EOS---" - ) # TODO can do better than this dirty split - *_, last = map(self.spark_session.sql, statements) - return last - - def _to_df_internal(self) -> pd.DataFrame: - """Return dataset as Pandas DataFrame synchronously""" - return self.to_spark_df().toPandas() - - def _to_arrow_internal(self) -> pyarrow.Table: - """Return dataset as pyarrow Table synchronously""" - df = self.to_df() - return pyarrow.Table.from_pandas(df) # noqa - - def persist(self, storage: SavedDatasetStorage): - """ - Run the retrieval and persist the results in the same offline store used for read. - """ - pass - - @property - def metadata(self) -> Optional[RetrievalMetadata]: - """ - Return metadata information about retrieval. - Should be available even before materializing the dataset itself. - """ - return self._metadata - - -def get_spark_session_or_start_new_with_repoconfig( - store_config: SparkOfflineStoreConfig, -) -> SparkSession: - spark_session = SparkSession.getActiveSession() - if not spark_session: - spark_builder = SparkSession.builder - spark_conf = store_config.spark_conf - if spark_conf: - spark_builder = spark_builder.config( - conf=SparkConf().setAll([(k, v) for k, v in spark_conf.items()]) - ) - - spark_session = spark_builder.getOrCreate() - spark_session.conf.set("spark.sql.parser.quotedRegexColumnNames", "true") - return spark_session - - -def _get_entity_df_event_timestamp_range( - entity_df: Union[pd.DataFrame, str], - entity_df_event_timestamp_col: str, - spark_session: SparkSession, -) -> Tuple[datetime, datetime]: - if isinstance(entity_df, pd.DataFrame): - entity_df_event_timestamp = entity_df.loc[ - :, entity_df_event_timestamp_col - ].infer_objects() - if pd.api.types.is_string_dtype(entity_df_event_timestamp): - entity_df_event_timestamp = pd.to_datetime( - entity_df_event_timestamp, utc=True - ) - entity_df_event_timestamp_range = ( - entity_df_event_timestamp.min().to_pydatetime(), - entity_df_event_timestamp.max().to_pydatetime(), - ) - elif isinstance(entity_df, str): - # If the entity_df is a string (SQL query), determine range - # from table - df = spark_session.sql(entity_df).select(entity_df_event_timestamp_col) - # TODO(kzhang132): need utc conversion here. - entity_df_event_timestamp_range = ( - df.agg({entity_df_event_timestamp_col: "max"}).collect()[0][0], - df.agg({entity_df_event_timestamp_col: "min"}).collect()[0][0], - ) - else: - raise InvalidEntityType(type(entity_df)) - - return entity_df_event_timestamp_range - - -def _upload_entity_df_and_get_entity_schema( - spark_session: SparkSession, - table_name: str, - entity_df: Union[pandas.DataFrame, str], -) -> Dict[str, np.dtype]: - if isinstance(entity_df, pd.DataFrame): - spark_session.createDataFrame(entity_df).createOrReplaceTempView(table_name) - return dict(zip(entity_df.columns, entity_df.dtypes)) - elif isinstance(entity_df, str): - spark_session.sql(entity_df).createOrReplaceTempView(table_name) - limited_entity_df = spark_session.table(table_name) - return dict( - zip( - limited_entity_df.columns, - spark_schema_to_np_dtypes(limited_entity_df.dtypes), - ) - ) - else: - raise InvalidEntityType(type(entity_df)) - - -def _format_datetime(t: datetime) -> str: - # Since Hive does not support timezone, need to transform to utc. - if t.tzinfo: - t = t.astimezone(tz=utc) - dt = t.strftime("%Y-%m-%d %H:%M:%S.%f") - return dt - - -MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """/* - Compute a deterministic hash for the `left_table_query_string` that will be used throughout - all the logic as the field to GROUP BY the data -*/ -CREATE OR REPLACE TEMPORARY VIEW entity_dataframe AS ( - SELECT *, - {{entity_df_event_timestamp_col}} AS entity_timestamp - {% for featureview in featureviews %} - ,CONCAT( - {% for entity in featureview.entities %} - CAST({{entity}} AS STRING), - {% endfor %} - CAST({{entity_df_event_timestamp_col}} AS STRING) - ) AS {{featureview.name}}__entity_row_unique_id - {% endfor %} - FROM {{ left_table_query_string }} -); ----EOS--- --- Start create temporary table *__base -{% for featureview in featureviews %} -CREATE OR REPLACE TEMPORARY VIEW {{ featureview.name }}__base AS -WITH {{ featureview.name }}__entity_dataframe AS ( - SELECT - {{ featureview.entities | join(', ')}}, - entity_timestamp, - {{featureview.name}}__entity_row_unique_id - FROM entity_dataframe - GROUP BY {{ featureview.entities | join(', ')}}, entity_timestamp, {{featureview.name}}__entity_row_unique_id -), -/* - This query template performs the point-in-time correctness join for a single feature set table - to the provided entity table. - 1. We first join the current feature_view to the entity dataframe that has been passed. - This JOIN has the following logic: - - For each row of the entity dataframe, only keep the rows where the `event_timestamp_column` - is less than the one provided in the entity dataframe - - If there a TTL for the current feature_view, also keep the rows where the `event_timestamp_column` - is higher the the one provided minus the TTL - - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been - computed previously - The output of this CTE will contain all the necessary information and already filtered out most - of the data that is not relevant. -*/ -{{ featureview.name }}__subquery AS ( - SELECT - {{ featureview.event_timestamp_column }} as event_timestamp, - {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }} - {{ featureview.entity_selections | join(', ')}}, - {% for feature in featureview.features %} - {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %}{% if loop.last %}{% else %}, {% endif %} - {% endfor %} - FROM {{ featureview.table_subquery }} AS subquery - INNER JOIN ( - SELECT MAX(entity_timestamp) as max_entity_timestamp_ - {% if featureview.ttl == 0 %}{% else %} - ,(MIN(entity_timestamp) - interval '{{ featureview.ttl }}' second) as min_entity_timestamp_ - {% endif %} - FROM entity_dataframe - ) AS temp - ON ( - {{ featureview.event_timestamp_column }} <= max_entity_timestamp_ - {% if featureview.ttl == 0 %}{% else %} - AND {{ featureview.event_timestamp_column }} >= min_entity_timestamp_ - {% endif %} - ) -) -SELECT - subquery.*, - entity_dataframe.entity_timestamp, - entity_dataframe.{{featureview.name}}__entity_row_unique_id -FROM {{ featureview.name }}__subquery AS subquery -INNER JOIN ( - SELECT * - {% if featureview.ttl == 0 %}{% else %} - , (entity_timestamp - interval '{{ featureview.ttl }}' second) as ttl_entity_timestamp - {% endif %} - FROM {{ featureview.name }}__entity_dataframe -) AS entity_dataframe -ON ( - subquery.event_timestamp <= entity_dataframe.entity_timestamp - {% if featureview.ttl == 0 %}{% else %} - AND subquery.event_timestamp >= entity_dataframe.ttl_entity_timestamp - {% endif %} - {% for entity in featureview.entities %} - AND subquery.{{ entity }} = entity_dataframe.{{ entity }} - {% endfor %} -); ----EOS--- -{% endfor %} --- End create temporary table *__base -{% for featureview in featureviews %} -{% if loop.first %}WITH{% endif %} -/* - 2. If the `created_timestamp_column` has been set, we need to - deduplicate the data first. This is done by calculating the - `MAX(created_at_timestamp)` for each event_timestamp. - We then join the data on the next CTE -*/ -{% if featureview.created_timestamp_column %} -{{ featureview.name }}__dedup AS ( - SELECT - {{featureview.name}}__entity_row_unique_id, - event_timestamp, - MAX(created_timestamp) as created_timestamp - FROM {{ featureview.name }}__base - GROUP BY {{featureview.name}}__entity_row_unique_id, event_timestamp -), -{% endif %} -/* - 3. The data has been filtered during the first CTE "*__base" - Thus we only need to compute the latest timestamp of each feature. -*/ -{{ featureview.name }}__latest AS ( - SELECT - base.{{featureview.name}}__entity_row_unique_id, - MAX(base.event_timestamp) AS event_timestamp - {% if featureview.created_timestamp_column %} - ,MAX(base.created_timestamp) AS created_timestamp - {% endif %} - FROM {{ featureview.name }}__base AS base - {% if featureview.created_timestamp_column %} - INNER JOIN {{ featureview.name }}__dedup AS dedup - ON ( - dedup.{{featureview.name}}__entity_row_unique_id=base.{{featureview.name}}__entity_row_unique_id - AND dedup.event_timestamp=base.event_timestamp - AND dedup.created_timestamp=base.created_timestamp - ) - {% endif %} - GROUP BY base.{{featureview.name}}__entity_row_unique_id -), -/* - 4. Once we know the latest value of each feature for a given timestamp, - we can join again the data back to the original "base" dataset -*/ -{{ featureview.name }}__cleaned AS ( - SELECT base.* - FROM {{ featureview.name }}__base AS base - INNER JOIN {{ featureview.name }}__latest AS latest - ON ( - base.{{featureview.name}}__entity_row_unique_id=latest.{{featureview.name}}__entity_row_unique_id - AND base.event_timestamp=latest.event_timestamp - {% if featureview.created_timestamp_column %} - AND base.created_timestamp=latest.created_timestamp - {% endif %} - ) -){% if loop.last %}{% else %}, {% endif %} -{% endfor %} -/* - Joins the outputs of multiple time travel joins to a single table. - The entity_dataframe dataset being our source of truth here. - */ -SELECT `(entity_timestamp|{% for featureview in featureviews %}{{featureview.name}}__entity_row_unique_id{% if loop.last %}{% else %}|{% endif %}{% endfor %})?+.+` -FROM entity_dataframe -{% for featureview in featureviews %} -LEFT JOIN ( - SELECT - {{featureview.name}}__entity_row_unique_id - {% for feature in featureview.features %} - ,{% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %} - {% endfor %} - FROM {{ featureview.name }}__cleaned -) AS {{ featureview.name }}__joined -ON ( - {{ featureview.name }}__joined.{{featureview.name}}__entity_row_unique_id=entity_dataframe.{{featureview.name}}__entity_row_unique_id -) -{% endfor %}""" diff --git a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py b/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py deleted file mode 100644 index 50e365a631..0000000000 --- a/sdk/python/feast/infra/offline_stores/third_party/spark_offline_store/spark_source.py +++ /dev/null @@ -1,242 +0,0 @@ -import logging -import pickle -import traceback -import warnings -from enum import Enum -from typing import Any, Callable, Dict, Iterable, Optional, Tuple - -from pyspark.sql import SparkSession - -from feast.data_source import DataSource -from feast.infra.offline_stores.offline_utils import get_temp_entity_table_name -from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto -from feast.protos.feast.core.SavedDataset_pb2 import ( - SavedDatasetStorage as SavedDatasetStorageProto, -) -from feast.repo_config import RepoConfig -from feast.saved_dataset import SavedDatasetStorage -from feast.type_map import spark_to_feast_value_type -from feast.value_type import ValueType - -logger = logging.getLogger(__name__) - - -class SparkSourceFormat(Enum): - csv = "csv" - json = "json" - parquet = "parquet" - - -class SparkSource(DataSource): - def __init__( - self, - table: Optional[str] = None, - query: Optional[str] = None, - path: Optional[str] = None, - file_format: Optional[str] = None, - event_timestamp_column: Optional[str] = None, - created_timestamp_column: Optional[str] = None, - field_mapping: Optional[Dict[str, str]] = None, - date_partition_column: Optional[str] = None, - ): - super().__init__( - event_timestamp_column, - created_timestamp_column, - field_mapping, - date_partition_column, - ) - warnings.warn( - "The spark data source API is an experimental feature in alpha development. " - "This API is unstable and it could and most probably will be changed in the future.", - RuntimeWarning, - ) - self.allowed_formats = [format.value for format in SparkSourceFormat] - - # Check that only one of the ways to load a spark dataframe can be used. - if sum([(arg is not None) for arg in [table, query, path]]) != 1: - raise ValueError( - "Exactly one of params(table, query, path) must be specified." - ) - - if path is not None: - if file_format is None: - raise ValueError( - "If 'path' is specified, then 'file_format' is required." - ) - if file_format not in self.allowed_formats: - raise ValueError( - f"'file_format' should be one of {self.allowed_formats}" - ) - - self.spark_options = SparkOptions( - table=table, query=query, path=path, file_format=file_format, - ) - - @property - def table(self): - """ - Returns the table of this feature data source - """ - return self.spark_options.table - - @property - def query(self): - """ - Returns the query of this feature data source - """ - return self.spark_options.query - - @property - def path(self): - """ - Returns the path of the spark data source file. - """ - return self.spark_options.path - - @property - def file_format(self): - """ - Returns the file format of this feature data source. - """ - return self.spark_options.file_format - - @staticmethod - def from_proto(data_source: DataSourceProto) -> Any: - assert data_source.HasField("custom_options") - - spark_options = SparkOptions.from_proto(data_source.custom_options) - return SparkSource( - field_mapping=dict(data_source.field_mapping), - table=spark_options.table, - query=spark_options.query, - path=spark_options.path, - file_format=spark_options.file_format, - event_timestamp_column=data_source.event_timestamp_column, - created_timestamp_column=data_source.created_timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - - def to_proto(self) -> DataSourceProto: - data_source_proto = DataSourceProto( - type=DataSourceProto.CUSTOM_SOURCE, - field_mapping=self.field_mapping, - custom_options=self.spark_options.to_proto(), - ) - - data_source_proto.event_timestamp_column = self.event_timestamp_column - data_source_proto.created_timestamp_column = self.created_timestamp_column - data_source_proto.date_partition_column = self.date_partition_column - - return data_source_proto - - def validate(self, config: RepoConfig): - self.get_table_column_names_and_types(config) - - @staticmethod - def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: - return spark_to_feast_value_type - - def get_table_column_names_and_types( - self, config: RepoConfig - ) -> Iterable[Tuple[str, str]]: - from feast.infra.offline_stores.contrib.spark_offline_store.spark import ( - get_spark_session_or_start_new_with_repoconfig, - ) - - spark_session = get_spark_session_or_start_new_with_repoconfig( - store_config=config.offline_store - ) - df = spark_session.sql(f"SELECT * FROM {self.get_table_query_string()}") - return ( - (fields["name"], fields["type"]) - for fields in df.schema.jsonValue()["fields"] - ) - - def get_table_query_string(self) -> str: - """Returns a string that can directly be used to reference this table in SQL""" - if self.table: - # Backticks make sure that spark sql knows this a table reference. - return f"`{self.table}`" - if self.query: - return f"({self.query})" - - # If both the table query string and the actual query are null, we can load from file. - spark_session = SparkSession.getActiveSession() - if spark_session is None: - raise AssertionError("Could not find an active spark session.") - try: - df = spark_session.read.format(self.file_format).load(self.path) - except Exception: - logger.exception( - "Spark read of file source failed.\n" + traceback.format_exc() - ) - tmp_table_name = get_temp_entity_table_name() - df.createOrReplaceTempView(tmp_table_name) - - return f"`{tmp_table_name}`" - - -class SparkOptions: - def __init__( - self, - table: Optional[str] = None, - query: Optional[str] = None, - path: Optional[str] = None, - file_format: Optional[str] = None, - ): - self.table = table - self.query = query - self.path = path - self.file_format = file_format - - @classmethod - def from_proto(cls, spark_options_proto: DataSourceProto.CustomSourceOptions): - """ - Creates a SparkOptions from a protobuf representation of a spark option - args: - spark_options_proto: a protobuf representation of a datasource - Returns: - Returns a SparkOptions object based on the spark_options protobuf - """ - spark_configuration = pickle.loads(spark_options_proto.configuration) - - spark_options = cls( - table=spark_configuration.table, - query=spark_configuration.query, - path=spark_configuration.path, - file_format=spark_configuration.file_format, - ) - return spark_options - - def to_proto(self) -> DataSourceProto.CustomSourceOptions: - """ - Converts an SparkOptionsProto object to its protobuf representation. - Returns: - SparkOptionsProto protobuf - """ - - spark_options_proto = DataSourceProto.CustomSourceOptions( - configuration=pickle.dumps(self), - ) - - return spark_options_proto - - -class SavedDatasetSparkStorage(SavedDatasetStorage): - _proto_attr_name = "spark_storage" - - spark_options: SparkOptions - - def __init__(self, table_ref: Optional[str] = None, query: Optional[str] = None): - self.spark_options = SparkOptions(table=table_ref, query=query) - - @staticmethod - def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: - # TODO: implementation is not correct. Needs fix and update to protos. - return SavedDatasetSparkStorage(table_ref="", query=None) - - def to_proto(self) -> SavedDatasetStorageProto: - return SavedDatasetStorageProto() - - def to_data_source(self) -> DataSource: - return SparkSource(table=self.spark_options.table) From b43417eaefbc38ac7ffb081c6f8d2771a4ef07ad Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 4 Mar 2022 12:02:22 -0800 Subject: [PATCH 57/58] Revert ci requirements and update comment in type map Signed-off-by: Kevin Zhang --- sdk/python/feast/type_map.py | 2 +- .../requirements/py3.7-ci-requirements.txt | 84 +++++++++++-------- .../requirements/py3.8-ci-requirements.txt | 14 ++-- 3 files changed, 57 insertions(+), 43 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index b1d3dfc706..713b952d09 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -362,7 +362,7 @@ def _python_value_to_proto_value( if valid_scalar_types: assert type(sample) in valid_scalar_types if feast_value_type == ValueType.BOOL: - # ProtoValue no longer supports conversion of np.bool_ so we need it convert it. + # ProtoValue does not support conversion of np.bool_ so we need to convert it to support np.bool_. return [ ProtoValue( **{ diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index 93ab57002b..6cb8c2931b 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with python 3.7 # To update, run: # -# pip-compile --extra=ci --output-file=requirements/py3.7-ci-requirements.txt setup.py +# pip-compile --extra=ci --output-file=requirements/py3.7-ci-requirements.txt # absl-py==1.0.0 # via tensorflow-metadata @@ -44,6 +44,8 @@ assertpy==1.1 # via feast (setup.py) async-timeout==4.0.2 # via aiohttp +asynctest==0.13.0 + # via aiohttp attrs==21.4.0 # via # aiohttp @@ -52,14 +54,14 @@ attrs==21.4.0 # pytest avro==1.10.0 # via feast (setup.py) -azure-core==1.22.1 +azure-core==1.23.0 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.52 # via adlfs -azure-identity==1.7.1 +azure-identity==1.8.0 # via adlfs azure-storage-blob==12.9.0 # via adlfs @@ -67,7 +69,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports.zoneinfo==0.2.1 +backports-zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -75,11 +77,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.21.8 +boto3==1.21.11 # via # feast (setup.py) # moto -botocore==1.24.8 +botocore==1.24.11 # via # boto3 # moto @@ -195,7 +197,7 @@ google-api-core[grpc]==1.31.5 # google-cloud-core # google-cloud-datastore # google-cloud-firestore -google-api-python-client==2.38.0 +google-api-python-client==2.39.0 # via firebase-admin google-auth==1.35.0 # via @@ -210,7 +212,7 @@ google-auth-httplib2==0.1.0 # via google-api-python-client google-auth-oauthlib==0.5.0 # via gcsfs -google-cloud-bigquery==2.34.0 +google-cloud-bigquery==2.34.1 # via feast (setup.py) google-cloud-bigquery-storage==2.12.0 # via feast (setup.py) @@ -221,7 +223,7 @@ google-cloud-core==1.7.2 # google-cloud-datastore # google-cloud-firestore # google-cloud-storage -google-cloud-datastore==2.4.0 +google-cloud-datastore==2.5.0 # via feast (setup.py) google-cloud-firestore==2.3.4 # via firebase-admin @@ -278,7 +280,17 @@ idna==3.3 imagesize==1.3.0 # via sphinx importlib-metadata==4.2.0 - # via great-expectations + # via + # click + # flake8 + # great-expectations + # jsonschema + # moto + # pep517 + # pluggy + # pre-commit + # pytest + # virtualenv importlib-resources==5.4.0 # via jsonschema iniconfig==1.1.1 @@ -342,13 +354,9 @@ jupyterlab-pygments==0.1.2 # via nbconvert jupyterlab-widgets==1.0.2 # via ipywidgets -libcst==0.4.1 - # via - # google-cloud-bigquery-storage - # google-cloud-datastore locket==0.2.1 # via partd -markupsafe==2.0.1 +markupsafe==2.1.0 # via # jinja2 # moto @@ -391,9 +399,7 @@ multidict==6.0.2 mypy==0.931 # via feast (setup.py) mypy-extensions==0.4.3 - # via - # mypy - # typing-inspect + # via mypy mypy-protobuf==3.1.0 # via feast (setup.py) nbclient==0.5.11 @@ -604,7 +610,6 @@ pyyaml==6.0 # via # dask # feast (setup.py) - # libcst # pre-commit # uvicorn pyzmq==22.3.0 @@ -617,7 +622,7 @@ redis==3.5.3 # redis-py-cluster redis-py-cluster==2.1.3 # via feast (setup.py) -regex==2022.1.18 +regex==2022.3.2 # via black requests==2.27.1 # via @@ -647,10 +652,10 @@ responses==0.18.0 # via moto rsa==4.8 # via google-auth -ruamel.yaml==0.17.17 +ruamel-yaml==0.17.17 # via great-expectations -ruamel.yaml.clib==0.2.6 - # via ruamel.yaml +ruamel-yaml-clib==0.2.6 + # via ruamel-yaml s3transfer==0.5.2 # via boto3 scipy==1.7.3 @@ -707,11 +712,11 @@ tabulate==0.8.9 # via feast (setup.py) tenacity==8.0.1 # via feast (setup.py) -tensorflow-metadata==1.6.0 +tensorflow-metadata==1.7.0 # via feast (setup.py) termcolor==1.1.0 # via great-expectations -terminado==0.13.1 +terminado==0.13.2 # via notebook testcontainers==3.4.2 # via feast (setup.py) @@ -739,7 +744,7 @@ tornado==6.1 # jupyter-client # notebook # terminado -tqdm==4.62.3 +tqdm==4.63.0 # via # feast (setup.py) # great-expectations @@ -756,10 +761,10 @@ traitlets==5.1.1 # nbformat # notebook typed-ast==1.5.2 - # via black -types-futures==3.3.8 - # via types-protobuf -types-protobuf==3.19.2 + # via + # black + # mypy +types-protobuf==3.19.12 # via # feast (setup.py) # mypy-protobuf @@ -777,17 +782,25 @@ types-setuptools==57.4.9 # via feast (setup.py) types-tabulate==0.8.5 # via feast (setup.py) -types-urllib3==1.26.9 +types-urllib3==1.26.10 # via types-requests typing-extensions==4.1.1 # via + # aiohttp + # anyio + # argon2-cffi + # asgiref + # async-timeout + # azure-core # great-expectations - # libcst + # h11 + # importlib-metadata + # jsonschema # mypy # pydantic - # typing-inspect -typing-inspect==0.7.1 - # via libcst + # starlette + # uvicorn + # yarl tzdata==2021.5 # via pytz-deprecation-shim tzlocal==4.1 @@ -833,6 +846,7 @@ zipp==3.7.0 # via # importlib-metadata # importlib-resources + # pep517 # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/sdk/python/requirements/py3.8-ci-requirements.txt b/sdk/python/requirements/py3.8-ci-requirements.txt index a146335e1a..f5da5525ba 100644 --- a/sdk/python/requirements/py3.8-ci-requirements.txt +++ b/sdk/python/requirements/py3.8-ci-requirements.txt @@ -69,7 +69,7 @@ babel==2.9.1 # via sphinx backcall==0.2.0 # via ipython -backports-zoneinfo==0.2.1 +backports.zoneinfo==0.2.1 # via # pytz-deprecation-shim # tzlocal @@ -77,11 +77,11 @@ black==19.10b0 # via feast (setup.py) bleach==4.1.0 # via nbconvert -boto3==1.21.12 +boto3==1.21.11 # via # feast (setup.py) # moto -botocore==1.24.12 +botocore==1.24.11 # via # boto3 # moto @@ -646,10 +646,10 @@ responses==0.18.0 # via moto rsa==4.8 # via google-auth -ruamel-yaml==0.17.17 +ruamel.yaml==0.17.17 # via great-expectations -ruamel-yaml-clib==0.2.6 - # via ruamel-yaml +ruamel.yaml.clib==0.2.6 + # via ruamel.yaml s3transfer==0.5.2 # via boto3 scipy==1.8.0 @@ -833,4 +833,4 @@ zipp==3.7.0 # The following packages are considered to be unsafe in a requirements file: # pip -# setuptools +# setuptools \ No newline at end of file From 1acc088b11050362db5574d607b4a1d85e4b0bfc Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Fri, 4 Mar 2022 12:11:53 -0800 Subject: [PATCH 58/58] Revert 3.8-requirements Signed-off-by: Kevin Zhang --- .../requirements/py3.8-requirements.txt | 736 +----------------- 1 file changed, 25 insertions(+), 711 deletions(-) diff --git a/sdk/python/requirements/py3.8-requirements.txt b/sdk/python/requirements/py3.8-requirements.txt index 6b2a9661dd..90b4276013 100644 --- a/sdk/python/requirements/py3.8-requirements.txt +++ b/sdk/python/requirements/py3.8-requirements.txt @@ -2,706 +2,127 @@ # This file is autogenerated by pip-compile with python 3.8 # To update, run: # -# pip-compile --extra=ci --output-file=requirements/py3.8-ci-requirements.txt setup.py +# pip-compile --output-file=requirements/py3.8-requirements.txt # absl-py==1.0.0 # via tensorflow-metadata -adal==1.2.7 - # via - # azure-datalake-store - # msrestazure -adlfs==0.5.9 - # via feast (setup.py) -aiohttp==3.8.1 - # via - # adlfs - # gcsfs -aiosignal==1.2.0 - # via aiohttp -alabaster==0.7.12 - # via sphinx -altair==4.2.0 - # via great-expectations anyio==3.5.0 # via starlette -appdirs==1.4.4 - # via black -appnope==0.1.2 - # via - # ipykernel - # ipython -argon2-cffi==21.3.0 - # via notebook -argon2-cffi-bindings==21.2.0 - # via argon2-cffi -asgiref==3.5.0 +asgiref==3.4.1 # via uvicorn -asn1crypto==1.4.0 - # via - # oscrypto - # snowflake-connector-python -assertpy==1.1 - # via feast (setup.py) -async-timeout==4.0.2 - # via aiohttp attrs==21.4.0 - # via - # aiohttp - # black - # jsonschema - # pytest -avro==1.10.0 - # via feast (setup.py) -azure-core==1.21.1 - # via - # adlfs - # azure-identity - # azure-storage-blob -azure-datalake-store==0.0.52 - # via adlfs -azure-identity==1.7.1 - # via adlfs -azure-storage-blob==12.9.0 - # via adlfs -babel==2.9.1 - # via sphinx -backcall==0.2.0 - # via ipython -backports.zoneinfo==0.2.1 - # via - # pytz-deprecation-shim - # tzlocal -black==19.10b0 - # via feast (setup.py) -bleach==4.1.0 - # via nbconvert -boto3==1.20.46 - # via - # feast (setup.py) - # moto -botocore==1.23.46 - # via - # boto3 - # moto - # s3transfer -cachecontrol==0.12.10 - # via firebase-admin + # via jsonschema cachetools==4.2.4 # via google-auth certifi==2021.10.8 - # via - # minio - # msrest - # requests - # snowflake-connector-python -cffi==1.15.0 - # via - # argon2-cffi-bindings - # azure-datalake-store - # cryptography - # snowflake-connector-python -cfgv==3.3.1 - # via pre-commit -charset-normalizer==2.0.11 - # via - # aiohttp - # requests - # snowflake-connector-python + # via requests +charset-normalizer==2.0.10 + # via requests click==8.0.3 # via - # black # feast (setup.py) - # great-expectations - # pip-tools # uvicorn -cloudpickle==2.0.0 - # via dask colorama==0.4.4 # via feast (setup.py) -coverage[toml]==6.3 - # via pytest-cov -cryptography==3.3.2 - # via - # adal - # azure-identity - # azure-storage-blob - # feast (setup.py) - # moto - # msal - # pyjwt - # pyopenssl - # snowflake-connector-python -dask==2022.1.1 - # via feast (setup.py) -debugpy==1.5.1 - # via ipykernel -decorator==5.1.1 - # via - # gcsfs - # ipython -defusedxml==0.7.1 - # via nbconvert -deprecation==2.1.0 - # via testcontainers dill==0.3.4 # via feast (setup.py) -distlib==0.3.4 - # via virtualenv -docker==5.0.3 - # via - # feast (setup.py) - # testcontainers -docutils==0.17.1 - # via - # sphinx - # sphinx-rtd-theme -entrypoints==0.3 - # via - # altair - # jupyter-client - # nbconvert -execnet==1.9.0 - # via pytest-xdist -fastapi==0.73.0 +fastapi==0.72.0 # via feast (setup.py) fastavro==1.4.9 # via # feast (setup.py) # pandavro -filelock==3.4.2 - # via virtualenv -firebase-admin==4.5.2 - # via feast (setup.py) -flake8==4.0.1 - # via feast (setup.py) -frozenlist==1.3.0 - # via - # aiohttp - # aiosignal -fsspec==2022.1.0 - # via - # adlfs - # dask - # gcsfs -gcsfs==2022.1.0 - # via feast (setup.py) -google-api-core[grpc]==1.31.5 - # via - # feast (setup.py) - # firebase-admin - # google-api-python-client - # google-cloud-bigquery - # google-cloud-bigquery-storage - # google-cloud-core - # google-cloud-datastore - # google-cloud-firestore -google-api-python-client==2.36.0 - # via firebase-admin -google-auth==1.35.0 - # via - # gcsfs - # google-api-core - # google-api-python-client - # google-auth-httplib2 - # google-auth-oauthlib - # google-cloud-core - # google-cloud-storage -google-auth-httplib2==0.1.0 - # via google-api-python-client -google-auth-oauthlib==0.4.6 - # via gcsfs -google-cloud-bigquery==2.32.0 - # via feast (setup.py) -google-cloud-bigquery-storage==2.11.0 - # via feast (setup.py) -google-cloud-core==1.7.2 - # via - # feast (setup.py) - # google-cloud-bigquery - # google-cloud-datastore - # google-cloud-firestore - # google-cloud-storage -google-cloud-datastore==2.4.0 +google-api-core==2.4.0 # via feast (setup.py) -google-cloud-firestore==2.3.4 - # via firebase-admin -google-cloud-storage==1.40.0 - # via - # feast (setup.py) - # firebase-admin - # gcsfs -google-crc32c==1.3.0 - # via google-resumable-media -google-resumable-media==1.3.3 - # via - # google-cloud-bigquery - # google-cloud-storage +google-auth==2.3.3 + # via google-api-core googleapis-common-protos==1.52.0 # via # feast (setup.py) # google-api-core # tensorflow-metadata -great-expectations==0.14.4 - # via feast (setup.py) grpcio==1.43.0 # via # feast (setup.py) - # google-api-core - # google-cloud-bigquery # grpcio-reflection - # grpcio-testing - # grpcio-tools grpcio-reflection==1.43.0 # via feast (setup.py) -grpcio-testing==1.34.0 - # via feast (setup.py) -grpcio-tools==1.34.0 - # via feast (setup.py) h11==0.13.0 # via uvicorn -hiredis==2.0.0 - # via feast (setup.py) -httplib2==0.20.2 - # via - # google-api-python-client - # google-auth-httplib2 httptools==0.3.0 # via uvicorn -identify==2.4.7 - # via pre-commit idna==3.3 # via # anyio # requests - # snowflake-connector-python - # yarl -imagesize==1.3.0 - # via sphinx -importlib-metadata==4.2.0 - # via great-expectations importlib-resources==5.4.0 # via jsonschema -iniconfig==1.1.1 - # via pytest -ipykernel==6.7.0 - # via - # ipywidgets - # notebook -ipython==7.31.1 - # via - # ipykernel - # ipywidgets -ipython-genutils==0.2.0 - # via - # ipywidgets - # nbformat - # notebook -ipywidgets==7.6.5 - # via great-expectations -isodate==0.6.1 - # via msrest -isort==5.10.1 - # via feast (setup.py) -jedi==0.18.1 - # via ipython jinja2==3.0.3 - # via - # altair - # feast (setup.py) - # great-expectations - # moto - # nbconvert - # notebook - # sphinx -jmespath==0.10.0 - # via - # boto3 - # botocore -jsonpatch==1.32 - # via great-expectations -jsonpointer==2.2 - # via jsonpatch + # via feast (setup.py) jsonschema==4.4.0 - # via - # altair - # feast (setup.py) - # great-expectations - # nbformat -jupyter-client==7.1.2 - # via - # ipykernel - # nbclient - # notebook -jupyter-core==4.9.1 - # via - # jupyter-client - # nbconvert - # nbformat - # notebook -jupyterlab-pygments==0.1.2 - # via nbconvert -jupyterlab-widgets==1.0.2 - # via ipywidgets -libcst==0.4.1 - # via - # google-cloud-bigquery-storage - # google-cloud-datastore -locket==0.2.1 - # via partd -markupsafe==2.0.1 - # via - # jinja2 - # moto -matplotlib-inline==0.1.3 - # via - # ipykernel - # ipython -mccabe==0.6.1 - # via flake8 -minio==7.1.0 # via feast (setup.py) -mistune==0.8.4 - # via - # great-expectations - # nbconvert +markupsafe==2.0.1 + # via jinja2 mmh3==3.0.0 # via feast (setup.py) -mock==2.0.0 - # via feast (setup.py) -moto==3.0.2 - # via feast (setup.py) -msal==1.16.0 - # via - # azure-identity - # msal-extensions -msal-extensions==0.3.1 - # via azure-identity -msgpack==1.0.3 - # via cachecontrol -msrest==0.6.21 - # via - # azure-storage-blob - # msrestazure -msrestazure==0.6.4 - # via adlfs -multidict==6.0.2 - # via - # aiohttp - # yarl -mypy==0.931 - # via feast (setup.py) -mypy-extensions==0.4.3 - # via - # mypy - # typing-inspect -mypy-protobuf==3.1.0 - # via feast (setup.py) -nbclient==0.5.10 - # via nbconvert -nbconvert==6.4.1 - # via notebook -nbformat==5.1.3 - # via - # ipywidgets - # nbclient - # nbconvert - # notebook -nest-asyncio==1.5.4 - # via - # ipykernel - # jupyter-client - # nbclient - # notebook -nodeenv==1.6.0 - # via pre-commit -notebook==6.4.8 - # via widgetsnbextension numpy==1.21.5 # via - # altair - # great-expectations # pandas # pandavro # pyarrow - # scipy -oauthlib==3.2.0 - # via requests-oauthlib -oscrypto==1.2.1 - # via snowflake-connector-python -packaging==21.3 - # via - # bleach - # dask - # deprecation - # google-api-core - # google-cloud-bigquery - # google-cloud-firestore - # pytest - # sphinx pandas==1.3.5 # via - # altair # feast (setup.py) - # great-expectations # pandavro - # snowflake-connector-python pandavro==1.5.2 # via feast (setup.py) -pandocfilters==1.5.0 - # via nbconvert -parso==0.8.3 - # via jedi -partd==1.2.0 - # via dask -pathspec==0.9.0 - # via black -pbr==5.8.0 - # via mock -pep517==0.12.0 - # via pip-tools -pexpect==4.8.0 - # via ipython -pickleshare==0.7.5 - # via ipython -pip-tools==6.4.0 - # via feast (setup.py) -platformdirs==2.4.1 - # via virtualenv -pluggy==1.0.0 - # via pytest -portalocker==2.3.2 - # via msal-extensions -pre-commit==2.17.0 - # via feast (setup.py) -prometheus-client==0.13.1 - # via notebook -prompt-toolkit==3.0.26 - # via ipython proto-plus==1.19.6 - # via - # feast (setup.py) - # google-cloud-bigquery - # google-cloud-bigquery-storage - # google-cloud-datastore - # google-cloud-firestore -protobuf==3.19.4 + # via feast (setup.py) +protobuf==3.19.3 # via # feast (setup.py) # google-api-core - # google-cloud-bigquery # googleapis-common-protos # grpcio-reflection - # grpcio-testing - # grpcio-tools - # mypy-protobuf # proto-plus # tensorflow-metadata -ptyprocess==0.7.0 - # via - # pexpect - # terminado -py==1.11.0 - # via - # pytest - # pytest-forked -py-cpuinfo==8.0.0 - # via pytest-benchmark -py4j==0.10.9.3 - # via pyspark pyarrow==6.0.1 - # via - # feast (setup.py) - # snowflake-connector-python + # via feast (setup.py) pyasn1==0.4.8 # via # pyasn1-modules # rsa pyasn1-modules==0.2.8 # via google-auth -pycodestyle==2.8.0 - # via flake8 -pycparser==2.21 - # via cffi -pycryptodomex==3.14.0 - # via snowflake-connector-python pydantic==1.9.0 # via # fastapi # feast (setup.py) -pyflakes==2.4.0 - # via flake8 -pygments==2.11.2 - # via - # ipython - # jupyterlab-pygments - # nbconvert - # sphinx -pyjwt[crypto]==2.3.0 - # via - # adal - # msal - # snowflake-connector-python -pyopenssl==21.0.0 - # via snowflake-connector-python -pyparsing==2.4.7 - # via - # great-expectations - # httplib2 - # packaging pyrsistent==0.18.1 # via jsonschema -pyspark==3.2.1 - # via feast (setup.py) -pytest==6.2.5 - # via - # feast (setup.py) - # pytest-benchmark - # pytest-cov - # pytest-forked - # pytest-lazy-fixture - # pytest-mock - # pytest-ordering - # pytest-timeout - # pytest-xdist -pytest-benchmark==3.4.1 - # via feast (setup.py) -pytest-cov==3.0.0 - # via feast (setup.py) -pytest-forked==1.4.0 - # via pytest-xdist -pytest-lazy-fixture==0.6.3 - # via feast (setup.py) -pytest-mock==1.10.4 - # via feast (setup.py) -pytest-ordering==0.6 - # via feast (setup.py) -pytest-timeout==1.4.2 - # via feast (setup.py) -pytest-xdist==2.5.0 - # via feast (setup.py) python-dateutil==2.8.2 - # via - # adal - # botocore - # google-cloud-bigquery - # great-expectations - # jupyter-client - # moto - # pandas + # via pandas python-dotenv==0.19.2 # via uvicorn pytz==2021.3 - # via - # babel - # google-api-core - # great-expectations - # moto - # pandas - # snowflake-connector-python -pytz-deprecation-shim==0.1.0.post0 - # via tzlocal + # via pandas pyyaml==6.0 # via - # dask # feast (setup.py) - # libcst - # pre-commit # uvicorn -pyzmq==22.3.0 - # via - # jupyter-client - # notebook -redis==3.5.3 - # via - # feast (setup.py) - # redis-py-cluster -redis-py-cluster==2.1.3 - # via feast (setup.py) -regex==2022.1.18 - # via black requests==2.27.1 - # via - # adal - # adlfs - # azure-core - # azure-datalake-store - # cachecontrol - # docker - # gcsfs - # google-api-core - # google-cloud-bigquery - # google-cloud-storage - # great-expectations - # moto - # msal - # msrest - # requests-oauthlib - # responses - # snowflake-connector-python - # sphinx -requests-oauthlib==1.3.1 - # via - # google-auth-oauthlib - # msrest -responses==0.17.0 - # via moto + # via google-api-core rsa==4.8 # via google-auth -ruamel.yaml==0.17.17 - # via great-expectations -ruamel.yaml.clib==0.2.6 - # via ruamel.yaml -s3transfer==0.5.0 - # via boto3 -scipy==1.7.3 - # via great-expectations -send2trash==1.8.0 - # via notebook six==1.16.0 # via # absl-py - # azure-core - # azure-identity - # bleach - # cryptography - # google-api-core # google-auth - # google-auth-httplib2 - # google-cloud-core - # google-resumable-media # grpcio - # isodate - # mock - # msrestazure # pandavro - # pyopenssl # python-dateutil - # responses - # virtualenv sniffio==1.2.0 # via anyio -snowballstemmer==2.2.0 - # via sphinx -snowflake-connector-python[pandas]==2.7.3 - # via feast (setup.py) -sphinx==4.3.2 - # via - # feast (setup.py) - # sphinx-rtd-theme -sphinx-rtd-theme==1.0.0 - # via feast (setup.py) -sphinxcontrib-applehelp==1.0.2 - # via sphinx -sphinxcontrib-devhelp==1.0.2 - # via sphinx -sphinxcontrib-htmlhelp==2.0.0 - # via sphinx -sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-qthelp==1.0.3 - # via sphinx -sphinxcontrib-serializinghtml==1.1.5 - # via sphinx starlette==0.17.1 # via fastapi tabulate==0.8.9 @@ -710,131 +131,24 @@ tenacity==8.0.1 # via feast (setup.py) tensorflow-metadata==1.6.0 # via feast (setup.py) -termcolor==1.1.0 - # via great-expectations -terminado==0.13.1 - # via notebook -testcontainers==3.4.2 - # via feast (setup.py) -testpath==0.5.0 - # via nbconvert toml==0.10.2 - # via - # black - # feast (setup.py) - # pre-commit - # pytest -tomli==2.0.0 - # via - # coverage - # mypy - # pep517 -toolz==0.11.2 - # via - # altair - # dask - # partd -tornado==6.1 - # via - # ipykernel - # jupyter-client - # notebook - # terminado -tqdm==4.62.3 - # via - # feast (setup.py) - # great-expectations -traitlets==5.1.1 - # via - # ipykernel - # ipython - # ipywidgets - # jupyter-client - # jupyter-core - # matplotlib-inline - # nbclient - # nbconvert - # nbformat - # notebook -typed-ast==1.5.2 - # via black -types-futures==3.3.8 - # via types-protobuf -types-protobuf==3.19.7 - # via - # feast (setup.py) - # mypy-protobuf -types-python-dateutil==2.8.9 - # via feast (setup.py) -types-pytz==2021.3.4 - # via feast (setup.py) -types-pyyaml==6.0.4 - # via feast (setup.py) -types-redis==4.1.13 # via feast (setup.py) -types-requests==2.27.8 - # via feast (setup.py) -types-setuptools==57.4.8 - # via feast (setup.py) -types-tabulate==0.8.5 +tqdm==4.62.3 # via feast (setup.py) -types-urllib3==1.26.8 - # via types-requests typing-extensions==4.0.1 - # via - # great-expectations - # libcst - # mypy - # pydantic - # typing-inspect -typing-inspect==0.7.1 - # via libcst -tzdata==2021.5 - # via pytz-deprecation-shim -tzlocal==4.1 - # via great-expectations -uritemplate==4.1.1 - # via google-api-python-client + # via pydantic urllib3==1.26.8 - # via - # botocore - # feast (setup.py) - # minio - # requests - # responses -uvicorn[standard]==0.17.1 + # via requests +uvicorn[standard]==0.17.0 # via feast (setup.py) uvloop==0.16.0 # via uvicorn -virtualenv==20.13.0 - # via pre-commit watchgod==0.7 # via uvicorn -wcwidth==0.2.5 - # via prompt-toolkit -webencodings==0.5.1 - # via bleach -websocket-client==1.2.3 - # via docker websockets==10.1 # via uvicorn -werkzeug==2.0.2 - # via moto -wheel==0.37.1 - # via pip-tools -widgetsnbextension==3.5.2 - # via ipywidgets -wrapt==1.13.3 - # via testcontainers -xmltodict==0.12.0 - # via moto -yarl==1.7.2 - # via aiohttp zipp==3.7.0 - # via - # importlib-metadata - # importlib-resources + # via importlib-resources # The following packages are considered to be unsafe in a requirements file: -# pip -# setuptools \ No newline at end of file +# setuptools