From 789d7a1dac0bf22f421de1ffbef2d351a09598e8 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 18 Aug 2021 14:52:10 -0700 Subject: [PATCH 01/51] Fix API cruft from DataSourceCreator Signed-off-by: Achal Shah --- .../universal/data_source_creator.py | 2 +- .../universal/data_sources/bigquery.py | 4 +-- .../universal/data_sources/file.py | 25 +++++++++++-------- .../universal/data_sources/redshift.py | 4 +-- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py index fa5293c06d..6efadaa77f 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py @@ -28,5 +28,5 @@ def teardown(self): ... @abstractmethod - def get_prefixed_table_name(self, name: str, suffix: str) -> str: + def get_prefixed_table_name(self, table_name: str) -> str: ... diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py index 74804e7512..070b87690f 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py @@ -69,5 +69,5 @@ def create_data_sources( field_mapping=field_mapping or {"ts_1": "ts"}, ) - def get_prefixed_table_name(self, name: str, suffix: str) -> str: - return f"{self.client.project}.{name}.{suffix}" + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.client.project}.{self.project_name}.{suffix}" diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py index b386c399dc..0c570e8a5f 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py @@ -1,5 +1,5 @@ import tempfile -from typing import Any, Dict +from typing import Any, Dict, List import pandas as pd @@ -14,10 +14,11 @@ class FileDataSourceCreator(DataSourceCreator): - f: Any + files: List[Any] - def __init__(self, _: str): - pass + def __init__(self, project_name: str): + self.project_name = project_name + self.files = [] def create_data_sources( self, @@ -27,22 +28,26 @@ def create_data_sources( created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: - self.f = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) - df.to_parquet(self.f.name) + f = tempfile.NamedTemporaryFile( + prefix=self.project_name, suffix=".parquet", delete=False + ) + df.to_parquet(f.name) + self.files.append(f) return FileSource( file_format=ParquetFormat(), - path=f"file://{self.f.name}", + path=f"file://{f.name}", event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, ) - def get_prefixed_table_name(self, name: str, suffix: str) -> str: - return f"{name}.{suffix}" + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}.{suffix}" def create_offline_store_config(self) -> FeastConfigBaseModel: return FileOfflineStoreConfig() def teardown(self): - self.f.close() + for f in self.files: + f.close() diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py index f731b60bb3..3802a49510 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py @@ -65,8 +65,8 @@ def create_data_sources( def create_offline_store_config(self) -> FeastConfigBaseModel: return self.offline_store_config - def get_prefixed_table_name(self, name: str, suffix: str) -> str: - return f"{name}_{suffix}" + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}_{suffix}" def teardown(self): for table in self.tables: From 8b7eab57e994390949820b575efaa28807d03c5b Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 18 Aug 2021 15:16:13 -0700 Subject: [PATCH 02/51] Remove the need for get_prefixed_table_name Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 22 +++++++------------ .../universal/data_source_creator.py | 5 +++-- .../universal/data_sources/bigquery.py | 9 ++++++-- .../universal/data_sources/file.py | 10 +++++++-- .../universal/data_sources/redshift.py | 9 ++++++-- 5 files changed, 33 insertions(+), 22 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index f912f4dbdb..e18135520c 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -112,12 +112,9 @@ class Environment: def customer_feature_view(self) -> FeatureView: if self._customer_feature_view is None: - customer_table_id = self.data_source_creator.get_prefixed_table_name( - self.name, "customer_profile" - ) ds = self.data_source_creator.create_data_sources( - customer_table_id, self.customer_df, + suffix="customer_profile", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) @@ -126,12 +123,9 @@ def customer_feature_view(self) -> FeatureView: def driver_stats_feature_view(self) -> FeatureView: if self._driver_stats_feature_view is None: - driver_table_id = self.data_source_creator.get_prefixed_table_name( - self.name, "driver_hourly" - ) ds = self.data_source_creator.create_data_sources( - driver_table_id, self.driver_df, + suffix="driver_hourly", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) @@ -142,12 +136,9 @@ def driver_stats_feature_view(self) -> FeatureView: def orders_table(self) -> Optional[str]: if self._orders_table is None: - orders_table_id = self.data_source_creator.get_prefixed_table_name( - self.name, "orders" - ) ds = self.data_source_creator.create_data_sources( - orders_table_id, self.orders_df, + suffix="orders", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) @@ -210,7 +201,6 @@ def construct_test_environment( :param test_repo_config: configuration :return: A feature store built using the supplied configuration. """ - df = create_dataset() project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}" @@ -221,9 +211,13 @@ def construct_test_environment( offline_creator: DataSourceCreator = importer.get_class_from_type( module_name, config_class_name, "DataSourceCreator" )(project) + + # This needs to be abstracted away for test_e2e_universal which uses a different dataset. + df = create_dataset() ds = offline_creator.create_data_sources( - project, df, field_mapping={"ts_1": "ts", "id": "driver_id"} + df, destination=project, field_mapping={"ts_1": "ts", "id": "driver_id"} ) + offline_store = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store diff --git a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py index 6efadaa77f..6c038f1046 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict +from typing import Dict, Optional import pandas as pd @@ -11,8 +11,9 @@ class DataSourceCreator(ABC): @abstractmethod def create_data_sources( self, - destination: str, df: pd.DataFrame, + destination: Optional[str] = None, + suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py index 070b87690f..288951c5e4 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Optional import pandas as pd from google.cloud import bigquery @@ -42,14 +42,19 @@ def create_offline_store_config(self): def create_data_sources( self, - destination: str, df: pd.DataFrame, + destination: Optional[str] = None, + suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, **kwargs, ) -> DataSource: + assert destination or suffix + if not destination: + destination = self.get_prefixed_table_name(suffix) + job_config = bigquery.LoadJobConfig() if self.gcp_project not in destination: destination = f"{self.gcp_project}.{self.project_name}.{destination}" diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py index 0c570e8a5f..f212a5cfcd 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py @@ -1,5 +1,5 @@ import tempfile -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import pandas as pd @@ -22,12 +22,18 @@ def __init__(self, project_name: str): def create_data_sources( self, - destination: str, df: pd.DataFrame, + destination: Optional[str] = None, + suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: + + assert destination or suffix + if not destination: + destination = self.get_prefixed_table_name(suffix) + f = tempfile.NamedTemporaryFile( prefix=self.project_name, suffix=".parquet", delete=False ) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py index 3802a49510..291ef89b27 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Optional import pandas as pd @@ -33,13 +33,18 @@ def __init__(self, project_name: str): def create_data_sources( self, - destination: str, df: pd.DataFrame, + destination: Optional[str] = None, + suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: + assert destination or suffix + if not destination: + destination = self.get_prefixed_table_name(suffix) + aws_utils.upload_df_to_redshift( self.client, self.offline_store_config.cluster_id, From 0cc21ab1dfefccc834d0b0733370b9d3944c3053 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 18 Aug 2021 17:09:13 -0700 Subject: [PATCH 03/51] major refactor Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 167 ++++++++++-------- .../test_universal_historical_retrieval.py | 20 ++- .../online_store/test_universal_online.py | 10 +- 3 files changed, 112 insertions(+), 85 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index e18135520c..ac07215653 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -1,11 +1,12 @@ import tempfile import uuid from contextlib import contextmanager -from dataclasses import dataclass, replace +from dataclasses import dataclass, field, replace from datetime import datetime, timedelta from pathlib import Path -from typing import Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union +import pandas as pd import pytest from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data, importer @@ -71,9 +72,63 @@ def ds_creator_path(cls: str): ] -OFFLINE_STORES: List[str] = [] -ONLINE_STORES: List[str] = [] -PROVIDERS: List[str] = [] +def construct_entities() -> Dict[str, List[Any]]: + return {"customer": list(range(1001, 1110)), "driver": list(range(5001, 5110))} + + +def construct_datasets( + entities: Dict[str, List[Any]], start_time: datetime, end_time: datetime +) -> Dict[str, pd.DataFrame]: + customer_df = driver_test_data.create_customer_daily_profile_df( + entities["customer"], start_time, end_time + ) + driver_df = driver_test_data.create_driver_hourly_stats_df( + entities["driver"], start_time, end_time + ) + orders_df = driver_test_data.create_orders_df( + customers=entities["customer"], + drivers=entities["driver"], + start_date=end_time - timedelta(days=365), + end_date=end_time + timedelta(days=365), + order_count=1000, + ) + + return {"customer": customer_df, "driver": driver_df, "orders": orders_df} + + +def construct_data_sources( + datasets: Dict[str, pd.DataFrame], data_source_creator: DataSourceCreator +) -> Dict[str, DataSource]: + customer_ds = data_source_creator.create_data_sources( + datasets["customer"], + suffix="customer_profile", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", + ) + driver_ds = data_source_creator.create_data_sources( + datasets["driver"], + suffix="driver_hourly", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", + ) + orders_ds = data_source_creator.create_data_sources( + datasets["orders"], + suffix="orders", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", + ) + return {"customer": customer_ds, "driver": driver_ds, "orders": orders_ds} + + +def construct_feature_views( + data_sources: Dict[str, DataSource] +) -> Dict[str, FeatureView]: + return { + "customer": create_customer_daily_profile_feature_view( + data_sources["customer"] + ), + "driver": create_driver_hourly_stats_feature_view(data_sources["driver"]), + } @dataclass @@ -84,69 +139,44 @@ class Environment: data_source: DataSource data_source_creator: DataSourceCreator - end_date = datetime.now().replace(microsecond=0, second=0, minute=0) - start_date = end_date - timedelta(days=7) - before_start_date = end_date - timedelta(days=365) - after_end_date = end_date + timedelta(days=365) - - customer_entities = list(range(1001, 1110)) - customer_df = driver_test_data.create_customer_daily_profile_df( - customer_entities, start_date, end_date + entites_creator: Callable[[], Dict[str, List[Any]]] = field( + default_factory=construct_entities ) - _customer_feature_view: Optional[FeatureView] = None - - driver_entities = list(range(5001, 5110)) - driver_df = driver_test_data.create_driver_hourly_stats_df( - driver_entities, start_date, end_date + datasets_creator: Callable[ + [Dict[str, List[Any]], datetime, datetime], Dict[str, pd.DataFrame] + ] = field(default=construct_datasets) + datasources_creator: Callable[ + [Dict[str, pd.DataFrame]], Dict[str, DataSource] + ] = field(default=construct_data_sources) + feature_views_creator: Callable[ + [Dict[str, DataSource]], Dict[str, FeatureView] + ] = field(default=construct_feature_views) + + entites: Dict[str, List[Any]] = field(default_factory=dict) + datasets: Dict[str, pd.DataFrame] = field(default_factory=dict) + datasources: Dict[str, DataSource] = field(default_factory=dict) + feature_views: List[FeatureView] = field(default_factory=list) + + end_date: datetime = field( + default=datetime.now().replace(microsecond=0, second=0, minute=0) ) - _driver_stats_feature_view: Optional[FeatureView] = None + start_date: datetime = end_date - timedelta(days=7) - orders_df = driver_test_data.create_orders_df( - customers=customer_entities, - drivers=driver_entities, - start_date=before_start_date, - end_date=after_end_date, - order_count=1000, - ) - _orders_table: Optional[str] = None - - def customer_feature_view(self) -> FeatureView: - if self._customer_feature_view is None: - ds = self.data_source_creator.create_data_sources( - self.customer_df, - suffix="customer_profile", - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - self._customer_feature_view = create_customer_daily_profile_feature_view(ds) - return self._customer_feature_view - - def driver_stats_feature_view(self) -> FeatureView: - if self._driver_stats_feature_view is None: - ds = self.data_source_creator.create_data_sources( - self.driver_df, - suffix="driver_hourly", - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - self._driver_stats_feature_view = create_driver_hourly_stats_feature_view( - ds - ) - return self._driver_stats_feature_view - - def orders_table(self) -> Optional[str]: - if self._orders_table is None: - ds = self.data_source_creator.create_data_sources( - self.orders_df, - suffix="orders", - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - if hasattr(ds, "table_ref"): - self._orders_table = ds.table_ref - elif hasattr(ds, "table"): - self._orders_table = ds.table - return self._orders_table + def __post_init__(self): + self.entites = self.entites_creator() + self.datasets = self.datasets_creator( + self.entites, self.start_date, self.end_date + ) + self.datasources = self.datasources_creator(self.datasets) + self.feature_views = self.feature_views_creator(self.datasources) + + +def table_name_from_data_source(ds: DataSource) -> Optional[str]: + if hasattr(ds, "table_ref"): + return ds.table_ref + elif hasattr(ds, "table"): + return ds.table + return None def vary_full_feature_names(configs: List[TestRepoConfig]) -> List[TestRepoConfig]: @@ -244,12 +274,7 @@ def construct_test_environment( try: if create_and_apply: entities.extend([driver(), customer()]) - fvs.extend( - [ - environment.driver_stats_feature_view(), - environment.customer_feature_view(), - ] - ) + fvs.extend(environment.feature_views) fs.apply(fvs + entities) if materialize: diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 7379c27a62..6199ebacf8 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -14,6 +14,7 @@ from tests.integration.feature_repos.test_repo_configuration import ( Environment, parametrize_offline_retrieval_test, + table_name_from_data_source, ) np.random.seed(0) @@ -139,20 +140,21 @@ def get_expected_training_df( def test_historical_features(environment: Environment): store = environment.feature_store - customer_df, customer_fv = ( - environment.customer_df, - environment.customer_feature_view(), + customer_df, driver_df, orders_df = ( + environment.datasets["customer"], + environment.datasets["driver"], + environment.datasets["orders"], ) - driver_df, driver_fv = ( - environment.driver_df, - environment.driver_stats_feature_view(), + customer_fv, driver_fv = ( + environment.feature_views["customer"], + environment.feature_views["driver"], ) - orders_df = environment.orders_df full_feature_names = environment.test_repo_config.full_feature_names entity_df_query = None - if environment.orders_table(): - entity_df_query = f"SELECT * FROM {environment.orders_table()}" + if "orders" in environment.datasources: + orders_table = table_name_from_data_source(environment.datasources["orders"]) + entity_df_query = f"SELECT * FROM {orders_table}" event_timestamp = ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index a4337a305c..1e3483d38e 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -14,14 +14,14 @@ def test_online_retrieval(environment: Environment): fs = environment.feature_store full_feature_names = environment.test_repo_config.full_feature_names - sample_drivers = random.sample(environment.driver_entities, 10) - drivers_df = environment.driver_df[ - environment.driver_df["driver_id"].isin(sample_drivers) + sample_drivers = random.sample(environment.entites["driver"], 10) + drivers_df = environment.datasets["driver"][ + environment.datasets["driver"]["driver_id"].isin(sample_drivers) ] sample_customers = random.sample(environment.customer_entities, 10) - customers_df = environment.customer_df[ - environment.customer_df["customer_id"].isin(sample_customers) + customers_df = environment.datasets["customer"][ + environment.datasets["customer"]["customer_id"].isin(sample_customers) ] entity_rows = [ From 6ddd5b0482e44d4e79a7c5c9bc59c6fae8ba761b Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 18 Aug 2021 17:13:51 -0700 Subject: [PATCH 04/51] move start time Signed-off-by: Achal Shah --- .../integration/feature_repos/test_repo_configuration.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index ac07215653..addd8c981e 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -155,14 +155,15 @@ class Environment: entites: Dict[str, List[Any]] = field(default_factory=dict) datasets: Dict[str, pd.DataFrame] = field(default_factory=dict) datasources: Dict[str, DataSource] = field(default_factory=dict) - feature_views: List[FeatureView] = field(default_factory=list) + feature_views: Dict[str, FeatureView] = field(default_factory=list) end_date: datetime = field( default=datetime.now().replace(microsecond=0, second=0, minute=0) ) - start_date: datetime = end_date - timedelta(days=7) def __post_init__(self): + self.start_date: datetime = self.end_date - timedelta(days=7) + self.entites = self.entites_creator() self.datasets = self.datasets_creator( self.entites, self.start_date, self.end_date From a70074552377c237b840d228ff1c20dc48f9814a Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 18 Aug 2021 17:21:34 -0700 Subject: [PATCH 05/51] Remove one dimension of variation to be added in later Signed-off-by: Achal Shah --- .../tests/integration/feature_repos/test_repo_configuration.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index addd8c981e..15e253f6eb 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -325,7 +325,6 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): configs = vary_providers_for_offline_stores(FULL_REPO_CONFIGS) configs = vary_full_feature_names(configs) - configs = vary_infer_event_timestamp_col(configs) @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) @@ -350,7 +349,6 @@ def parametrize_online_test(online_test): configs = vary_providers_for_offline_stores(FULL_REPO_CONFIGS) configs = vary_full_feature_names(configs) - configs = vary_infer_event_timestamp_col(configs) @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) From c25b6cbaf2cf6de6c32b7fcd01d4b8b70ed1770e Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 18 Aug 2021 17:32:28 -0700 Subject: [PATCH 06/51] Fix default Signed-off-by: Achal Shah --- .../tests/integration/feature_repos/test_repo_configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 15e253f6eb..5f63ccd94b 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -140,7 +140,7 @@ class Environment: data_source_creator: DataSourceCreator entites_creator: Callable[[], Dict[str, List[Any]]] = field( - default_factory=construct_entities + default=construct_entities ) datasets_creator: Callable[ [Dict[str, List[Any]], datetime, datetime], Dict[str, pd.DataFrame] From a3e44739c29b4f69dd6080c672295bfb39ca1d79 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 18 Aug 2021 21:31:50 -0700 Subject: [PATCH 07/51] Fixups Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 14 ++++++++------ .../online_store/test_universal_online.py | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 5f63ccd94b..caa180e39f 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -139,20 +139,20 @@ class Environment: data_source: DataSource data_source_creator: DataSourceCreator - entites_creator: Callable[[], Dict[str, List[Any]]] = field( + entities_creator: Callable[[], Dict[str, List[Any]]] = field( default=construct_entities ) datasets_creator: Callable[ [Dict[str, List[Any]], datetime, datetime], Dict[str, pd.DataFrame] ] = field(default=construct_datasets) datasources_creator: Callable[ - [Dict[str, pd.DataFrame]], Dict[str, DataSource] + [Dict[str, pd.DataFrame], DataSourceCreator], Dict[str, DataSource] ] = field(default=construct_data_sources) feature_views_creator: Callable[ [Dict[str, DataSource]], Dict[str, FeatureView] ] = field(default=construct_feature_views) - entites: Dict[str, List[Any]] = field(default_factory=dict) + entities: Dict[str, List[Any]] = field(default_factory=dict) datasets: Dict[str, pd.DataFrame] = field(default_factory=dict) datasources: Dict[str, DataSource] = field(default_factory=dict) feature_views: Dict[str, FeatureView] = field(default_factory=list) @@ -164,11 +164,13 @@ class Environment: def __post_init__(self): self.start_date: datetime = self.end_date - timedelta(days=7) - self.entites = self.entites_creator() + self.entities = self.entities_creator() self.datasets = self.datasets_creator( - self.entites, self.start_date, self.end_date + self.entities, self.start_date, self.end_date + ) + self.datasources = self.datasources_creator( + self.datasets, self.data_source_creator ) - self.datasources = self.datasources_creator(self.datasets) self.feature_views = self.feature_views_creator(self.datasources) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 1e3483d38e..52973207f6 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -14,7 +14,7 @@ def test_online_retrieval(environment: Environment): fs = environment.feature_store full_feature_names = environment.test_repo_config.full_feature_names - sample_drivers = random.sample(environment.entites["driver"], 10) + sample_drivers = random.sample(environment.entities["driver"], 10) drivers_df = environment.datasets["driver"][ environment.datasets["driver"]["driver_id"].isin(sample_drivers) ] From 3f632fd670379bae3fb382c74b1af5d8135c4d10 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 18 Aug 2021 22:42:29 -0700 Subject: [PATCH 08/51] Fixups Signed-off-by: Achal Shah --- .../tests/integration/feature_repos/test_repo_configuration.py | 2 +- .../tests/integration/online_store/test_universal_online.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index caa180e39f..6215986178 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -277,7 +277,7 @@ def construct_test_environment( try: if create_and_apply: entities.extend([driver(), customer()]) - fvs.extend(environment.feature_views) + fvs.extend(environment.feature_views.values()) fs.apply(fvs + entities) if materialize: diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 52973207f6..46b84130b8 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -19,7 +19,7 @@ def test_online_retrieval(environment: Environment): environment.datasets["driver"]["driver_id"].isin(sample_drivers) ] - sample_customers = random.sample(environment.customer_entities, 10) + sample_customers = random.sample(environment.entities["customer"], 10) customers_df = environment.datasets["customer"][ environment.datasets["customer"]["customer_id"].isin(sample_customers) ] From 9aeea86a07a81ee7e426e76b3b4e9d98dbfe2ea1 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 19 Aug 2021 07:31:26 -0700 Subject: [PATCH 09/51] Fix up tests Signed-off-by: Achal Shah --- .../offline_store/test_universal_historical_retrieval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 6199ebacf8..b00818a078 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -152,8 +152,8 @@ def test_historical_features(environment: Environment): full_feature_names = environment.test_repo_config.full_feature_names entity_df_query = None - if "orders" in environment.datasources: - orders_table = table_name_from_data_source(environment.datasources["orders"]) + orders_table = table_name_from_data_source(environment.datasources["orders"]) + if orders_table: entity_df_query = f"SELECT * FROM {orders_table}" event_timestamp = ( From 84cbefde065922b1d8fbb858bf187bfc9f370015 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 19 Aug 2021 10:02:26 -0700 Subject: [PATCH 10/51] Add retries to execute_redshift_statement_async Signed-off-by: Achal Shah --- sdk/python/feast/infra/utils/aws_utils.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index aea460cfb8..81d42dbf38 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -7,7 +7,12 @@ import pandas as pd import pyarrow as pa import pyarrow.parquet as pq -from tenacity import retry, retry_if_exception_type, wait_exponential +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) from feast.errors import RedshiftCredentialsError, RedshiftQueryError from feast.type_map import pa_to_redshift_value_type @@ -15,7 +20,7 @@ try: import boto3 from botocore.config import Config - from botocore.exceptions import ClientError + from botocore.exceptions import ClientError, ConnectionClosedError except ImportError as e: from feast.errors import FeastExtrasDependencyImportError @@ -50,6 +55,11 @@ def get_bucket_and_key(s3_path: str) -> Tuple[str, str]: return bucket, key +@retry( + wait=wait_exponential(multiplier=0.1, max=30), + retry=retry_if_exception_type(ConnectionClosedError), + stop=stop_after_attempt(3), +) def execute_redshift_statement_async( redshift_data_client, cluster_id: str, database: str, user: str, query: str ) -> dict: From fe180eb6dbf7f19c88efc0c4db877939f420c362 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 19 Aug 2021 10:26:23 -0700 Subject: [PATCH 11/51] Add retries to execute_redshift_statement_async Signed-off-by: Achal Shah --- sdk/python/feast/infra/utils/aws_utils.py | 4 +-- .../feature_repos/test_repo_configuration.py | 26 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 81d42dbf38..2567074525 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -56,7 +56,7 @@ def get_bucket_and_key(s3_path: str) -> Tuple[str, str]: @retry( - wait=wait_exponential(multiplier=0.1, max=30), + wait=wait_exponential(multiplier=1, max=30), retry=retry_if_exception_type(ConnectionClosedError), stop=stop_after_attempt(3), ) @@ -92,7 +92,7 @@ class RedshiftStatementNotFinishedError(Exception): @retry( - wait=wait_exponential(multiplier=0.1, max=30), + wait=wait_exponential(multiplier=1, max=30), retry=retry_if_exception_type(RedshiftStatementNotFinishedError), ) def wait_for_redshift_statement(redshift_data_client, statement: dict) -> None: diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 6215986178..840d212cee 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -72,11 +72,11 @@ def ds_creator_path(cls: str): ] -def construct_entities() -> Dict[str, List[Any]]: +def construct_universal_entities() -> Dict[str, List[Any]]: return {"customer": list(range(1001, 1110)), "driver": list(range(5001, 5110))} -def construct_datasets( +def construct_universal_datasets( entities: Dict[str, List[Any]], start_time: datetime, end_time: datetime ) -> Dict[str, pd.DataFrame]: customer_df = driver_test_data.create_customer_daily_profile_df( @@ -96,7 +96,7 @@ def construct_datasets( return {"customer": customer_df, "driver": driver_df, "orders": orders_df} -def construct_data_sources( +def construct_universal_data_sources( datasets: Dict[str, pd.DataFrame], data_source_creator: DataSourceCreator ) -> Dict[str, DataSource]: customer_ds = data_source_creator.create_data_sources( @@ -120,7 +120,7 @@ def construct_data_sources( return {"customer": customer_ds, "driver": driver_ds, "orders": orders_ds} -def construct_feature_views( +def construct_universal_feature_views( data_sources: Dict[str, DataSource] ) -> Dict[str, FeatureView]: return { @@ -140,17 +140,17 @@ class Environment: data_source_creator: DataSourceCreator entities_creator: Callable[[], Dict[str, List[Any]]] = field( - default=construct_entities + default=construct_universal_entities ) datasets_creator: Callable[ [Dict[str, List[Any]], datetime, datetime], Dict[str, pd.DataFrame] - ] = field(default=construct_datasets) + ] = field(default=construct_universal_datasets) datasources_creator: Callable[ [Dict[str, pd.DataFrame], DataSourceCreator], Dict[str, DataSource] - ] = field(default=construct_data_sources) + ] = field(default=construct_universal_data_sources) feature_views_creator: Callable[ [Dict[str, DataSource]], Dict[str, FeatureView] - ] = field(default=construct_feature_views) + ] = field(default=construct_universal_feature_views) entities: Dict[str, List[Any]] = field(default_factory=dict) datasets: Dict[str, pd.DataFrame] = field(default_factory=dict) @@ -219,7 +219,7 @@ def vary_providers_for_offline_stores( @contextmanager -def construct_test_environment( +def construct_universal_test_environment( test_repo_config: TestRepoConfig, create_and_apply: bool = False, materialize: bool = False, @@ -305,7 +305,7 @@ def parametrize_e2e_test(e2e_test): @pytest.mark.integration @pytest.mark.parametrize("config", FULL_REPO_CONFIGS, ids=lambda v: str(v)) def inner_test(config): - with construct_test_environment(config) as environment: + with construct_universal_test_environment(config) as environment: e2e_test(environment) return inner_test @@ -331,7 +331,9 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) def inner_test(config): - with construct_test_environment(config, create_and_apply=True) as environment: + with construct_universal_test_environment( + config, create_and_apply=True + ) as environment: offline_retrieval_test(environment) return inner_test @@ -355,7 +357,7 @@ def parametrize_online_test(online_test): @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) def inner_test(config): - with construct_test_environment( + with construct_universal_test_environment( config, create_and_apply=True, materialize=True ) as environment: online_test(environment) From 0cd08d2d8a6ddb7707a91fa57613b72efc78b116 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 19 Aug 2021 10:57:29 -0700 Subject: [PATCH 12/51] refactoooor Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 71 ++++++++++++++----- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 840d212cee..a345d2708a 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -131,6 +131,55 @@ def construct_universal_feature_views( } +def setup_entities( + environment: "Environment", entities_override: Optional[Dict[str, List[Any]]] = None +) -> "Environment": + environment.entities = ( + entities_override if entities_override else construct_universal_entities() + ) + return environment + + +def setup_datasets( + environment: "Environment", + datasets_override: Optional[Dict[str, pd.DataFrame]] = None, +) -> "Environment": + environment.datasets = ( + datasets_override + if datasets_override + else construct_universal_datasets( + environment.entities, environment.start_date, environment.end_date + ) + ) + return environment + + +def setup_data_sources( + environment: "Environment", + data_sources_override: Optional[Dict[str, DataSource]] = None, +) -> "Environment": + environment.datasources = ( + data_sources_override + if data_sources_override + else construct_universal_data_sources( + environment.datasets, environment.data_source_creator + ) + ) + return environment + + +def setup_feature_views( + environment: "Environment", + feature_views_override: Optional[Dict[str, FeatureView]] = None, +) -> "Environment": + environment.feature_views = ( + feature_views_override + if feature_views_override + else construct_universal_feature_views(environment.datasources) + ) + return environment + + @dataclass class Environment: name: str @@ -142,15 +191,6 @@ class Environment: entities_creator: Callable[[], Dict[str, List[Any]]] = field( default=construct_universal_entities ) - datasets_creator: Callable[ - [Dict[str, List[Any]], datetime, datetime], Dict[str, pd.DataFrame] - ] = field(default=construct_universal_datasets) - datasources_creator: Callable[ - [Dict[str, pd.DataFrame], DataSourceCreator], Dict[str, DataSource] - ] = field(default=construct_universal_data_sources) - feature_views_creator: Callable[ - [Dict[str, DataSource]], Dict[str, FeatureView] - ] = field(default=construct_universal_feature_views) entities: Dict[str, List[Any]] = field(default_factory=dict) datasets: Dict[str, pd.DataFrame] = field(default_factory=dict) @@ -164,15 +204,6 @@ class Environment: def __post_init__(self): self.start_date: datetime = self.end_date - timedelta(days=7) - self.entities = self.entities_creator() - self.datasets = self.datasets_creator( - self.entities, self.start_date, self.end_date - ) - self.datasources = self.datasources_creator( - self.datasets, self.data_source_creator - ) - self.feature_views = self.feature_views_creator(self.datasources) - def table_name_from_data_source(ds: DataSource) -> Optional[str]: if hasattr(ds, "table_ref"): @@ -271,6 +302,10 @@ def construct_universal_test_environment( data_source=ds, data_source_creator=offline_creator, ) + environment = setup_entities(environment) + environment = setup_datasets(environment) + environment = setup_data_sources(environment) + environment = setup_feature_views(environment) fvs = [] entities = [] From 4701183121e148c7f3a8dffbaa8babf58c094f28 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 19 Aug 2021 10:59:34 -0700 Subject: [PATCH 13/51] remove retries Signed-off-by: Achal Shah --- sdk/python/feast/infra/utils/aws_utils.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 2567074525..84fa611c15 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -7,12 +7,7 @@ import pandas as pd import pyarrow as pa import pyarrow.parquet as pq -from tenacity import ( - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) +from tenacity import retry, retry_if_exception_type, wait_exponential from feast.errors import RedshiftCredentialsError, RedshiftQueryError from feast.type_map import pa_to_redshift_value_type @@ -20,7 +15,7 @@ try: import boto3 from botocore.config import Config - from botocore.exceptions import ClientError, ConnectionClosedError + from botocore.exceptions import ClientError except ImportError as e: from feast.errors import FeastExtrasDependencyImportError @@ -55,11 +50,6 @@ def get_bucket_and_key(s3_path: str) -> Tuple[str, str]: return bucket, key -@retry( - wait=wait_exponential(multiplier=1, max=30), - retry=retry_if_exception_type(ConnectionClosedError), - stop=stop_after_attempt(3), -) def execute_redshift_statement_async( redshift_data_client, cluster_id: str, database: str, user: str, query: str ) -> dict: From 4cde2847549b618bb961d2cc1845313e1a3706e9 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 20 Aug 2021 12:51:42 -0700 Subject: [PATCH 14/51] Remove provider variation since they don't really play a big role Signed-off-by: Achal Shah --- .../integration/feature_repos/test_repo_configuration.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index a345d2708a..cda1f62250 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -360,8 +360,7 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): The decorator takes care of tearing down the feature store, as well as the sample data. """ - configs = vary_providers_for_offline_stores(FULL_REPO_CONFIGS) - configs = vary_full_feature_names(configs) + configs = vary_full_feature_names(FULL_REPO_CONFIGS) @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) @@ -386,8 +385,7 @@ def parametrize_online_test(online_test): The decorator takes care of tearing down the feature store, as well as the sample data. """ - configs = vary_providers_for_offline_stores(FULL_REPO_CONFIGS) - configs = vary_full_feature_names(configs) + configs = vary_full_feature_names(FULL_REPO_CONFIGS) @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) From 974bb0bae8a77137b1742dff0dddd56f46856251 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Mon, 23 Aug 2021 09:50:59 -0700 Subject: [PATCH 15/51] Session scoped cache for test datasets and skipping older tests whose functionality is present in other universal tests Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 22 +++++++++ .../feature_repos/test_repo_configuration.py | 45 ++++++++++++++----- .../test_offline_online_store_consistency.py | 3 ++ .../test_historical_retrieval.py | 6 +++ 4 files changed, 64 insertions(+), 12 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 0c94f4d57a..ac5598006b 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -87,3 +87,25 @@ def simple_dataset_2() -> pd.DataFrame: ], } return pd.DataFrame.from_dict(data) + + +class DataSourceCache: + cache = {} + + def get(self, test_repo_config): + return self.cache.get(test_repo_config.offline_store_creator, None) + + def put(self, test_repo_config, + entites, + datasets, + data_sources, + data_source_creator): + self.cache[test_repo_config.offline_store_creator] = (entites, datasets, data_sources, data_source_creator) + + +@pytest.fixture(scope="session", autouse=True) +def data_source_cache(): + dsc = DataSourceCache() + yield dsc + for _, v in dsc.cache.items(): + v[3].teardown() diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index cda1f62250..6cc4a88815 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -4,7 +4,7 @@ from dataclasses import dataclass, field, replace from datetime import datetime, timedelta from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union, Tuple import pandas as pd import pytest @@ -254,6 +254,7 @@ def construct_universal_test_environment( test_repo_config: TestRepoConfig, create_and_apply: bool = False, materialize: bool = False, + data_source_cache = None ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, @@ -302,9 +303,27 @@ def construct_universal_test_environment( data_source=ds, data_source_creator=offline_creator, ) - environment = setup_entities(environment) - environment = setup_datasets(environment) - environment = setup_data_sources(environment) + print(f"Data Source Cache: {data_source_cache}") + if data_source_cache: + fixtures = data_source_cache.get(test_repo_config) + print(f"Fixtures: {fixtures}") + if fixtures: + environment = setup_entities(environment, entities_override=fixtures[0]) + environment = setup_datasets(environment, datasets_override=fixtures[1]) + environment = setup_data_sources(environment, data_sources_override=fixtures[2]) + else: + environment = setup_entities(environment) + environment = setup_datasets(environment) + environment = setup_data_sources(environment) + data_source_cache.put(test_repo_config, environment.entities, + environment.datasets, + environment.datasources, + offline_creator) + else: + environment = setup_entities(environment) + environment = setup_datasets(environment) + environment = setup_data_sources(environment) + environment = setup_feature_views(environment) fvs = [] @@ -320,7 +339,8 @@ def construct_universal_test_environment( yield environment finally: - offline_creator.teardown() + if not data_source_cache: + offline_creator.teardown() fs.teardown() @@ -339,8 +359,8 @@ def parametrize_e2e_test(e2e_test): @pytest.mark.integration @pytest.mark.parametrize("config", FULL_REPO_CONFIGS, ids=lambda v: str(v)) - def inner_test(config): - with construct_universal_test_environment(config) as environment: + def inner_test(config, data_source_cache): + with construct_universal_test_environment(config, data_source_cache=data_source_cache) as environment: e2e_test(environment) return inner_test @@ -364,9 +384,10 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) - def inner_test(config): + def inner_test(config, data_source_cache): with construct_universal_test_environment( - config, create_and_apply=True + config, create_and_apply=True, + data_source_cache=data_source_cache ) as environment: offline_retrieval_test(environment) @@ -389,10 +410,10 @@ def parametrize_online_test(online_test): @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) - def inner_test(config): + def inner_test(config, data_source_cache): with construct_universal_test_environment( - config, create_and_apply=True, materialize=True + config, create_and_apply=True, materialize=True, + data_source_cache=data_source_cache ) as environment: online_test(environment) - return inner_test diff --git a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py index e97f7d0d84..ecb4c82bd3 100644 --- a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py +++ b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py @@ -382,6 +382,9 @@ def run_offline_online_store_consistency_test( ) +@pytest.mark.skip( + reason="This is the most expensive test and is subsumed by the universal tests." +) @pytest.mark.integration @pytest.mark.parametrize( "bq_source_type", ["query", "table"], diff --git a/sdk/python/tests/integration/offline_store/test_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_historical_retrieval.py index 5d735fcd9d..26b6b335dc 100644 --- a/sdk/python/tests/integration/offline_store/test_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_historical_retrieval.py @@ -394,6 +394,9 @@ def test_historical_features_from_parquet_sources( store.teardown() +@pytest.mark.skip( + reason="This is the most expensive test and is subsumed by the universal tests." +) @pytest.mark.integration @pytest.mark.parametrize( "provider_type", ["local", "gcp", "gcp_custom_offline_config"], @@ -655,6 +658,9 @@ def test_historical_features_from_bigquery_sources( store.teardown() +@pytest.mark.skip( + reason="This is the most expensive test and is subsumed by the universal tests." +) @pytest.mark.integration @pytest.mark.parametrize( "provider_type", ["local", "aws"], From dbbd6fb708a4dd9a3cc8d6dccf4fd64631ca5f13 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Mon, 23 Aug 2021 09:51:16 -0700 Subject: [PATCH 16/51] make format Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 9 +++-- .../feature_repos/test_repo_configuration.py | 33 ++++++++++++------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index ac5598006b..b8ec789019 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -95,12 +95,15 @@ class DataSourceCache: def get(self, test_repo_config): return self.cache.get(test_repo_config.offline_store_creator, None) - def put(self, test_repo_config, + def put( + self, test_repo_config, entites, datasets, data_sources, data_source_creator + ): + self.cache[test_repo_config.offline_store_creator] = ( entites, datasets, data_sources, - data_source_creator): - self.cache[test_repo_config.offline_store_creator] = (entites, datasets, data_sources, data_source_creator) + data_source_creator, + ) @pytest.fixture(scope="session", autouse=True) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 6cc4a88815..b91c0723a7 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -4,7 +4,7 @@ from dataclasses import dataclass, field, replace from datetime import datetime, timedelta from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pandas as pd import pytest @@ -254,7 +254,7 @@ def construct_universal_test_environment( test_repo_config: TestRepoConfig, create_and_apply: bool = False, materialize: bool = False, - data_source_cache = None + data_source_cache=None, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, @@ -310,15 +310,20 @@ def construct_universal_test_environment( if fixtures: environment = setup_entities(environment, entities_override=fixtures[0]) environment = setup_datasets(environment, datasets_override=fixtures[1]) - environment = setup_data_sources(environment, data_sources_override=fixtures[2]) + environment = setup_data_sources( + environment, data_sources_override=fixtures[2] + ) else: environment = setup_entities(environment) environment = setup_datasets(environment) environment = setup_data_sources(environment) - data_source_cache.put(test_repo_config, environment.entities, - environment.datasets, - environment.datasources, - offline_creator) + data_source_cache.put( + test_repo_config, + environment.entities, + environment.datasets, + environment.datasources, + offline_creator, + ) else: environment = setup_entities(environment) environment = setup_datasets(environment) @@ -360,7 +365,9 @@ def parametrize_e2e_test(e2e_test): @pytest.mark.integration @pytest.mark.parametrize("config", FULL_REPO_CONFIGS, ids=lambda v: str(v)) def inner_test(config, data_source_cache): - with construct_universal_test_environment(config, data_source_cache=data_source_cache) as environment: + with construct_universal_test_environment( + config, data_source_cache=data_source_cache + ) as environment: e2e_test(environment) return inner_test @@ -386,8 +393,7 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) def inner_test(config, data_source_cache): with construct_universal_test_environment( - config, create_and_apply=True, - data_source_cache=data_source_cache + config, create_and_apply=True, data_source_cache=data_source_cache ) as environment: offline_retrieval_test(environment) @@ -412,8 +418,11 @@ def parametrize_online_test(online_test): @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) def inner_test(config, data_source_cache): with construct_universal_test_environment( - config, create_and_apply=True, materialize=True, - data_source_cache=data_source_cache + config, + create_and_apply=True, + materialize=True, + data_source_cache=data_source_cache, ) as environment: online_test(environment) + return inner_test From 473b630297f9bc8715d306d0a19bef7cbd4b91a7 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Mon, 23 Aug 2021 09:56:18 -0700 Subject: [PATCH 17/51] make format Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index b8ec789019..7147e26dee 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -14,6 +14,7 @@ import multiprocessing from datetime import datetime, timedelta from sys import platform +from typing import Any, Dict import pandas as pd import pytest @@ -90,7 +91,7 @@ def simple_dataset_2() -> pd.DataFrame: class DataSourceCache: - cache = {} + cache: Dict[str, Any] = {} def get(self, test_repo_config): return self.cache.get(test_repo_config.offline_store_creator, None) From 95982e231cbf17b3a4475f8c4a7a16b15c48402d Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Mon, 23 Aug 2021 09:57:22 -0700 Subject: [PATCH 18/51] remove import Signed-off-by: Achal Shah --- .../tests/integration/feature_repos/test_repo_configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index b91c0723a7..f59160a967 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -4,7 +4,7 @@ from dataclasses import dataclass, field, replace from datetime import datetime, timedelta from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Union import pandas as pd import pytest From df9596b51450d7eb38d0f6b5845664ed3aa03708 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 25 Aug 2021 13:23:20 -0700 Subject: [PATCH 19/51] fix merge Signed-off-by: Achal Shah --- .../integration/offline_store/test_s3_custom_endpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py index 207ebd9732..e5bff828b3 100644 --- a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py +++ b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py @@ -2,7 +2,7 @@ from tests.integration.feature_repos.test_repo_configuration import ( TestRepoConfig, - construct_test_environment, + construct_universal_test_environment, ) # TODO: Allow integration tests to run using different credentials. @@ -27,7 +27,7 @@ def test_registration_and_retrieval_from_custom_s3_endpoint(): os.environ["AWS_ACCESS_KEY_ID"] = "AKIAIOSFODNN7EXAMPLE" os.environ["AWS_SECRET_ACCESS_KEY"] = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" - with construct_test_environment( + with construct_universal_test_environment( config, create_and_apply=True, materialize=True ) as environment: fs = environment.feature_store From fc46edb5bdd6dfcbcd9f4bff63a4aae2d346fb58 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 25 Aug 2021 14:59:42 -0700 Subject: [PATCH 20/51] Use an enum for the stopping procedure instead of the bools Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 37 ++++++++++--------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 189a998191..62b3e67750 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -3,8 +3,9 @@ from contextlib import contextmanager from dataclasses import dataclass, field, replace from datetime import datetime, timedelta +from enum import Enum from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import pandas as pd import pytest @@ -188,10 +189,6 @@ class Environment: data_source: DataSource data_source_creator: DataSourceCreator - entities_creator: Callable[[], Dict[str, List[Any]]] = field( - default=construct_universal_entities - ) - entities: Dict[str, List[Any]] = field(default_factory=dict) datasets: Dict[str, pd.DataFrame] = field(default_factory=dict) datasources: Dict[str, DataSource] = field(default_factory=dict) @@ -249,12 +246,18 @@ def vary_providers_for_offline_stores( return new_configs +class EnvironmentSetupSteps(Enum): + INIT = 0 + CREATE_AND_APPLY_OBJECTS = 1 + MATERIALIZE = 2 + + +DEFAULT_STEP = EnvironmentSetupSteps.INIT + + @contextmanager def construct_universal_test_environment( - test_repo_config: TestRepoConfig, - create_and_apply: bool = False, - materialize: bool = False, - data_source_cache=None, + test_repo_config: TestRepoConfig, stop_at_step=DEFAULT_STEP, data_source_cache=None, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, @@ -303,10 +306,8 @@ def construct_universal_test_environment( data_source=ds, data_source_creator=offline_creator, ) - print(f"Data Source Cache: {data_source_cache}") if data_source_cache: fixtures = data_source_cache.get(test_repo_config) - print(f"Fixtures: {fixtures}") if fixtures: environment = setup_entities(environment, entities_override=fixtures[0]) environment = setup_datasets(environment, datasets_override=fixtures[1]) @@ -334,12 +335,13 @@ def construct_universal_test_environment( fvs = [] entities = [] try: - if create_and_apply: + if stop_at_step <= EnvironmentSetupSteps.INIT: + pass + if stop_at_step >= EnvironmentSetupSteps.CREATE_AND_APPLY_OBJECTS: entities.extend([driver(), customer()]) fvs.extend(environment.feature_views.values()) fs.apply(fvs + entities) - - if materialize: + if stop_at_step >= EnvironmentSetupSteps.MATERIALIZE: fs.materialize(environment.start_date, environment.end_date) yield environment @@ -393,7 +395,9 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) def inner_test(config, data_source_cache): with construct_universal_test_environment( - config, create_and_apply=True, data_source_cache=data_source_cache + config, + stop_at_step=EnvironmentSetupSteps.CREATE_AND_APPLY_OBJECTS, + data_source_cache=data_source_cache, ) as environment: offline_retrieval_test(environment) @@ -419,8 +423,7 @@ def parametrize_online_test(online_test): def inner_test(config, data_source_cache): with construct_universal_test_environment( config, - create_and_apply=True, - materialize=True, + stop_at_step=EnvironmentSetupSteps.MATERIALIZE, data_source_cache=data_source_cache, ) as environment: online_test(environment) From 0daaff0ef03a3ceb09218352dc22c31aff75074b Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 25 Aug 2021 21:56:47 -0700 Subject: [PATCH 21/51] Fix refs Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 74 ++++++++++--------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 62b3e67750..bf9b409fe0 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from dataclasses import dataclass, field, replace from datetime import datetime, timedelta -from enum import Enum +from enum import IntEnum from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -100,19 +100,19 @@ def construct_universal_datasets( def construct_universal_data_sources( datasets: Dict[str, pd.DataFrame], data_source_creator: DataSourceCreator ) -> Dict[str, DataSource]: - customer_ds = data_source_creator.create_data_sources( + customer_ds = data_source_creator.create_data_source( datasets["customer"], suffix="customer_profile", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) - driver_ds = data_source_creator.create_data_sources( + driver_ds = data_source_creator.create_data_source( datasets["driver"], suffix="driver_hourly", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) - orders_ds = data_source_creator.create_data_sources( + orders_ds = data_source_creator.create_data_source( datasets["orders"], suffix="orders", event_timestamp_column="event_timestamp", @@ -246,10 +246,11 @@ def vary_providers_for_offline_stores( return new_configs -class EnvironmentSetupSteps(Enum): +class EnvironmentSetupSteps(IntEnum): INIT = 0 - CREATE_AND_APPLY_OBJECTS = 1 - MATERIALIZE = 2 + CREATE_OBJECTS = 1 + APPLY_OBJECTS = 2 + MATERIALIZE = 3 DEFAULT_STEP = EnvironmentSetupSteps.INIT @@ -306,38 +307,43 @@ def construct_universal_test_environment( data_source=ds, data_source_creator=offline_creator, ) - if data_source_cache: - fixtures = data_source_cache.get(test_repo_config) - if fixtures: - environment = setup_entities(environment, entities_override=fixtures[0]) - environment = setup_datasets(environment, datasets_override=fixtures[1]) - environment = setup_data_sources( - environment, data_sources_override=fixtures[2] - ) - else: - environment = setup_entities(environment) - environment = setup_datasets(environment) - environment = setup_data_sources(environment) - data_source_cache.put( - test_repo_config, - environment.entities, - environment.datasets, - environment.datasources, - offline_creator, - ) - else: - environment = setup_entities(environment) - environment = setup_datasets(environment) - environment = setup_data_sources(environment) - - environment = setup_feature_views(environment) fvs = [] entities = [] try: if stop_at_step <= EnvironmentSetupSteps.INIT: pass - if stop_at_step >= EnvironmentSetupSteps.CREATE_AND_APPLY_OBJECTS: + if stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS: + if data_source_cache: + fixtures = data_source_cache.get(test_repo_config) + if fixtures: + environment = setup_entities( + environment, entities_override=fixtures[0] + ) + environment = setup_datasets( + environment, datasets_override=fixtures[1] + ) + environment = setup_data_sources( + environment, data_sources_override=fixtures[2] + ) + else: + environment = setup_entities(environment) + environment = setup_datasets(environment) + environment = setup_data_sources(environment) + data_source_cache.put( + test_repo_config, + environment.entities, + environment.datasets, + environment.datasources, + offline_creator, + ) + else: + environment = setup_entities(environment) + environment = setup_datasets(environment) + environment = setup_data_sources(environment) + + environment = setup_feature_views(environment) + if stop_at_step >= EnvironmentSetupSteps.APPLY_OBJECTS: entities.extend([driver(), customer()]) fvs.extend(environment.feature_views.values()) fs.apply(fvs + entities) @@ -396,7 +402,7 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): def inner_test(config, data_source_cache): with construct_universal_test_environment( config, - stop_at_step=EnvironmentSetupSteps.CREATE_AND_APPLY_OBJECTS, + stop_at_step=EnvironmentSetupSteps.CREATE_OBJECTS, data_source_cache=data_source_cache, ) as environment: offline_retrieval_test(environment) From ef6a6b3bf2b69b759b365af4a9446c3ee20d2e74 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 25 Aug 2021 22:41:06 -0700 Subject: [PATCH 22/51] fix step Signed-off-by: Achal Shah --- .../integration/feature_repos/test_repo_configuration.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index bf9b409fe0..85cdb57422 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -258,7 +258,9 @@ class EnvironmentSetupSteps(IntEnum): @contextmanager def construct_universal_test_environment( - test_repo_config: TestRepoConfig, stop_at_step=DEFAULT_STEP, data_source_cache=None, + test_repo_config: TestRepoConfig, + stop_at_step=DEFAULT_STEP, + data_source_cache=None, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, @@ -402,7 +404,7 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): def inner_test(config, data_source_cache): with construct_universal_test_environment( config, - stop_at_step=EnvironmentSetupSteps.CREATE_OBJECTS, + stop_at_step=EnvironmentSetupSteps.APPLY_OBJECTS, data_source_cache=data_source_cache, ) as environment: offline_retrieval_test(environment) From d63691d2b902851f0a4d7ff59800100e36502b8f Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 26 Aug 2021 13:24:08 -0700 Subject: [PATCH 23/51] WIP fixes Signed-off-by: Achal Shah --- sdk/python/feast/feature_view.py | 10 +++++++- .../feast/infra/offline_stores/bigquery.py | 1 + sdk/python/feast/infra/provider.py | 5 +++- sdk/python/feast/registry.py | 1 + sdk/python/feast/type_map.py | 4 ++++ .../integration/e2e/test_universal_e2e.py | 5 +++- .../feature_repos/test_repo_configuration.py | 23 +++++++++++++------ .../feature_repos/universal/feature_views.py | 15 +++++++----- 8 files changed, 48 insertions(+), 16 deletions(-) diff --git a/sdk/python/feast/feature_view.py b/sdk/python/feast/feature_view.py index a833c7a5e5..cbb5a752d3 100644 --- a/sdk/python/feast/feature_view.py +++ b/sdk/python/feast/feature_view.py @@ -325,6 +325,14 @@ def infer_features_from_batch_source(self, config: RepoConfig): self.batch_source.created_timestamp_column, } | set(self.entities) + if self.batch_source.event_timestamp_column in self.batch_source.field_mapping: + columns_to_exclude.add(self.batch_source.field_mapping[self.batch_source.event_timestamp_column]) + if self.batch_source.created_timestamp_column in self.batch_source.field_mapping: + columns_to_exclude.add(self.batch_source.field_mapping[self.batch_source.created_timestamp_column]) + for e in self.entities: + if e in self.batch_source.field_mapping: + columns_to_exclude.add(self.batch_source.field_mapping[e]) + for ( col_name, col_datatype, @@ -335,7 +343,7 @@ def infer_features_from_batch_source(self, config: RepoConfig): ): feature_name = ( self.batch_source.field_mapping[col_name] - if col_name in self.batch_source.field_mapping.keys() + if col_name in self.batch_source.field_mapping else col_name ) self.features.append( diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index 3463e96898..2bfd863991 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -86,6 +86,7 @@ def pull_latest_from_table_or_query( ) WHERE _feast_row = 1 """ + return BigQueryRetrievalJob(query=query, client=client, config=config) @staticmethod diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 5d8a3eeed8..8f450662c9 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -238,9 +238,12 @@ def _get_column_names( reverse_field_mapping[col] if col in reverse_field_mapping.keys() else col for col in feature_names ] + feature_names = set(feature_names) - set(join_keys) + feature_names = feature_names - {event_timestamp_column} + feature_names = feature_names - {created_timestamp_column} return ( join_keys, - feature_names, + list(feature_names), event_timestamp_column, created_timestamp_column, ) diff --git a/sdk/python/feast/registry.py b/sdk/python/feast/registry.py index af04de5f3f..0839453b75 100644 --- a/sdk/python/feast/registry.py +++ b/sdk/python/feast/registry.py @@ -593,6 +593,7 @@ def __init__(self, repo_path: Path, registry_path_string: str): def get_registry_proto(self): registry_proto = RegistryProto() + import pdb; pdb.set_trace() if self._filepath.exists(): registry_proto.ParseFromString(self._filepath.read_bytes()) return registry_proto diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index af09069407..7aee3e5403 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -13,6 +13,7 @@ # limitations under the License. import re +from datetime import datetime from typing import Any, Dict, Union import numpy as np @@ -104,6 +105,7 @@ def python_type_to_feast_value_type( "int8": ValueType.INT32, "bool": ValueType.BOOL, "timedelta": ValueType.UNIX_TIMESTAMP, + "datetime": ValueType.UNIX_TIMESTAMP, "datetime64[ns]": ValueType.UNIX_TIMESTAMP, "datetime64[ns, tz]": ValueType.UNIX_TIMESTAMP, "category": ValueType.STRING, @@ -281,6 +283,8 @@ def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: elif feast_value_type == ValueType.INT64: return ProtoValue(int64_val=int(value)) elif feast_value_type == ValueType.UNIX_TIMESTAMP: + if isinstance(value, datetime): + return ProtoValue(int64_val=int(value.timestamp())) return ProtoValue(int64_val=int(value)) elif feast_value_type == ValueType.FLOAT: return ProtoValue(float_val=float(value)) diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 5c89d9b966..560972294a 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -16,10 +16,13 @@ @parametrize_e2e_test def test_e2e_consistency(test_environment: Environment): + infer_features = test_environment.test_repo_config.infer_features fs, fv = ( test_environment.feature_store, - driver_feature_view(test_environment.data_source), + driver_feature_view(data_source=test_environment.data_source, + infer_features=infer_features), ) + import pdb; pdb.set_trace() entity = driver() fs.apply([fv, entity]) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 85cdb57422..5a93daca9d 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -36,6 +36,7 @@ class TestRepoConfig: full_feature_names: bool = True infer_event_timestamp_col: bool = True + infer_features: bool = False def ds_creator_path(cls: str): @@ -230,6 +231,14 @@ def vary_infer_event_timestamp_col( return new_configs +def vary_infer_feature(configs: List[TestRepoConfig]) -> List[TestRepoConfig]: + new_configs = [] + for c in configs: + true_c = replace(c, infer_features=True) + false_c = replace(c, infer_features=False) + new_configs.extend([true_c, false_c]) + return new_configs + def vary_providers_for_offline_stores( configs: List[TestRepoConfig], ) -> List[TestRepoConfig]: @@ -258,9 +267,8 @@ class EnvironmentSetupSteps(IntEnum): @contextmanager def construct_universal_test_environment( - test_repo_config: TestRepoConfig, - stop_at_step=DEFAULT_STEP, - data_source_cache=None, + test_repo_config: TestRepoConfig, stop_at_step=DEFAULT_STEP, data_source_cache=None, + **kwargs ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, @@ -270,6 +278,8 @@ def construct_universal_test_environment( The user is *not* expected to perform any clean up actions. :param test_repo_config: configuration + :param stop_at_step: The step which should be the last one executed when setting up the test environment. + :param data_source_cache: :return: A feature store built using the supplied configuration. """ @@ -313,8 +323,6 @@ def construct_universal_test_environment( fvs = [] entities = [] try: - if stop_at_step <= EnvironmentSetupSteps.INIT: - pass if stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS: if data_source_cache: fixtures = data_source_cache.get(test_repo_config) @@ -354,7 +362,8 @@ def construct_universal_test_environment( yield environment finally: - if not data_source_cache: + import pdb; pdb.set_trace() + if not data_source_cache and stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS: offline_creator.teardown() fs.teardown() @@ -373,7 +382,7 @@ def parametrize_e2e_test(e2e_test): """ @pytest.mark.integration - @pytest.mark.parametrize("config", FULL_REPO_CONFIGS, ids=lambda v: str(v)) + @pytest.mark.parametrize("config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v)) def inner_test(config, data_source_cache): with construct_universal_test_environment( config, data_source_cache=data_source_cache diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index 0306044ecd..176a03f0dc 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -5,22 +5,24 @@ def driver_feature_view( - data_source: DataSource, name="test_correctness" + data_source: DataSource, name="test_correctness", + infer_features: bool = False, ) -> FeatureView: return FeatureView( name=name, entities=["driver"], - features=[Feature("value", ValueType.FLOAT)], + features=None if infer_features else [Feature("value", ValueType.FLOAT)], ttl=timedelta(days=5), input=data_source, ) -def create_driver_hourly_stats_feature_view(source): +def create_driver_hourly_stats_feature_view(source, + infer_features: bool = False): driver_stats_feature_view = FeatureView( name="driver_stats", entities=["driver"], - features=[ + features=None if infer_features else [ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT32), @@ -31,11 +33,12 @@ def create_driver_hourly_stats_feature_view(source): return driver_stats_feature_view -def create_customer_daily_profile_feature_view(source): +def create_customer_daily_profile_feature_view(source, + infer_features: bool = False): customer_profile_feature_view = FeatureView( name="customer_profile", entities=["customer_id"], - features=[ + features=None if infer_features else [ Feature(name="current_balance", dtype=ValueType.FLOAT), Feature(name="avg_passenger_count", dtype=ValueType.FLOAT), Feature(name="lifetime_trip_count", dtype=ValueType.INT32), From 3d7597752b1fd1b1280a2ab28f209ef99795e28c Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 26 Aug 2021 14:06:09 -0700 Subject: [PATCH 24/51] Fix for feature inferencing Signed-off-by: Achal Shah --- sdk/python/feast/feature_view.py | 22 +++++++++++++++---- sdk/python/feast/infra/provider.py | 8 +++---- sdk/python/feast/registry.py | 1 - sdk/python/feast/type_map.py | 4 ++++ .../integration/e2e/test_universal_e2e.py | 6 ++--- .../feature_repos/test_repo_configuration.py | 17 +++++++++----- .../universal/data_sources/file.py | 2 +- .../feature_repos/universal/feature_views.py | 17 +++++++------- 8 files changed, 51 insertions(+), 26 deletions(-) diff --git a/sdk/python/feast/feature_view.py b/sdk/python/feast/feature_view.py index cbb5a752d3..801d578e7d 100644 --- a/sdk/python/feast/feature_view.py +++ b/sdk/python/feast/feature_view.py @@ -325,10 +325,24 @@ def infer_features_from_batch_source(self, config: RepoConfig): self.batch_source.created_timestamp_column, } | set(self.entities) - if self.batch_source.event_timestamp_column in self.batch_source.field_mapping: - columns_to_exclude.add(self.batch_source.field_mapping[self.batch_source.event_timestamp_column]) - if self.batch_source.created_timestamp_column in self.batch_source.field_mapping: - columns_to_exclude.add(self.batch_source.field_mapping[self.batch_source.created_timestamp_column]) + if ( + self.batch_source.event_timestamp_column + in self.batch_source.field_mapping + ): + columns_to_exclude.add( + self.batch_source.field_mapping[ + self.batch_source.event_timestamp_column + ] + ) + if ( + self.batch_source.created_timestamp_column + in self.batch_source.field_mapping + ): + columns_to_exclude.add( + self.batch_source.field_mapping[ + self.batch_source.created_timestamp_column + ] + ) for e in self.entities: if e in self.batch_source.field_mapping: columns_to_exclude.add(self.batch_source.field_mapping[e]) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 8f450662c9..e2c2bb2975 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -238,12 +238,12 @@ def _get_column_names( reverse_field_mapping[col] if col in reverse_field_mapping.keys() else col for col in feature_names ] - feature_names = set(feature_names) - set(join_keys) - feature_names = feature_names - {event_timestamp_column} - feature_names = feature_names - {created_timestamp_column} + _feature_names = set(feature_names) - set(join_keys) + _feature_names = _feature_names - {event_timestamp_column} + _feature_names = _feature_names - {created_timestamp_column} return ( join_keys, - list(feature_names), + list(_feature_names), event_timestamp_column, created_timestamp_column, ) diff --git a/sdk/python/feast/registry.py b/sdk/python/feast/registry.py index 0839453b75..af04de5f3f 100644 --- a/sdk/python/feast/registry.py +++ b/sdk/python/feast/registry.py @@ -593,7 +593,6 @@ def __init__(self, repo_path: Path, registry_path_string: str): def get_registry_proto(self): registry_proto = RegistryProto() - import pdb; pdb.set_trace() if self._filepath.exists(): registry_proto.ParseFromString(self._filepath.read_bytes()) return registry_proto diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 7aee3e5403..4cc5345262 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -19,6 +19,7 @@ import numpy as np import pandas as pd from google.protobuf.json_format import MessageToDict +from google.protobuf.timestamp_pb2 import Timestamp from feast.protos.feast.types.Value_pb2 import ( BoolList, @@ -105,6 +106,7 @@ def python_type_to_feast_value_type( "int8": ValueType.INT32, "bool": ValueType.BOOL, "timedelta": ValueType.UNIX_TIMESTAMP, + "Timestamp": ValueType.UNIX_TIMESTAMP, "datetime": ValueType.UNIX_TIMESTAMP, "datetime64[ns]": ValueType.UNIX_TIMESTAMP, "datetime64[ns, tz]": ValueType.UNIX_TIMESTAMP, @@ -285,6 +287,8 @@ def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: elif feast_value_type == ValueType.UNIX_TIMESTAMP: if isinstance(value, datetime): return ProtoValue(int64_val=int(value.timestamp())) + elif isinstance(value, Timestamp): + return ProtoValue(int64_val=int(value.ToSeconds())) return ProtoValue(int64_val=int(value)) elif feast_value_type == ValueType.FLOAT: return ProtoValue(float_val=float(value)) diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 560972294a..0d2a62322a 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -19,10 +19,10 @@ def test_e2e_consistency(test_environment: Environment): infer_features = test_environment.test_repo_config.infer_features fs, fv = ( test_environment.feature_store, - driver_feature_view(data_source=test_environment.data_source, - infer_features=infer_features), + driver_feature_view( + data_source=test_environment.data_source, infer_features=infer_features + ), ) - import pdb; pdb.set_trace() entity = driver() fs.apply([fv, entity]) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 5a93daca9d..92e90cb122 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -239,6 +239,7 @@ def vary_infer_feature(configs: List[TestRepoConfig]) -> List[TestRepoConfig]: new_configs.extend([true_c, false_c]) return new_configs + def vary_providers_for_offline_stores( configs: List[TestRepoConfig], ) -> List[TestRepoConfig]: @@ -267,8 +268,10 @@ class EnvironmentSetupSteps(IntEnum): @contextmanager def construct_universal_test_environment( - test_repo_config: TestRepoConfig, stop_at_step=DEFAULT_STEP, data_source_cache=None, - **kwargs + test_repo_config: TestRepoConfig, + stop_at_step=DEFAULT_STEP, + data_source_cache=None, + **kwargs, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, @@ -362,8 +365,10 @@ def construct_universal_test_environment( yield environment finally: - import pdb; pdb.set_trace() - if not data_source_cache and stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS: + if ( + not data_source_cache + and stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS + ): offline_creator.teardown() fs.teardown() @@ -382,7 +387,9 @@ def parametrize_e2e_test(e2e_test): """ @pytest.mark.integration - @pytest.mark.parametrize("config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v)) + @pytest.mark.parametrize( + "config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v) + ) def inner_test(config, data_source_cache): with construct_universal_test_environment( config, data_source_cache=data_source_cache diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py index 046f40afa0..cd86740e6a 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py @@ -44,7 +44,7 @@ def create_data_source( self.files.append(f) return FileSource( file_format=ParquetFormat(), - path=f"file://{f.name}", + path=f"{f.name}", event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index 176a03f0dc..b57e2988ab 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -5,8 +5,7 @@ def driver_feature_view( - data_source: DataSource, name="test_correctness", - infer_features: bool = False, + data_source: DataSource, name="test_correctness", infer_features: bool = False, ) -> FeatureView: return FeatureView( name=name, @@ -17,12 +16,13 @@ def driver_feature_view( ) -def create_driver_hourly_stats_feature_view(source, - infer_features: bool = False): +def create_driver_hourly_stats_feature_view(source, infer_features: bool = False): driver_stats_feature_view = FeatureView( name="driver_stats", entities=["driver"], - features=None if infer_features else [ + features=None + if infer_features + else [ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT32), @@ -33,12 +33,13 @@ def create_driver_hourly_stats_feature_view(source, return driver_stats_feature_view -def create_customer_daily_profile_feature_view(source, - infer_features: bool = False): +def create_customer_daily_profile_feature_view(source, infer_features: bool = False): customer_profile_feature_view = FeatureView( name="customer_profile", entities=["customer_id"], - features=None if infer_features else [ + features=None + if infer_features + else [ Feature(name="current_balance", dtype=ValueType.FLOAT), Feature(name="avg_passenger_count", dtype=ValueType.FLOAT), Feature(name="lifetime_trip_count", dtype=ValueType.INT32), From 1abbaa84d59aaa1774abffa0fe43dbf7573e9b36 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 26 Aug 2021 14:09:16 -0700 Subject: [PATCH 25/51] C901 '_python_value_to_proto_value' is too complex :( Signed-off-by: Achal Shah --- sdk/python/feast/type_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 4cc5345262..3ea6cda968 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -164,7 +164,7 @@ def _type_err(item, dtype): raise ValueError(f'Value "{item}" is of type {type(item)} not of type {dtype}') -def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: +def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: # noqa: C901 """ Converts a Python (native, pandas) value to a Feast Proto Value based on a provided value type From 21996b786a00b816ef0d0926798a1d5640b77a06 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 26 Aug 2021 23:20:35 -0700 Subject: [PATCH 26/51] Split out construct_test_repo and construct_universal_test_repo Signed-off-by: Achal Shah --- sdk/python/feast/infra/provider.py | 3 +- sdk/python/tests/conftest.py | 29 ++-- .../integration/e2e/test_universal_e2e.py | 47 ++++-- .../feature_repos/test_repo_configuration.py | 134 +++++++++--------- .../universal/data_sources/bigquery.py | 23 ++- .../universal/data_sources/file.py | 2 +- 6 files changed, 129 insertions(+), 109 deletions(-) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index e2c2bb2975..e5a03d7ca8 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -239,8 +239,7 @@ def _get_column_names( for col in feature_names ] _feature_names = set(feature_names) - set(join_keys) - _feature_names = _feature_names - {event_timestamp_column} - _feature_names = _feature_names - {created_timestamp_column} + _feature_names = _feature_names - {event_timestamp_column, created_timestamp_column} return ( join_keys, list(_feature_names), diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 7147e26dee..ee193ea419 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -14,11 +14,13 @@ import multiprocessing from datetime import datetime, timedelta from sys import platform -from typing import Any, Dict +from typing import Any, Callable, Dict, Tuple import pandas as pd import pytest +from feast.data_source import DataSource + def pytest_configure(config): if platform in ["darwin", "windows"]: @@ -90,26 +92,19 @@ def simple_dataset_2() -> pd.DataFrame: return pd.DataFrame.from_dict(data) -class DataSourceCache: - cache: Dict[str, Any] = {} - - def get(self, test_repo_config): - return self.cache.get(test_repo_config.offline_store_creator, None) - - def put( - self, test_repo_config, entites, datasets, data_sources, data_source_creator - ): - self.cache[test_repo_config.offline_store_creator] = ( - entites, - datasets, - data_sources, - data_source_creator, - ) +class DataSourceCache(dict): + def get_or_create(self, key, c: Callable[[], Any]): + if key in self: + return self[key] + else: + v = c() + self[key] = v + return v @pytest.fixture(scope="session", autouse=True) def data_source_cache(): dsc = DataSourceCache() yield dsc - for _, v in dsc.cache.items(): + for _, v in dsc.items(): v[3].teardown() diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 0d2a62322a..2992dcc6d7 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -3,30 +3,49 @@ from typing import Optional import pandas as pd +import pytest from pytz import utc from feast import FeatureStore, FeatureView +from tests.conftest import DataSourceCache +from tests.data.data_creator import create_dataset from tests.integration.feature_repos.test_repo_configuration import ( - Environment, - parametrize_e2e_test, + FULL_REPO_CONFIGS, + construct_test_environment, + vary_infer_feature, ) from tests.integration.feature_repos.universal.entities import driver from tests.integration.feature_repos.universal.feature_views import driver_feature_view -@parametrize_e2e_test -def test_e2e_consistency(test_environment: Environment): - infer_features = test_environment.test_repo_config.infer_features - fs, fv = ( - test_environment.feature_store, - driver_feature_view( - data_source=test_environment.data_source, infer_features=infer_features - ), - ) - entity = driver() - fs.apply([fv, entity]) +@pytest.mark.integration +@pytest.mark.parametrize( + "config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v) +) +def test_e2e_consistency(config, data_source_cache: DataSourceCache): + df = create_dataset() + test_suite_name = "test_e2e_consistency" + with construct_test_environment(test_suite_name, config) as test_environment: + fs = test_environment.feature_store + infer_features = test_environment.test_repo_config.infer_features + key = f"test_e2e_consistency_{test_environment.test_repo_config.offline_store_creator}" + _, _, data_source, _ = data_source_cache.get_or_create( + key, + lambda: ( + None, + df, + test_environment.data_source_creator.create_data_source( + df, fs.project, field_mapping={"ts_1": "ts", "id": "driver_id"} + ), + test_environment.data_source_creator, + ), + ) + fv = driver_feature_view(data_source=data_source, infer_features=infer_features) + + entity = driver() + fs.apply([fv, entity]) - run_offline_online_store_consistency_test(fs, fv) + run_offline_online_store_consistency_test(fs, fv) def check_offline_and_online_features( diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 92e90cb122..e6bb9f441b 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -12,7 +12,6 @@ from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data, importer from feast.data_source import DataSource -from tests.data.data_creator import create_dataset from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, ) @@ -187,7 +186,6 @@ class Environment: name: str test_repo_config: TestRepoConfig feature_store: FeatureStore - data_source: DataSource data_source_creator: DataSourceCreator entities: Dict[str, List[Any]] = field(default_factory=dict) @@ -267,26 +265,10 @@ class EnvironmentSetupSteps(IntEnum): @contextmanager -def construct_universal_test_environment( - test_repo_config: TestRepoConfig, - stop_at_step=DEFAULT_STEP, - data_source_cache=None, - **kwargs, +def construct_test_environment( + test_suite_name: str, test_repo_config: TestRepoConfig, ) -> Environment: - """ - This method should take in the parameters from the test repo config and created a feature repo, apply it, - and return the constructed feature store object to callers. - - This feature store object can be interacted for the purposes of tests. - The user is *not* expected to perform any clean up actions. - - :param test_repo_config: configuration - :param stop_at_step: The step which should be the last one executed when setting up the test environment. - :param data_source_cache: - :return: A feature store built using the supplied configuration. - """ - - project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}" + project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}" module_name, config_class_name = test_repo_config.offline_store_creator.rsplit( ".", 1 @@ -296,12 +278,6 @@ def construct_universal_test_environment( module_name, config_class_name, "DataSourceCreator" )(project) - # This needs to be abstracted away for test_e2e_universal which uses a different dataset. - df = create_dataset() - ds = offline_creator.create_data_source( - df, project, field_mapping={"ts_1": "ts", "id": "driver_id"} - ) - offline_store = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store @@ -319,16 +295,44 @@ def construct_universal_test_environment( name=project, test_repo_config=test_repo_config, feature_store=fs, - data_source=ds, data_source_creator=offline_creator, ) + try: + yield environment + finally: + fs.teardown() + + +@contextmanager +def construct_universal_test_environment( + test_suite_name: str, + test_repo_config: TestRepoConfig, + stop_at_step=DEFAULT_STEP, + data_source_cache=None, + **kwargs, +) -> Environment: + """ + This method should take in the parameters from the test repo config and created a feature repo, apply it, + and return the constructed feature store object to callers. + + This feature store object can be interacted for the purposes of tests. + The user is *not* expected to perform any clean up actions. + + :param test_suite_name: A name for the test suite. + :param test_repo_config: configuration + :param stop_at_step: The step which should be the last one executed when setting up the test environment. + :param data_source_cache: + :return: A feature store built using the supplied configuration. + """ + with construct_test_environment(test_suite_name, test_repo_config) as environment: + fs = environment.feature_store fvs = [] entities = [] try: if stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS: - if data_source_cache: - fixtures = data_source_cache.get(test_repo_config) + if data_source_cache is not None: + fixtures = data_source_cache.get(test_repo_config, None) if fixtures: environment = setup_entities( environment, entities_override=fixtures[0] @@ -340,20 +344,39 @@ def construct_universal_test_environment( environment, data_sources_override=fixtures[2] ) else: - environment = setup_entities(environment) - environment = setup_datasets(environment) - environment = setup_data_sources(environment) - data_source_cache.put( - test_repo_config, + environment = setup_entities( + environment, + entities_override=kwargs.get("entites_override", None), + ) + environment = setup_datasets( + environment, + datasets_override=kwargs.get("datasets_overrides", None), + ) + environment = setup_data_sources( + environment, + data_sources_override=kwargs.get( + "data_sources_override", None + ), + ) + data_source_cache[test_repo_config] = ( environment.entities, environment.datasets, environment.datasources, - offline_creator, + environment.data_source_creator, ) else: - environment = setup_entities(environment) - environment = setup_datasets(environment) - environment = setup_data_sources(environment) + environment = setup_entities( + environment, + entities_override=kwargs.get("entites_override", None), + ) + environment = setup_datasets( + environment, + datasets_override=kwargs.get("datasets_overrides", None), + ) + environment = setup_data_sources( + environment, + data_sources_override=kwargs.get("data_sources_override", None), + ) environment = setup_feature_views(environment) if stop_at_step >= EnvironmentSetupSteps.APPLY_OBJECTS: @@ -366,37 +389,10 @@ def construct_universal_test_environment( yield environment finally: if ( - not data_source_cache + data_source_cache is None and stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS ): - offline_creator.teardown() - fs.teardown() - - -def parametrize_e2e_test(e2e_test): - """ - This decorator should be used for end-to-end tests. These tests are expected to be parameterized, - and receive an empty feature repo created for all supported configurations. - - The decorator also ensures that sample data needed for the test is available in the relevant offline store. - - Decorated tests should create and apply the objects needed by the tests, and perform any operations needed - (such as materialization and looking up feature values). - - The decorator takes care of tearing down the feature store, as well as the sample data. - """ - - @pytest.mark.integration - @pytest.mark.parametrize( - "config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v) - ) - def inner_test(config, data_source_cache): - with construct_universal_test_environment( - config, data_source_cache=data_source_cache - ) as environment: - e2e_test(environment) - - return inner_test + environment.data_source_creator.teardown() def parametrize_offline_retrieval_test(offline_retrieval_test): @@ -419,6 +415,7 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) def inner_test(config, data_source_cache): with construct_universal_test_environment( + offline_retrieval_test.__name__, config, stop_at_step=EnvironmentSetupSteps.APPLY_OBJECTS, data_source_cache=data_source_cache, @@ -446,6 +443,7 @@ def parametrize_online_test(online_test): @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) def inner_test(config, data_source_cache): with construct_universal_test_environment( + online_test.__name__, config, stop_at_step=EnvironmentSetupSteps.MATERIALIZE, data_source_cache=data_source_cache, diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py index a38e03143b..5c888ca113 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py @@ -2,6 +2,7 @@ import pandas as pd from google.cloud import bigquery +from google.cloud.bigquery import Dataset from feast import BigQuerySource from feast.data_source import DataSource @@ -12,21 +13,27 @@ class BigQueryDataSourceCreator(DataSourceCreator): + dataset: Optional[Dataset] = None + def __init__(self, project_name: str): self.client = bigquery.Client() self.project_name = project_name self.gcp_project = self.client.project self.dataset_id = f"{self.gcp_project}.{project_name}" - self.dataset = bigquery.Dataset(self.dataset_id) - print(f"Creating dataset: {self.dataset_id}") - self.client.create_dataset(self.dataset, exists_ok=True) - self.dataset.default_table_expiration_ms = ( - 1000 * 60 * 60 * 24 * 14 - ) # 2 weeks in milliseconds - self.client.update_dataset(self.dataset, ["default_table_expiration_ms"]) self.tables = [] + def get_dataset(self): + if not self.dataset: + self.dataset = bigquery.Dataset(self.dataset_id) + print(f"Creating dataset: {self.dataset_id}") + self.client.create_dataset(self.dataset, exists_ok=True) + self.dataset.default_table_expiration_ms = ( + 1000 * 60 * 60 * 24 * 14 + ) # 2 weeks in milliseconds + self.client.update_dataset(self.dataset, ["default_table_expiration_ms"]) + return self.dataset + def teardown(self): for table in self.tables: @@ -55,6 +62,8 @@ def create_data_source( if not destination: destination = self.get_prefixed_table_name(suffix) + self.get_dataset() + job_config = bigquery.LoadJobConfig() if self.gcp_project not in destination: destination = f"{self.gcp_project}.{self.project_name}.{destination}" diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py index cd86740e6a..48630dd3f9 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py @@ -38,7 +38,7 @@ def create_data_source( destination = self.get_prefixed_table_name(suffix) f = tempfile.NamedTemporaryFile( - prefix=self.project_name, suffix=".parquet", delete=False + prefix=f"{self.project_name}_{destination}", suffix=".parquet", delete=False ) df.to_parquet(f.name) self.files.append(f) From 4541561130fda3910ba7a758ac3d56ac2df490d3 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 26 Aug 2021 23:23:11 -0700 Subject: [PATCH 27/51] remove import Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index ee193ea419..57a47c7f17 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -14,13 +14,11 @@ import multiprocessing from datetime import datetime, timedelta from sys import platform -from typing import Any, Callable, Dict, Tuple +from typing import Any, Callable import pandas as pd import pytest -from feast.data_source import DataSource - def pytest_configure(config): if platform in ["darwin", "windows"]: From 2f20b5c615bfe2a1fbec4938423f5dadf3c00a5a Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 26 Aug 2021 23:33:01 -0700 Subject: [PATCH 28/51] add unsafe_hash Signed-off-by: Achal Shah --- sdk/python/tests/integration/e2e/test_universal_e2e.py | 2 +- .../tests/integration/feature_repos/test_repo_configuration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 2992dcc6d7..3362b45466 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -28,7 +28,7 @@ def test_e2e_consistency(config, data_source_cache: DataSourceCache): with construct_test_environment(test_suite_name, config) as test_environment: fs = test_environment.feature_store infer_features = test_environment.test_repo_config.infer_features - key = f"test_e2e_consistency_{test_environment.test_repo_config.offline_store_creator}" + key = f"{test_suite_name}_{test_environment.test_repo_config.offline_store_creator}" _, _, data_source, _ = data_source_cache.get_or_create( key, lambda: ( diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index e6bb9f441b..52d01bcbf7 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -22,7 +22,7 @@ ) -@dataclass(frozen=True, repr=True) +@dataclass(frozen=True, repr=True, unsafe_hash=True) class TestRepoConfig: """ This class should hold all possible parameters that may need to be varied by individual tests. From 6cc32b520ad837e1c89f78e8cca0f8f959733e7d Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 26 Aug 2021 23:48:18 -0700 Subject: [PATCH 29/51] Update testrepoconfig Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 52d01bcbf7..ea81bc587a 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -22,14 +22,14 @@ ) -@dataclass(frozen=True, repr=True, unsafe_hash=True) +@dataclass(repr=True) class TestRepoConfig: """ This class should hold all possible parameters that may need to be varied by individual tests. """ provider: str = "local" - online_store: Union[str, Dict] = "sqlite" + online_store: Union[str, Dict] = field(default="sqlite", hash=False) offline_store_creator: str = "tests.integration.feature_repos.universal.data_sources.file.FileDataSourceCreator" @@ -37,6 +37,13 @@ class TestRepoConfig: infer_event_timestamp_col: bool = True infer_features: bool = False + def __post_init__(self): + self.online_store_type = ( + self.online_store + if isinstance(self.online_store, str) + else self.online_store.get("type", "unknown") + ) + def ds_creator_path(cls: str): return f"tests.integration.feature_repos.universal.data_sources.{cls}" From 68f299706ba11f5b9cf918472758554dd38e8ef0 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 26 Aug 2021 23:52:41 -0700 Subject: [PATCH 30/51] Update testrepoconfig Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index ea81bc587a..9ce7c91345 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -22,14 +22,14 @@ ) -@dataclass(repr=True) +@dataclass(frozen=True, repr=True) class TestRepoConfig: """ This class should hold all possible parameters that may need to be varied by individual tests. """ provider: str = "local" - online_store: Union[str, Dict] = field(default="sqlite", hash=False) + online_store: Union[str, Dict] = "sqlite" offline_store_creator: str = "tests.integration.feature_repos.universal.data_sources.file.FileDataSourceCreator" @@ -37,13 +37,6 @@ class TestRepoConfig: infer_event_timestamp_col: bool = True infer_features: bool = False - def __post_init__(self): - self.online_store_type = ( - self.online_store - if isinstance(self.online_store, str) - else self.online_store.get("type", "unknown") - ) - def ds_creator_path(cls: str): return f"tests.integration.feature_repos.universal.data_sources.{cls}" @@ -339,7 +332,9 @@ def construct_universal_test_environment( try: if stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS: if data_source_cache is not None: - fixtures = data_source_cache.get(test_repo_config, None) + fixtures = data_source_cache.get( + test_repo_config.offline_store_creator, None + ) if fixtures: environment = setup_entities( environment, entities_override=fixtures[0] @@ -365,7 +360,7 @@ def construct_universal_test_environment( "data_sources_override", None ), ) - data_source_cache[test_repo_config] = ( + data_source_cache[test_repo_config.offline_store_creator] = ( environment.entities, environment.datasets, environment.datasources, From a73868d5ddc68c40731539e4c80053054d4edaf6 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 27 Aug 2021 00:01:14 -0700 Subject: [PATCH 31/51] Remove kwargs from construct_universal_test_environment Signed-off-by: Achal Shah --- .../feature_repos/test_repo_configuration.py | 33 ++++--------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 9ce7c91345..fbee272436 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -310,7 +310,6 @@ def construct_universal_test_environment( test_repo_config: TestRepoConfig, stop_at_step=DEFAULT_STEP, data_source_cache=None, - **kwargs, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, @@ -346,20 +345,9 @@ def construct_universal_test_environment( environment, data_sources_override=fixtures[2] ) else: - environment = setup_entities( - environment, - entities_override=kwargs.get("entites_override", None), - ) - environment = setup_datasets( - environment, - datasets_override=kwargs.get("datasets_overrides", None), - ) - environment = setup_data_sources( - environment, - data_sources_override=kwargs.get( - "data_sources_override", None - ), - ) + environment = setup_entities(environment) + environment = setup_datasets(environment) + environment = setup_data_sources(environment) data_source_cache[test_repo_config.offline_store_creator] = ( environment.entities, environment.datasets, @@ -367,18 +355,9 @@ def construct_universal_test_environment( environment.data_source_creator, ) else: - environment = setup_entities( - environment, - entities_override=kwargs.get("entites_override", None), - ) - environment = setup_datasets( - environment, - datasets_override=kwargs.get("datasets_overrides", None), - ) - environment = setup_data_sources( - environment, - data_sources_override=kwargs.get("data_sources_override", None), - ) + environment = setup_entities(environment) + environment = setup_datasets(environment) + environment = setup_data_sources(environment) environment = setup_feature_views(environment) if stop_at_step >= EnvironmentSetupSteps.APPLY_OBJECTS: From ff9d49dfd2a4330e766955465d08d97ad31fa1d8 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 27 Aug 2021 10:42:08 -0700 Subject: [PATCH 32/51] Remove unneeded method Signed-off-by: Achal Shah --- .../integration/feature_repos/test_repo_configuration.py | 4 ++-- .../feature_repos/universal/data_source_creator.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index fbee272436..641017ac76 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -278,7 +278,7 @@ def construct_test_environment( module_name, config_class_name, "DataSourceCreator" )(project) - offline_store = offline_creator.create_offline_store_config() + offline_store_config = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store with tempfile.TemporaryDirectory() as repo_dir_name: @@ -286,7 +286,7 @@ def construct_test_environment( registry=str(Path(repo_dir_name) / "registry.db"), project=project, provider=test_repo_config.provider, - offline_store=offline_store, + offline_store=offline_store_config, online_store=online_store, repo_path=repo_dir_name, ) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py index feb5ae5e98..4385adab85 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py @@ -27,7 +27,3 @@ def create_offline_store_config(self) -> FeastConfigBaseModel: @abstractmethod def teardown(self): ... - - @abstractmethod - def get_prefixed_table_name(self, table_name: str) -> str: - ... From b95c3ee389018a17e105e21a83999713e98616be Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 27 Aug 2021 12:26:46 -0700 Subject: [PATCH 33/51] Docs Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 15 +++++++++++++++ .../integration/e2e/test_universal_e2e.py | 2 +- .../feature_repos/test_repo_configuration.py | 6 +++--- .../universal/data_source_creator.py | 19 ++++++++++++++++--- .../universal/data_sources/bigquery.py | 5 +---- .../universal/data_sources/file.py | 7 ++----- .../universal/data_sources/redshift.py | 6 ++---- 7 files changed, 40 insertions(+), 20 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 57a47c7f17..4cc8bcec1a 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -91,6 +91,21 @@ def simple_dataset_2() -> pd.DataFrame: class DataSourceCache(dict): + """ + DataSourceCache is meant to cache datasources to be reused across multiple tests. + + Staging data into a remote store is an expensive operation, so we expose a way through which we can have the + datasources created shared between tests. + The key for the cache is an arbitrary string, which should be constructed using a combination of the test name along + with the backing store. + The value for the cache is a 4-valued Tuple: + - the entites that are meant to be used with the data source created + - The dataframe that is persisted into the remote store. + - The data source object that represents the table/file; created from the dataframe mentioned above. + - The data_source_creator object that is used to upload the data frame and construct the data source object. + This object is needed so that we can invoke the teardown method at the end of all our tests. + """ + def get_or_create(self, key, c: Callable[[], Any]): if key in self: return self[key] diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 3362b45466..57d2d361ce 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -24,7 +24,7 @@ ) def test_e2e_consistency(config, data_source_cache: DataSourceCache): df = create_dataset() - test_suite_name = "test_e2e_consistency" + test_suite_name = test_e2e_consistency.__name__ with construct_test_environment(test_suite_name, config) as test_environment: fs = test_environment.feature_store infer_features = test_environment.test_repo_config.infer_features diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 641017ac76..e5a40c0b0e 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -102,19 +102,19 @@ def construct_universal_data_sources( ) -> Dict[str, DataSource]: customer_ds = data_source_creator.create_data_source( datasets["customer"], - suffix="customer_profile", + destination="customer_profile", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_ds = data_source_creator.create_data_source( datasets["driver"], - suffix="driver_hourly", + destination="driver_hourly", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) orders_ds = data_source_creator.create_data_source( datasets["orders"], - suffix="orders", + destination="orders", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py index 4385adab85..d65a91bd2f 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, Optional +from typing import Dict import pandas as pd @@ -12,12 +12,25 @@ class DataSourceCreator(ABC): def create_data_source( self, df: pd.DataFrame, - destination: Optional[str] = None, - suffix: Optional[str] = None, + destination: str, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: + """ + Create a data source based on the dataframe. Implementing this method requires the underlying implementation to + persist the dataframe in offline store, using the destination string as a way to differentiate multiple + dataframes and data sources. + + :param df: The dataframe to be used to create the data source. + :param destination: The destination that this data frame is meant for. This str is used by the implementing + classes to isolate the multiple dataframes from each other. + :param event_timestamp_column: Pass through for the underlying data source. + :param created_timestamp_column: Pass through for the underlying data source. + :param field_mapping: Pass through for the underlying data source. + :return: A Data source object, pointing to a table or file that is uploaded/persisted for the purpose of the + test. + """ ... @abstractmethod diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py index 5c888ca113..0863d7bd25 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py @@ -51,16 +51,13 @@ def create_data_source( self, df: pd.DataFrame, destination: Optional[str] = None, - suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, **kwargs, ) -> DataSource: - assert destination or suffix - if not destination: - destination = self.get_prefixed_table_name(suffix) + destination = self.get_prefixed_table_name(destination) self.get_dataset() diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py index 48630dd3f9..7d204ed7b1 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py @@ -26,16 +26,13 @@ def __init__(self, project_name: str): def create_data_source( self, df: pd.DataFrame, - destination: Optional[str] = None, - suffix: Optional[str] = None, + destination: str, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: - assert destination or suffix - if not destination: - destination = self.get_prefixed_table_name(suffix) + destination = self.get_prefixed_table_name(destination) f = tempfile.NamedTemporaryFile( prefix=f"{self.project_name}_{destination}", suffix=".parquet", delete=False diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py index 34d4cfd4a5..528e0261f8 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py @@ -34,16 +34,14 @@ def __init__(self, project_name: str): def create_data_source( self, df: pd.DataFrame, - destination: Optional[str] = None, + destination: str, suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: - assert destination or suffix - if not destination: - destination = self.get_prefixed_table_name(suffix) + destination = self.get_prefixed_table_name(destination) aws_utils.upload_df_to_redshift( self.client, From 3e5aada09083822ede24d546b86448438f3b92cf Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 27 Aug 2021 13:04:34 -0700 Subject: [PATCH 34/51] Kill skipped tests Signed-off-by: Achal Shah --- .../test_offline_online_store_consistency.py | 15 - .../test_historical_retrieval.py | 559 ------------------ 2 files changed, 574 deletions(-) diff --git a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py index ecb4c82bd3..fbca6e9856 100644 --- a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py +++ b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py @@ -382,21 +382,6 @@ def run_offline_online_store_consistency_test( ) -@pytest.mark.skip( - reason="This is the most expensive test and is subsumed by the universal tests." -) -@pytest.mark.integration -@pytest.mark.parametrize( - "bq_source_type", ["query", "table"], -) -@pytest.mark.parametrize("full_feature_names", [True, False]) -def test_bq_offline_online_store_consistency( - bq_source_type: str, full_feature_names: bool -): - with prep_bq_fs_and_fv(bq_source_type) as (fs, fv): - run_offline_online_store_consistency_test(fs, fv, full_feature_names) - - @pytest.mark.parametrize("full_feature_names", [True, False]) @pytest.mark.integration def test_redis_offline_online_store_consistency(full_feature_names: bool): diff --git a/sdk/python/tests/integration/offline_store/test_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_historical_retrieval.py index 26b6b335dc..63b720d3df 100644 --- a/sdk/python/tests/integration/offline_store/test_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_historical_retrieval.py @@ -394,565 +394,6 @@ def test_historical_features_from_parquet_sources( store.teardown() -@pytest.mark.skip( - reason="This is the most expensive test and is subsumed by the universal tests." -) -@pytest.mark.integration -@pytest.mark.parametrize( - "provider_type", ["local", "gcp", "gcp_custom_offline_config"], -) -@pytest.mark.parametrize( - "infer_event_timestamp_col", [False, True], -) -@pytest.mark.parametrize( - "full_feature_names", [False, True], -) -def test_historical_features_from_bigquery_sources( - provider_type, infer_event_timestamp_col, capsys, full_feature_names -): - start_date = datetime.now().replace(microsecond=0, second=0, minute=0) - ( - customer_entities, - driver_entities, - end_date, - orders_df, - start_date, - ) = generate_entities(start_date, infer_event_timestamp_col) - - bigquery_dataset = ( - f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" - ) - - with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir: - gcp_project = bigquery.Client().project - - # Orders Query - table_id = f"{bigquery_dataset}.orders" - stage_orders_bigquery(orders_df, table_id) - entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}" - - # Driver Feature View - driver_df = driver_data.create_driver_hourly_stats_df( - driver_entities, start_date, end_date - ) - driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly" - stage_driver_hourly_stats_bigquery_source(driver_df, driver_table_id) - driver_source = BigQuerySource( - table_ref=driver_table_id, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - driver_fv = create_driver_hourly_stats_feature_view(driver_source) - - # Customer Feature View - customer_df = driver_data.create_customer_daily_profile_df( - customer_entities, start_date, end_date - ) - customer_table_id = f"{gcp_project}.{bigquery_dataset}.customer_profile" - - stage_customer_daily_profile_bigquery_source(customer_df, customer_table_id) - customer_source = BigQuerySource( - table_ref=customer_table_id, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - customer_fv = create_customer_daily_profile_feature_view(customer_source) - - driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) - customer = Entity(name="customer_id", value_type=ValueType.INT64) - - if provider_type == "local": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="default", - provider="local", - online_store=SqliteOnlineStoreConfig( - path=os.path.join(temp_dir, "online_store.db"), - ), - offline_store=BigQueryOfflineStoreConfig( - type="bigquery", dataset=bigquery_dataset - ), - ) - ) - elif provider_type == "gcp": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="".join( - random.choices(string.ascii_uppercase + string.digits, k=10) - ), - provider="gcp", - offline_store=BigQueryOfflineStoreConfig( - type="bigquery", dataset=bigquery_dataset - ), - ) - ) - elif provider_type == "gcp_custom_offline_config": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="".join( - random.choices(string.ascii_uppercase + string.digits, k=10) - ), - provider="gcp", - offline_store=BigQueryOfflineStoreConfig( - type="bigquery", dataset="foo" - ), - ) - ) - else: - raise Exception("Invalid provider used as part of test configuration") - - store.apply([driver, customer, driver_fv, customer_fv]) - - try: - event_timestamp = ( - DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL - if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns - else "e_ts" - ) - expected_df = get_expected_training_df( - customer_df, - customer_fv, - driver_df, - driver_fv, - orders_df, - event_timestamp, - full_feature_names, - ) - - job_from_sql = store.get_historical_features( - entity_df=entity_df_query, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - full_feature_names=full_feature_names, - ) - - start_time = datetime.utcnow() - actual_df_from_sql_entities = job_from_sql.to_df() - end_time = datetime.utcnow() - with capsys.disabled(): - print( - str( - f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" - ) - ) - - assert sorted(expected_df.columns) == sorted( - actual_df_from_sql_entities.columns - ) - assert_frame_equal( - expected_df.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - actual_df_from_sql_entities[expected_df.columns] - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - check_dtype=False, - ) - - table_from_sql_entities = job_from_sql.to_arrow() - assert_frame_equal( - actual_df_from_sql_entities, table_from_sql_entities.to_pandas() - ) - - timestamp_column = ( - "e_ts" - if infer_event_timestamp_col - else DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL - ) - - entity_df_query_with_invalid_join_key = ( - f"select order_id, driver_id, customer_id as customer, " - f"order_is_success, {timestamp_column}, FROM {gcp_project}.{table_id}" - ) - # Rename the join key; this should now raise an error. - assertpy.assert_that(store.get_historical_features).raises( - errors.FeastEntityDFMissingColumnsError - ).when_called_with( - entity_df=entity_df_query_with_invalid_join_key, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - ) - - job_from_df = store.get_historical_features( - entity_df=orders_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - full_feature_names=full_feature_names, - ) - - # Rename the join key; this should now raise an error. - orders_df_with_invalid_join_key = orders_df.rename( - {"customer_id": "customer"}, axis="columns" - ) - assertpy.assert_that(store.get_historical_features).raises( - errors.FeastEntityDFMissingColumnsError - ).when_called_with( - entity_df=orders_df_with_invalid_join_key, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - ) - - # Make sure that custom dataset name is being used from the offline_store config - if provider_type == "gcp_custom_offline_config": - assertpy.assert_that(job_from_df.query).contains("foo.feast_entity_df") - else: - assertpy.assert_that(job_from_df.query).contains( - f"{bigquery_dataset}.feast_entity_df" - ) - - start_time = datetime.utcnow() - actual_df_from_df_entities = job_from_df.to_df() - end_time = datetime.utcnow() - with capsys.disabled(): - print( - str( - f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" - ) - ) - - assert sorted(expected_df.columns) == sorted( - actual_df_from_df_entities.columns - ) - assert_frame_equal( - expected_df.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - actual_df_from_df_entities[expected_df.columns] - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - check_dtype=False, - ) - - table_from_df_entities = job_from_df.to_arrow() - assert_frame_equal( - actual_df_from_df_entities, table_from_df_entities.to_pandas() - ) - finally: - store.teardown() - - -@pytest.mark.skip( - reason="This is the most expensive test and is subsumed by the universal tests." -) -@pytest.mark.integration -@pytest.mark.parametrize( - "provider_type", ["local", "aws"], -) -@pytest.mark.parametrize( - "infer_event_timestamp_col", [False, True], -) -@pytest.mark.parametrize( - "full_feature_names", [False, True], -) -def test_historical_features_from_redshift_sources( - provider_type, infer_event_timestamp_col, capsys, full_feature_names -): - client = aws_utils.get_redshift_data_client("us-west-2") - s3 = aws_utils.get_s3_resource("us-west-2") - - offline_store = RedshiftOfflineStoreConfig( - cluster_id="feast-integration-tests", - region="us-west-2", - user="admin", - database="feast", - s3_staging_location="s3://feast-integration-tests/redshift/tests/ingestion", - iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", - ) - - start_date = datetime.now().replace(microsecond=0, second=0, minute=0) - ( - customer_entities, - driver_entities, - end_date, - orders_df, - start_date, - ) = generate_entities(start_date, infer_event_timestamp_col) - - redshift_table_prefix = ( - f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" - ) - - # Stage orders_df to Redshift - table_name = f"{redshift_table_prefix}_orders" - entity_df_query = f"SELECT * FROM {table_name}" - orders_context = aws_utils.temporarily_upload_df_to_redshift( - client, - offline_store.cluster_id, - offline_store.database, - offline_store.user, - s3, - f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", - offline_store.iam_role, - table_name, - orders_df, - ) - - # Stage driver_df to Redshift - driver_df = driver_data.create_driver_hourly_stats_df( - driver_entities, start_date, end_date - ) - driver_table_name = f"{redshift_table_prefix}_driver_hourly" - driver_context = aws_utils.temporarily_upload_df_to_redshift( - client, - offline_store.cluster_id, - offline_store.database, - offline_store.user, - s3, - f"{offline_store.s3_staging_location}/copy/{driver_table_name}.parquet", - offline_store.iam_role, - driver_table_name, - driver_df, - ) - - # Stage customer_df to Redshift - customer_df = driver_data.create_customer_daily_profile_df( - customer_entities, start_date, end_date - ) - customer_table_name = f"{redshift_table_prefix}_customer_profile" - customer_context = aws_utils.temporarily_upload_df_to_redshift( - client, - offline_store.cluster_id, - offline_store.database, - offline_store.user, - s3, - f"{offline_store.s3_staging_location}/copy/{customer_table_name}.parquet", - offline_store.iam_role, - customer_table_name, - customer_df, - ) - - with orders_context, driver_context, customer_context, TemporaryDirectory() as temp_dir: - driver_source = RedshiftSource( - table=driver_table_name, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - driver_fv = create_driver_hourly_stats_feature_view(driver_source) - - customer_source = RedshiftSource( - table=customer_table_name, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - customer_fv = create_customer_daily_profile_feature_view(customer_source) - - driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) - customer = Entity(name="customer_id", value_type=ValueType.INT64) - - if provider_type == "local": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="default", - provider="local", - online_store=SqliteOnlineStoreConfig( - path=os.path.join(temp_dir, "online_store.db"), - ), - offline_store=offline_store, - ) - ) - elif provider_type == "aws": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="".join( - random.choices(string.ascii_uppercase + string.digits, k=10) - ), - provider="aws", - online_store=DynamoDBOnlineStoreConfig(region="us-west-2"), - offline_store=offline_store, - ) - ) - else: - raise Exception("Invalid provider used as part of test configuration") - - store.apply([driver, customer, driver_fv, customer_fv]) - - try: - event_timestamp = ( - DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL - if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns - else "e_ts" - ) - expected_df = get_expected_training_df( - customer_df, - customer_fv, - driver_df, - driver_fv, - orders_df, - event_timestamp, - full_feature_names, - ) - - job_from_sql = store.get_historical_features( - entity_df=entity_df_query, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - full_feature_names=full_feature_names, - ) - - start_time = datetime.utcnow() - actual_df_from_sql_entities = job_from_sql.to_df() - end_time = datetime.utcnow() - with capsys.disabled(): - print( - str( - f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" - ) - ) - - assert sorted(expected_df.columns) == sorted( - actual_df_from_sql_entities.columns - ) - assert_frame_equal( - expected_df.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - actual_df_from_sql_entities[expected_df.columns] - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - check_dtype=False, - ) - - table_from_sql_entities = job_from_sql.to_arrow() - assert_frame_equal( - actual_df_from_sql_entities.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - table_from_sql_entities.to_pandas() - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - ) - - timestamp_column = ( - "e_ts" - if infer_event_timestamp_col - else DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL - ) - - entity_df_query_with_invalid_join_key = ( - f"select order_id, driver_id, customer_id as customer, " - f"order_is_success, {timestamp_column} FROM {table_name}" - ) - # Rename the join key; this should now raise an error. - assertpy.assert_that( - store.get_historical_features( - entity_df=entity_df_query_with_invalid_join_key, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - ).to_df - ).raises(errors.FeastEntityDFMissingColumnsError).when_called_with() - - job_from_df = store.get_historical_features( - entity_df=orders_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - full_feature_names=full_feature_names, - ) - - # Rename the join key; this should now raise an error. - orders_df_with_invalid_join_key = orders_df.rename( - {"customer_id": "customer"}, axis="columns" - ) - assertpy.assert_that( - store.get_historical_features( - entity_df=orders_df_with_invalid_join_key, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - ).to_df - ).raises(errors.FeastEntityDFMissingColumnsError).when_called_with() - - start_time = datetime.utcnow() - actual_df_from_df_entities = job_from_df.to_df() - end_time = datetime.utcnow() - with capsys.disabled(): - print( - str( - f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" - ) - ) - - assert sorted(expected_df.columns) == sorted( - actual_df_from_df_entities.columns - ) - assert_frame_equal( - expected_df.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - actual_df_from_df_entities[expected_df.columns] - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - check_dtype=False, - ) - - table_from_df_entities = job_from_df.to_arrow() - assert_frame_equal( - actual_df_from_df_entities.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - table_from_df_entities.to_pandas() - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - ) - finally: - store.teardown() - - def test_feature_name_collision_on_historical_retrieval(): # _validate_feature_refs is the function that checks for colliding feature names From b6abcaa13b05f17fd91397c25477ae5eb7b7d9dd Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 27 Aug 2021 13:09:39 -0700 Subject: [PATCH 35/51] reorder Signed-off-by: Achal Shah --- .../offline_store/test_historical_retrieval.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/sdk/python/tests/integration/offline_store/test_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_historical_retrieval.py index 63b720d3df..44f9e595e3 100644 --- a/sdk/python/tests/integration/offline_store/test_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_historical_retrieval.py @@ -5,7 +5,6 @@ from datetime import datetime, timedelta from tempfile import TemporaryDirectory -import assertpy import numpy as np import pandas as pd import pytest @@ -14,15 +13,7 @@ from pytz import utc import feast.driver_test_data as driver_data -from feast import ( - BigQuerySource, - FeatureService, - FileSource, - RedshiftSource, - RepoConfig, - errors, - utils, -) +from feast import BigQuerySource, FeatureService, FileSource, RepoConfig, utils from feast.entity import Entity from feast.errors import FeatureNameCollisionError from feast.feature import Feature @@ -32,10 +23,7 @@ from feast.infra.offline_stores.offline_utils import ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, ) -from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig -from feast.infra.online_stores.dynamodb import DynamoDBOnlineStoreConfig from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig -from feast.infra.utils import aws_utils from feast.value_type import ValueType np.random.seed(0) From 95806545ed49dbbb90765191c04a2e801be1b1ff Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 27 Aug 2021 13:33:01 -0700 Subject: [PATCH 36/51] add todo Signed-off-by: Achal Shah --- sdk/python/feast/type_map.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 3ea6cda968..c65ec6e14c 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -164,6 +164,7 @@ def _type_err(item, dtype): raise ValueError(f'Value "{item}" is of type {type(item)} not of type {dtype}') +# TODO(achals): Simplify this method and remove the noqa. def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: # noqa: C901 """ Converts a Python (native, pandas) value to a Feast Proto Value based From 171a1efbe15c650cbc68919f45a2fc9683d72766 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 27 Aug 2021 14:10:15 -0700 Subject: [PATCH 37/51] Split universal vs non data_source_cache Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 11 ++++++++++- .../feature_repos/test_repo_configuration.py | 8 ++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 4cc8bcec1a..fce15e987d 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -115,9 +115,18 @@ def get_or_create(self, key, c: Callable[[], Any]): return v -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="function", autouse=True) def data_source_cache(): dsc = DataSourceCache() yield dsc for _, v in dsc.items(): v[3].teardown() + + +@pytest.fixture(scope="session", autouse=True) +def universal_data_source_cache(): + dsc = DataSourceCache() + yield dsc + for _, v in dsc.items(): + v[3].teardown() + diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index e5a40c0b0e..4679763d1d 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -394,12 +394,12 @@ def parametrize_offline_retrieval_test(offline_retrieval_test): @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) - def inner_test(config, data_source_cache): + def inner_test(config, universal_data_source_cache): with construct_universal_test_environment( offline_retrieval_test.__name__, config, stop_at_step=EnvironmentSetupSteps.APPLY_OBJECTS, - data_source_cache=data_source_cache, + data_source_cache=universal_data_source_cache, ) as environment: offline_retrieval_test(environment) @@ -422,12 +422,12 @@ def parametrize_online_test(online_test): @pytest.mark.integration @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) - def inner_test(config, data_source_cache): + def inner_test(config, universal_data_source_cache): with construct_universal_test_environment( online_test.__name__, config, stop_at_step=EnvironmentSetupSteps.MATERIALIZE, - data_source_cache=data_source_cache, + data_source_cache=universal_data_source_cache, ) as environment: online_test(environment) From de9d8aae39920f0085e1de4bdccdaa1df000dc4f Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 27 Aug 2021 14:13:31 -0700 Subject: [PATCH 38/51] make format Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index fce15e987d..3647124864 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -129,4 +129,3 @@ def universal_data_source_cache(): yield dsc for _, v in dsc.items(): v[3].teardown() - From 6454775341efcf19d535c41aec3033466e84099b Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Tue, 31 Aug 2021 16:42:44 -0700 Subject: [PATCH 39/51] WIP fixtures Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 26 +++++++++++++++ .../integration/e2e/test_universal_e2e.py | 5 +-- .../feature_repos/test_repo_configuration.py | 3 +- .../online_store/test_universal_online.py | 32 +++++++++++-------- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 3647124864..414cad492e 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -18,6 +18,8 @@ import pandas as pd import pytest +from tests.integration.feature_repos.test_repo_configuration import FULL_REPO_CONFIGS, construct_test_environment, \ + construct_universal_entities, construct_universal_datasets, construct_universal_data_sources def pytest_configure(config): @@ -129,3 +131,27 @@ def universal_data_source_cache(): yield dsc for _, v in dsc.items(): v[3].teardown() + + +@pytest.fixture(params=FULL_REPO_CONFIGS, scope="session") +def environment(request): + with construct_test_environment(request.param) as e: + yield e + + +@pytest.fixture(scope="session") +def universal_data_sources(environment): + + with environment as environment: + + entities = construct_universal_entities() + datasets = construct_universal_datasets( + entities, environment.start_date, environment.end_date + ) + datasources = construct_universal_data_sources( + datasets, environment.data_source_creator + ) + + yield (entities, datasets, datasources) + + environment.data_source_creator.teardown() diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 57d2d361ce..c2b1a77d32 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -22,10 +22,11 @@ @pytest.mark.parametrize( "config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v) ) -def test_e2e_consistency(config, data_source_cache: DataSourceCache): +def test_e2e_consistency(config, data_source_cache: DataSourceCache, environment, + universal_data_sources): df = create_dataset() test_suite_name = test_e2e_consistency.__name__ - with construct_test_environment(test_suite_name, config) as test_environment: + with construct_test_environment(config, test_suite_name) as test_environment: fs = test_environment.feature_store infer_features = test_environment.test_repo_config.infer_features key = f"{test_suite_name}_{test_environment.test_repo_config.offline_store_creator}" diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 4679763d1d..4975285556 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -266,7 +266,8 @@ class EnvironmentSetupSteps(IntEnum): @contextmanager def construct_test_environment( - test_suite_name: str, test_repo_config: TestRepoConfig, + test_repo_config: TestRepoConfig, + test_suite_name: str = "integration_test" ) -> Environment: project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}" diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 46b84130b8..b2a88a4f79 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -2,26 +2,32 @@ import unittest import pandas as pd +import pytest -from tests.integration.feature_repos.test_repo_configuration import ( - Environment, - parametrize_online_test, -) +from tests.integration.feature_repos.test_repo_configuration import construct_universal_feature_views +from tests.integration.feature_repos.universal.entities import driver, customer -@parametrize_online_test -def test_online_retrieval(environment: Environment): +@pytest.mark.integration +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +def test_online_retrieval(environment, universal_data_sources, full_feature_names): fs = environment.feature_store - full_feature_names = environment.test_repo_config.full_feature_names + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) - sample_drivers = random.sample(environment.entities["driver"], 10) - drivers_df = environment.datasets["driver"][ - environment.datasets["driver"]["driver_id"].isin(sample_drivers) + print(f"Created universal feature views {feature_views}") + + fs.apply([feature_views.values()] + [driver(), customer()]) + fs.materialize(environment.start_date, environment.end_date) + + sample_drivers = random.sample(entities["driver"], 10) + drivers_df = datasets["driver"][ + datasets["driver"]["driver_id"].isin(sample_drivers) ] - sample_customers = random.sample(environment.entities["customer"], 10) - customers_df = environment.datasets["customer"][ - environment.datasets["customer"]["customer_id"].isin(sample_customers) + sample_customers = random.sample(entities["customer"], 10) + customers_df = datasets["customer"][ + datasets["customer"]["customer_id"].isin(sample_customers) ] entity_rows = [ From 3fbb2e4f4bdd7a4e083d7608b997faf65da09aa7 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Tue, 31 Aug 2021 17:17:46 -0700 Subject: [PATCH 40/51] WIP Trying fixtures more effectively Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 73 ++----- .../integration/e2e/test_universal_e2e.py | 5 +- .../feature_repos/test_repo_configuration.py | 202 +----------------- .../feature_repos/universal/feature_views.py | 2 +- .../test_universal_historical_retrieval.py | 22 +- .../online_store/test_universal_online.py | 14 +- 6 files changed, 45 insertions(+), 273 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 414cad492e..8371ba202f 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -14,12 +14,17 @@ import multiprocessing from datetime import datetime, timedelta from sys import platform -from typing import Any, Callable import pandas as pd import pytest -from tests.integration.feature_repos.test_repo_configuration import FULL_REPO_CONFIGS, construct_test_environment, \ - construct_universal_entities, construct_universal_datasets, construct_universal_data_sources + +from tests.integration.feature_repos.test_repo_configuration import ( + FULL_REPO_CONFIGS, + construct_test_environment, + construct_universal_data_sources, + construct_universal_datasets, + construct_universal_entities, +) def pytest_configure(config): @@ -92,47 +97,6 @@ def simple_dataset_2() -> pd.DataFrame: return pd.DataFrame.from_dict(data) -class DataSourceCache(dict): - """ - DataSourceCache is meant to cache datasources to be reused across multiple tests. - - Staging data into a remote store is an expensive operation, so we expose a way through which we can have the - datasources created shared between tests. - The key for the cache is an arbitrary string, which should be constructed using a combination of the test name along - with the backing store. - The value for the cache is a 4-valued Tuple: - - the entites that are meant to be used with the data source created - - The dataframe that is persisted into the remote store. - - The data source object that represents the table/file; created from the dataframe mentioned above. - - The data_source_creator object that is used to upload the data frame and construct the data source object. - This object is needed so that we can invoke the teardown method at the end of all our tests. - """ - - def get_or_create(self, key, c: Callable[[], Any]): - if key in self: - return self[key] - else: - v = c() - self[key] = v - return v - - -@pytest.fixture(scope="function", autouse=True) -def data_source_cache(): - dsc = DataSourceCache() - yield dsc - for _, v in dsc.items(): - v[3].teardown() - - -@pytest.fixture(scope="session", autouse=True) -def universal_data_source_cache(): - dsc = DataSourceCache() - yield dsc - for _, v in dsc.items(): - v[3].teardown() - - @pytest.fixture(params=FULL_REPO_CONFIGS, scope="session") def environment(request): with construct_test_environment(request.param) as e: @@ -141,17 +105,14 @@ def environment(request): @pytest.fixture(scope="session") def universal_data_sources(environment): + entities = construct_universal_entities() + datasets = construct_universal_datasets( + entities, environment.start_date, environment.end_date + ) + datasources = construct_universal_data_sources( + datasets, environment.data_source_creator + ) - with environment as environment: - - entities = construct_universal_entities() - datasets = construct_universal_datasets( - entities, environment.start_date, environment.end_date - ) - datasources = construct_universal_data_sources( - datasets, environment.data_source_creator - ) - - yield (entities, datasets, datasources) + yield (entities, datasets, datasources) - environment.data_source_creator.teardown() + environment.data_source_creator.teardown() diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index c2b1a77d32..7f462db722 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -22,8 +22,9 @@ @pytest.mark.parametrize( "config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v) ) -def test_e2e_consistency(config, data_source_cache: DataSourceCache, environment, - universal_data_sources): +def test_e2e_consistency( + config, data_source_cache: DataSourceCache, environment, universal_data_sources +): df = create_dataset() test_suite_name = test_e2e_consistency.__name__ with construct_test_environment(config, test_suite_name) as test_environment: diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 4975285556..17aecd1909 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -3,19 +3,16 @@ from contextlib import contextmanager from dataclasses import dataclass, field, replace from datetime import datetime, timedelta -from enum import IntEnum from pathlib import Path from typing import Any, Dict, List, Optional, Union import pandas as pd -import pytest from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data, importer from feast.data_source import DataSource from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, ) -from tests.integration.feature_repos.universal.entities import customer, driver from tests.integration.feature_repos.universal.feature_views import ( create_customer_daily_profile_feature_view, create_driver_hourly_stats_feature_view, @@ -122,7 +119,7 @@ def construct_universal_data_sources( def construct_universal_feature_views( - data_sources: Dict[str, DataSource] + data_sources: Dict[str, DataSource], ) -> Dict[str, FeatureView]: return { "customer": create_customer_daily_profile_feature_view( @@ -132,55 +129,6 @@ def construct_universal_feature_views( } -def setup_entities( - environment: "Environment", entities_override: Optional[Dict[str, List[Any]]] = None -) -> "Environment": - environment.entities = ( - entities_override if entities_override else construct_universal_entities() - ) - return environment - - -def setup_datasets( - environment: "Environment", - datasets_override: Optional[Dict[str, pd.DataFrame]] = None, -) -> "Environment": - environment.datasets = ( - datasets_override - if datasets_override - else construct_universal_datasets( - environment.entities, environment.start_date, environment.end_date - ) - ) - return environment - - -def setup_data_sources( - environment: "Environment", - data_sources_override: Optional[Dict[str, DataSource]] = None, -) -> "Environment": - environment.datasources = ( - data_sources_override - if data_sources_override - else construct_universal_data_sources( - environment.datasets, environment.data_source_creator - ) - ) - return environment - - -def setup_feature_views( - environment: "Environment", - feature_views_override: Optional[Dict[str, FeatureView]] = None, -) -> "Environment": - environment.feature_views = ( - feature_views_override - if feature_views_override - else construct_universal_feature_views(environment.datasources) - ) - return environment - - @dataclass class Environment: name: str @@ -188,11 +136,6 @@ class Environment: feature_store: FeatureStore data_source_creator: DataSourceCreator - entities: Dict[str, List[Any]] = field(default_factory=dict) - datasets: Dict[str, pd.DataFrame] = field(default_factory=dict) - datasources: Dict[str, DataSource] = field(default_factory=dict) - feature_views: Dict[str, FeatureView] = field(default_factory=list) - end_date: datetime = field( default=datetime.now().replace(microsecond=0, second=0, minute=0) ) @@ -254,20 +197,9 @@ def vary_providers_for_offline_stores( return new_configs -class EnvironmentSetupSteps(IntEnum): - INIT = 0 - CREATE_OBJECTS = 1 - APPLY_OBJECTS = 2 - MATERIALIZE = 3 - - -DEFAULT_STEP = EnvironmentSetupSteps.INIT - - @contextmanager def construct_test_environment( - test_repo_config: TestRepoConfig, - test_suite_name: str = "integration_test" + test_repo_config: TestRepoConfig, test_suite_name: str = "integration_test" ) -> Environment: project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}" @@ -303,133 +235,3 @@ def construct_test_environment( yield environment finally: fs.teardown() - - -@contextmanager -def construct_universal_test_environment( - test_suite_name: str, - test_repo_config: TestRepoConfig, - stop_at_step=DEFAULT_STEP, - data_source_cache=None, -) -> Environment: - """ - This method should take in the parameters from the test repo config and created a feature repo, apply it, - and return the constructed feature store object to callers. - - This feature store object can be interacted for the purposes of tests. - The user is *not* expected to perform any clean up actions. - - :param test_suite_name: A name for the test suite. - :param test_repo_config: configuration - :param stop_at_step: The step which should be the last one executed when setting up the test environment. - :param data_source_cache: - :return: A feature store built using the supplied configuration. - """ - with construct_test_environment(test_suite_name, test_repo_config) as environment: - fs = environment.feature_store - fvs = [] - entities = [] - try: - if stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS: - if data_source_cache is not None: - fixtures = data_source_cache.get( - test_repo_config.offline_store_creator, None - ) - if fixtures: - environment = setup_entities( - environment, entities_override=fixtures[0] - ) - environment = setup_datasets( - environment, datasets_override=fixtures[1] - ) - environment = setup_data_sources( - environment, data_sources_override=fixtures[2] - ) - else: - environment = setup_entities(environment) - environment = setup_datasets(environment) - environment = setup_data_sources(environment) - data_source_cache[test_repo_config.offline_store_creator] = ( - environment.entities, - environment.datasets, - environment.datasources, - environment.data_source_creator, - ) - else: - environment = setup_entities(environment) - environment = setup_datasets(environment) - environment = setup_data_sources(environment) - - environment = setup_feature_views(environment) - if stop_at_step >= EnvironmentSetupSteps.APPLY_OBJECTS: - entities.extend([driver(), customer()]) - fvs.extend(environment.feature_views.values()) - fs.apply(fvs + entities) - if stop_at_step >= EnvironmentSetupSteps.MATERIALIZE: - fs.materialize(environment.start_date, environment.end_date) - - yield environment - finally: - if ( - data_source_cache is None - and stop_at_step >= EnvironmentSetupSteps.CREATE_OBJECTS - ): - environment.data_source_creator.teardown() - - -def parametrize_offline_retrieval_test(offline_retrieval_test): - """ - This decorator should be used by tests that rely on the offline store. These tests are expected to be parameterized, - and receive an Environment object that contains a reference to a Feature Store with pre-applied - entities and feature views. - - The decorator also ensures that sample data needed for the test is available in the relevant offline store. - - Decorated tests should interact with the offline store, via the FeatureStore.get_historical_features method. They - may perform more operations as needed. - - The decorator takes care of tearing down the feature store, as well as the sample data. - """ - - configs = vary_full_feature_names(FULL_REPO_CONFIGS) - - @pytest.mark.integration - @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) - def inner_test(config, universal_data_source_cache): - with construct_universal_test_environment( - offline_retrieval_test.__name__, - config, - stop_at_step=EnvironmentSetupSteps.APPLY_OBJECTS, - data_source_cache=universal_data_source_cache, - ) as environment: - offline_retrieval_test(environment) - - return inner_test - - -def parametrize_online_test(online_test): - """ - This decorator should be used by tests that rely on the offline store. These tests are expected to be parameterized, - and receive an Environment object that contains a reference to a Feature Store with pre-applied - entities and feature views. - - The decorator also ensures that sample data needed for the test is available in the relevant offline store. This - data is also materialized into the online store. - - The decorator takes care of tearing down the feature store, as well as the sample data. - """ - - configs = vary_full_feature_names(FULL_REPO_CONFIGS) - - @pytest.mark.integration - @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) - def inner_test(config, universal_data_source_cache): - with construct_universal_test_environment( - online_test.__name__, - config, - stop_at_step=EnvironmentSetupSteps.MATERIALIZE, - data_source_cache=universal_data_source_cache, - ) as environment: - online_test(environment) - - return inner_test diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index b57e2988ab..e8bc2e0911 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -16,7 +16,7 @@ def driver_feature_view( ) -def create_driver_hourly_stats_feature_view(source, infer_features: bool = False): +def create_driver_hourly_stats_feature_view(source, infer_features: bool = True): driver_stats_feature_view = FeatureView( name="driver_stats", entities=["driver"], diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index b00818a078..c23d3b3eea 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import pytest from pandas.testing import assert_frame_equal from pytz import utc @@ -12,8 +13,7 @@ DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, ) from tests.integration.feature_repos.test_repo_configuration import ( - Environment, - parametrize_offline_retrieval_test, + construct_universal_feature_views, table_name_from_data_source, ) @@ -136,18 +136,22 @@ def get_expected_training_df( return expected_df -@parametrize_offline_retrieval_test -def test_historical_features(environment: Environment): +@pytest.mark.integration +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store + (entities, datasets, data_sources) = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + customer_df, driver_df, orders_df = ( - environment.datasets["customer"], - environment.datasets["driver"], - environment.datasets["orders"], + datasets["customer"], + datasets["driver"], + datasets["orders"], ) customer_fv, driver_fv = ( - environment.feature_views["customer"], - environment.feature_views["driver"], + feature_views["customer"], + feature_views["driver"], ) full_feature_names = environment.test_repo_config.full_feature_names diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index b2a88a4f79..3aca0d73d3 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -4,20 +4,24 @@ import pandas as pd import pytest -from tests.integration.feature_repos.test_repo_configuration import construct_universal_feature_views -from tests.integration.feature_repos.universal.entities import driver, customer +from tests.integration.feature_repos.test_repo_configuration import ( + construct_universal_feature_views, +) +from tests.integration.feature_repos.universal.entities import customer, driver @pytest.mark.integration @pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) def test_online_retrieval(environment, universal_data_sources, full_feature_names): + fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) - print(f"Created universal feature views {feature_views}") - - fs.apply([feature_views.values()] + [driver(), customer()]) + feast_objects = [] + feast_objects.extend(feature_views.values()) + feast_objects.extend([driver(), customer()]) + fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) sample_drivers = random.sample(entities["driver"], 10) From 2961d58c430829100b474e3e45279d99339b0067 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Tue, 31 Aug 2021 21:44:19 -0700 Subject: [PATCH 41/51] fix refs Signed-off-by: Achal Shah --- .../offline_store/test_universal_historical_retrieval.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index c23d3b3eea..be2467fb81 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -153,10 +153,9 @@ def test_historical_features(environment, universal_data_sources, full_feature_n feature_views["customer"], feature_views["driver"], ) - full_feature_names = environment.test_repo_config.full_feature_names entity_df_query = None - orders_table = table_name_from_data_source(environment.datasources["orders"]) + orders_table = table_name_from_data_source(data_sources["orders"]) if orders_table: entity_df_query = f"SELECT * FROM {orders_table}" From 173981571ec2844f14d995b89acb7c5a0839cdd8 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Tue, 31 Aug 2021 22:04:59 -0700 Subject: [PATCH 42/51] Fix refs Signed-off-by: Achal Shah --- .../integration/e2e/test_universal_e2e.py | 1 - .../offline_store/test_s3_custom_endpoint.py | 20 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 7f462db722..424f85468f 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -7,7 +7,6 @@ from pytz import utc from feast import FeatureStore, FeatureView -from tests.conftest import DataSourceCache from tests.data.data_creator import create_dataset from tests.integration.feature_repos.test_repo_configuration import ( FULL_REPO_CONFIGS, diff --git a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py index e5bff828b3..7c94d003c3 100644 --- a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py +++ b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py @@ -2,17 +2,19 @@ from tests.integration.feature_repos.test_repo_configuration import ( TestRepoConfig, - construct_universal_test_environment, + construct_test_environment, + construct_universal_feature_views, ) # TODO: Allow integration tests to run using different credentials. +from tests.integration.feature_repos.universal.entities import customer, driver @pytest.mark.integration @pytest.mark.skip( reason="No way to run this test today. Credentials conflict with real AWS credentials in CI" ) -def test_registration_and_retrieval_from_custom_s3_endpoint(): +def test_registration_and_retrieval_from_custom_s3_endpoint(universal_data_sources): config = TestRepoConfig( offline_store_creator="tests.integration.feature_repos.universal.data_sources.file.S3FileDataSourceCreator" ) @@ -27,10 +29,18 @@ def test_registration_and_retrieval_from_custom_s3_endpoint(): os.environ["AWS_ACCESS_KEY_ID"] = "AKIAIOSFODNN7EXAMPLE" os.environ["AWS_SECRET_ACCESS_KEY"] = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" - with construct_universal_test_environment( - config, create_and_apply=True, materialize=True - ) as environment: + with construct_test_environment(config) as environment: fs = environment.feature_store + + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + + feast_objects = [] + feast_objects.extend(feature_views.values()) + feast_objects.extend([driver(), customer()]) + fs.apply(feast_objects) + fs.materialize(environment.start_date, environment.end_date) + out = fs.get_online_features( features=["driver_stats:conv_rate"], entity_rows=[{"driver": 5001}] ).to_dict() From fe14ba3b8cd3c49b7c1be2b82eb174ad6c05072c Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Tue, 31 Aug 2021 22:07:41 -0700 Subject: [PATCH 43/51] Fix refs Signed-off-by: Achal Shah --- .../tests/integration/offline_store/test_s3_custom_endpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py index 7c94d003c3..8f85efd23f 100644 --- a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py +++ b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py @@ -5,9 +5,9 @@ construct_test_environment, construct_universal_feature_views, ) +from tests.integration.feature_repos.universal.entities import customer, driver # TODO: Allow integration tests to run using different credentials. -from tests.integration.feature_repos.universal.entities import customer, driver @pytest.mark.integration From 43d8e270b808585ac37b39a74e9bf44e474f4244 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Tue, 31 Aug 2021 22:09:01 -0700 Subject: [PATCH 44/51] Fix refs Signed-off-by: Achal Shah --- .../integration/e2e/test_universal_e2e.py | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 424f85468f..6f1805a840 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -21,26 +21,21 @@ @pytest.mark.parametrize( "config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v) ) -def test_e2e_consistency( - config, data_source_cache: DataSourceCache, environment, universal_data_sources -): +def test_e2e_consistency(config): df = create_dataset() test_suite_name = test_e2e_consistency.__name__ with construct_test_environment(config, test_suite_name) as test_environment: fs = test_environment.feature_store infer_features = test_environment.test_repo_config.infer_features - key = f"{test_suite_name}_{test_environment.test_repo_config.offline_store_creator}" - _, _, data_source, _ = data_source_cache.get_or_create( - key, - lambda: ( - None, - df, - test_environment.data_source_creator.create_data_source( - df, fs.project, field_mapping={"ts_1": "ts", "id": "driver_id"} - ), - test_environment.data_source_creator, + _, _, data_source, _ = ( + None, + df, + test_environment.data_source_creator.create_data_source( + df, fs.project, field_mapping={"ts_1": "ts", "id": "driver_id"} ), + test_environment.data_source_creator, ) + fv = driver_feature_view(data_source=data_source, infer_features=infer_features) entity = driver() From 58c75b04bd0e8b7f1df9051081e9ee7b601689ad Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Tue, 31 Aug 2021 22:20:26 -0700 Subject: [PATCH 45/51] fix historical tests Signed-off-by: Achal Shah --- .../offline_store/test_universal_historical_retrieval.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index be2467fb81..91e3f5edea 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -16,6 +16,7 @@ construct_universal_feature_views, table_name_from_data_source, ) +from tests.integration.feature_repos.universal.entities import customer, driver np.random.seed(0) @@ -154,6 +155,10 @@ def test_historical_features(environment, universal_data_sources, full_feature_n feature_views["driver"], ) + feast_objects = [] + feast_objects.extend([customer_fv, driver_fv, driver(), customer()]) + store.apply(feast_objects) + entity_df_query = None orders_table = table_name_from_data_source(data_sources["orders"]) if orders_table: From 8b5ba5a0cca2a203258a451a6a2b0e3ac81ddfa7 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Tue, 31 Aug 2021 22:31:10 -0700 Subject: [PATCH 46/51] renames Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 4 +- .../integration/e2e/test_universal_e2e.py | 2 +- ...configuration.py => repo_configuration.py} | 37 +++++++++++-------- .../offline_store/test_s3_custom_endpoint.py | 6 +-- .../test_universal_historical_retrieval.py | 2 +- .../online_store/test_universal_online.py | 2 +- 6 files changed, 29 insertions(+), 24 deletions(-) rename sdk/python/tests/integration/feature_repos/{test_repo_configuration.py => repo_configuration.py} (89%) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 8371ba202f..d8f34fcb3f 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -18,7 +18,7 @@ import pandas as pd import pytest -from tests.integration.feature_repos.test_repo_configuration import ( +from tests.integration.feature_repos.repo_configuration import ( FULL_REPO_CONFIGS, construct_test_environment, construct_universal_data_sources, @@ -113,6 +113,6 @@ def universal_data_sources(environment): datasets, environment.data_source_creator ) - yield (entities, datasets, datasources) + yield entities, datasets, datasources environment.data_source_creator.teardown() diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 6f1805a840..c4c2294a18 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -8,7 +8,7 @@ from feast import FeatureStore, FeatureView from tests.data.data_creator import create_dataset -from tests.integration.feature_repos.test_repo_configuration import ( +from tests.integration.feature_repos.repo_configuration import ( FULL_REPO_CONFIGS, construct_test_environment, vary_infer_feature, diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py similarity index 89% rename from sdk/python/tests/integration/feature_repos/test_repo_configuration.py rename to sdk/python/tests/integration/feature_repos/repo_configuration.py index 17aecd1909..1acb11ae1c 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -20,7 +20,7 @@ @dataclass(frozen=True, repr=True) -class TestRepoConfig: +class IntegrationTestRepoConfig: """ This class should hold all possible parameters that may need to be varied by individual tests. """ @@ -41,28 +41,28 @@ def ds_creator_path(cls: str): DYNAMO_CONFIG = {"type": "dynamodb", "region": "us-west-2"} REDIS_CONFIG = {"type": "redis", "connection_string": "localhost:6379,db=0"} -FULL_REPO_CONFIGS: List[TestRepoConfig] = [ +FULL_REPO_CONFIGS: List[IntegrationTestRepoConfig] = [ # Local configurations - TestRepoConfig(), - TestRepoConfig(online_store=REDIS_CONFIG), + IntegrationTestRepoConfig(), + IntegrationTestRepoConfig(online_store=REDIS_CONFIG), # GCP configurations - TestRepoConfig( + IntegrationTestRepoConfig( provider="gcp", offline_store_creator=ds_creator_path("bigquery.BigQueryDataSourceCreator"), online_store="datastore", ), - TestRepoConfig( + IntegrationTestRepoConfig( provider="gcp", offline_store_creator=ds_creator_path("bigquery.BigQueryDataSourceCreator"), online_store=REDIS_CONFIG, ), # AWS configurations - TestRepoConfig( + IntegrationTestRepoConfig( provider="aws", offline_store_creator=ds_creator_path("redshift.RedshiftDataSourceCreator"), online_store=DYNAMO_CONFIG, ), - TestRepoConfig( + IntegrationTestRepoConfig( provider="aws", offline_store_creator=ds_creator_path("redshift.RedshiftDataSourceCreator"), online_store=REDIS_CONFIG, @@ -132,7 +132,7 @@ def construct_universal_feature_views( @dataclass class Environment: name: str - test_repo_config: TestRepoConfig + test_repo_config: IntegrationTestRepoConfig feature_store: FeatureStore data_source_creator: DataSourceCreator @@ -152,7 +152,9 @@ def table_name_from_data_source(ds: DataSource) -> Optional[str]: return None -def vary_full_feature_names(configs: List[TestRepoConfig]) -> List[TestRepoConfig]: +def vary_full_feature_names( + configs: List[IntegrationTestRepoConfig], +) -> List[IntegrationTestRepoConfig]: new_configs = [] for c in configs: true_c = replace(c, full_feature_names=True) @@ -162,8 +164,8 @@ def vary_full_feature_names(configs: List[TestRepoConfig]) -> List[TestRepoConfi def vary_infer_event_timestamp_col( - configs: List[TestRepoConfig], -) -> List[TestRepoConfig]: + configs: List[IntegrationTestRepoConfig], +) -> List[IntegrationTestRepoConfig]: new_configs = [] for c in configs: true_c = replace(c, infer_event_timestamp_col=True) @@ -172,7 +174,9 @@ def vary_infer_event_timestamp_col( return new_configs -def vary_infer_feature(configs: List[TestRepoConfig]) -> List[TestRepoConfig]: +def vary_infer_feature( + configs: List[IntegrationTestRepoConfig], +) -> List[IntegrationTestRepoConfig]: new_configs = [] for c in configs: true_c = replace(c, infer_features=True) @@ -182,8 +186,8 @@ def vary_infer_feature(configs: List[TestRepoConfig]) -> List[TestRepoConfig]: def vary_providers_for_offline_stores( - configs: List[TestRepoConfig], -) -> List[TestRepoConfig]: + configs: List[IntegrationTestRepoConfig], +) -> List[IntegrationTestRepoConfig]: new_configs = [] for c in configs: if "FileDataSourceCreator" in c.offline_store_creator: @@ -199,7 +203,8 @@ def vary_providers_for_offline_stores( @contextmanager def construct_test_environment( - test_repo_config: TestRepoConfig, test_suite_name: str = "integration_test" + test_repo_config: IntegrationTestRepoConfig, + test_suite_name: str = "integration_test", ) -> Environment: project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}" diff --git a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py index 8f85efd23f..3eb3f4da6f 100644 --- a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py +++ b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py @@ -1,7 +1,7 @@ import pytest -from tests.integration.feature_repos.test_repo_configuration import ( - TestRepoConfig, +from tests.integration.feature_repos.repo_configuration import ( + IntegrationTestRepoConfig, construct_test_environment, construct_universal_feature_views, ) @@ -15,7 +15,7 @@ reason="No way to run this test today. Credentials conflict with real AWS credentials in CI" ) def test_registration_and_retrieval_from_custom_s3_endpoint(universal_data_sources): - config = TestRepoConfig( + config = IntegrationTestRepoConfig( offline_store_creator="tests.integration.feature_repos.universal.data_sources.file.S3FileDataSourceCreator" ) import os diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 91e3f5edea..75cd5bbf70 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -12,7 +12,7 @@ from feast.infra.offline_stores.offline_utils import ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, ) -from tests.integration.feature_repos.test_repo_configuration import ( +from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, table_name_from_data_source, ) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 3aca0d73d3..b3bcf688bd 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from tests.integration.feature_repos.test_repo_configuration import ( +from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, ) from tests.integration.feature_repos.universal.entities import customer, driver From c445f5384e56ccdf7b6f9225dc49d29cc90eef52 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 1 Sep 2021 10:47:12 -0700 Subject: [PATCH 47/51] CR updates Signed-off-by: Achal Shah --- sdk/python/feast/infra/provider.py | 6 +- sdk/python/tests/conftest.py | 16 ++++++ .../integration/e2e/test_universal_e2e.py | 36 +++--------- .../feature_repos/repo_configuration.py | 57 ++----------------- .../universal/data_source_creator.py | 19 ++++--- .../universal/data_sources/bigquery.py | 21 +++---- .../universal/data_sources/file.py | 12 ++-- .../universal/data_sources/redshift.py | 12 ++-- 8 files changed, 68 insertions(+), 111 deletions(-) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index e5a03d7ca8..b040b512a0 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -238,11 +238,15 @@ def _get_column_names( reverse_field_mapping[col] if col in reverse_field_mapping.keys() else col for col in feature_names ] + + # We need to exclude join keys and timestamp columns from the list of features, after they are mapped to + # their final column names via the `field_mapping` field of the source. _feature_names = set(feature_names) - set(join_keys) _feature_names = _feature_names - {event_timestamp_column, created_timestamp_column} + feature_names = list(_feature_names) return ( join_keys, - list(_feature_names), + feature_names, event_timestamp_column, created_timestamp_column, ) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index d8f34fcb3f..742b572f8e 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -18,8 +18,10 @@ import pandas as pd import pytest +from tests.data.data_creator import create_dataset from tests.integration.feature_repos.repo_configuration import ( FULL_REPO_CONFIGS, + Environment, construct_test_environment, construct_universal_data_sources, construct_universal_datasets, @@ -116,3 +118,17 @@ def universal_data_sources(environment): yield entities, datasets, datasources environment.data_source_creator.teardown() + + +@pytest.fixture(scope="session") +def e2e_data_sources(environment: Environment): + df = create_dataset() + data_source = environment.data_source_creator.create_data_source( + df, + environment.feature_store.project, + field_mapping={"ts_1": "ts", "id": "driver_id"}, + ) + + yield df, data_source + + environment.data_source_creator.teardown() diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index c4c2294a18..dd09aef19c 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -7,41 +7,21 @@ from pytz import utc from feast import FeatureStore, FeatureView -from tests.data.data_creator import create_dataset -from tests.integration.feature_repos.repo_configuration import ( - FULL_REPO_CONFIGS, - construct_test_environment, - vary_infer_feature, -) from tests.integration.feature_repos.universal.entities import driver from tests.integration.feature_repos.universal.feature_views import driver_feature_view @pytest.mark.integration -@pytest.mark.parametrize( - "config", vary_infer_feature(FULL_REPO_CONFIGS), ids=lambda v: str(v) -) -def test_e2e_consistency(config): - df = create_dataset() - test_suite_name = test_e2e_consistency.__name__ - with construct_test_environment(config, test_suite_name) as test_environment: - fs = test_environment.feature_store - infer_features = test_environment.test_repo_config.infer_features - _, _, data_source, _ = ( - None, - df, - test_environment.data_source_creator.create_data_source( - df, fs.project, field_mapping={"ts_1": "ts", "id": "driver_id"} - ), - test_environment.data_source_creator, - ) - - fv = driver_feature_view(data_source=data_source, infer_features=infer_features) +@pytest.mark.parametrize("infer_features", [True, False]) +def test_e2e_consistency(environment, e2e_data_sources, infer_features): + fs = environment.feature_store + df, data_source = e2e_data_sources + fv = driver_feature_view(data_source=data_source, infer_features=infer_features) - entity = driver() - fs.apply([fv, entity]) + entity = driver() + fs.apply([fv, entity]) - run_offline_online_store_consistency_test(fs, fv) + run_offline_online_store_consistency_test(fs, fv) def check_offline_and_online_features( diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 1acb11ae1c..8a90fd54d6 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -1,7 +1,7 @@ import tempfile import uuid from contextlib import contextmanager -from dataclasses import dataclass, field, replace +from dataclasses import dataclass, field from datetime import datetime, timedelta from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -99,19 +99,19 @@ def construct_universal_data_sources( ) -> Dict[str, DataSource]: customer_ds = data_source_creator.create_data_source( datasets["customer"], - destination="customer_profile", + destination_name="customer_profile", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) driver_ds = data_source_creator.create_data_source( datasets["driver"], - destination="driver_hourly", + destination_name="driver_hourly", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) orders_ds = data_source_creator.create_data_source( datasets["orders"], - destination="orders", + destination_name="orders", event_timestamp_column="event_timestamp", created_timestamp_column="created", ) @@ -152,55 +152,6 @@ def table_name_from_data_source(ds: DataSource) -> Optional[str]: return None -def vary_full_feature_names( - configs: List[IntegrationTestRepoConfig], -) -> List[IntegrationTestRepoConfig]: - new_configs = [] - for c in configs: - true_c = replace(c, full_feature_names=True) - false_c = replace(c, full_feature_names=False) - new_configs.extend([true_c, false_c]) - return new_configs - - -def vary_infer_event_timestamp_col( - configs: List[IntegrationTestRepoConfig], -) -> List[IntegrationTestRepoConfig]: - new_configs = [] - for c in configs: - true_c = replace(c, infer_event_timestamp_col=True) - false_c = replace(c, infer_event_timestamp_col=False) - new_configs.extend([true_c, false_c]) - return new_configs - - -def vary_infer_feature( - configs: List[IntegrationTestRepoConfig], -) -> List[IntegrationTestRepoConfig]: - new_configs = [] - for c in configs: - true_c = replace(c, infer_features=True) - false_c = replace(c, infer_features=False) - new_configs.extend([true_c, false_c]) - return new_configs - - -def vary_providers_for_offline_stores( - configs: List[IntegrationTestRepoConfig], -) -> List[IntegrationTestRepoConfig]: - new_configs = [] - for c in configs: - if "FileDataSourceCreator" in c.offline_store_creator: - new_configs.append(c) - elif "RedshiftDataSourceCreator" in c.offline_store_creator: - for p in ["local", "aws"]: - new_configs.append(replace(c, provider=p)) - elif "BigQueryDataSourceCreator" in c.offline_store_creator: - for p in ["local", "gcp"]: - new_configs.append(replace(c, provider=p)) - return new_configs - - @contextmanager def construct_test_environment( test_repo_config: IntegrationTestRepoConfig, diff --git a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py index d65a91bd2f..e0d6983bf1 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py @@ -12,7 +12,7 @@ class DataSourceCreator(ABC): def create_data_source( self, df: pd.DataFrame, - destination: str, + destination_name: str, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, @@ -22,13 +22,16 @@ def create_data_source( persist the dataframe in offline store, using the destination string as a way to differentiate multiple dataframes and data sources. - :param df: The dataframe to be used to create the data source. - :param destination: The destination that this data frame is meant for. This str is used by the implementing - classes to isolate the multiple dataframes from each other. - :param event_timestamp_column: Pass through for the underlying data source. - :param created_timestamp_column: Pass through for the underlying data source. - :param field_mapping: Pass through for the underlying data source. - :return: A Data source object, pointing to a table or file that is uploaded/persisted for the purpose of the + Args: + df: The dataframe to be used to create the data source. + destination_name: This str is used by the implementing classes to + isolate the multiple dataframes from each other. + event_timestamp_column: Pass through for the underlying data source. + created_timestamp_column: Pass through for the underlying data source. + field_mapping: Pass through for the underlying data source. + + Returns: + A Data source object, pointing to a table or file that is uploaded/persisted for the purpose of the test. """ ... diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py index 0863d7bd25..9b702ebe6c 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py @@ -23,7 +23,7 @@ def __init__(self, project_name: str): self.tables = [] - def get_dataset(self): + def create_dataset(self): if not self.dataset: self.dataset = bigquery.Dataset(self.dataset_id) print(f"Creating dataset: {self.dataset_id}") @@ -32,7 +32,6 @@ def get_dataset(self): 1000 * 60 * 60 * 24 * 14 ) # 2 weeks in milliseconds self.client.update_dataset(self.dataset, ["default_table_expiration_ms"]) - return self.dataset def teardown(self): @@ -50,30 +49,32 @@ def create_offline_store_config(self): def create_data_source( self, df: pd.DataFrame, - destination: Optional[str] = None, + destination_name: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, **kwargs, ) -> DataSource: - destination = self.get_prefixed_table_name(destination) + destination_name = self.get_prefixed_table_name(destination_name) - self.get_dataset() + self.create_dataset() job_config = bigquery.LoadJobConfig() - if self.gcp_project not in destination: - destination = f"{self.gcp_project}.{self.project_name}.{destination}" + if self.gcp_project not in destination_name: + destination_name = ( + f"{self.gcp_project}.{self.project_name}.{destination_name}" + ) job = self.client.load_table_from_dataframe( - df, destination, job_config=job_config + df, destination_name, job_config=job_config ) job.result() - self.tables.append(destination) + self.tables.append(destination_name) return BigQuerySource( - table_ref=destination, + table_ref=destination_name, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py index 7d204ed7b1..0d402b2314 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py @@ -26,16 +26,18 @@ def __init__(self, project_name: str): def create_data_source( self, df: pd.DataFrame, - destination: str, + destination_name: str, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: - destination = self.get_prefixed_table_name(destination) + destination_name = self.get_prefixed_table_name(destination_name) f = tempfile.NamedTemporaryFile( - prefix=f"{self.project_name}_{destination}", suffix=".parquet", delete=False + prefix=f"{self.project_name}_{destination_name}", + suffix=".parquet", + delete=False, ) df.to_parquet(f.name) self.files.append(f) @@ -102,13 +104,13 @@ def _upload_parquet_file(self, df, file_name, minio_endpoint): def create_data_source( self, df: pd.DataFrame, - destination: Optional[str] = None, + destination_name: Optional[str] = None, suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: - filename = f"{destination}.parquet" + filename = f"{destination_name}.parquet" port = self.minio.get_exposed_port("9000") host = self.minio.get_container_host_ip() minio_endpoint = f"{host}:{port}" diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py index 528e0261f8..88780f07a0 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py @@ -34,14 +34,14 @@ def __init__(self, project_name: str): def create_data_source( self, df: pd.DataFrame, - destination: str, + destination_name: str, suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: - destination = self.get_prefixed_table_name(destination) + destination_name = self.get_prefixed_table_name(destination_name) aws_utils.upload_df_to_redshift( self.client, @@ -49,16 +49,16 @@ def create_data_source( self.offline_store_config.database, self.offline_store_config.user, self.s3, - f"{self.offline_store_config.s3_staging_location}/copy/{destination}.parquet", + f"{self.offline_store_config.s3_staging_location}/copy/{destination_name}.parquet", self.offline_store_config.iam_role, - destination, + destination_name, df, ) - self.tables.append(destination) + self.tables.append(destination_name) return RedshiftSource( - table=destination, + table=destination_name, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", From e0ccd9b8babd0f77fee07f25f1760a11a5267c8e Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 1 Sep 2021 11:15:23 -0700 Subject: [PATCH 48/51] use the actual ref to data source creators Signed-off-by: Achal Shah --- .../feature_repos/repo_configuration.py | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 8a90fd54d6..00c439ca89 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -4,15 +4,24 @@ from dataclasses import dataclass, field from datetime import datetime, timedelta from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Type, Union import pandas as pd -from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data, importer +from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data from feast.data_source import DataSource from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, ) +from tests.integration.feature_repos.universal.data_sources.bigquery import ( + BigQueryDataSourceCreator, +) +from tests.integration.feature_repos.universal.data_sources.file import ( + FileDataSourceCreator, +) +from tests.integration.feature_repos.universal.data_sources.redshift import ( + RedshiftDataSourceCreator, +) from tests.integration.feature_repos.universal.feature_views import ( create_customer_daily_profile_feature_view, create_driver_hourly_stats_feature_view, @@ -28,17 +37,13 @@ class IntegrationTestRepoConfig: provider: str = "local" online_store: Union[str, Dict] = "sqlite" - offline_store_creator: str = "tests.integration.feature_repos.universal.data_sources.file.FileDataSourceCreator" + offline_store_creator: Type[DataSourceCreator] = FileDataSourceCreator full_feature_names: bool = True infer_event_timestamp_col: bool = True infer_features: bool = False -def ds_creator_path(cls: str): - return f"tests.integration.feature_repos.universal.data_sources.{cls}" - - DYNAMO_CONFIG = {"type": "dynamodb", "region": "us-west-2"} REDIS_CONFIG = {"type": "redis", "connection_string": "localhost:6379,db=0"} FULL_REPO_CONFIGS: List[IntegrationTestRepoConfig] = [ @@ -48,23 +53,23 @@ def ds_creator_path(cls: str): # GCP configurations IntegrationTestRepoConfig( provider="gcp", - offline_store_creator=ds_creator_path("bigquery.BigQueryDataSourceCreator"), + offline_store_creator=BigQueryDataSourceCreator, online_store="datastore", ), IntegrationTestRepoConfig( provider="gcp", - offline_store_creator=ds_creator_path("bigquery.BigQueryDataSourceCreator"), + offline_store_creator=BigQueryDataSourceCreator, online_store=REDIS_CONFIG, ), # AWS configurations IntegrationTestRepoConfig( provider="aws", - offline_store_creator=ds_creator_path("redshift.RedshiftDataSourceCreator"), + offline_store_creator=RedshiftDataSourceCreator, online_store=DYNAMO_CONFIG, ), IntegrationTestRepoConfig( provider="aws", - offline_store_creator=ds_creator_path("redshift.RedshiftDataSourceCreator"), + offline_store_creator=RedshiftDataSourceCreator, online_store=REDIS_CONFIG, ), ] @@ -159,13 +164,7 @@ def construct_test_environment( ) -> Environment: project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}" - module_name, config_class_name = test_repo_config.offline_store_creator.rsplit( - ".", 1 - ) - - offline_creator: DataSourceCreator = importer.get_class_from_type( - module_name, config_class_name, "DataSourceCreator" - )(project) + offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project) offline_store_config = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store From a510ca6029073857cc35e34c5865a7f34714bbc9 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 1 Sep 2021 14:27:14 -0700 Subject: [PATCH 49/51] format Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 8 +-- .../registration/test_universal_types.py | 49 ++++++++++++------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index a25e134343..605787b234 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -28,7 +28,9 @@ construct_universal_datasets, construct_universal_entities, ) -from tests.integration.feature_repos.universal.data_sources.bigquery import BigQueryDataSourceCreator +from tests.integration.feature_repos.universal.data_sources.bigquery import ( + BigQueryDataSourceCreator, +) def pytest_configure(config): @@ -126,9 +128,7 @@ def universal_data_sources(environment): def e2e_data_sources(environment: Environment): df = create_dataset() data_source = environment.data_source_creator.create_data_source( - df, - environment.feature_store.project, - field_mapping={"ts_1": "ts"}, + df, environment.feature_store.project, field_mapping={"ts_1": "ts"}, ) yield df, data_source diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index fd18875faf..bfd2cb0292 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -2,15 +2,20 @@ import pandas as pd import pytest -from tests.data.data_creator import get_feature_values_for_dtype, create_dataset -from tests.integration.feature_repos.universal.entities import driver -from tests.integration.feature_repos.universal.feature_views import driver_feature_view from feast.infra.offline_stores.offline_store import RetrievalJob from feast.type_map import python_type_to_feast_value_type from feast.value_type import ValueType -from tests.integration.feature_repos.repo_configuration import IntegrationTestRepoConfig, construct_test_environment -from tests.integration.feature_repos.universal.data_sources.bigquery import BigQueryDataSourceCreator +from tests.data.data_creator import create_dataset, get_feature_values_for_dtype +from tests.integration.feature_repos.repo_configuration import ( + IntegrationTestRepoConfig, + construct_test_environment, +) +from tests.integration.feature_repos.universal.data_sources.bigquery import ( + BigQueryDataSourceCreator, +) +from tests.integration.feature_repos.universal.entities import driver +from tests.integration.feature_repos.universal.feature_views import driver_feature_view def entity_feature_types_ids(entity_type: ValueType, feature_dtype: str): @@ -50,7 +55,7 @@ def test_entity_inference_types_match(entity_type, feature_dtype, feature_is_lis data_source = environment.data_source_creator.create_data_source( df, destination_name=environment.feature_store.project, - field_mapping={"ts_1": "ts"} + field_mapping={"ts_1": "ts"}, ) fv = create_feature_view(feature_dtype, feature_is_list, data_source) fs = environment.feature_store @@ -88,13 +93,15 @@ def test_entity_inference_types_match(entity_type, feature_dtype, feature_is_lis @pytest.mark.parametrize( "feature_is_list", [True, False], ids=lambda v: f"feature_is_list:{str(v)}" ) -def test_feature_get_historical_features_types_match(entity_type, feature_dtype, feature_is_list): +def test_feature_get_historical_features_types_match( + entity_type, feature_dtype, feature_is_list +): with construct_test_environment(GCP_CONFIG) as environment: df = create_dataset(entity_type, feature_dtype, feature_is_list) data_source = environment.data_source_creator.create_data_source( df, destination_name=environment.feature_store.project, - field_mapping={"ts_1": "ts"} + field_mapping={"ts_1": "ts"}, ) fv = create_feature_view(feature_dtype, feature_is_list, data_source) fs = environment.feature_store @@ -104,16 +111,16 @@ def test_feature_get_historical_features_types_match(entity_type, feature_dtype, features = [f"{fv.name}:value"] df = pd.DataFrame() - df["driver_id"] = ( - ["1", "3"] if entity_type == ValueType.STRING else [1, 3] - ) + df["driver_id"] = ["1", "3"] if entity_type == ValueType.STRING else [1, 3] now = datetime.utcnow() ts = pd.Timestamp(now).round("ms") df["ts"] = [ ts - timedelta(hours=4), ts - timedelta(hours=2), ] - historical_features = fs.get_historical_features(entity_df=df, features=features,) + historical_features = fs.get_historical_features( + entity_df=df, features=features, + ) # TODO(adchia): pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs historical_features_df = historical_features.to_df() @@ -121,8 +128,12 @@ def test_feature_get_historical_features_types_match(entity_type, feature_dtype, if feature_is_list: assert_feature_list_types(feature_dtype, historical_features_df) else: - assert_expected_historical_feature_types(feature_dtype, historical_features_df) - assert_expected_arrow_types(feature_dtype, feature_is_list, historical_features) + assert_expected_historical_feature_types( + feature_dtype, historical_features_df + ) + assert_expected_arrow_types( + feature_dtype, feature_is_list, historical_features + ) finally: environment.data_source_creator.teardown() @@ -139,13 +150,15 @@ def test_feature_get_historical_features_types_match(entity_type, feature_dtype, @pytest.mark.parametrize( "feature_is_list", [False], ids=lambda v: f"feature_is_list:{str(v)}" ) -def test_feature_get_online_features_types_match(entity_type, feature_dtype, feature_is_list): +def test_feature_get_online_features_types_match( + entity_type, feature_dtype, feature_is_list +): with construct_test_environment(GCP_CONFIG) as environment: df = create_dataset(entity_type, feature_dtype, feature_is_list) data_source = environment.data_source_creator.create_data_source( df, destination_name=environment.feature_store.project, - field_mapping={"ts_1": "ts"} + field_mapping={"ts_1": "ts"}, ) fv = create_feature_view(feature_dtype, feature_is_list, data_source) fs = environment.feature_store @@ -170,7 +183,9 @@ def test_feature_get_online_features_types_match(entity_type, feature_dtype, fea } assert ( type(online_features["value"][0]).__name__ - == feature_list_dtype_to_expected_online_response_value_type[feature_dtype] + == feature_list_dtype_to_expected_online_response_value_type[ + feature_dtype + ] ) finally: environment.data_source_creator.teardown() From 6c105020455e652e07f86220610cc96d1c54d60a Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 1 Sep 2021 14:28:11 -0700 Subject: [PATCH 50/51] unused imports' Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 605787b234..76812825b3 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import multiprocessing -import uuid from datetime import datetime, timedelta from sys import platform @@ -28,9 +27,6 @@ construct_universal_datasets, construct_universal_entities, ) -from tests.integration.feature_repos.universal.data_sources.bigquery import ( - BigQueryDataSourceCreator, -) def pytest_configure(config): From ff1fb8175b3b9d6b27b35e0426bcd1073336cf4f Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 1 Sep 2021 14:38:29 -0700 Subject: [PATCH 51/51] Add ids for pytest params Signed-off-by: Achal Shah --- sdk/python/tests/conftest.py | 10 +++++++++- .../integration/registration/test_universal_types.py | 1 - 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 76812825b3..55bdeb3a7d 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -99,7 +99,9 @@ def simple_dataset_2() -> pd.DataFrame: return pd.DataFrame.from_dict(data) -@pytest.fixture(params=FULL_REPO_CONFIGS, scope="session") +@pytest.fixture( + params=FULL_REPO_CONFIGS, scope="session", ids=[str(c) for c in FULL_REPO_CONFIGS] +) def environment(request): with construct_test_environment(request.param) as e: yield e @@ -130,3 +132,9 @@ def e2e_data_sources(environment: Environment): yield df, data_source environment.data_source_creator.teardown() + + +@pytest.fixture(params=FULL_REPO_CONFIGS, scope="session") +def type_test_environment(request): + with construct_test_environment(request.param) as e: + yield e diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index bfd2cb0292..5390b72cc0 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -33,7 +33,6 @@ def entity_feature_types_ids(entity_type: ValueType, feature_dtype: str): offline_store_creator=BigQueryDataSourceCreator, online_store="datastore", ) -feature_is_list = [True, False] # TODO: change parametrization to allow for other providers aside from gcp