Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix incorrect on demand feature view diffing and improve Java tests #3074

Merged
merged 8 commits into from
Aug 11, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions .github/workflows/java_master_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,46 @@ jobs:
java-version: '11'
java-package: jdk
architecture: x64
- uses: actions/setup-python@v2
- name: Setup Python (to call feast apply)
uses: actions/setup-python@v2
id: setup-python
with:
python-version: '3.8'
architecture: 'x64'
python-version: 3.8
architecture: x64
- name: Setup Go
id: setup-go
uses: actions/setup-go@v2
with:
go-version: 1.18.0
- name: Upgrade pip version
run: |
pip install --upgrade "pip>=21.3.1,<22.1"
- name: Get pip cache dir
id: pip-cache
run: |
echo "::set-output name=dir::$(pip cache dir)"
- name: pip cache
uses: actions/cache@v2
with:
path: |
${{ steps.pip-cache.outputs.dir }}
/opt/hostedtoolcache/Python
/Users/runner/hostedtoolcache/Python
key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }}
restore-keys: |
${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-
- name: Install pip-tools
run: pip install pip-tools
- name: Install apache-arrow on ubuntu
run: |
sudo apt update
sudo apt install -y -V ca-certificates lsb-release wget
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt update
sudo apt install -y -V libarrow-dev
- name: Install Python dependencies
run: make install-python-ci-dependencies
- uses: actions/cache@v2
with:
path: ~/.m2/repository
Expand Down
40 changes: 40 additions & 0 deletions .github/workflows/java_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,46 @@ jobs:
aws-region: us-west-2
- name: Use AWS CLI
run: aws sts get-caller-identity
- name: Setup Python (to call feast apply)
uses: actions/setup-python@v2
id: setup-python
with:
python-version: 3.8
architecture: x64
- name: Setup Go
id: setup-go
uses: actions/setup-go@v2
with:
go-version: 1.18.0
- name: Upgrade pip version
run: |
pip install --upgrade "pip>=21.3.1,<22.1"
- name: Get pip cache dir
id: pip-cache
run: |
echo "::set-output name=dir::$(pip cache dir)"
- name: pip cache
uses: actions/cache@v2
with:
path: |
${{ steps.pip-cache.outputs.dir }}
/opt/hostedtoolcache/Python
/Users/runner/hostedtoolcache/Python
key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-${{ hashFiles(format('**/py{0}-ci-requirements.txt', env.PYTHON)) }}
restore-keys: |
${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-pip-
- name: Install pip-tools
run: pip install pip-tools
- name: Install apache-arrow on ubuntu
run: |
sudo apt update
sudo apt install -y -V ca-certificates lsb-release wget
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt update
sudo apt install -y -V libarrow-dev
- name: Install Python dependencies
run: make install-python-ci-dependencies
- name: Run integration tests
run: make test-java-integration
- name: Save report
Expand Down
2 changes: 2 additions & 0 deletions java/serving/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,4 +136,6 @@ Unit &amp; Integration Tests can be used to verify functionality:
mvn test -pl serving --also-make
# run integration tests
mvn verify -pl serving --also-make
# run integration tests with debugger
mvn -Dmaven.failsafe.debug verify -pl serving --also-make
```
22 changes: 22 additions & 0 deletions java/serving/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,28 @@
</configuration>
</plugin>

<!-- Call feast apply before running integration tests -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<executions>
<execution>
<configuration>
<executable>python</executable>
<workingDirectory>src/test/resources/docker-compose/feast10/</workingDirectory>
<arguments>
<argument>setup_it.py</argument>
</arguments>
</configuration>
<id>feast_test_apply</id>
<phase>process-test-resources</phase>
<goals>
<goal>exec</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,17 @@ def transformed_conv_rate(features_df: pd.DataFrame) -> pd.DataFrame:

entity = Entity(name="entity")

benchmark_feature_views = [
FeatureView(
benchmark_feature_views = []
for i in range(25):
fv = FeatureView(
name=f"feature_view_{i}",
entities=[entity],
ttl=timedelta(seconds=86400),
schema=[Field(name=f"feature_{10 * i + j}", dtype=Int64) for j in range(10)],
online=True,
source=generated_data_source,
)
for i in range(25)
]
benchmark_feature_views.append(fv)

benchmark_feature_service = FeatureService(
name=f"benchmark_feature_service", features=benchmark_feature_views,
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from pathlib import Path
from feast.repo_config import load_repo_config
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

from definitions import (
benchmark_feature_service,
benchmark_feature_views,
driver,
driver_hourly_stats_view,
entity,
transformed_conv_rate,
)

from feast import FeatureStore


def setup_data():
start = datetime.now() - timedelta(days=10)

df = pd.DataFrame()
df["driver_id"] = np.arange(1000, 1010)
df["created"] = datetime.now()
df["conv_rate"] = np.arange(0, 1, 0.1)
df["acc_rate"] = np.arange(0.5, 1, 0.05)
df["avg_daily_trips"] = np.arange(0, 1000, 100)

# some of rows are beyond 7 days to test OUTSIDE_MAX_AGE status
df["event_timestamp"] = start + pd.Series(np.arange(0, 10)).map(
lambda days: timedelta(days=days)
)

# Store data in parquet files. Parquet is convenient for local development mode. For
# production, you can use your favorite DWH, such as BigQuery. See Feast documentation
# for more info.
df.to_parquet("driver_stats.parquet")

# For Benchmarks
# Please read more in Feast RFC-031
# (link https://docs.google.com/document/d/12UuvTQnTTCJhdRgy6h10zSbInNGSyEJkIxpOcgOen1I/edit)
# about this benchmark setup
def generate_data(
num_rows: int, num_features: int, destination: str
) -> pd.DataFrame:
features = [f"feature_{i}" for i in range(num_features)]
columns = ["entity", "event_timestamp"] + features
df = pd.DataFrame(0, index=np.arange(num_rows), columns=columns)
df["event_timestamp"] = datetime.utcnow()
for column in features:
df[column] = np.random.randint(1, num_rows, num_rows)

df["entity"] = "key-" + pd.Series(np.arange(1, num_rows + 1)).astype(
pd.StringDtype()
)

df.to_parquet(destination)

generate_data(10**3, 250, "benchmark_data.parquet")


def main():
print("Running setup_it.py")

setup_data()
existing_repo_config = load_repo_config(Path("."))

# Update to default online store since otherwise, relies on Dockerized Redis service
fs = FeatureStore(config=existing_repo_config.copy(update={"online_store": {}}))
fs.apply(
[
driver_hourly_stats_view,
transformed_conv_rate,
driver,
entity,
benchmark_feature_service,
*benchmark_feature_views,
]
)

print("setup_it finished")


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions sdk/python/feast/diff/registry_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ def diff_registry_objects(
continue
elif getattr(current_spec, _field.name) != getattr(new_spec, _field.name):
if _field.name == "user_defined_function":
current_spec = cast(OnDemandFeatureViewSpec, current_proto)
new_spec = cast(OnDemandFeatureViewSpec, new_proto)
current_spec = cast(OnDemandFeatureViewSpec, current_spec)
new_spec = cast(OnDemandFeatureViewSpec, new_spec)
current_udf = current_spec.user_defined_function
new_udf = new_spec.user_defined_function
for _udf_field in current_udf.DESCRIPTOR.fields:
Expand Down
50 changes: 50 additions & 0 deletions sdk/python/tests/unit/diff/test_registry_diff.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import pandas as pd

from feast import Field
from feast.diff.registry_diff import (
diff_registry_objects,
tag_objects_for_keep_delete_update_add,
)
from feast.entity import Entity
from feast.feature_view import FeatureView
from feast.on_demand_feature_view import on_demand_feature_view
from feast.types import String
from tests.utils.data_source_test_creator import prep_file_source


Expand Down Expand Up @@ -89,3 +94,48 @@ def test_diff_registry_objects_feature_views(simple_dataset_1):
assert feast_object_diffs.feast_object_property_diffs[0].val_declared == {
"when": "after"
}


def test_diff_odfv(simple_dataset_1):
with prep_file_source(df=simple_dataset_1, timestamp_field="ts_1") as file_source:
entity = Entity(name="id", join_keys=["id"])
fv = FeatureView(
name="fv2",
entities=[entity],
source=file_source,
tags={"when": "before"},
)

@on_demand_feature_view(
sources=[fv],
schema=[Field(name="first_char", dtype=String)],
)
def pre_changed(inputs: pd.DataFrame) -> pd.DataFrame:
df = pd.DataFrame()
df["first_char"] = inputs["string_col"].str[:1].astype("string")
return df

@on_demand_feature_view(
sources=[fv],
schema=[Field(name="first_char", dtype=String)],
)
def post_changed(inputs: pd.DataFrame) -> pd.DataFrame:
df = pd.DataFrame()
df["first_char"] = inputs["string_col"].str[:1].astype("string") + "hi"
return df

feast_object_diffs = diff_registry_objects(
pre_changed, pre_changed, "on demand feature view"
)
assert len(feast_object_diffs.feast_object_property_diffs) == 0

feast_object_diffs = diff_registry_objects(
pre_changed, post_changed, "on demand feature view"
)

# Note that user_defined_function.body is excluded because it always changes (dill is non-deterministic), even
# if no code is changed
assert len(feast_object_diffs.feast_object_property_diffs) == 3
assert feast_object_diffs.feast_object_property_diffs[0].property_name == "name"
assert feast_object_diffs.feast_object_property_diffs[1].property_name == "user_defined_function.name"
assert feast_object_diffs.feast_object_property_diffs[2].property_name == "user_defined_function.body_text"