Algattik/654 fraud scenario (feast-dev#22)

Closes KE-654 Added third data science scenario, ingestion of data for fraud detection.
Yanson · Jun 11, 2020 · 76deb09 · 76deb09
1 parent 24c0486
commit 76deb09
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 2 deletions.
diff --git a/infra/scripts/test-docker-compose-databricks.sh b/infra/scripts/test-docker-compose-databricks.sh
@@ -17,7 +17,7 @@ clean_up () {
     docker-compose $COMPOSE_ARGS down
 
     # Remove configuration file
-    rm .env
+    rm -f .env
 
     exit $ARG
 }

diff --git a/infra/scripts/test-docker-compose.sh b/infra/scripts/test-docker-compose.sh
@@ -15,7 +15,7 @@ clean_up () {
     docker-compose -f docker-compose.yml -f docker-compose.online.yml down
 
     # Remove configuration file
-    rm .env
+    rm -f .env
 
     exit $ARG
 }

diff --git a/tests/ds_scenarios/ds_fraud_feature_data.py b/tests/ds_scenarios/ds_fraud_feature_data.py
@@ -0,0 +1,40 @@
+import datetime 
+import numpy as np
+import pandas as pd
+from feast import Feature, FeatureSet, Entity, ValueType
+from pytz import utc
+
+"""
+
+Fraud features: customer counts for different windows of time (15M throughout day):
+- FR1-7: int
+
+"""
+FRAUD_COUNTS_FEATURE_SET = FeatureSet(
+    'fraud_count_features',
+    entities=[Entity('customer_id', ValueType.INT64)],
+    features=[
+        Feature('window_count1', ValueType.INT64),
+        Feature('window_count2', ValueType.INT64),
+        Feature('window_count3', ValueType.INT64),
+        Feature('window_count4', ValueType.INT64),
+        Feature('window_count5', ValueType.INT64),
+        Feature('window_count6', ValueType.INT64),
+        Feature('window_count7', ValueType.INT64),
+    ]
+)
+
+def create_fraud_counts_df(initial_customer_id=1, n=1000, dt=None):
+    if dt is None:
+        dt = datetime.datetime.now(datetime.timezone.utc)
+    return pd.DataFrame({
+        'datetime': dt,
+        'customer_id': list(range(initial_customer_id, initial_customer_id + n)),
+        'window_count1': list(np.random.random_integers(10, size=n)),
+        'window_count2': list(np.random.random_integers(20, size=n)),
+        'window_count3': list(np.random.random_integers(50, size=n)),
+        'window_count4': list(np.random.random_integers(100, size=n)),
+        'window_count5': list(np.random.random_integers(1000, size=n)),
+        'window_count6': list(np.random.random_integers(2000, size=n)),
+        'window_count7': list(np.random.random_integers(5000, size=n)),
+    })
diff --git a/tests/ds_scenarios/test-ingest.py b/tests/ds_scenarios/test-ingest.py
@@ -7,6 +7,9 @@
     PRODUCT_IMAGE_FEATURE_SET, create_product_image_features_df,
     PRODUCT_TEXT_ATTRIBUTE_FEATURE_SET, create_product_text_attributes_df,
 )
+from ds_fraud_feature_data import (
+    FRAUD_COUNTS_FEATURE_SET, create_fraud_counts_df,
+)
 
 PROJECT_NAME = 'ds_' + uuid.uuid4().hex.upper()[0:6]
 
@@ -48,6 +51,7 @@ def client(core_url, serving_url, allow_dirty):
 @pytest.mark.parametrize("data_frame_generator,feature_set", [
     (create_product_image_features_df, PRODUCT_IMAGE_FEATURE_SET),
     (create_product_text_attributes_df, PRODUCT_TEXT_ATTRIBUTE_FEATURE_SET),
+    (create_fraud_counts_df, FRAUD_COUNTS_FEATURE_SET),
 ])
 def test_ingestion(client, data_frame_generator, feature_set):
     client.apply(feature_set)