Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update sample notebooks (fraud detection and recommendation examples) to use latest feathr api #921

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
889f291
Update notebooks to use latest codes with extra notebook dependencies
loomlike Dec 7, 2022
62d7509
wip. Remove azure cli package from extra dependencies
loomlike Dec 7, 2022
83fc0ca
Update fraud detection demo notebook and add test
loomlike Dec 13, 2022
5e6a6bc
WIP debugging
loomlike Dec 13, 2022
c19b79f
Merge branch 'main' into jumin/update_get_result_df_examples
loomlike Dec 13, 2022
711090f
Update notebooks
loomlike Dec 13, 2022
2c0eac1
modify notebook test to go-around materialization issue
loomlike Dec 13, 2022
9531554
Change notebook parameter name to align with client argument
loomlike Dec 14, 2022
0c3d7ab
Update recommendation notebook
loomlike Dec 15, 2022
62e42c0
Update synapse example. Add azure-cli dependency to notebook dependen…
loomlike Dec 17, 2022
b13733b
Update data url constants to point the source github repo's raw files
loomlike Dec 19, 2022
bb0372d
add dataset url constants to init.py
loomlike Dec 19, 2022
94507ad
Update feature embedding notebook to use the original dataset from az…
loomlike Dec 19, 2022
a318cb4
Add recommendation sample notebook test
loomlike Dec 23, 2022
783c658
Merge branch 'main' into jumin/update_get_result_df_examples
loomlike Dec 27, 2022
c473374
Fix numpy.bool deprecation error
loomlike Dec 27, 2022
2044323
Change databricks cluster node size from Dv2 to DSv2
loomlike Dec 28, 2022
7b91b3e
Use Dv4 for databricks notebook test due to the limit of Dv2 quota at…
loomlike Dec 28, 2022
bad967d
Fix to use the supported vm size
loomlike Dec 28, 2022
73f60de
pin numpy to resolve conflict with pyspark
loomlike Dec 29, 2022
c1f791a
Add document intelligence sample notebook
loomlike Jan 3, 2023
4795914
Update fraud detection sample
loomlike Jan 5, 2023
9752e93
Merge branch 'main' into jumin/update_get_result_df_examples
loomlike Jan 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
725 changes: 367 additions & 358 deletions docs/samples/azure_synapse/product_recommendation_demo.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@
},
"outputs": [],
"source": [
"# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n",
"!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\""
"# Install feathr from the latest codes in the repo. You may use `pip install feathr[notebook]` as well.\n",
"!pip install \"git+https://github.com/feathr-ai/feathr.git#subdirectory=feathr_project&egg=feathr[notebook]\""
]
},
{
Expand Down
1,249 changes: 1,249 additions & 0 deletions docs/samples/document_intelligence_with_azure_cognitive_search_skills.ipynb

Large diffs are not rendered by default.

1,634 changes: 832 additions & 802 deletions docs/samples/feature_embedding.ipynb

Large diffs are not rendered by default.

2,302 changes: 1,279 additions & 1,023 deletions docs/samples/fraud_detection_demo.ipynb

Large diffs are not rendered by default.

141 changes: 87 additions & 54 deletions docs/samples/nyc_taxi_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,27 @@
"source": [
"## 1. Install Feathr and Necessary Dependancies\n",
"\n",
"Install feathr and necessary packages by running `pip install feathr[notebook]` if you haven't installed them already."
"Install feathr and necessary packages by running one of following commends if you haven't installed them already:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# To install feathr from the latest codes in the repo:\n",
"# !pip install \"git+https://github.com/feathr-ai/feathr.git#subdirectory=feathr_project&egg=feathr[notebook]\" \n",
"\n",
"# To install the latest release:\n",
"# !pip install feathr[notebook] "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
Expand All @@ -92,7 +105,6 @@
"from datetime import timedelta\n",
"import os\n",
"from pathlib import Path\n",
"from tempfile import TemporaryDirectory\n",
"\n",
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.evaluation import RegressionEvaluator\n",
Expand Down Expand Up @@ -148,21 +160,26 @@
"outputs": [],
"source": [
"RESOURCE_PREFIX = None # TODO fill the value used to deploy the resources via ARM template\n",
"PROJECT_NAME = \"feathr_getting_started\"\n",
"PROJECT_NAME = \"nyc_taxi\"\n",
"\n",
"# Currently support: 'azure_synapse', 'databricks', and 'local' \n",
"SPARK_CLUSTER = \"local\"\n",
"\n",
"# TODO fill values to use databricks cluster:\n",
"DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n",
"DATABRICKS_URL = None # Set Databricks workspace url to use databricks\n",
"DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster\n",
"if is_databricks():\n",
" # If this notebook is running on Databricks, its context can be used to retrieve token and instance URL\n",
" ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n",
" DATABRICKS_WORKSPACE_TOKEN_VALUE = ctx.apiToken().get()\n",
" SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = f\"https://{ctx.tags().get('browserHostName').get()}\"\n",
"else:\n",
" DATABRICKS_WORKSPACE_TOKEN_VALUE = None # Set Databricks workspace token to use databricks\n",
" SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = None # Set Databricks workspace url to use databricks\n",
"\n",
"# TODO fill values to use Azure Synapse cluster:\n",
"AZURE_SYNAPSE_SPARK_POOL = None # Set Azure Synapse Spark pool name\n",
"AZURE_SYNAPSE_URL = None # Set Azure Synapse workspace url to use Azure Synapse\n",
"\n",
"# Data store root path. Could be a local file system path, dbfs or Azure storage path like abfs or wasbs\n",
"DATA_STORE_PATH = TemporaryDirectory().name\n",
"AZURE_SYNAPSE_URL = None # Set Azure Synapse workspace url to use Azure Synapse\n",
"ADLS_KEY = None # Set Azure Data Lake Storage key to use Azure Synapse\n",
"\n",
"# An existing Feathr config file path. If None, we'll generate a new config based on the constants in this cell.\n",
"FEATHR_CONFIG_PATH = None\n",
Expand Down Expand Up @@ -207,20 +224,9 @@
"outputs": [],
"source": [
"if SPARK_CLUSTER == \"azure_synapse\" and not os.environ.get(\"ADLS_KEY\"):\n",
" os.environ[\"ADLS_KEY\"] = add_your_key_here\n",
" os.environ[\"ADLS_KEY\"] = ADLS_KEY\n",
"elif SPARK_CLUSTER == \"databricks\" and not os.environ.get(\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"):\n",
" os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = add_your_token_here"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Force to use dbfs if the notebook is running on Databricks\n",
"if is_databricks() and not DATA_STORE_PATH.startswith(\"dbfs:\"):\n",
" DATA_STORE_PATH = f\"dbfs:/{DATA_STORE_PATH.lstrip('/')}\""
" os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = DATABRICKS_WORKSPACE_TOKEN_VALUE"
]
},
{
Expand All @@ -229,8 +235,23 @@
"metadata": {},
"outputs": [],
"source": [
"# Get an authentication credential to access Azure resources and register features\n",
"if USE_CLI_AUTH:\n",
" !az login --use-device-code"
" # Use AZ CLI interactive browser authentication\n",
" !az login --use-device-code\n",
" from azure.identity import AzureCliCredential\n",
" credential = AzureCliCredential(additionally_allowed_tenants=['*'],)\n",
"elif \"AZURE_TENANT_ID\" in os.environ and \"AZURE_CLIENT_ID\" in os.environ and \"AZURE_CLIENT_SECRET\" in os.environ:\n",
" # Use Environment variable secret\n",
" from azure.identity import EnvironmentCredential\n",
" credential = EnvironmentCredential()\n",
"else:\n",
" # Try to use the default credential\n",
" from azure.identity import DefaultAzureCredential\n",
" credential = DefaultAzureCredential(\n",
" exclude_interactive_browser_credential=False,\n",
" additionally_allowed_tenants=['*'],\n",
" )"
]
},
{
Expand All @@ -241,21 +262,11 @@
"source": [
"# Redis password\n",
"if 'REDIS_PASSWORD' not in os.environ:\n",
" # Try to get all the required credentials from Azure Key Vault\n",
" from azure.identity import AzureCliCredential, DefaultAzureCredential \n",
" from azure.keyvault.secrets import SecretClient\n",
"\n",
" vault_url = f\"https://{RESOURCE_PREFIX}kv.vault.azure.net\"\n",
" if USE_CLI_AUTH:\n",
" credential = AzureCliCredential(additionally_allowed_tenants=['*'],)\n",
" else:\n",
" credential = DefaultAzureCredential(\n",
" exclude_interactive_browser_credential=False,\n",
" additionally_allowed_tenants=['*'],\n",
" )\n",
" secret_client = SecretClient(vault_url=vault_url, credential=credential)\n",
" retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value\n",
" os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n"
" os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]"
]
},
{
Expand Down Expand Up @@ -296,7 +307,7 @@
" spark_config__spark_cluster=SPARK_CLUSTER,\n",
" spark_config__azure_synapse__dev_url=AZURE_SYNAPSE_URL,\n",
" spark_config__azure_synapse__pool_name=AZURE_SYNAPSE_SPARK_POOL,\n",
" spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n",
" spark_config__databricks__workspace_instance_url=SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL,\n",
" databricks_cluster_id=DATABRICKS_CLUSTER_ID,\n",
" )\n",
"\n",
Expand Down Expand Up @@ -338,7 +349,7 @@
},
"outputs": [],
"source": [
"client = FeathrClient(config_path=config_path)"
"client = FeathrClient(config_path=config_path, credential=credential)"
]
},
{
Expand Down Expand Up @@ -377,6 +388,19 @@
"# Else, you must already have a spark session object available in databricks or synapse notebooks."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use dbfs if the notebook is running on Databricks\n",
"if is_databricks():\n",
" WORKING_DIR = f\"/dbfs/{PROJECT_NAME}\"\n",
"else:\n",
" WORKING_DIR = PROJECT_NAME"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -390,10 +414,9 @@
},
"outputs": [],
"source": [
"DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n",
"\n",
"# Download the data file\n",
"df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n",
"data_file_path = f\"{WORKING_DIR}/nyc_taxi_data.csv\"\n",
"df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=data_file_path)\n",
"df_raw.limit(5).toPandas()"
]
},
Expand Down Expand Up @@ -510,14 +533,16 @@
"metadata": {},
"outputs": [],
"source": [
"# Define data source path\n",
"if client.spark_runtime == \"local\" or (client.spark_runtime == \"databricks\" and is_databricks()):\n",
"# Upload files to cloud if needed\n",
"if client.spark_runtime == \"local\":\n",
" # In local mode, we can use the same data path as the source.\n",
" # If the notebook is running on databricks, DATA_FILE_PATH should be already a dbfs path.\n",
" data_source_path = DATA_FILE_PATH\n",
" data_source_path = data_file_path\n",
"elif client.spark_runtime == \"databricks\" and is_databricks():\n",
" # If the notebook is running on databricks, we can use the same data path as the source.\n",
" data_source_path = data_file_path.replace(\"/dbfs\", \"dbfs:\")\n",
"else:\n",
" # Otherwise, upload the local file to the cloud storage (either dbfs or adls).\n",
" data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(DATA_FILE_PATH) "
" data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(data_file_path) "
]
},
{
Expand All @@ -535,8 +560,8 @@
" name=\"nycTaxiBatchSource\",\n",
" path=data_source_path,\n",
" event_timestamp_column=TIMESTAMP_COL,\n",
" preprocessing=preprocessing,\n",
" timestamp_format=TIMESTAMP_FORMAT,\n",
" preprocessing=preprocessing,\n",
")"
]
},
Expand Down Expand Up @@ -713,8 +738,7 @@
"metadata": {},
"outputs": [],
"source": [
"DATA_FORMAT = \"parquet\"\n",
"offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"
"DATA_FORMAT = \"parquet\""
]
},
{
Expand Down Expand Up @@ -748,7 +772,7 @@
" execution_configurations=SparkExecutionConfiguration({\n",
" \"spark.feathr.outputFormat\": DATA_FORMAT,\n",
" }),\n",
" output_path=offline_features_path,\n",
" output_path=data_source_path.rpartition(\"/\")[0] + f\"/features.{DATA_FORMAT}\",\n",
")\n",
"\n",
"client.wait_job_to_finish(timeout_sec=5000)"
Expand All @@ -765,7 +789,6 @@
" spark=spark,\n",
" client=client,\n",
" data_format=DATA_FORMAT,\n",
" res_url=offline_features_path,\n",
")\n",
"df.select(feature_names).limit(5).toPandas()"
]
Expand Down Expand Up @@ -937,9 +960,8 @@
"if REGISTER_FEATURES:\n",
" try:\n",
" client.register_features()\n",
" except KeyError:\n",
" # TODO temporarily go around the \"Already exists\" error\n",
" pass \n",
" except Exception as e:\n",
" print(e) \n",
" print(client.list_registered_features(project_name=PROJECT_NAME))\n",
" # You can get the actual features too by calling client.get_features_from_registry(PROJECT_NAME)"
]
Expand Down Expand Up @@ -1072,6 +1094,17 @@
" spark.stop()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cleaning up the output files. CAUTION: this maybe dangerous if you \"reused\" the project name.\n",
"import shutil\n",
"shutil.rmtree(WORKING_DIR, ignore_errors=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -1107,7 +1140,7 @@
},
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "feathr",
"language": "python",
"name": "python3"
},
Expand All @@ -1121,7 +1154,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.8 (main, Nov 24 2022, 14:13:03) [GCC 11.2.0]"
},
"vscode": {
"interpreter": {
Expand Down
Loading