feathr-ai · xiaoyongzhu · Jan 6, 2023 · Dec 7, 2022 · Dec 7, 2022 · Dec 13, 2022
diff --git a/docs/images/cognitive-search-enrichment-architecture.png b/docs/images/cognitive-search-enrichment-architecture.png
diff --git a/docs/samples/azure_synapse/product_recommendation_demo.ipynb b/docs/samples/azure_synapse/product_recommendation_demo.ipynb
diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb
@@ -70,8 +70,8 @@
    },
    "outputs": [],
    "source": [
-    "# Install feathr from the latest codes in the repo. You may use `pip install feathr` as well.\n",
-    "!pip install \"git+https://github.com/feathr-ai/feathr#subdirectory=feathr_project\""
+    "# Install feathr from the latest codes in the repo. You may use `pip install feathr[notebook]` as well.\n",
+    "!pip install \"git+https://github.com/feathr-ai/feathr.git#subdirectory=feathr_project&egg=feathr[notebook]\""
    ]
   },
   {

diff --git a/docs/samples/document_intelligence_with_azure_cognitive_search_skills.ipynb b/docs/samples/document_intelligence_with_azure_cognitive_search_skills.ipynb
diff --git a/docs/samples/feature_embedding.ipynb b/docs/samples/feature_embedding.ipynb
diff --git a/docs/samples/fraud_detection_demo.ipynb b/docs/samples/fraud_detection_demo.ipynb
diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb
@@ -63,14 +63,27 @@
    "source": [
     "## 1. Install Feathr and Necessary Dependancies\n",
     "\n",
-    "Install feathr and necessary packages by running `pip install feathr[notebook]` if you haven't installed them already."
+    "Install feathr and necessary packages by running one of following commends if you haven't installed them already:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# To install feathr from the latest codes in the repo:\n",
+    "# !pip install \"git+https://github.com/feathr-ai/feathr.git#subdirectory=feathr_project&egg=feathr[notebook]\" \n",
+    "\n",
+    "# To install the latest release:\n",
+    "# !pip install feathr[notebook] "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2"
@@ -92,7 +105,6 @@
     "from datetime import timedelta\n",
     "import os\n",
     "from pathlib import Path\n",
-    "from tempfile import TemporaryDirectory\n",
     "\n",
     "from pyspark.ml import Pipeline\n",
     "from pyspark.ml.evaluation import RegressionEvaluator\n",
@@ -148,21 +160,26 @@
    "outputs": [],
    "source": [
     "RESOURCE_PREFIX = None  # TODO fill the value used to deploy the resources via ARM template\n",
-    "PROJECT_NAME = \"feathr_getting_started\"\n",
+    "PROJECT_NAME = \"nyc_taxi\"\n",
     "\n",
     "# Currently support: 'azure_synapse', 'databricks', and 'local' \n",
     "SPARK_CLUSTER = \"local\"\n",
     "\n",
     "# TODO fill values to use databricks cluster:\n",
-    "DATABRICKS_CLUSTER_ID = None     # Set Databricks cluster id to use an existing cluster\n",
-    "DATABRICKS_URL = None   # Set Databricks workspace url to use databricks\n",
+    "DATABRICKS_CLUSTER_ID = None             # Set Databricks cluster id to use an existing cluster\n",
+    "if is_databricks():\n",
+    "    # If this notebook is running on Databricks, its context can be used to retrieve token and instance URL\n",
+    "    ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n",
+    "    DATABRICKS_WORKSPACE_TOKEN_VALUE = ctx.apiToken().get()\n",
+    "    SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = f\"https://{ctx.tags().get('browserHostName').get()}\"\n",
+    "else:\n",
+    "    DATABRICKS_WORKSPACE_TOKEN_VALUE = None                  # Set Databricks workspace token to use databricks\n",
+    "    SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = None  # Set Databricks workspace url to use databricks\n",
     "\n",
     "# TODO fill values to use Azure Synapse cluster:\n",
     "AZURE_SYNAPSE_SPARK_POOL = None  # Set Azure Synapse Spark pool name\n",
-    "AZURE_SYNAPSE_URL = None  # Set Azure Synapse workspace url to use Azure Synapse\n",
-    "\n",
-    "# Data store root path. Could be a local file system path, dbfs or Azure storage path like abfs or wasbs\n",
-    "DATA_STORE_PATH = TemporaryDirectory().name\n",
+    "AZURE_SYNAPSE_URL = None         # Set Azure Synapse workspace url to use Azure Synapse\n",
+    "ADLS_KEY = None                  # Set Azure Data Lake Storage key to use Azure Synapse\n",
     "\n",
     "# An existing Feathr config file path. If None, we'll generate a new config based on the constants in this cell.\n",
     "FEATHR_CONFIG_PATH = None\n",
@@ -207,20 +224,9 @@
    "outputs": [],
    "source": [
     "if SPARK_CLUSTER == \"azure_synapse\" and not os.environ.get(\"ADLS_KEY\"):\n",
-    "    os.environ[\"ADLS_KEY\"] = add_your_key_here\n",
+    "    os.environ[\"ADLS_KEY\"] = ADLS_KEY\n",
     "elif SPARK_CLUSTER == \"databricks\" and not os.environ.get(\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"):\n",
-    "    os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = add_your_token_here"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Force to use dbfs if the notebook is running on Databricks\n",
-    "if is_databricks() and not DATA_STORE_PATH.startswith(\"dbfs:\"):\n",
-    "    DATA_STORE_PATH = f\"dbfs:/{DATA_STORE_PATH.lstrip('/')}\""
+    "    os.environ[\"DATABRICKS_WORKSPACE_TOKEN_VALUE\"] = DATABRICKS_WORKSPACE_TOKEN_VALUE"
    ]
   },
   {
@@ -229,8 +235,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Get an authentication credential to access Azure resources and register features\n",
     "if USE_CLI_AUTH:\n",
-    "    !az login --use-device-code"
+    "    # Use AZ CLI interactive browser authentication\n",
+    "    !az login --use-device-code\n",
+    "    from azure.identity import AzureCliCredential\n",
+    "    credential = AzureCliCredential(additionally_allowed_tenants=['*'],)\n",
+    "elif \"AZURE_TENANT_ID\" in os.environ and \"AZURE_CLIENT_ID\" in os.environ and \"AZURE_CLIENT_SECRET\" in os.environ:\n",
+    "    # Use Environment variable secret\n",
+    "    from azure.identity import EnvironmentCredential\n",
+    "    credential = EnvironmentCredential()\n",
+    "else:\n",
+    "    # Try to use the default credential\n",
+    "    from azure.identity import DefaultAzureCredential\n",
+    "    credential = DefaultAzureCredential(\n",
+    "        exclude_interactive_browser_credential=False,\n",
+    "        additionally_allowed_tenants=['*'],\n",
+    "    )"
    ]
   },
   {
@@ -241,21 +262,11 @@
    "source": [
     "# Redis password\n",
     "if 'REDIS_PASSWORD' not in os.environ:\n",
-    "    # Try to get all the required credentials from Azure Key Vault\n",
-    "    from azure.identity import AzureCliCredential, DefaultAzureCredential \n",
     "    from azure.keyvault.secrets import SecretClient\n",
-    "\n",
     "    vault_url = f\"https://{RESOURCE_PREFIX}kv.vault.azure.net\"\n",
-    "    if USE_CLI_AUTH:\n",
-    "        credential = AzureCliCredential(additionally_allowed_tenants=['*'],)\n",
-    "    else:\n",
-    "        credential = DefaultAzureCredential(\n",
-    "            exclude_interactive_browser_credential=False,\n",
-    "            additionally_allowed_tenants=['*'],\n",
-    "        )\n",
     "    secret_client = SecretClient(vault_url=vault_url, credential=credential)\n",
     "    retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value\n",
-    "    os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n"
+    "    os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]"
    ]
   },
   {
@@ -296,7 +307,7 @@
     "        spark_config__spark_cluster=SPARK_CLUSTER,\n",
     "        spark_config__azure_synapse__dev_url=AZURE_SYNAPSE_URL,\n",
     "        spark_config__azure_synapse__pool_name=AZURE_SYNAPSE_SPARK_POOL,\n",
-    "        spark_config__databricks__workspace_instance_url=DATABRICKS_URL,\n",
+    "        spark_config__databricks__workspace_instance_url=SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL,\n",
     "        databricks_cluster_id=DATABRICKS_CLUSTER_ID,\n",
     "    )\n",
     "\n",
@@ -338,7 +349,7 @@
    },
    "outputs": [],
    "source": [
-    "client = FeathrClient(config_path=config_path)"
+    "client = FeathrClient(config_path=config_path, credential=credential)"
    ]
   },
   {
@@ -377,6 +388,19 @@
     "# Else, you must already have a spark session object available in databricks or synapse notebooks."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use dbfs if the notebook is running on Databricks\n",
+    "if is_databricks():\n",
+    "    WORKING_DIR = f\"/dbfs/{PROJECT_NAME}\"\n",
+    "else:\n",
+    "    WORKING_DIR = PROJECT_NAME"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -390,10 +414,9 @@
    },
    "outputs": [],
    "source": [
-    "DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n",
-    "\n",
     "# Download the data file\n",
-    "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n",
+    "data_file_path = f\"{WORKING_DIR}/nyc_taxi_data.csv\"\n",
+    "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=data_file_path)\n",
     "df_raw.limit(5).toPandas()"
    ]
   },
@@ -510,14 +533,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Define data source path\n",
-    "if client.spark_runtime == \"local\" or (client.spark_runtime == \"databricks\" and is_databricks()):\n",
+    "# Upload files to cloud if needed\n",
+    "if client.spark_runtime == \"local\":\n",
     "    # In local mode, we can use the same data path as the source.\n",
-    "    # If the notebook is running on databricks, DATA_FILE_PATH should be already a dbfs path.\n",
-    "    data_source_path = DATA_FILE_PATH\n",
+    "    data_source_path = data_file_path\n",
+    "elif client.spark_runtime == \"databricks\" and is_databricks():\n",
+    "    # If the notebook is running on databricks, we can use the same data path as the source.\n",
+    "    data_source_path = data_file_path.replace(\"/dbfs\", \"dbfs:\")\n",
     "else:\n",
     "    # Otherwise, upload the local file to the cloud storage (either dbfs or adls).\n",
-    "    data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(DATA_FILE_PATH)    "
+    "    data_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(data_file_path)    "
    ]
   },
   {
@@ -535,8 +560,8 @@
     "    name=\"nycTaxiBatchSource\",\n",
     "    path=data_source_path,\n",
     "    event_timestamp_column=TIMESTAMP_COL,\n",
-    "    preprocessing=preprocessing,\n",
     "    timestamp_format=TIMESTAMP_FORMAT,\n",
+    "    preprocessing=preprocessing,\n",
     ")"
    ]
   },
@@ -713,8 +738,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "DATA_FORMAT = \"parquet\"\n",
-    "offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))"
+    "DATA_FORMAT = \"parquet\""
    ]
   },
   {
@@ -748,7 +772,7 @@
     "    execution_configurations=SparkExecutionConfiguration({\n",
     "        \"spark.feathr.outputFormat\": DATA_FORMAT,\n",
     "    }),\n",
-    "    output_path=offline_features_path,\n",
+    "    output_path=data_source_path.rpartition(\"/\")[0] + f\"/features.{DATA_FORMAT}\",\n",
     ")\n",
     "\n",
     "client.wait_job_to_finish(timeout_sec=5000)"
@@ -765,7 +789,6 @@
     "    spark=spark,\n",
     "    client=client,\n",
     "    data_format=DATA_FORMAT,\n",
-    "    res_url=offline_features_path,\n",
     ")\n",
     "df.select(feature_names).limit(5).toPandas()"
    ]
@@ -937,9 +960,8 @@
     "if REGISTER_FEATURES:\n",
     "    try:\n",
     "        client.register_features()\n",
-    "    except KeyError:\n",
-    "        # TODO temporarily go around the \"Already exists\" error\n",
-    "        pass    \n",
+    "    except Exception as e:\n",
+    "        print(e)  \n",
     "    print(client.list_registered_features(project_name=PROJECT_NAME))\n",
     "    # You can get the actual features too by calling client.get_features_from_registry(PROJECT_NAME)"
    ]
@@ -1072,6 +1094,17 @@
     "    spark.stop()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cleaning up the output files. CAUTION: this maybe dangerous if you \"reused\" the project name.\n",
+    "import shutil\n",
+    "shutil.rmtree(WORKING_DIR, ignore_errors=False)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1107,7 +1140,7 @@
   },
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "feathr",
    "language": "python",
    "name": "python3"
   },
@@ -1121,7 +1154,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.8 (main, Nov 24 2022, 14:13:03) [GCC 11.2.0]"
   },
   "vscode": {
    "interpreter": {