update notebook samples

microsoft · May 19, 2021 · c497742 · c497742
1 parent a931672
commit c497742
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 435 deletions.
diff --git a/notebooks/samples/Cognitive Services - Overview.ipynb b/notebooks/samples/Cognitive Services - Overview.ipynb
@@ -348,9 +348,9 @@
   },
   {
    "source": [
-    "## Azure Cognitive search - Creating a searchable Art Database with The MET's open-access collection sample\n",
+    "## Azure Cognitive search sample\n",
     "\n",
-    "In this example, we show how you can enrich data using Cognitive Skills and write to an Azure Search Index using MMLSpark. We use a subset of The MET's open-access collection and enrich it by passing it through 'Describe Image' and a custom 'Image Similarity' skill. The results are then written to a searchable index."
+    "In this example, we show how you can enrich data using Cognitive Skills and write to an Azure Search Index using MMLSpark."
    ],
    "cell_type": "markdown",
    "metadata": {}
@@ -361,115 +361,37 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os, sys, time, json, requests\n",
-    "from pyspark.ml import Transformer, Estimator, Pipeline\n",
-    "from pyspark.ml.feature import SQLTransformer\n",
-    "from pyspark.sql.functions import lit, udf, col, split"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "# import os, sys, time, json, requests\n",
+    "# from pyspark.ml import Transformer, Estimator, Pipeline\n",
+    "# from pyspark.ml.feature import SQLTransformer\n",
+    "# from pyspark.sql.functions import lit, udf, col, split\n",
+    "from mmlspark.cognitive import *\n",
+    "\n",
     "VISION_API_KEY = os.environ['VISION_API_KEY']\n",
     "AZURE_SEARCH_KEY = os.environ['AZURE_SEARCH_KEY']\n",
     "search_service = \"mmlspark-azure-search\"\n",
-    "search_index = \"test\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = spark.read\\\n",
-    "  .format(\"csv\")\\\n",
-    "  .option(\"header\", True)\\\n",
-    "  .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/metartworks_sample.csv\")\\\n",
-    "  .withColumn(\"searchAction\", lit(\"upload\"))\\\n",
-    "  .withColumn(\"Neighbors\", split(col(\"Neighbors\"), \",\").cast(\"array<string>\"))\\\n",
-    "  .withColumn(\"Tags\", split(col(\"Tags\"), \",\").cast(\"array<string>\"))\\\n",
-    "  .limit(25)"
-   ]
-  },
-  {
-   "source": [
-    "<img src=\"https://mmlspark.blob.core.windows.net/graphics/CognitiveSearchHyperscale/MetArtworkSamples.png\" width=\"800\" style=\"float: center;\"/>"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from mmlspark.cognitive import AnalyzeImage\n",
-    "from mmlspark.stages import SelectColumns\n",
-    "\n",
-    "#define pipeline\n",
-    "describeImage = (AnalyzeImage()\n",
-    "  .setSubscriptionKey(VISION_API_KEY)\n",
-    "  .setLocation(\"eastus\")\n",
-    "  .setImageUrlCol(\"PrimaryImageUrl\")\n",
-    "  .setOutputCol(\"RawImageDescription\")\n",
-    "  .setErrorCol(\"Errors\")\n",
-    "  .setVisualFeatures([\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\n",
-    "  .setConcurrency(5))\n",
-    "\n",
-    "df2 = describeImage.transform(data)\\\n",
-    "  .select(\"*\", \"RawImageDescription.*\").drop(\"Errors\", \"RawImageDescription\")"
-   ]
-  },
-  {
-   "source": [
-    "<img src=\"https://mmlspark.blob.core.windows.net/graphics/CognitiveSearchHyperscale/MetArtworksProcessed.png\" width=\"800\" style=\"float: center;\"/>"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "source": [
-    "Before writing the results to a Search Index, you must define a schema which must specify the name, type, and attributes of each field in your index. Refer [Create a basic index in Azure Search](https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index) for more information."
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from mmlspark.cognitive import *\n",
-    "df2.writeToAzureSearch(\n",
-    "  subscriptionKey=AZURE_SEARCH_KEY,\n",
+    "search_index = \"test-33467690\"\n",
+    "\n",
+    "df = spark.createDataFrame([(\"upload\", \"0\", \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg\"), \n",
+    "                            (\"upload\", \"1\", \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg\")], \n",
+    "                           [\"searchAction\", \"id\", \"url\"])\n",
+    "\n",
+    "tdf = AnalyzeImage()\\\n",
+    "  .setSubscriptionKey(VISION_API_KEY)\\\n",
+    "  .setLocation(\"eastus\")\\\n",
+    "  .setImageUrlCol(\"url\")\\\n",
+    "  .setOutputCol(\"analyzed\")\\\n",
+    "  .setErrorCol(\"errors\")\\\n",
+    "  .setVisualFeatures([\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\\\n",
+    "  .transform(df)\\\n",
+    "  .select(\"*\", \"analyzed.*\")\\\n",
+    "  .drop(\"errors\", \"analyzed\")\n",
+    "\n",
+    "tdf.writeToAzureSearch(subscriptionKey=AZURE_SEARCH_KEY,\n",
     "  actionCol=\"searchAction\",\n",
     "  serviceName=search_service,\n",
     "  indexName=search_index,\n",
-    "  keyCol=\"ObjectID\"\n",
-    ")"
-   ]
-  },
-  {
-   "source": [
-    "The Search Index can be queried using the [Azure Search REST API](https://docs.microsoft.com/rest/api/searchservice/) by sending GET or POST requests and specifying query parameters that give the criteria for selecting matching documents. For more information on querying refer [Query your Azure Search index using the REST API](https://docs.microsoft.com/en-us/rest/api/searchservice/Search-Documents)"
-   ],
-   "cell_type": "markdown",
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "url = 'https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06'.format(search_service, search_index)\n",
-    "requests.post(url, json={\"search\": \"Glass\"}, headers = {\"api-key\": AZURE_SEARCH_KEY}).json()"
+    "  keyCol=\"id\")"
    ]
   },
   {

diff --git a/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb b/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb
diff --git a/notebooks/samples/Vowpal Wabbit - Overview.ipynb b/notebooks/samples/Vowpal Wabbit - Overview.ipynb
@@ -466,7 +466,7 @@
     ")\n",
     "vw_train_data = vw_featurizer.transform(train_data)['target', 'features']\n",
     "vw_test_data = vw_featurizer.transform(test_data)['target', 'features']\n",
-    "display(vw_train_data.limit(10).toPandas())"
+    "display(vw_train_data)"
    ]
   },
   {
@@ -493,9 +493,7 @@
     ")\n",
     "\n",
     "# To reduce number of partitions (which will effect performance), use `vw_train_data.repartition(1)`\n",
-    "vw_train_data_2 = vw_train_data.repartition(1)\n",
-    "print(vw_train_data_2.count())\n",
-    "vw_model = vwr.fit(vw_train_data_2.repartition(1))\n",
+    "vw_model = vwr.fit(vw_train_data.repartition(1))\n",
     "vw_predictions = vw_model.transform(vw_test_data)\n",
     "\n",
     "display(vw_predictions.limit(20).toPandas())"
@@ -673,7 +671,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = spark.read.format(\"json\").option(\"inferSchema\", True).load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\")"
+    "data = spark.read.format(\"json\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\")"
    ]
   },
   {
@@ -744,7 +742,7 @@
   },
   {
    "source": [
-    "Buiild VowpalWabbit Contextual Bandit model and compute performance statistics."
+    "Build VowpalWabbit Contextual Bandit model and compute performance statistics."
    ],
    "cell_type": "markdown",
    "metadata": {}