From e15e7108f5498d8561942637937a3d1379caa312 Mon Sep 17 00:00:00 2001 From: serena-ruan Date: Thu, 6 May 2021 15:05:42 +0800 Subject: [PATCH 01/19] add LightGBM classification sample --- ...diction with LightGBM Classification.ipynb | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb diff --git a/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb b/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb new file mode 100644 index 0000000000..1d382ed2ed --- /dev/null +++ b/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb @@ -0,0 +1,223 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python385jvsc74a57bd072be13fef265c65d19cf428fd1b09dd31615eed186d1dccdebb6e555960506ee", + "display_name": "Python 3.8.5 64-bit (conda)" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "## Bankruptcy Prediction with LightGBM Classification" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = spark.read.format(\"csv\")\\\n", + " .option(\"header\", True)\\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\")\n", + "# print dataset size\n", + "print(\"records read: \" + str(dataset.count()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# convert features to double type\n", + "from pyspark.sql.functions import col\n", + "from pyspark.sql.types import DoubleType\n", + "for colName in dataset.columns:\n", + " dataset = dataset.withColumn(colName, col(colName).cast(DoubleType()))\n", + "print(\"Schema: \")\n", + "dataset.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset.show(n=3, truncate=False, vertical=True)" + ] + }, + { + "source": [ + "#### Split the dataset into train and test" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = dataset.randomSplit([0.85, 0.15], seed=1)" + ] + }, + { + "source": [ + "#### Add featurizer to convert features to vector" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml.feature import VectorAssembler\n", + "feature_cols = dataset.columns[1:]\n", + "featurizer = VectorAssembler(\n", + " inputCols=feature_cols,\n", + " outputCol='features'\n", + ")\n", + "train_data = featurizer.transform(train)['Bankrupt?', 'features']\n", + "test_data = featurizer.transform(test)['Bankrupt?', 'features']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.show(10)" + ] + }, + { + "source": [ + "#### Check if the data is unbalanced" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.groupBy(\"Bankrupt?\").count().show()" + ] + }, + { + "source": [ + "#### Model Training" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.lightgbm import LightGBMClassifier\n", + "model = LightGBMClassifier(objective=\"binary\", featuresCol=\"features\", labelCol=\"Bankrupt?\", isUnbalance=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = model.fit(train_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.lightgbm import LightGBMClassificationModel\n", + "model.saveNativeModel(\"/lgbmcmodel\")\n", + "model = LightGBMClassificationModel.loadNativeModelFromFile(\"/lgbmcmodel\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(model.getFeatureImportances())" + ] + }, + { + "source": [ + "#### Model Prediction" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = model.transform(test_data)\n", + "predictions.limit(10).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ComputeModelStatistics(evaluationMetric=\"classification\", labelCol=\"Bankrupt?\", scoredLabelsCol=\"prediction\").transform(predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.train import ComputeModelStatistics\n", + "metrics = ComputeModelStatistics(evaluationMetric=\"classification\", labelCol='Bankrupt?', scoredLabelsCol='prediction').transform(predictions)\n", + "display(metrics)" + ] + } + ] +} \ No newline at end of file From 5d670a3f09593052d2cf39f0e44404b449754044 Mon Sep 17 00:00:00 2001 From: serena-ruan Date: Thu, 6 May 2021 16:16:50 +0800 Subject: [PATCH 02/19] fix error --- ...kruptcy Prediction with LightGBM Classification.ipynb | 9 --------- 1 file changed, 9 deletions(-) diff --git a/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb b/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb index 1d382ed2ed..b85267a185 100644 --- a/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb +++ b/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb @@ -199,15 +199,6 @@ "predictions.limit(10).toPandas()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ComputeModelStatistics(evaluationMetric=\"classification\", labelCol=\"Bankrupt?\", scoredLabelsCol=\"prediction\").transform(predictions)" - ] - }, { "cell_type": "code", "execution_count": null, From aa28f118847810f5301cf47ffbe25dfd22c0a85f Mon Sep 17 00:00:00 2001 From: serena-ruan Date: Thu, 6 May 2021 17:40:52 +0800 Subject: [PATCH 03/19] add VW classification sample --- ...pal Wabbit - Heart Disease Detection.ipynb | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb diff --git a/notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb b/notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb new file mode 100644 index 0000000000..73cf965ad0 --- /dev/null +++ b/notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb @@ -0,0 +1,161 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "## Heart Disease Detection with VowalWabbit Classifier" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = spark.read.format(\"csv\")\\\n", + " .option(\"header\", True)\\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\")\n", + "# print dataset size\n", + "print(\"records read: \" + str(dataset.count()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# convert features to double type\n", + "from pyspark.sql.functions import col\n", + "from pyspark.sql.types import DoubleType\n", + "for colName in dataset.columns:\n", + " dataset = dataset.withColumn(colName, col(colName).cast(DoubleType()))\n", + "print(\"Schema: \")\n", + "dataset.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset.show(10, truncate=False)" + ] + }, + { + "source": [ + "#### Split the dataset into train and test" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = dataset.randomSplit([0.85, 0.15], seed=1)" + ] + }, + { + "source": [ + "#### Use VowalWabbitFeaturizer to convert data features into vector" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.vw import VowpalWabbitFeaturizer\n", + "featurizer = VowpalWabbitFeaturizer(inputCols=dataset.columns[:-1], outputCol=\"features\")\n", + "train_data = featurizer.transform(train)[\"target\", \"features\"]\n", + "test_data = featurizer.transform(test)[\"target\", \"features\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.groupBy(\"target\").count().show()" + ] + }, + { + "source": [ + "#### Model Training" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.vw import VowpalWabbitClassifier\n", + "model = VowpalWabbitClassifier(numPasses=20, labelCol=\"target\", featuresCol=\"features\").fit(train_data)" + ] + }, + { + "source": [ + "#### Model Prediction" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = model.transform(test_data)\n", + "predictions.limit(10).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.train import ComputeModelStatistics\n", + "metrics = ComputeModelStatistics(evaluationMetric='classification', labelCol='target', scoredLabelsCol='prediction').transform(predictions)\n", + "display(metrics)" + ] + } + ] +} \ No newline at end of file From 923628fa789fee1ff2df0907ff6a176f3188aceb Mon Sep 17 00:00:00 2001 From: serena-ruan Date: Fri, 7 May 2021 12:50:49 +0800 Subject: [PATCH 04/19] add cognitive services for big data sample --- .../Cognitive Services for Big Data.ipynb | 313 ++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 notebooks/samples/Cognitive Services for Big Data.ipynb diff --git a/notebooks/samples/Cognitive Services for Big Data.ipynb b/notebooks/samples/Cognitive Services for Big Data.ipynb new file mode 100644 index 0000000000..babef975b8 --- /dev/null +++ b/notebooks/samples/Cognitive Services for Big Data.ipynb @@ -0,0 +1,313 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "## Prerequisites\n", + "\n", + "1. Follow the steps in [Getting started](getting-started.md) to set up your Azure Databricks and Cognitive Services environment. This tutorial shows you how to install MMLSpark and how to create your Spark cluster in Databricks.\n", + "1. After you create a new notebook in Azure Databricks, copy the **Shared code** below and paste into a new cell in your notebook.\n", + "1. Choose a service sample, below, and copy paste it into a second new cell in your notebook.\n", + "1. Replace any of the service subscription key placeholders with your own key.\n", + "1. Choose the run button (triangle icon) in the upper right corner of the cell, then select **Run Cell**.\n", + "1. View results in a table below the cell." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "## Shared code\n", + "\n", + "To get started, we'll need to add this code to the project:" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.cognitive import *\n", + "import os\n", + "\n", + "# A general Cognitive Services key for Text Analytics and Computer Vision (or use separate keys that belong to each service)\n", + "service_key = os.environ[\"TEXT_API_KEY\"]\n", + "# A Bing Search v7 subscription key\n", + "bing_search_key = os.environ[\"BING_IMAGE_SEARCH_KEY\"]\n", + "# An Anomaly Dectector subscription key\n", + "anomaly_key = os.environ[\"ANOMALY_DETECTION_KEY\"]\n", + "\n", + "# Validate the key\n", + "assert service_key != \"ADD_YOUR_SUBSCRIPION_KEY\"" + ] + }, + { + "source": [ + "## Text Analytics sample\n", + "\n", + "The [Text Analytics](../text-analytics/index.yml) service provides several algorithms for extracting intelligent insights from text. For example, we can find the sentiment of given input text. The service will return a score between 0.0 and 1.0 where low scores indicate negative sentiment and high score indicates positive sentiment. This sample uses three simple sentences and returns the sentiment for each." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import col\n", + "\n", + "# Create a dataframe that's tied to it's column names\n", + "df = spark.createDataFrame([\n", + " (\"I am so happy today, its sunny!\", \"en-US\"),\n", + " (\"I am frustrated by this rush hour traffic\", \"en-US\"),\n", + " (\"The cognitive services on spark aint bad\", \"en-US\"),\n", + "], [\"text\", \"language\"])\n", + "\n", + "# Run the Text Analytics service with options\n", + "sentiment = (TextSentiment()\n", + " .setTextCol(\"text\")\n", + " .setLocation(\"eastus\")\n", + " .setSubscriptionKey(service_key)\n", + " .setOutputCol(\"sentiment\")\n", + " .setErrorCol(\"error\")\n", + " .setLanguageCol(\"language\"))\n", + "\n", + "# Show the results of your text query in a table format\n", + "display(sentiment.transform(df).select(\"text\", col(\"sentiment\")[0].getItem(\"sentiment\").alias(\"sentiment\")))" + ] + }, + { + "source": [ + "## Computer Vision sample\n", + "\n", + "[Computer Vision](../computer-vision/index.yml) analyzes images to identify structure such as faces, objects, and natural-language descriptions. In this sample, we tag a list of images. Tags are one-word descriptions of things in the image like recognizable objects, people, scenery, and actions." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataframe with the image URLs\n", + "df = spark.createDataFrame([\n", + " (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/objects.jpg\", ),\n", + " (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/dog.jpg\", ),\n", + " (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/house.jpg\", )\n", + " ], [\"image\", ])\n", + "\n", + "# Run the Computer Vision service. Analyze Image extracts infortmation from/about the images.\n", + "analysis = (AnalyzeImage()\n", + " .setLocation(\"eastus\")\n", + " .setSubscriptionKey(service_key)\n", + " .setVisualFeatures([\"Categories\",\"Color\",\"Description\",\"Faces\",\"Objects\",\"Tags\"])\n", + " .setOutputCol(\"analysis_results\")\n", + " .setImageUrlCol(\"image\")\n", + " .setErrorCol(\"error\"))\n", + "\n", + "# Show the results of what you wanted to pull out of the images.\n", + "display(analysis.transform(df).select(\"image\", \"analysis_results.description.tags\"))" + ] + }, + { + "source": [ + "## Bing Image Search sample\n", + "\n", + "[Bing Image Search](../bing-image-search/overview.md) searches the web to retrieve images related to a user's natural language query. In this sample, we use a text query that looks for images with quotes. It returns a list of image URLs that contain photos related to our query." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml import PipelineModel\n", + "\n", + "# Number of images Bing will return per query\n", + "imgsPerBatch = 10\n", + "# A list of offsets, used to page into the search results\n", + "offsets = [(i*imgsPerBatch,) for i in range(100)]\n", + "# Since web content is our data, we create a dataframe with options on that data: offsets\n", + "bingParameters = spark.createDataFrame(offsets, [\"offset\"])\n", + "\n", + "# Run the Bing Image Search service with our text query\n", + "bingSearch = (BingImageSearch()\n", + " .setSubscriptionKey(bing_search_key)\n", + " .setOffsetCol(\"offset\")\n", + " .setQuery(\"Martin Luther King Jr. quotes\")\n", + " .setCount(imgsPerBatch)\n", + " .setOutputCol(\"images\"))\n", + "\n", + "# Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n", + "getUrls = BingImageSearch.getUrlTransformer(\"images\", \"url\")\n", + "\n", + "# This displays the full results returned, uncomment to use\n", + "# display(bingSearch.transform(bingParameters))\n", + "\n", + "# Since we have two services, they are put into a pipeline\n", + "pipeline = PipelineModel(stages=[bingSearch, getUrls])\n", + "\n", + "# Show the results of your search: image URLs\n", + "display(pipeline.transform(bingParameters))" + ] + }, + { + "source": [ + "## Speech-to-Text sample\n", + "The [Speech-to-text](../speech-service/index-speech-to-text.yml) service converts streams or files of spoken audio to text. In this sample, we transcribe two audio files. The first file is easy to understand, and the second is more challenging." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataframe with our audio URLs, tied to the column called \"url\"\n", + "df = spark.createDataFrame([(\"https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav\",)\n", + " ], [\"url\"])\n", + "\n", + "# Run the Speech-to-text service to translate the audio into text\n", + "speech_to_text = (SpeechToTextSDK()\n", + " .setSubscriptionKey(service_key)\n", + " .setLocation(\"eastus\")\n", + " .setOutputCol(\"text\")\n", + " .setAudioDataCol(\"url\")\n", + " .setLanguage(\"en-US\")\n", + " .setProfanity(\"Masked\"))\n", + "\n", + "# Show the results of the translation\n", + "display(speech_to_text.transform(df).select(\"url\", \"text.DisplayText\"))" + ] + }, + { + "source": [ + "## Anomaly Detector sample\n", + "\n", + "[Anomaly Detector](../anomaly-detector/index.yml) is great for detecting irregularities in your time series data. In this sample, we use the service to find anomalies in the entire time series." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import lit\n", + "\n", + "# Create a dataframe with the point data that Anomaly Detector requires\n", + "df = spark.createDataFrame([\n", + " (\"1972-01-01T00:00:00Z\", 826.0),\n", + " (\"1972-02-01T00:00:00Z\", 799.0),\n", + " (\"1972-03-01T00:00:00Z\", 890.0),\n", + " (\"1972-04-01T00:00:00Z\", 900.0),\n", + " (\"1972-05-01T00:00:00Z\", 766.0),\n", + " (\"1972-06-01T00:00:00Z\", 805.0),\n", + " (\"1972-07-01T00:00:00Z\", 821.0),\n", + " (\"1972-08-01T00:00:00Z\", 20000.0),\n", + " (\"1972-09-01T00:00:00Z\", 883.0),\n", + " (\"1972-10-01T00:00:00Z\", 898.0),\n", + " (\"1972-11-01T00:00:00Z\", 957.0),\n", + " (\"1972-12-01T00:00:00Z\", 924.0),\n", + " (\"1973-01-01T00:00:00Z\", 881.0),\n", + " (\"1973-02-01T00:00:00Z\", 837.0),\n", + " (\"1973-03-01T00:00:00Z\", 9000.0)\n", + "], [\"timestamp\", \"value\"]).withColumn(\"group\", lit(\"series1\"))\n", + "\n", + "# Run the Anomaly Detector service to look for irregular data\n", + "anamoly_detector = (SimpleDetectAnomalies()\n", + " .setSubscriptionKey(anomaly_key)\n", + " .setLocation(\"eastus\")\n", + " .setTimestampCol(\"timestamp\")\n", + " .setValueCol(\"value\")\n", + " .setOutputCol(\"anomalies\")\n", + " .setGroupbyCol(\"group\")\n", + " .setGranularity(\"monthly\"))\n", + "\n", + "# Show the full results of the analysis with the anomalies marked as \"True\"\n", + "display(anamoly_detector.transform(df).select(\"timestamp\", \"value\", \"anomalies.isAnomaly\"))" + ] + }, + { + "source": [ + "## Arbitrary web APIs\n", + "\n", + "With HTTP on Spark, any web service can be used in your big data pipeline. In this example, we use the [World Bank API](http://api.worldbank.org/v2/country/) to get information about various countries around the world." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from requests import Request\n", + "from mmlspark.io.http import HTTPTransformer, http_udf\n", + "from pyspark.sql.functions import udf, col\n", + "\n", + "# Use any requests from the python requests library\n", + "def world_bank_request(country):\n", + " return Request(\"GET\", \"http://api.worldbank.org/v2/country/{}?format=json\".format(country))\n", + "\n", + "# Create a dataframe with spcificies which countries we want data on\n", + "df = (spark.createDataFrame([(\"br\",),(\"usa\",)], [\"country\"])\n", + " .withColumn(\"request\", http_udf(world_bank_request)(col(\"country\"))))\n", + "\n", + "# Much faster for big data because of the concurrency :)\n", + "client = (HTTPTransformer()\n", + " .setConcurrency(3)\n", + " .setInputCol(\"request\")\n", + " .setOutputCol(\"response\"))\n", + "\n", + "# Get the body of the response\n", + "def get_response_body(resp):\n", + " return resp.entity.content.decode()\n", + "\n", + "# Show the details of the country data returned\n", + "display(client.transform(df).select(\"country\", udf(get_response_body)(col(\"response\")).alias(\"response\")))" + ] + }, + { + "source": [ + "## See also\n", + "\n", + "* [Recipe: Anomaly Detection](./recipes/anomaly-detection.md)\n", + "* [Recipe: Art Explorer](./recipes/art-explorer.md)" + ], + "cell_type": "markdown", + "metadata": {} + } + ] +} \ No newline at end of file From 44856d19b219307bb7a3c52f68e081cb872181d0 Mon Sep 17 00:00:00 2001 From: serena-ruan Date: Fri, 7 May 2021 12:52:27 +0800 Subject: [PATCH 05/19] fix tiny error --- notebooks/samples/Cognitive Services for Big Data.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/samples/Cognitive Services for Big Data.ipynb b/notebooks/samples/Cognitive Services for Big Data.ipynb index babef975b8..0765281b75 100644 --- a/notebooks/samples/Cognitive Services for Big Data.ipynb +++ b/notebooks/samples/Cognitive Services for Big Data.ipynb @@ -54,7 +54,7 @@ "# A Bing Search v7 subscription key\n", "bing_search_key = os.environ[\"BING_IMAGE_SEARCH_KEY\"]\n", "# An Anomaly Dectector subscription key\n", - "anomaly_key = os.environ[\"ANOMALY_DETECTION_KEY\"]\n", + "anomaly_key = os.environ[\"ANOMALY_API_KEY\"]\n", "\n", "# Validate the key\n", "assert service_key != \"ADD_YOUR_SUBSCRIPION_KEY\"" From 27323382f5acce258f0c7ef812999420b5e11b2b Mon Sep 17 00:00:00 2001 From: Serena Date: Fri, 7 May 2021 16:10:00 +0800 Subject: [PATCH 06/19] update LightGBM samples --- ...ification.ipynb => LightGBM Samples.ipynb} | 159 +++++++++++++++++- 1 file changed, 158 insertions(+), 1 deletion(-) rename notebooks/samples/{LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb => LightGBM Samples.ipynb} (50%) diff --git a/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb b/notebooks/samples/LightGBM Samples.ipynb similarity index 50% rename from notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb rename to notebooks/samples/LightGBM Samples.ipynb index b85267a185..2ceb66bf7d 100644 --- a/notebooks/samples/LightGBM - Bankruptcy Prediction with LightGBM Classification.ipynb +++ b/notebooks/samples/LightGBM Samples.ipynb @@ -23,7 +23,49 @@ "cells": [ { "source": [ - "## Bankruptcy Prediction with LightGBM Classification" + "# LightGBM" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "[LightGBM](https://github.com/Microsoft/LightGBM) is an open-source,\n", + "distributed, high-performance gradient boosting (GBDT, GBRT, GBM, or\n", + "MART) framework. This framework specializes in creating high-quality and\n", + "GPU enabled decision tree algorithms for ranking, classification, and\n", + "many other machine learning tasks. LightGBM is part of Microsoft's\n", + "[DMTK](http://github.com/microsoft/dmtk) project.\n", + "\n", + "### Advantages of LightGBM\n", + "\n", + "- **Composability**: LightGBM models can be incorporated into existing\n", + " SparkML Pipelines, and used for batch, streaming, and serving\n", + " workloads.\n", + "- **Performance**: LightGBM on Spark is 10-30% faster than SparkML on\n", + " the Higgs dataset, and achieves a 15% increase in AUC. [Parallel\n", + " experiments](https://github.com/Microsoft/LightGBM/blob/master/docs/Experiments.rst#parallel-experiment)\n", + " have verified that LightGBM can achieve a linear speed-up by using\n", + " multiple machines for training in specific settings.\n", + "- **Functionality**: LightGBM offers a wide array of [tunable\n", + " parameters](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst),\n", + " that one can use to customize their decision tree system. LightGBM on\n", + " Spark also supports new types of problems such as quantile regression.\n", + "- **Cross platform** LightGBM on Spark is available on Spark, PySpark, and SparklyR\n", + "\n", + "LightGBM Usage:\n", + "\n", + "1. LightGBMClassifier\n", + "1. LightGBMRegressor" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "## Bankruptcy Prediction with LightGBM Classifier\n", + "\n", + "" ], "cell_type": "markdown", "metadata": {} @@ -209,6 +251,121 @@ "metrics = ComputeModelStatistics(evaluationMetric=\"classification\", labelCol='Bankrupt?', scoredLabelsCol='prediction').transform(predictions)\n", "display(metrics)" ] + }, + { + "source": [ + "## Quantile Regression for Drug Discovery with LightGBMRegressor\n", + "\n", + "" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "triazines = spark.read.format(\"libsvm\")\\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print some basic info\n", + "print(\"records read: \" + str(triazines.count()))\n", + "print(\"Schema: \")\n", + "triazines.printSchema()\n", + "triazines.limit(10).toPandas()" + ] + }, + { + "source": [ + "#### Split dataset into train and test" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = triazines.randomSplit([0.85, 0.15], seed=1)" + ] + }, + { + "source": [ + "#### Model Training" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.lightgbm import LightGBMRegressor\n", + "model = LightGBMRegressor(objective='quantile',\n", + " alpha=0.2,\n", + " learningRate=0.3,\n", + " numLeaves=31).fit(train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(model.getFeatureImportances())" + ] + }, + { + "source": [ + "#### Model Prediction" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scoredData = model.transform(test)\n", + "scoredData.limit(10).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.train import ComputeModelStatistics\n", + "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", + " labelCol='label',\n", + " scoresCol='prediction') \\\n", + " .transform(scoredData)\n", + "metrics.toPandas()" + ] } ] } \ No newline at end of file From 281e6b298f9f64d5dcdd77fe075dc6f9a00eb2e6 Mon Sep 17 00:00:00 2001 From: Serena Date: Fri, 7 May 2021 17:43:23 +0800 Subject: [PATCH 07/19] update samples --- .../Cognitive Services for Big Data.ipynb | 49 ++ notebooks/samples/LightGBM Samples.ipynb | 14 +- ...pal Wabbit - Heart Disease Detection.ipynb | 161 ----- notebooks/samples/Vowpal Wabbit Samples.ipynb | 654 ++++++++++++++++++ 4 files changed, 712 insertions(+), 166 deletions(-) delete mode 100644 notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb create mode 100644 notebooks/samples/Vowpal Wabbit Samples.ipynb diff --git a/notebooks/samples/Cognitive Services for Big Data.ipynb b/notebooks/samples/Cognitive Services for Big Data.ipynb index 0765281b75..7064601780 100644 --- a/notebooks/samples/Cognitive Services for Big Data.ipynb +++ b/notebooks/samples/Cognitive Services for Big Data.ipynb @@ -17,6 +17,55 @@ "nbformat": 4, "nbformat_minor": 2, "cells": [ + { + "source": [ + "\n", + "\n", + "# Cognitive Services\n", + "\n", + "[Azure Cognitive Services](https://azure.microsoft.com/en-us/services/cognitive-services/) are a suite of APIs, SDKs, and services available to help developers build intelligent applications without having direct AI or data science skills or knowledge by enabling developers to easily add cognitive features into their applications. The goal of Azure Cognitive Services is to help developers create applications that can see, hear, speak, understand, and even begin to reason. The catalog of services within Azure Cognitive Services can be categorized into five main pillars - Vision, Speech, Language, Web Search, and Decision.\n", + "\n", + "## Usage\n", + "\n", + "### Vision\n", + "[**Computer Vision**](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/)\n", + "- Describe: provides description of an image in human readable language ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/DescribeImage.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.DescribeImage))\n", + "- Analyze (color, image type, face, adult/racy content): analyzes visual features of an image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/AnalyzeImage.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.AnalyzeImage))\n", + "- OCR: reads text from an image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/OCR.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.OCR))\n", + "- Recognize Text: reads text from an image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/RecognizeText.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.RecognizeText))\n", + "- Thumbnail: generates a thumbnail of user-specified size from the image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/GenerateThumbnails.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.GenerateThumbnails))\n", + "- Recognize domain-specific content: recognizes domain-specific content (celebrity, landmark) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/RecognizeDomainSpecificContent.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.RecognizeDomainSpecificContent))\n", + "- Tag: identifies list of words that are relevant to the in0put image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/TagImage.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.TagImage))\n", + "\n", + "[**Face**](https://azure.microsoft.com/en-us/services/cognitive-services/face/)\n", + "- Detect: detects human faces in an image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/DetectFace.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.DetectFace))\n", + "- Verify: verifies whether two faces belong to a same person, or a face belongs to a person ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/VerifyFaces.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.VerifyFaces))\n", + "- Identify: finds the closest matches of the specific query person face from a person group ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/IdentifyFaces.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.IdentifyFaces))\n", + "- Find similar: finds similar faces to the query face in a face list ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/FindSimilarFace.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.FindSimilarFace))\n", + "- Group: divides a group of faces into disjoint groups based on similarity ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/GroupFaces.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.GroupFaces))\n", + "\n", + "### Speech\n", + "[**Speech Services**](https://azure.microsoft.com/en-us/services/cognitive-services/speech-services/)\n", + "- Speech-to-text: transcribes audio streams ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/SpeechToText.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.SpeechToText))\n", + "\n", + "### Language\n", + "[**Text Analytics**](https://azure.microsoft.com/en-us/services/cognitive-services/text-analytics/)\n", + "- Language detection: detects language of the input text ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/LanguageDetector.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.LanguageDetector))\n", + "- Key phrase extraction: identifies the key talking points in the input text ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/KeyPhraseExtractor.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.KeyPhraseExtractor))\n", + "- Named entity recognition: identifies known entities and general named entities in the input text ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/NER.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.NER))\n", + "- Sentiment analysis: returns a score betwee 0 and 1 indicating the sentiment in the input text ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/TextSentiment.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.TextSentiment))\n", + "\n", + "### Decision\n", + "[**Anomaly Detector**](https://azure.microsoft.com/en-us/services/cognitive-services/anomaly-detector/)\n", + "- Anomaly status of latest point: generates a model using preceding points and determines whether the latest point is anomalous ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/DetectLastAnomaly.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.DetectLastAnomaly))\n", + "- Find anomalies: generates a model using an entire series and finds anomalies in the series ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/DetectAnomalies.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.DetectAnomalies))\n", + "\n", + "### Web Search\n", + "- [Bing Image search](https://azure.microsoft.com/en-us/services/cognitive-services/bing-image-search-api/) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.BingImageSearch))\n" + ], + "cell_type": "markdown", + "metadata": {} + }, { "source": [ "## Prerequisites\n", diff --git a/notebooks/samples/LightGBM Samples.ipynb b/notebooks/samples/LightGBM Samples.ipynb index 2ceb66bf7d..3deab76803 100644 --- a/notebooks/samples/LightGBM Samples.ipynb +++ b/notebooks/samples/LightGBM Samples.ipynb @@ -53,10 +53,10 @@ " Spark also supports new types of problems such as quantile regression.\n", "- **Cross platform** LightGBM on Spark is available on Spark, PySpark, and SparklyR\n", "\n", - "LightGBM Usage:\n", + "### LightGBM Usage:\n", "\n", - "1. LightGBMClassifier\n", - "1. LightGBMRegressor" + "- LightGBMClassifier\n", + "- LightGBMRegressor" ], "cell_type": "markdown", "metadata": {} @@ -65,7 +65,9 @@ "source": [ "## Bankruptcy Prediction with LightGBM Classifier\n", "\n", - "" + "\n", + "\n", + "In this example, we use LightGBM to build a classification model in order to predict bankruptcy." ], "cell_type": "markdown", "metadata": {} @@ -256,7 +258,9 @@ "source": [ "## Quantile Regression for Drug Discovery with LightGBMRegressor\n", "\n", - "" + "\n", + "\n", + "In this example, we show how to use LightGBM to build a simple regression model." ], "cell_type": "markdown", "metadata": {} diff --git a/notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb b/notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb deleted file mode 100644 index 73cf965ad0..0000000000 --- a/notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb +++ /dev/null @@ -1,161 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": 3 - }, - "orig_nbformat": 2 - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "source": [ - "## Heart Disease Detection with VowalWabbit Classifier" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "source": [ - "#### Read dataset" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = spark.read.format(\"csv\")\\\n", - " .option(\"header\", True)\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\")\n", - "# print dataset size\n", - "print(\"records read: \" + str(dataset.count()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# convert features to double type\n", - "from pyspark.sql.functions import col\n", - "from pyspark.sql.types import DoubleType\n", - "for colName in dataset.columns:\n", - " dataset = dataset.withColumn(colName, col(colName).cast(DoubleType()))\n", - "print(\"Schema: \")\n", - "dataset.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset.show(10, truncate=False)" - ] - }, - { - "source": [ - "#### Split the dataset into train and test" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train, test = dataset.randomSplit([0.85, 0.15], seed=1)" - ] - }, - { - "source": [ - "#### Use VowalWabbitFeaturizer to convert data features into vector" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.vw import VowpalWabbitFeaturizer\n", - "featurizer = VowpalWabbitFeaturizer(inputCols=dataset.columns[:-1], outputCol=\"features\")\n", - "train_data = featurizer.transform(train)[\"target\", \"features\"]\n", - "test_data = featurizer.transform(test)[\"target\", \"features\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_data.groupBy(\"target\").count().show()" - ] - }, - { - "source": [ - "#### Model Training" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.vw import VowpalWabbitClassifier\n", - "model = VowpalWabbitClassifier(numPasses=20, labelCol=\"target\", featuresCol=\"features\").fit(train_data)" - ] - }, - { - "source": [ - "#### Model Prediction" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predictions = model.transform(test_data)\n", - "predictions.limit(10).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric='classification', labelCol='target', scoredLabelsCol='prediction').transform(predictions)\n", - "display(metrics)" - ] - } - ] -} \ No newline at end of file diff --git a/notebooks/samples/Vowpal Wabbit Samples.ipynb b/notebooks/samples/Vowpal Wabbit Samples.ipynb new file mode 100644 index 0000000000..6fa66bb61b --- /dev/null +++ b/notebooks/samples/Vowpal Wabbit Samples.ipynb @@ -0,0 +1,654 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "\n", + "\n", + "# VowalWabbit \n", + "\n", + "[VowpalWabbit](https://github.com/VowpalWabbit/vowpal_wabbit) (VW) is a machine learning system which\n", + "pushes the frontier of machine learning with techniques such as online, hashing, allreduce,\n", + "reductions, learning2search, active, and interactive learning. \n", + "VowpalWabbit is a popular choice in ad-tech due to it's speed and cost efficacy. \n", + "Furthermore it includes many advances in the area of reinforcement learning (e.g. contextual bandits). \n", + "\n", + "### Advantages of VowpalWabbit\n", + "\n", + "- **Composability**: VowpalWabbit models can be incorporated into existing\n", + " SparkML Pipelines, and used for batch, streaming, and serving workloads.\n", + "- **Small footprint**: VowpalWabbit memory consumption is rather small and can be controlled through '-b 18' or setNumBits method. \n", + " This determines the size of the model (e.g. 2^18 * some_constant).\n", + "- **Feature Interactions**: Feature interactions (e.g. quadratic, cubic,... terms) are created on-the-fly within the most inner\n", + " learning loop in VW.\n", + " Interactions can be specified by using the -q parameter and passing the first character of the namespaces that should be _interacted_. \n", + " The VW namespace concept is mapped to Spark using columns. The column name is used as namespace name, thus one sparse or dense Spark ML vector corresponds to the features of a single namespace. \n", + " To allow passing of multiple namespaces the VW estimator (classifier or regression) expose an additional property called _additionalFeatures_. Users can pass an array of column names.\n", + "- **Simple deployment**: all native dependencies are packaged into a single jars (including boost and zlib).\n", + "- **VowpalWabbit command line arguments**: users can pass VW command line arguments to control the learning process.\n", + "- **VowpalWabbit binary models** Users can supply an inital VowpalWabbit model to start the training which can be produced outside of \n", + " VW on Spark by invoking _setInitialModel_ and pass the model as a byte array. Similarly users can access the binary model by invoking\n", + " _getModel_ on the trained model object.\n", + "- **Java-based hashing** VWs version of murmur-hash was re-implemented in Java (praise to [JackDoe](https://github.com/jackdoe)) \n", + " providing a major performance improvement compared to passing input strings through JNI and hashing in C++.\n", + "- **Cross language** VowpalWabbit on Spark is available on Spark, PySpark, and SparklyR.\n", + "\n", + "### Limitations of VowpalWabbit on Spark\n", + "\n", + "- **Linux and CentOS only** The native binaries included with the published jar are built Linux and CentOS only.\n", + " We're working on creating a more portable version by statically linking Boost and lib C++.\n", + "- **Limited Parsing** Features implemented in the native VW parser (e.g. ngrams, skips, ...) are not yet implemented in\n", + " VowpalWabbitFeaturizer.\n", + "\n", + "### VowalWabbit Usage:\n", + "\n", + "- VowalWabbitClassifier\n", + "- VowalWabbitRegressor" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "## Heart Disease Detection with VowalWabbit Classifier\n", + "\n", + "" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = spark.read.format(\"csv\")\\\n", + " .option(\"header\", True)\\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\")\n", + "# print dataset size\n", + "print(\"records read: \" + str(dataset.count()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# convert features to double type\n", + "from pyspark.sql.functions import col\n", + "from pyspark.sql.types import DoubleType\n", + "for colName in dataset.columns:\n", + " dataset = dataset.withColumn(colName, col(colName).cast(DoubleType()))\n", + "print(\"Schema: \")\n", + "dataset.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset.show(10, truncate=False)" + ] + }, + { + "source": [ + "#### Split the dataset into train and test" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = dataset.randomSplit([0.85, 0.15], seed=1)" + ] + }, + { + "source": [ + "#### Use VowalWabbitFeaturizer to convert data features into vector" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.vw import VowpalWabbitFeaturizer\n", + "featurizer = VowpalWabbitFeaturizer(inputCols=dataset.columns[:-1], outputCol=\"features\")\n", + "train_data = featurizer.transform(train)[\"target\", \"features\"]\n", + "test_data = featurizer.transform(test)[\"target\", \"features\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.groupBy(\"target\").count().show()" + ] + }, + { + "source": [ + "#### Model Training" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.vw import VowpalWabbitClassifier\n", + "model = VowpalWabbitClassifier(numPasses=20, labelCol=\"target\", featuresCol=\"features\").fit(train_data)" + ] + }, + { + "source": [ + "#### Model Prediction" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = model.transform(test_data)\n", + "predictions.limit(10).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.train import ComputeModelStatistics\n", + "metrics = ComputeModelStatistics(evaluationMetric='classification', labelCol='target', scoredLabelsCol='prediction').transform(predictions)\n", + "display(metrics)" + ] + }, + { + "source": [ + "## Adult Census with VowpalWabbitClassifier\n", + "\n", + "In this example, we predict incomes from the Adult Census dataset using Vowpal Wabbit (VW) Classifier in MMLSpark." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset and split them into train & test" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", + "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", + "train.limit(10).toPandas()" + ] + }, + { + "source": [ + "#### Model Training\n", + "\n", + "We define a pipeline that includes feature engineering and training of a VW classifier. We use a featurizer provided by VW that hashes the feature names. Note that VW expects classification labels being -1 or 1. Thus, the income category is mapped to this space before feeding training data into the pipeline." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import when, col\n", + "from pyspark.ml import Pipeline\n", + "from mmlspark.vw import VowpalWabbitFeaturizer, VowpalWabbitClassifier\n", + "\n", + "# Define classification label\n", + "train = train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)).repartition(1).cache()\n", + "print(train.count())\n", + "\n", + "# Specify featurizer\n", + "vw_featurizer = VowpalWabbitFeaturizer(inputCols=[\"education\", \"marital-status\", \"hours-per-week\"],\n", + " outputCol=\"features\")\n", + "\n", + "# Define VW classification model\n", + "args = \"--loss_function=logistic --quiet --holdout_off\"\n", + "vw_model = VowpalWabbitClassifier(featuresCol=\"features\",\n", + " labelCol=\"label\",\n", + " args=args,\n", + " numPasses=10)\n", + "\n", + "# Create a pipeline\n", + "vw_pipeline = Pipeline(stages=[vw_featurizer, vw_model])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vw_trained = vw_pipeline.fit(train)" + ] + }, + { + "source": [ + "#### Model Prediction\n", + "\n", + "After the model is trained, we apply it to predict the income of each sample in the test set." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Making predictions\n", + "test = test.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0))\n", + "prediction = vw_trained.transform(test)\n", + "prediction.limit(10).toPandas()" + ] + }, + { + "source": [ + "Finally, we evaluate the model performance using ComputeModelStatistics function which will compute confusion matrix, accuracy, precision, recall, and AUC by default for classificaiton models." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.train import ComputeModelStatistics\n", + "metrics = ComputeModelStatistics(evaluationMetric=\"classification\", \n", + " labelCol=\"label\", \n", + " scoredLabelsCol=\"prediction\").transform(prediction)\n", + "metrics.toPandas()" + ] + }, + { + "source": [ + "## Quantile Regression for Drug Discovery with VowpalWabbitRegressor\n", + "\n", + "" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "triazines = spark.read.format(\"libsvm\")\\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# print some basic info\n", + "print(\"records read: \" + str(triazines.count()))\n", + "print(\"Schema: \")\n", + "triazines.printSchema()\n", + "triazines.limit(10).toPandas()" + ] + }, + { + "source": [ + "#### Split dataset into train and test" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = triazines.randomSplit([0.85, 0.15], seed=1)" + ] + }, + { + "source": [ + "#### Model Training" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.vw import VowpalWabbitRegressor\n", + "model = (VowpalWabbitRegressor(numPasses=20, args=\"--holdout_off --loss_function quantile -q :: -l 0.1\")\n", + " .fit(train))" + ] + }, + { + "source": [ + "#### Model Prediction" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scoredData = model.transform(test)\n", + "scoredData.limit(10).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.train import ComputeModelStatistics\n", + "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", + " labelCol='label',\n", + " scoresCol='prediction') \\\n", + " .transform(scoredData)\n", + "metrics.toPandas()" + ] + }, + { + "source": [ + "## Boston house price prediction with VowpalWabbitRegressor\n", + "\n", + "In this example, we show how to build regression model with VW using Boston's house price." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset\n", + "\n", + "We use [*Boston house price* dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html) \n", + ". \n", + "The data was collected in 1978 from Boston area and consists of 506 entries with 14 features including the value of homes. \n", + "We use `sklearn.datasets` module to download it easily, then split the set into training and testing by 75/25." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "from matplotlib.colors import ListedColormap, Normalize\n", + "from matplotlib.cm import get_cmap\n", + "import matplotlib.pyplot as plt\n", + "from mmlspark.train import ComputeModelStatistics\n", + "from mmlspark.vw import VowpalWabbitRegressor, VowpalWabbitFeaturizer\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.datasets import load_boston" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "boston = load_boston()\n", + "\n", + "feature_cols = ['f' + str(i) for i in range(boston.data.shape[1])]\n", + "header = ['target'] + feature_cols\n", + "df = spark.createDataFrame(\n", + " pd.DataFrame(data=np.column_stack((boston.target, boston.data)), columns=header)\n", + ").repartition(1)\n", + "print(\"Dataframe has {} rows\".format(df.count()))\n", + "display(df.limit(10).toPandas())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data, test_data = df.randomSplit([0.75, 0.25], seed=42)\n", + "train_data.cache()\n", + "test_data.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(train_data.summary().toPandas())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.show(10)" + ] + }, + { + "source": [ + "Exploratory analysis: plot feature distributions over different target values." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = train_data.columns[1:]\n", + "values = train_data.drop('target').toPandas()\n", + "ncols = 5\n", + "nrows = math.ceil(len(features) / ncols)\n", + "\n", + "yy = [r['target'] for r in train_data.select('target').collect()]\n", + "\n", + "f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30,10))\n", + "f.tight_layout()\n", + "\n", + "for irow in range(nrows):\n", + " axes[irow][0].set_ylabel('target')\n", + " for icol in range(ncols):\n", + " try:\n", + " feat = features[irow*ncols + icol]\n", + " xx = values[feat]\n", + "\n", + " axes[irow][icol].scatter(xx, yy, s=10, alpha=0.25)\n", + " axes[irow][icol].set_xlabel(feat)\n", + " axes[irow][icol].get_yaxis().set_ticks([])\n", + " except IndexError:\n", + " f.delaxes(axes[irow][icol])" + ] + }, + { + "source": [ + "#### VW-style feature hashing" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vw_featurizer = VowpalWabbitFeaturizer(\n", + " inputCols=feature_cols,\n", + " outputCol='features',\n", + ")\n", + "vw_train_data = vw_featurizer.transform(train_data)['target', 'features']\n", + "vw_test_data = vw_featurizer.transform(test_data)['target', 'features']\n", + "display(vw_train_data.limit(10).toPandas())" + ] + }, + { + "source": [ + "#### Model training & Prediction\n", + "\n", + "See [VW wiki](https://github.com/vowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments) for command line arguments." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "args = \"--holdout_off --loss_function quantile -l 7 -q :: --power_t 0.7\"\n", + "vwr = VowpalWabbitRegressor(\n", + " labelCol='target',\n", + " featuresCol='features',\n", + " args=args,\n", + " numPasses=200,\n", + ")\n", + "\n", + "# To reduce number of partitions (which will effect performance), use `vw_train_data.repartition(1)`\n", + "vw_train_data_2 = vw_train_data.repartition(1).cache()\n", + "print(vw_train_data_2.count())\n", + "vw_model = vwr.fit(vw_train_data_2.repartition(1))\n", + "vw_predictions = vw_model.transform(vw_test_data)\n", + "\n", + "display(vw_predictions.limit(20).toPandas())" + ] + }, + { + "source": [ + "#### Compute Statistics & Visualization" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = ComputeModelStatistics(\n", + " evaluationMetric='regression',\n", + " labelCol='target',\n", + " scoresCol='prediction'\n", + ").transform(vw_predictions)\n", + "\n", + "vw_result = metrics.toPandas()\n", + "vw_result.insert(0, 'model', ['Vowpal Wabbit'])\n", + "display(vw_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cmap = get_cmap('YlOrRd')\n", + "target = np.array(test_data.select('target').collect()).flatten()\n", + "model_preds = [(\"Vowpal Wabbit\", vw_predictions)]\n", + "\n", + "f, axe = plt.subplots(figsize=(6, 6))\n", + "f.tight_layout()\n", + "\n", + "preds = np.array(vw_predictions.select('prediction').collect()).flatten()\n", + "err = np.absolute(preds - target)\n", + "norm = Normalize()\n", + "clrs = cmap(np.asarray(norm(err)))[:, :-1]\n", + "plt.scatter(preds, target, s=60, c=clrs, edgecolors='#888888', alpha=0.75)\n", + "plt.plot((0, 60), (0, 60), linestyle='--', color='#888888')\n", + "axe.set_xlabel('Predicted values')\n", + "axe.set_ylabel('Actual values')\n", + "axe.set_title(\"Vowpal Wabbit\")" + ] + } + ] +} \ No newline at end of file From 74b4ceefa6d9d4651bb401a22c5209f30f6811db Mon Sep 17 00:00:00 2001 From: Serena Date: Mon, 10 May 2021 13:25:57 +0800 Subject: [PATCH 08/19] fix link --- notebooks/samples/Cognitive Services for Big Data.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/samples/Cognitive Services for Big Data.ipynb b/notebooks/samples/Cognitive Services for Big Data.ipynb index 7064601780..e783aac9da 100644 --- a/notebooks/samples/Cognitive Services for Big Data.ipynb +++ b/notebooks/samples/Cognitive Services for Big Data.ipynb @@ -70,7 +70,7 @@ "source": [ "## Prerequisites\n", "\n", - "1. Follow the steps in [Getting started](getting-started.md) to set up your Azure Databricks and Cognitive Services environment. This tutorial shows you how to install MMLSpark and how to create your Spark cluster in Databricks.\n", + "1. Follow the steps in [Getting started](https://docs.microsoft.com/en-us/azure/cognitive-services/big-data/getting-started) to set up your Azure Databricks and Cognitive Services environment. This tutorial shows you how to install MMLSpark and how to create your Spark cluster in Databricks.\n", "1. After you create a new notebook in Azure Databricks, copy the **Shared code** below and paste into a new cell in your notebook.\n", "1. Choose a service sample, below, and copy paste it into a second new cell in your notebook.\n", "1. Replace any of the service subscription key placeholders with your own key.\n", From 26703a33283f7a28efc76777fe3f4fe58a40015a Mon Sep 17 00:00:00 2001 From: Serena Date: Tue, 11 May 2021 10:33:27 +0800 Subject: [PATCH 09/19] update comments --- notebooks/samples/Cognitive Services for Big Data.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/samples/Cognitive Services for Big Data.ipynb b/notebooks/samples/Cognitive Services for Big Data.ipynb index e783aac9da..39884fb30b 100644 --- a/notebooks/samples/Cognitive Services for Big Data.ipynb +++ b/notebooks/samples/Cognitive Services for Big Data.ipynb @@ -229,7 +229,7 @@ { "source": [ "## Speech-to-Text sample\n", - "The [Speech-to-text](../speech-service/index-speech-to-text.yml) service converts streams or files of spoken audio to text. In this sample, we transcribe two audio files. The first file is easy to understand, and the second is more challenging." + "The [Speech-to-text](../speech-service/index-speech-to-text.yml) service converts streams or files of spoken audio to text. In this sample, we transcribe one audio file." ], "cell_type": "markdown", "metadata": {} From 1b03e9af6f1f84bd0a44fc2260e9005ea2cbfda6 Mon Sep 17 00:00:00 2001 From: Serena Date: Tue, 11 May 2021 14:55:41 +0800 Subject: [PATCH 10/19] update BingImageSearch calling to fix test errors --- notebooks/samples/Cognitive Services for Big Data.ipynb | 1 + .../CognitiveServices - Celebrity Quote Analysis.ipynb | 3 ++- .../ModelInterpretation - Snow Leopard Detection.ipynb | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/notebooks/samples/Cognitive Services for Big Data.ipynb b/notebooks/samples/Cognitive Services for Big Data.ipynb index 39884fb30b..c3f3c3c840 100644 --- a/notebooks/samples/Cognitive Services for Big Data.ipynb +++ b/notebooks/samples/Cognitive Services for Big Data.ipynb @@ -208,6 +208,7 @@ "# Run the Bing Image Search service with our text query\n", "bingSearch = (BingImageSearch()\n", " .setSubscriptionKey(bing_search_key)\n", + " .setUrl(\"https://api.bing.microsoft.com/v7.0/images/search\")\n", " .setOffsetCol(\"offset\")\n", " .setQuery(\"Martin Luther King Jr. quotes\")\n", " .setCount(imgsPerBatch)\n", diff --git a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb b/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb index 81de9c1ed8..8028a86795 100644 --- a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb +++ b/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb @@ -59,6 +59,7 @@ "\n", "bingSearch = BingImageSearch()\\\n", " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\\\n", + " .setUrl(\"https://api.bing.microsoft.com/v7.0/images/search\")\\\n", " .setOffsetCol(\"offset\")\\\n", " .setQuery(\"celebrity quotes\")\\\n", " .setCount(imgsPerBatch)\\\n", @@ -211,4 +212,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} +} \ No newline at end of file diff --git a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb b/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb index dda071f3e4..965d750804 100644 --- a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb +++ b/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb @@ -48,6 +48,7 @@ " .mlTransform(\n", " BingImageSearch() # Apply Bing Image Search\n", " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY) # Set the API Key\n", + " .setUrl(\"https://api.bing.microsoft.com/v7.0/images/search\") # Specify the url used to search\n", " .setOffsetCol(\"offsets\") # Specify a column containing the offsets\n", " .setQueryCol(\"queries\") # Specify a column containing the query words\n", " .setCount(10) # Specify the number of images to return per offset\n", @@ -164,6 +165,7 @@ "randomLinks = randomWords \\\n", " .mlTransform(BingImageSearch()\n", " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\n", + " .setUrl(\"https://api.bing.microsoft.com/v7.0/images/search\")\n", " .setCount(10)\n", " .setQueryCol(\"words\")\n", " .setOutputCol(\"images\")) \\\n", @@ -338,4 +340,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file From 3c37ff76dad2b32aa981acaf5e9631202b75e56a Mon Sep 17 00:00:00 2001 From: Serena Date: Wed, 12 May 2021 15:27:43 +0800 Subject: [PATCH 11/19] update samples --- ...pynb => Cognitive Services Overview.ipynb} | 135 ++++++++++- ...eServices - Celebrity Quote Analysis.ipynb | 1 - ... Samples.ipynb => LightGBM Overview.ipynb} | 223 ++++++++++++++++-- ...erpretation - Snow Leopard Detection.ipynb | 2 - ...les.ipynb => Vowpal Wabbit Overview.ipynb} | 70 +++--- 5 files changed, 364 insertions(+), 67 deletions(-) rename notebooks/samples/{Cognitive Services for Big Data.ipynb => Cognitive Services Overview.ipynb} (81%) rename notebooks/samples/{LightGBM Samples.ipynb => LightGBM Overview.ipynb} (58%) rename notebooks/samples/{Vowpal Wabbit Samples.ipynb => Vowpal Wabbit Overview.ipynb} (92%) diff --git a/notebooks/samples/Cognitive Services for Big Data.ipynb b/notebooks/samples/Cognitive Services Overview.ipynb similarity index 81% rename from notebooks/samples/Cognitive Services for Big Data.ipynb rename to notebooks/samples/Cognitive Services Overview.ipynb index c3f3c3c840..5e76897714 100644 --- a/notebooks/samples/Cognitive Services for Big Data.ipynb +++ b/notebooks/samples/Cognitive Services Overview.ipynb @@ -61,7 +61,8 @@ "- Find anomalies: generates a model using an entire series and finds anomalies in the series ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/DetectAnomalies.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.DetectAnomalies))\n", "\n", "### Web Search\n", - "- [Bing Image search](https://azure.microsoft.com/en-us/services/cognitive-services/bing-image-search-api/) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.BingImageSearch))\n" + "- [Bing Image search](https://azure.microsoft.com/en-us/services/cognitive-services/bing-image-search-api/) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.BingImageSearch))\n", + "- [Azure Cognitive search](https://docs.microsoft.com/en-us/azure/search/search-what-is-azure-search)\n" ], "cell_type": "markdown", "metadata": {} @@ -103,10 +104,7 @@ "# A Bing Search v7 subscription key\n", "bing_search_key = os.environ[\"BING_IMAGE_SEARCH_KEY\"]\n", "# An Anomaly Dectector subscription key\n", - "anomaly_key = os.environ[\"ANOMALY_API_KEY\"]\n", - "\n", - "# Validate the key\n", - "assert service_key != \"ADD_YOUR_SUBSCRIPION_KEY\"" + "anomaly_key = os.environ[\"ANOMALY_API_KEY\"]" ] }, { @@ -208,7 +206,6 @@ "# Run the Bing Image Search service with our text query\n", "bingSearch = (BingImageSearch()\n", " .setSubscriptionKey(bing_search_key)\n", - " .setUrl(\"https://api.bing.microsoft.com/v7.0/images/search\")\n", " .setOffsetCol(\"offset\")\n", " .setQuery(\"Martin Luther King Jr. quotes\")\n", " .setCount(imgsPerBatch)\n", @@ -349,6 +346,132 @@ "display(client.transform(df).select(\"country\", udf(get_response_body)(col(\"response\")).alias(\"response\")))" ] }, + { + "source": [ + "## Azure Cognitive search - Creating a searchable Art Database with The MET's open-access collection sample\n", + "\n", + "In this example, we show how you can enrich data using Cognitive Skills and write to an Azure Search Index using MMLSpark. We use a subset of The MET's open-access collection and enrich it by passing it through 'Describe Image' and a custom 'Image Similarity' skill. The results are then written to a searchable index." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, sys, time, json, requests\n", + "from pyspark.ml import Transformer, Estimator, Pipeline\n", + "from pyspark.ml.feature import SQLTransformer\n", + "from pyspark.sql.functions import lit, udf, col, split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "VISION_API_KEY = os.environ['VISION_API_KEY']\n", + "AZURE_SEARCH_KEY = os.environ['AZURE_SEARCH_KEY']\n", + "search_service = \"mmlspark-azure-search\"\n", + "search_index = \"test\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = spark.read\\\n", + " .format(\"csv\")\\\n", + " .option(\"header\", True)\\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/metartworks_sample.csv\")\\\n", + " .withColumn(\"searchAction\", lit(\"upload\"))\\\n", + " .withColumn(\"Neighbors\", split(col(\"Neighbors\"), \",\").cast(\"array\"))\\\n", + " .withColumn(\"Tags\", split(col(\"Tags\"), \",\").cast(\"array\"))\\\n", + " .limit(25)" + ] + }, + { + "source": [ + "" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.cognitive import AnalyzeImage\n", + "from mmlspark.stages import SelectColumns\n", + "\n", + "#define pipeline\n", + "describeImage = (AnalyzeImage()\n", + " .setSubscriptionKey(VISION_API_KEY)\n", + " .setLocation(\"eastus\")\n", + " .setImageUrlCol(\"PrimaryImageUrl\")\n", + " .setOutputCol(\"RawImageDescription\")\n", + " .setErrorCol(\"Errors\")\n", + " .setVisualFeatures([\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\n", + " .setConcurrency(5))\n", + "\n", + "df2 = describeImage.transform(data)\\\n", + " .select(\"*\", \"RawImageDescription.*\").drop(\"Errors\", \"RawImageDescription\")" + ] + }, + { + "source": [ + "" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "Before writing the results to a Search Index, you must define a schema which must specify the name, type, and attributes of each field in your index. Refer [Create a basic index in Azure Search](https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index) for more information." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.cognitive import *\n", + "df2.writeToAzureSearch(\n", + " subscriptionKey=AZURE_SEARCH_KEY,\n", + " actionCol=\"searchAction\",\n", + " serviceName=search_service,\n", + " indexName=search_index,\n", + " keyCol=\"ObjectID\"\n", + ")" + ] + }, + { + "source": [ + "The Search Index can be queried using the [Azure Search REST API](https://docs.microsoft.com/rest/api/searchservice/) by sending GET or POST requests and specifying query parameters that give the criteria for selecting matching documents. For more information on querying refer [Query your Azure Search index using the REST API](https://docs.microsoft.com/en-us/rest/api/searchservice/Search-Documents)" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = 'https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06'.format(search_service, search_index)\n", + "requests.post(url, json={\"search\": \"Glass\"}, headers = {\"api-key\": AZURE_SEARCH_KEY}).json()" + ] + }, { "source": [ "## See also\n", diff --git a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb b/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb index 8028a86795..c9f40d6807 100644 --- a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb +++ b/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb @@ -59,7 +59,6 @@ "\n", "bingSearch = BingImageSearch()\\\n", " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\\\n", - " .setUrl(\"https://api.bing.microsoft.com/v7.0/images/search\")\\\n", " .setOffsetCol(\"offset\")\\\n", " .setQuery(\"celebrity quotes\")\\\n", " .setCount(imgsPerBatch)\\\n", diff --git a/notebooks/samples/LightGBM Samples.ipynb b/notebooks/samples/LightGBM Overview.ipynb similarity index 58% rename from notebooks/samples/LightGBM Samples.ipynb rename to notebooks/samples/LightGBM Overview.ipynb index 3deab76803..d978981eb8 100644 --- a/notebooks/samples/LightGBM Samples.ipynb +++ b/notebooks/samples/LightGBM Overview.ipynb @@ -55,8 +55,9 @@ "\n", "### LightGBM Usage:\n", "\n", - "- LightGBMClassifier\n", - "- LightGBMRegressor" + "- LightGBMClassifier: used for building classification models. For example, to predict whether a company will bankrupt or not, we could build a binary classification model with LightGBMClassifier.\n", + "- LightGBMRegressor: used for building regression models. For example, to predict the house price, we could build a regression model with LightGBMRegressor.\n", + "- LightGBMRanker: used for building ranking models. For example, to predict website searching result relevance, we could build a ranking model with LightGBMRanker." ], "cell_type": "markdown", "metadata": {} @@ -85,26 +86,14 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = spark.read.format(\"csv\")\\\n", + "df = spark.read.format(\"csv\")\\\n", " .option(\"header\", True)\\\n", + " .option(\"inferSchema\", True)\\\n", " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\")\n", "# print dataset size\n", - "print(\"records read: \" + str(dataset.count()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# convert features to double type\n", - "from pyspark.sql.functions import col\n", - "from pyspark.sql.types import DoubleType\n", - "for colName in dataset.columns:\n", - " dataset = dataset.withColumn(colName, col(colName).cast(DoubleType()))\n", + "print(\"records read: \" + str(df.count()))\n", "print(\"Schema: \")\n", - "dataset.printSchema()" + "df.printSchema()" ] }, { @@ -113,7 +102,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataset.show(n=3, truncate=False, vertical=True)" + "display(df)" ] }, { @@ -129,7 +118,7 @@ "metadata": {}, "outputs": [], "source": [ - "train, test = dataset.randomSplit([0.85, 0.15], seed=1)" + "train, test = df.randomSplit([0.85, 0.15], seed=1)" ] }, { @@ -146,7 +135,7 @@ "outputs": [], "source": [ "from pyspark.ml.feature import VectorAssembler\n", - "feature_cols = dataset.columns[1:]\n", + "feature_cols = df.columns[1:]\n", "featurizer = VectorAssembler(\n", " inputCols=feature_cols,\n", " outputCol='features'\n", @@ -217,13 +206,40 @@ "model = LightGBMClassificationModel.loadNativeModelFromFile(\"/lgbmcmodel\")" ] }, + { + "source": [ + "#### Feature Importances Visualization" + ], + "cell_type": "markdown", + "metadata": {} + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "print(model.getFeatureImportances())" + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "feature_importances = model.getFeatureImportances()\n", + "fi = pd.Series(feature_importances,index = feature_cols)\n", + "fi = fi.sort_values(ascending = True)\n", + "f_index = fi.index\n", + "f_values = fi.values\n", + " \n", + "# print feature importances \n", + "print ('f_index:',f_index)\n", + "print ('f_values:',f_values)\n", + "\n", + "# plo\n", + "x_index = list(range(len(F2)))\n", + "x_index = [x/len(F2) for x in x_index]\n", + "plt.rcParams['figure.figsize'] = (20,20)\n", + "plt.barh(x_index,f_values,height = 0.028 ,align=\"center\",color = 'tan',tick_label=f_index)\n", + "plt.xlabel('importances')\n", + "plt.ylabel('features')\n", + "plt.show()" ] }, { @@ -292,7 +308,7 @@ "print(\"records read: \" + str(triazines.count()))\n", "print(\"Schema: \")\n", "triazines.printSchema()\n", - "triazines.limit(10).toPandas()" + "display(triazines.limit(10))" ] }, { @@ -354,7 +370,7 @@ "outputs": [], "source": [ "scoredData = model.transform(test)\n", - "scoredData.limit(10).toPandas()" + "display(scoredData)" ] }, { @@ -368,7 +384,164 @@ " labelCol='label',\n", " scoresCol='prediction') \\\n", " .transform(scoredData)\n", - "metrics.toPandas()" + "display(metrics)" + ] + }, + { + "source": [ + "## LightGBM Ranker" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import monotonically_increasing_id\n", + "from pyspark.sql.functions import explode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = spark.read.format('libsvm') \\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_rank_train.libsvm\") \\\n", + " .withColumn('iid', monotonically_increasing_id())\n", + "display(df1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from pyspark.sql.types import ArrayType, LongType\n", + "def create_rows(value, index):\n", + " arr = np.zeros([value])\n", + " arr.fill(index)\n", + " return arr.astype('int').tolist()\n", + "create_rows_udf = udf(create_rows, ArrayType(LongType()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_col = 'query'\n", + "label_col = 'labels'\n", + "# read in CSV file\n", + "df2 = spark.read.format('csv').option('inferSchema', True) \\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_rank_train_query.csv\") \\\n", + " .withColumn('index', monotonically_increasing_id()) \\\n", + " .withColumn(query_col, explode(create_rows_udf('_c0', 'index'))) \\\n", + " .withColumn('iid', monotonically_increasing_id()) \\\n", + " .drop('_c0', 'index') \\\n", + " .join(df1, 'iid').drop('iid') \\\n", + " .withColumnRenamed('label', label_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print some basic info\n", + "print(\"records read: \" + str(df1.count()))\n", + "print(\"Schema: \")\n", + "df2.printSchema()\n", + "display(df2.limit(10))" + ] + }, + { + "source": [ + "#### Model Training" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.lightgbm import LightGBMRanker\n", + "\n", + "features_col = 'features'\n", + "lgbm_ranker = LightGBMRanker(labelCol=label_col,\n", + " featuresCol=features_col,\n", + " groupCol=query_col,\n", + " predictionCol='preds',\n", + " leafPredictionCol='leafPreds',\n", + " featuresShapCol='importances',\n", + " repartitionByGroupingColumn=True,\n", + " numLeaves=32,\n", + " numIterations=200,\n", + " evalAt=[1,3,5],\n", + " metric='ndcg')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lgbm_ranker_model = lgbm_ranker.fit(df2)" + ] + }, + { + "source": [ + "#### Model Prediction" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dt1 = spark.read.format('libsvm') \\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_rank_test.libsvm\") \\\n", + " .withColumn('iid', monotonically_increasing_id())\n", + "dt2 = spark.read.format('csv').option('inferSchema', True) \\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_rank_test_query.csv\") \\\n", + " .withColumn('index', monotonically_increasing_id()) \\\n", + " .withColumn(query_col, explode(create_rows_udf('_c0', 'index'))) \\\n", + " .withColumn('iid', monotonically_increasing_id()) \\\n", + " .drop('_c0', 'index') \\\n", + " .join(dt1, 'iid').drop('iid') \\\n", + " .withColumnRenamed('label', label_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = lgbm_ranker_model.transform(dt2)\n", + "predictions.limit(10).toPandas()" ] } ] diff --git a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb b/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb index 965d750804..097cb3dee1 100644 --- a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb +++ b/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb @@ -48,7 +48,6 @@ " .mlTransform(\n", " BingImageSearch() # Apply Bing Image Search\n", " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY) # Set the API Key\n", - " .setUrl(\"https://api.bing.microsoft.com/v7.0/images/search\") # Specify the url used to search\n", " .setOffsetCol(\"offsets\") # Specify a column containing the offsets\n", " .setQueryCol(\"queries\") # Specify a column containing the query words\n", " .setCount(10) # Specify the number of images to return per offset\n", @@ -165,7 +164,6 @@ "randomLinks = randomWords \\\n", " .mlTransform(BingImageSearch()\n", " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\n", - " .setUrl(\"https://api.bing.microsoft.com/v7.0/images/search\")\n", " .setCount(10)\n", " .setQueryCol(\"words\")\n", " .setOutputCol(\"images\")) \\\n", diff --git a/notebooks/samples/Vowpal Wabbit Samples.ipynb b/notebooks/samples/Vowpal Wabbit Overview.ipynb similarity index 92% rename from notebooks/samples/Vowpal Wabbit Samples.ipynb rename to notebooks/samples/Vowpal Wabbit Overview.ipynb index 6fa66bb61b..a4c624073c 100644 --- a/notebooks/samples/Vowpal Wabbit Samples.ipynb +++ b/notebooks/samples/Vowpal Wabbit Overview.ipynb @@ -56,10 +56,12 @@ "- **Limited Parsing** Features implemented in the native VW parser (e.g. ngrams, skips, ...) are not yet implemented in\n", " VowpalWabbitFeaturizer.\n", "\n", - "### VowalWabbit Usage:\n", + "### VowpalWabbit Usage:\n", "\n", - "- VowalWabbitClassifier\n", - "- VowalWabbitRegressor" + "- VowpalWabbitClassifier: used to build classification models.\n", + "- VowpalWabbitRegressor: used to build regression models.\n", + "- VowpalWabbitFeaturizer: used for feature hashing and extraction. For details please visit [here](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Feature-Hashing-and-Extraction).\n", + "- VowpalWabbitContextualBandit: used to solve contextual bandits problems. For algorithm details please visit [here](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Contextual-Bandit-algorithms)." ], "cell_type": "markdown", "metadata": {} @@ -86,26 +88,14 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = spark.read.format(\"csv\")\\\n", + "df = spark.read.format(\"csv\")\\\n", " .option(\"header\", True)\\\n", + " .option(\"inferSchema\", True)\\\n", " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\")\n", - "# print dataset size\n", - "print(\"records read: \" + str(dataset.count()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# convert features to double type\n", - "from pyspark.sql.functions import col\n", - "from pyspark.sql.types import DoubleType\n", - "for colName in dataset.columns:\n", - " dataset = dataset.withColumn(colName, col(colName).cast(DoubleType()))\n", + "# print dataset basic info\n", + "print(\"records read: \" + str(df.count()))\n", "print(\"Schema: \")\n", - "dataset.printSchema()" + "df.printSchema()" ] }, { @@ -114,7 +104,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataset.show(10, truncate=False)" + "display(df)" ] }, { @@ -130,7 +120,7 @@ "metadata": {}, "outputs": [], "source": [ - "train, test = dataset.randomSplit([0.85, 0.15], seed=1)" + "train, test = df.randomSplit([0.85, 0.15], seed=1)" ] }, { @@ -147,7 +137,7 @@ "outputs": [], "source": [ "from mmlspark.vw import VowpalWabbitFeaturizer\n", - "featurizer = VowpalWabbitFeaturizer(inputCols=dataset.columns[:-1], outputCol=\"features\")\n", + "featurizer = VowpalWabbitFeaturizer(inputCols=df.columns[:-1], outputCol=\"features\")\n", "train_data = featurizer.transform(train)[\"target\", \"features\"]\n", "test_data = featurizer.transform(test)[\"target\", \"features\"]" ] @@ -192,7 +182,7 @@ "outputs": [], "source": [ "predictions = model.transform(test_data)\n", - "predictions.limit(10).toPandas()" + "display(predictions)" ] }, { @@ -231,7 +221,7 @@ "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", - "train.limit(10).toPandas()" + "display(train.limit(10))" ] }, { @@ -259,8 +249,22 @@ "\n", "# Specify featurizer\n", "vw_featurizer = VowpalWabbitFeaturizer(inputCols=[\"education\", \"marital-status\", \"hours-per-week\"],\n", - " outputCol=\"features\")\n", - "\n", + " outputCol=\"features\")" + ] + }, + { + "source": [ + "Note: \"args\" parameter lets you pass in any params not exposed through our API. Full command line argument docs can be found [here](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments)." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Define VW classification model\n", "args = \"--loss_function=logistic --quiet --holdout_off\"\n", "vw_model = VowpalWabbitClassifier(featuresCol=\"features\",\n", @@ -299,7 +303,7 @@ "# Making predictions\n", "test = test.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0))\n", "prediction = vw_trained.transform(test)\n", - "prediction.limit(10).toPandas()" + "display(prediction.limit(10))" ] }, { @@ -319,7 +323,7 @@ "metrics = ComputeModelStatistics(evaluationMetric=\"classification\", \n", " labelCol=\"label\", \n", " scoredLabelsCol=\"prediction\").transform(prediction)\n", - "metrics.toPandas()" + "display(metrics)" ] }, { @@ -359,7 +363,7 @@ "print(\"records read: \" + str(triazines.count()))\n", "print(\"Schema: \")\n", "triazines.printSchema()\n", - "triazines.limit(10).toPandas()" + "display(triazines.limit(10))" ] }, { @@ -410,7 +414,7 @@ "outputs": [], "source": [ "scoredData = model.transform(test)\n", - "scoredData.limit(10).toPandas()" + "display(scoredData.limit(10))" ] }, { @@ -424,7 +428,7 @@ " labelCol='label',\n", " scoresCol='prediction') \\\n", " .transform(scoredData)\n", - "metrics.toPandas()" + "display(metrics)" ] }, { @@ -479,7 +483,7 @@ " pd.DataFrame(data=np.column_stack((boston.target, boston.data)), columns=header)\n", ").repartition(1)\n", "print(\"Dataframe has {} rows\".format(df.count()))\n", - "display(df.limit(10).toPandas())" + "display(df.limit(10))" ] }, { From cae75d9a96bb0c6ff20a059e6b72f1dd95c9cb2b Mon Sep 17 00:00:00 2001 From: Serena Date: Wed, 12 May 2021 15:52:22 +0800 Subject: [PATCH 12/19] update VW sample --- .../samples/Vowpal Wabbit Overview.ipynb | 215 +++++++++--------- 1 file changed, 108 insertions(+), 107 deletions(-) diff --git a/notebooks/samples/Vowpal Wabbit Overview.ipynb b/notebooks/samples/Vowpal Wabbit Overview.ipynb index a4c624073c..2198550461 100644 --- a/notebooks/samples/Vowpal Wabbit Overview.ipynb +++ b/notebooks/samples/Vowpal Wabbit Overview.ipynb @@ -228,7 +228,9 @@ "source": [ "#### Model Training\n", "\n", - "We define a pipeline that includes feature engineering and training of a VW classifier. We use a featurizer provided by VW that hashes the feature names. Note that VW expects classification labels being -1 or 1. Thus, the income category is mapped to this space before feeding training data into the pipeline." + "We define a pipeline that includes feature engineering and training of a VW classifier. We use a featurizer provided by VW that hashes the feature names. Note that VW expects classification labels being -1 or 1. Thus, the income category is mapped to this space before feeding training data into the pipeline.\n", + "\n", + "Note: VW supports distributed learning, and it's controlled by number of partitions of dataset." ], "cell_type": "markdown", "metadata": {} @@ -328,112 +330,7 @@ }, { "source": [ - "## Quantile Regression for Drug Discovery with VowpalWabbitRegressor\n", - "\n", - "" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "source": [ - "#### Read dataset" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "triazines = spark.read.format(\"libsvm\")\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# print some basic info\n", - "print(\"records read: \" + str(triazines.count()))\n", - "print(\"Schema: \")\n", - "triazines.printSchema()\n", - "display(triazines.limit(10))" - ] - }, - { - "source": [ - "#### Split dataset into train and test" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train, test = triazines.randomSplit([0.85, 0.15], seed=1)" - ] - }, - { - "source": [ - "#### Model Training" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.vw import VowpalWabbitRegressor\n", - "model = (VowpalWabbitRegressor(numPasses=20, args=\"--holdout_off --loss_function quantile -q :: -l 0.1\")\n", - " .fit(train))" - ] - }, - { - "source": [ - "#### Model Prediction" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scoredData = model.transform(test)\n", - "display(scoredData.limit(10))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", - " labelCol='label',\n", - " scoresCol='prediction') \\\n", - " .transform(scoredData)\n", - "display(metrics)" - ] - }, - { - "source": [ - "## Boston house price prediction with VowpalWabbitRegressor\n", + "## Boston house price prediction with VowpalWabbitRegressor - Quantile Regression\n", "\n", "In this example, we show how to build regression model with VW using Boston's house price." ], @@ -653,6 +550,110 @@ "axe.set_ylabel('Actual values')\n", "axe.set_title(\"Vowpal Wabbit\")" ] + }, + { + "source": [ + "## Quantile Regression for Drug Discovery with VowpalWabbitRegressor\r\n", + "\r\n", + "" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "triazines = spark.read.format(\"libsvm\")\\\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print some basic info\n", + "print(\"records read: \" + str(triazines.count()))\n", + "print(\"Schema: \")\n", + "triazines.printSchema()\n", + "display(triazines.limit(10))" + ] + }, + { + "source": [ + "#### Split dataset into train and test" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = triazines.randomSplit([0.85, 0.15], seed=1)" + ] + }, + { + "source": [ + "#### Model Training" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.vw import VowpalWabbitRegressor\n", + "model = (VowpalWabbitRegressor(numPasses=20, args=\"--holdout_off --loss_function quantile -q :: -l 0.1\")\n", + " .fit(train))" + ] + }, + { + "source": [ + "#### Model Prediction" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scoredData = model.transform(test)\n", + "display(scoredData.limit(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.train import ComputeModelStatistics\n", + "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", + " labelCol='label',\n", + " scoresCol='prediction') \\\n", + " .transform(scoredData)\n", + "display(metrics)" + ] } ] } \ No newline at end of file From dcb7014540136195f2357a34302ed286fe209acc Mon Sep 17 00:00:00 2001 From: Serena Date: Wed, 12 May 2021 18:31:22 +0800 Subject: [PATCH 13/19] fix spelling error --- notebooks/samples/LightGBM Overview.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/samples/LightGBM Overview.ipynb b/notebooks/samples/LightGBM Overview.ipynb index d978981eb8..a3ace33bd6 100644 --- a/notebooks/samples/LightGBM Overview.ipynb +++ b/notebooks/samples/LightGBM Overview.ipynb @@ -233,8 +233,8 @@ "print ('f_values:',f_values)\n", "\n", "# plo\n", - "x_index = list(range(len(F2)))\n", - "x_index = [x/len(F2) for x in x_index]\n", + "x_index = list(range(len(fi)))\n", + "x_index = [x/len(fi) for x in x_index]\n", "plt.rcParams['figure.figsize'] = (20,20)\n", "plt.barh(x_index,f_values,height = 0.028 ,align=\"center\",color = 'tan',tick_label=f_index)\n", "plt.xlabel('importances')\n", From 38bb9342b7444aa09b14ab44b86d30a9ca644929 Mon Sep 17 00:00:00 2001 From: Serena Date: Fri, 14 May 2021 14:58:09 +0800 Subject: [PATCH 14/19] update based on comments --- ...nb => Cognitive Services - Overview.ipynb} | 4 +- ...erview.ipynb => LightGBM - Overview.ipynb} | 111 +++--------------- ...w.ipynb => Vowpal Wabbit - Overview.ipynb} | 6 +- 3 files changed, 23 insertions(+), 98 deletions(-) rename notebooks/samples/{Cognitive Services Overview.ipynb => Cognitive Services - Overview.ipynb} (98%) rename notebooks/samples/{LightGBM Overview.ipynb => LightGBM - Overview.ipynb} (79%) rename notebooks/samples/{Vowpal Wabbit Overview.ipynb => Vowpal Wabbit - Overview.ipynb} (99%) diff --git a/notebooks/samples/Cognitive Services Overview.ipynb b/notebooks/samples/Cognitive Services - Overview.ipynb similarity index 98% rename from notebooks/samples/Cognitive Services Overview.ipynb rename to notebooks/samples/Cognitive Services - Overview.ipynb index 5e76897714..bc90555cb6 100644 --- a/notebooks/samples/Cognitive Services Overview.ipynb +++ b/notebooks/samples/Cognitive Services - Overview.ipynb @@ -60,9 +60,9 @@ "- Anomaly status of latest point: generates a model using preceding points and determines whether the latest point is anomalous ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/DetectLastAnomaly.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.DetectLastAnomaly))\n", "- Find anomalies: generates a model using an entire series and finds anomalies in the series ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/DetectAnomalies.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.DetectAnomalies))\n", "\n", - "### Web Search\n", + "### Search\n", "- [Bing Image search](https://azure.microsoft.com/en-us/services/cognitive-services/bing-image-search-api/) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/mmlspark.cognitive.html#module-mmlspark.cognitive.BingImageSearch))\n", - "- [Azure Cognitive search](https://docs.microsoft.com/en-us/azure/search/search-what-is-azure-search)\n" + "- [Azure Cognitive search](https://docs.microsoft.com/en-us/azure/search/search-what-is-azure-search) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/index.html#com.microsoft.ml.spark.cognitive.AzureSearchWriter$), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/index.html#com.microsoft.ml.spark.cognitive.AzureSearchWriter$))\n" ], "cell_type": "markdown", "metadata": {} diff --git a/notebooks/samples/LightGBM Overview.ipynb b/notebooks/samples/LightGBM - Overview.ipynb similarity index 79% rename from notebooks/samples/LightGBM Overview.ipynb rename to notebooks/samples/LightGBM - Overview.ipynb index a3ace33bd6..5c2d607653 100644 --- a/notebooks/samples/LightGBM Overview.ipynb +++ b/notebooks/samples/LightGBM - Overview.ipynb @@ -144,15 +144,6 @@ "test_data = featurizer.transform(test)['Bankrupt?', 'features']" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_data.show(10)" - ] - }, { "source": [ "#### Check if the data is unbalanced" @@ -166,7 +157,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_data.groupBy(\"Bankrupt?\").count().show()" + "display(train_data.groupBy(\"Bankrupt?\").count())" ] }, { @@ -195,6 +186,13 @@ "model = model.fit(train_data)" ] }, + { + "source": [ + "By calling \"saveNativeModel\", it allows you to extract the underlying lightGBM model for fast deployment after you train on Spark." + ], + "cell_type": "markdown", + "metadata": {} + }, { "cell_type": "code", "execution_count": null, @@ -202,8 +200,8 @@ "outputs": [], "source": [ "from mmlspark.lightgbm import LightGBMClassificationModel\n", - "model.saveNativeModel(\"/lgbmcmodel\")\n", - "model = LightGBMClassificationModel.loadNativeModelFromFile(\"/lgbmcmodel\")" + "model.saveNativeModel(\"/lgbmclassifier.model\")\n", + "model = LightGBMClassificationModel.loadNativeModelFromFile(\"/lgbmclassifier.model\")" ] }, { @@ -232,7 +230,7 @@ "print ('f_index:',f_index)\n", "print ('f_values:',f_values)\n", "\n", - "# plo\n", + "# plot\n", "x_index = list(range(len(fi)))\n", "x_index = [x/len(fi) for x in x_index]\n", "plt.rcParams['figure.figsize'] = (20,20)\n", @@ -407,67 +405,12 @@ "metadata": {}, "outputs": [], "source": [ - "from pyspark.sql.functions import monotonically_increasing_id\n", - "from pyspark.sql.functions import explode" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df1 = spark.read.format('libsvm') \\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_rank_train.libsvm\") \\\n", - " .withColumn('iid', monotonically_increasing_id())\n", - "display(df1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from pyspark.sql.types import ArrayType, LongType\n", - "def create_rows(value, index):\n", - " arr = np.zeros([value])\n", - " arr.fill(index)\n", - " return arr.astype('int').tolist()\n", - "create_rows_udf = udf(create_rows, ArrayType(LongType()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query_col = 'query'\n", - "label_col = 'labels'\n", - "# read in CSV file\n", - "df2 = spark.read.format('csv').option('inferSchema', True) \\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_rank_train_query.csv\") \\\n", - " .withColumn('index', monotonically_increasing_id()) \\\n", - " .withColumn(query_col, explode(create_rows_udf('_c0', 'index'))) \\\n", - " .withColumn('iid', monotonically_increasing_id()) \\\n", - " .drop('_c0', 'index') \\\n", - " .join(df1, 'iid').drop('iid') \\\n", - " .withColumnRenamed('label', label_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "df = spark.read.format(\"parquet\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_train.parquet\")\n", "# print some basic info\n", - "print(\"records read: \" + str(df1.count()))\n", + "print(\"records read: \" + str(df.count()))\n", "print(\"Schema: \")\n", - "df2.printSchema()\n", - "display(df2.limit(10))" + "df.printSchema()\n", + "display(df.limit(10))" ] }, { @@ -505,7 +448,7 @@ "metadata": {}, "outputs": [], "source": [ - "lgbm_ranker_model = lgbm_ranker.fit(df2)" + "lgbm_ranker_model = lgbm_ranker.fit(df)" ] }, { @@ -521,26 +464,8 @@ "metadata": {}, "outputs": [], "source": [ - "dt1 = spark.read.format('libsvm') \\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_rank_test.libsvm\") \\\n", - " .withColumn('iid', monotonically_increasing_id())\n", - "dt2 = spark.read.format('csv').option('inferSchema', True) \\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_rank_test_query.csv\") \\\n", - " .withColumn('index', monotonically_increasing_id()) \\\n", - " .withColumn(query_col, explode(create_rows_udf('_c0', 'index'))) \\\n", - " .withColumn('iid', monotonically_increasing_id()) \\\n", - " .drop('_c0', 'index') \\\n", - " .join(dt1, 'iid').drop('iid') \\\n", - " .withColumnRenamed('label', label_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predictions = lgbm_ranker_model.transform(dt2)\n", + "dt = spark.read.format(\"parquet\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_test.parquet\")\n", + "predictions = lgbm_ranker_model.transform(dt)\n", "predictions.limit(10).toPandas()" ] } diff --git a/notebooks/samples/Vowpal Wabbit Overview.ipynb b/notebooks/samples/Vowpal Wabbit - Overview.ipynb similarity index 99% rename from notebooks/samples/Vowpal Wabbit Overview.ipynb rename to notebooks/samples/Vowpal Wabbit - Overview.ipynb index 2198550461..04a8f05778 100644 --- a/notebooks/samples/Vowpal Wabbit Overview.ipynb +++ b/notebooks/samples/Vowpal Wabbit - Overview.ipynb @@ -148,7 +148,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_data.groupBy(\"target\").count().show()" + "display(train_data.groupBy(\"target\").count())" ] }, { @@ -221,7 +221,7 @@ "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", - "display(train.limit(10))" + "display(train)" ] }, { @@ -305,7 +305,7 @@ "# Making predictions\n", "test = test.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0))\n", "prediction = vw_trained.transform(test)\n", - "display(prediction.limit(10))" + "display(prediction)" ] }, { From bd7fde0646f942dea49956ffa191d75aae1b845f Mon Sep 17 00:00:00 2001 From: Serena Date: Fri, 14 May 2021 15:02:34 +0800 Subject: [PATCH 15/19] remove cache --- notebooks/samples/Vowpal Wabbit - Overview.ipynb | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/notebooks/samples/Vowpal Wabbit - Overview.ipynb b/notebooks/samples/Vowpal Wabbit - Overview.ipynb index 04a8f05778..267257035f 100644 --- a/notebooks/samples/Vowpal Wabbit - Overview.ipynb +++ b/notebooks/samples/Vowpal Wabbit - Overview.ipynb @@ -246,7 +246,7 @@ "from mmlspark.vw import VowpalWabbitFeaturizer, VowpalWabbitClassifier\n", "\n", "# Define classification label\n", - "train = train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)).repartition(1).cache()\n", + "train = train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)).repartition(1)\n", "print(train.count())\n", "\n", "# Specify featurizer\n", @@ -389,9 +389,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_data, test_data = df.randomSplit([0.75, 0.25], seed=42)\n", - "train_data.cache()\n", - "test_data.cache()" + "train_data, test_data = df.randomSplit([0.75, 0.25], seed=42)" ] }, { @@ -495,7 +493,7 @@ ")\n", "\n", "# To reduce number of partitions (which will effect performance), use `vw_train_data.repartition(1)`\n", - "vw_train_data_2 = vw_train_data.repartition(1).cache()\n", + "vw_train_data_2 = vw_train_data.repartition(1)\n", "print(vw_train_data_2.count())\n", "vw_model = vwr.fit(vw_train_data_2.repartition(1))\n", "vw_predictions = vw_model.transform(vw_test_data)\n", From 8cae61acd0c51cd88b6977266cbc1f01ddd82bcd Mon Sep 17 00:00:00 2001 From: Serena Date: Fri, 14 May 2021 16:09:54 +0800 Subject: [PATCH 16/19] fix error --- notebooks/samples/LightGBM - Overview.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/notebooks/samples/LightGBM - Overview.ipynb b/notebooks/samples/LightGBM - Overview.ipynb index 5c2d607653..8af166b1a7 100644 --- a/notebooks/samples/LightGBM - Overview.ipynb +++ b/notebooks/samples/LightGBM - Overview.ipynb @@ -429,6 +429,8 @@ "from mmlspark.lightgbm import LightGBMRanker\n", "\n", "features_col = 'features'\n", + "query_col = 'query'\n", + "label_col = 'labels'\n", "lgbm_ranker = LightGBMRanker(labelCol=label_col,\n", " featuresCol=features_col,\n", " groupCol=query_col,\n", From a9316726b3edee0f689a007555f007956db716ad Mon Sep 17 00:00:00 2001 From: Serena Date: Mon, 17 May 2021 13:15:09 +0800 Subject: [PATCH 17/19] add VW contextual bandit sample --- .../samples/Vowpal Wabbit - Overview.ipynb | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/notebooks/samples/Vowpal Wabbit - Overview.ipynb b/notebooks/samples/Vowpal Wabbit - Overview.ipynb index 267257035f..3bfaf98bae 100644 --- a/notebooks/samples/Vowpal Wabbit - Overview.ipynb +++ b/notebooks/samples/Vowpal Wabbit - Overview.ipynb @@ -652,6 +652,121 @@ " .transform(scoredData)\n", "display(metrics)" ] + }, + { + "source": [ + "## VW Contextual Bandit" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "#### Read dataset" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = spark.read.format(\"json\").option(\"inferSchema\", True).load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\")" + ] + }, + { + "source": [ + "Note: Actions are all five TAction_x_topic columns." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import col \n", + "from pyspark.sql.types import IntegerType, DoubleType \n", + "data = data.withColumn('GUser_id', col('c.GUser.id'))\\\n", + " .withColumn('GUser_major', col('c.GUser.major'))\\\n", + " .withColumn('GUser_hobby', col('c.GUser.hobby'))\\\n", + " .withColumn('GUser_favorite_character', col('c.GUser.favorite_character'))\\\n", + " .withColumn('TAction_0_topic', col('c._multi.TAction.topic')[0])\\\n", + " .withColumn('TAction_1_topic', col('c._multi.TAction.topic')[1])\\\n", + " .withColumn('TAction_2_topic', col('c._multi.TAction.topic')[2])\\\n", + " .withColumn('TAction_3_topic', col('c._multi.TAction.topic')[3])\\\n", + " .withColumn('TAction_4_topic', col('c._multi.TAction.topic')[4])\\\n", + " .withColumn('chosenAction', col('_label_Action').cast(IntegerType()))\\\n", + " .withColumn('label', col('_labelIndex').cast(DoubleType()))\\\n", + " .withColumn('probability', col('_label_probability'))\\\n", + " .select('GUser_id', 'GUser_major', 'GUser_hobby', 'GUser_favorite_character', 'TAction_0_topic', 'TAction_1_topic', 'TAction_2_topic', 'TAction_3_topic', 'TAction_4_topic', 'chosenAction', 'label', 'probability')\n", + "\n", + "print(\"Schema: \") \n", + "data.printSchema()" + ] + }, + { + "source": [ + "Add pipeline to add featurizer, convert all feature columns into vector." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mmlspark.vw import VowpalWabbitFeaturizer, VowpalWabbitContextualBandit, VectorZipper\n", + "from pyspark.ml import Pipeline\n", + "pipeline = Pipeline(stages=[\n", + " VowpalWabbitFeaturizer(inputCols=['GUser_id'], outputCol='GUser_id_feature'),\n", + " VowpalWabbitFeaturizer(inputCols=['GUser_major'], outputCol='GUser_major_feature'),\n", + " VowpalWabbitFeaturizer(inputCols=['GUser_hobby'], outputCol='GUser_hobby_feature'),\n", + " VowpalWabbitFeaturizer(inputCols=['GUser_favorite_character'], outputCol='GUser_favorite_character_feature'),\n", + " VowpalWabbitFeaturizer(inputCols=['TAction_0_topic'], outputCol='TAction_0_topic_feature'),\n", + " VowpalWabbitFeaturizer(inputCols=['TAction_1_topic'], outputCol='TAction_1_topic_feature'),\n", + " VowpalWabbitFeaturizer(inputCols=['TAction_2_topic'], outputCol='TAction_2_topic_feature'),\n", + " VowpalWabbitFeaturizer(inputCols=['TAction_3_topic'], outputCol='TAction_3_topic_feature'),\n", + " VowpalWabbitFeaturizer(inputCols=['TAction_4_topic'], outputCol='TAction_4_topic_feature'),\n", + " VectorZipper(inputCols=['TAction_0_topic_feature', 'TAction_1_topic_feature', 'TAction_2_topic_feature', 'TAction_3_topic_feature','TAction_4_topic_feature'], outputCol='features')\n", + "])\n", + "tranformation_pipeline = pipeline.fit(data)\n", + "transformed_data = tranformation_pipeline.transform(data)\n", + "\n", + "display(transformed_data)" + ] + }, + { + "source": [ + "Buiild VowpalWabbit Contextual Bandit model and compute performance statistics." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "estimator = VowpalWabbitContextualBandit() \\\n", + " .setArgs(\"--cb_explore_adf --epsilon 0.2 --quiet\") \\\n", + " .setSharedCol('GUser_id_feature') \\\n", + " .setAdditionalSharedFeatures([\"GUser_major_feature\", \"GUser_hobby_feature\", \"GUser_favorite_character_feature\"]) \\\n", + " .setFeaturesCol('features') \\\n", + " .setUseBarrierExecutionMode(False)\\\n", + " .setChosenActionCol('chosenAction')\\\n", + " .setLabelCol('label')\\\n", + " .setProbabilityCol('probability')\n", + "model = estimator.fit(transformed_data)\n", + "display(model.getPerformanceStatistics())" + ] } ] } \ No newline at end of file From c497742b0ac98fbe12c8a0d8de5199ea610a6caf Mon Sep 17 00:00:00 2001 From: Serena Date: Wed, 19 May 2021 14:28:56 +0800 Subject: [PATCH 18/19] update notebook samples --- .../Cognitive Services - Overview.ipynb | 132 +++---------- ...antile Regression for Drug Discovery.ipynb | 178 ------------------ .../samples/Vowpal Wabbit - Overview.ipynb | 10 +- ...antile Regression for Drug Discovery.ipynb | 146 -------------- 4 files changed, 31 insertions(+), 435 deletions(-) delete mode 100644 notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb delete mode 100644 notebooks/samples/Vowpal Wabbit - Quantile Regression for Drug Discovery.ipynb diff --git a/notebooks/samples/Cognitive Services - Overview.ipynb b/notebooks/samples/Cognitive Services - Overview.ipynb index bc90555cb6..3a586efe84 100644 --- a/notebooks/samples/Cognitive Services - Overview.ipynb +++ b/notebooks/samples/Cognitive Services - Overview.ipynb @@ -348,9 +348,9 @@ }, { "source": [ - "## Azure Cognitive search - Creating a searchable Art Database with The MET's open-access collection sample\n", + "## Azure Cognitive search sample\n", "\n", - "In this example, we show how you can enrich data using Cognitive Skills and write to an Azure Search Index using MMLSpark. We use a subset of The MET's open-access collection and enrich it by passing it through 'Describe Image' and a custom 'Image Similarity' skill. The results are then written to a searchable index." + "In this example, we show how you can enrich data using Cognitive Skills and write to an Azure Search Index using MMLSpark." ], "cell_type": "markdown", "metadata": {} @@ -361,115 +361,37 @@ "metadata": {}, "outputs": [], "source": [ - "import os, sys, time, json, requests\n", - "from pyspark.ml import Transformer, Estimator, Pipeline\n", - "from pyspark.ml.feature import SQLTransformer\n", - "from pyspark.sql.functions import lit, udf, col, split" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "# import os, sys, time, json, requests\n", + "# from pyspark.ml import Transformer, Estimator, Pipeline\n", + "# from pyspark.ml.feature import SQLTransformer\n", + "# from pyspark.sql.functions import lit, udf, col, split\n", + "from mmlspark.cognitive import *\n", + "\n", "VISION_API_KEY = os.environ['VISION_API_KEY']\n", "AZURE_SEARCH_KEY = os.environ['AZURE_SEARCH_KEY']\n", "search_service = \"mmlspark-azure-search\"\n", - "search_index = \"test\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = spark.read\\\n", - " .format(\"csv\")\\\n", - " .option(\"header\", True)\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/metartworks_sample.csv\")\\\n", - " .withColumn(\"searchAction\", lit(\"upload\"))\\\n", - " .withColumn(\"Neighbors\", split(col(\"Neighbors\"), \",\").cast(\"array\"))\\\n", - " .withColumn(\"Tags\", split(col(\"Tags\"), \",\").cast(\"array\"))\\\n", - " .limit(25)" - ] - }, - { - "source": [ - "" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.cognitive import AnalyzeImage\n", - "from mmlspark.stages import SelectColumns\n", - "\n", - "#define pipeline\n", - "describeImage = (AnalyzeImage()\n", - " .setSubscriptionKey(VISION_API_KEY)\n", - " .setLocation(\"eastus\")\n", - " .setImageUrlCol(\"PrimaryImageUrl\")\n", - " .setOutputCol(\"RawImageDescription\")\n", - " .setErrorCol(\"Errors\")\n", - " .setVisualFeatures([\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\n", - " .setConcurrency(5))\n", - "\n", - "df2 = describeImage.transform(data)\\\n", - " .select(\"*\", \"RawImageDescription.*\").drop(\"Errors\", \"RawImageDescription\")" - ] - }, - { - "source": [ - "" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "source": [ - "Before writing the results to a Search Index, you must define a schema which must specify the name, type, and attributes of each field in your index. Refer [Create a basic index in Azure Search](https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index) for more information." - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.cognitive import *\n", - "df2.writeToAzureSearch(\n", - " subscriptionKey=AZURE_SEARCH_KEY,\n", + "search_index = \"test-33467690\"\n", + "\n", + "df = spark.createDataFrame([(\"upload\", \"0\", \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg\"), \n", + " (\"upload\", \"1\", \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg\")], \n", + " [\"searchAction\", \"id\", \"url\"])\n", + "\n", + "tdf = AnalyzeImage()\\\n", + " .setSubscriptionKey(VISION_API_KEY)\\\n", + " .setLocation(\"eastus\")\\\n", + " .setImageUrlCol(\"url\")\\\n", + " .setOutputCol(\"analyzed\")\\\n", + " .setErrorCol(\"errors\")\\\n", + " .setVisualFeatures([\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\\\n", + " .transform(df)\\\n", + " .select(\"*\", \"analyzed.*\")\\\n", + " .drop(\"errors\", \"analyzed\")\n", + "\n", + "tdf.writeToAzureSearch(subscriptionKey=AZURE_SEARCH_KEY,\n", " actionCol=\"searchAction\",\n", " serviceName=search_service,\n", " indexName=search_index,\n", - " keyCol=\"ObjectID\"\n", - ")" - ] - }, - { - "source": [ - "The Search Index can be queried using the [Azure Search REST API](https://docs.microsoft.com/rest/api/searchservice/) by sending GET or POST requests and specifying query parameters that give the criteria for selecting matching documents. For more information on querying refer [Query your Azure Search index using the REST API](https://docs.microsoft.com/en-us/rest/api/searchservice/Search-Documents)" - ], - "cell_type": "markdown", - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "url = 'https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06'.format(search_service, search_index)\n", - "requests.post(url, json={\"search\": \"Glass\"}, headers = {\"api-key\": AZURE_SEARCH_KEY}).json()" + " keyCol=\"id\")" ] }, { diff --git a/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb b/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb deleted file mode 100644 index ceece82376..0000000000 --- a/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb +++ /dev/null @@ -1,178 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## LightGBM - Quantile Regression for Drug Discovery\n", - "\n", - "We will demonstrate how to use the LightGBM quantile regressor with\n", - "TrainRegressor and ComputeModelStatistics on the Triazines dataset.\n", - "\n", - "\n", - "This sample demonstrates how to use the following APIs:\n", - "- [`TrainRegressor`\n", - " ](http://mmlspark.azureedge.net/docs/pyspark/TrainRegressor.html)\n", - "- [`LightGBMRegressor`\n", - " ](http://mmlspark.azureedge.net/docs/pyspark/LightGBMRegressor.html)\n", - "- [`ComputeModelStatistics`\n", - " ](http://mmlspark.azureedge.net/docs/pyspark/ComputeModelStatistics.html)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "triazines = spark.read.format(\"libsvm\")\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print some basic info\n", - "print(\"records read: \" + str(triazines.count()))\n", - "print(\"Schema: \")\n", - "triazines.printSchema()\n", - "triazines.limit(10).toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Split the dataset into train and test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train, test = triazines.randomSplit([0.85, 0.15], seed=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Train the quantile regressor on the training data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.lightgbm import LightGBMRegressor\n", - "model = LightGBMRegressor(objective='quantile',\n", - " alpha=0.2,\n", - " learningRate=0.3,\n", - " numLeaves=31).fit(train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can save and load LightGBM to a file using the LightGBM native representation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.lightgbm import LightGBMRegressionModel\n", - "model.saveNativeModel(\"/mymodel\")\n", - "model = LightGBMRegressionModel.loadNativeModelFromFile(\"/mymodel\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "View the feature importances of the trained model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(model.getFeatureImportances())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Score the regressor on the test data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scoredData = model.transform(test)\n", - "scoredData.limit(10).toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compute metrics using ComputeModelStatistics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", - " labelCol='label',\n", - " scoresCol='prediction') \\\n", - " .transform(scoredData)\n", - "metrics.toPandas()" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python [default]", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/notebooks/samples/Vowpal Wabbit - Overview.ipynb b/notebooks/samples/Vowpal Wabbit - Overview.ipynb index 3bfaf98bae..d9180b303f 100644 --- a/notebooks/samples/Vowpal Wabbit - Overview.ipynb +++ b/notebooks/samples/Vowpal Wabbit - Overview.ipynb @@ -466,7 +466,7 @@ ")\n", "vw_train_data = vw_featurizer.transform(train_data)['target', 'features']\n", "vw_test_data = vw_featurizer.transform(test_data)['target', 'features']\n", - "display(vw_train_data.limit(10).toPandas())" + "display(vw_train_data)" ] }, { @@ -493,9 +493,7 @@ ")\n", "\n", "# To reduce number of partitions (which will effect performance), use `vw_train_data.repartition(1)`\n", - "vw_train_data_2 = vw_train_data.repartition(1)\n", - "print(vw_train_data_2.count())\n", - "vw_model = vwr.fit(vw_train_data_2.repartition(1))\n", + "vw_model = vwr.fit(vw_train_data.repartition(1))\n", "vw_predictions = vw_model.transform(vw_test_data)\n", "\n", "display(vw_predictions.limit(20).toPandas())" @@ -673,7 +671,7 @@ "metadata": {}, "outputs": [], "source": [ - "data = spark.read.format(\"json\").option(\"inferSchema\", True).load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\")" + "data = spark.read.format(\"json\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\")" ] }, { @@ -744,7 +742,7 @@ }, { "source": [ - "Buiild VowpalWabbit Contextual Bandit model and compute performance statistics." + "Build VowpalWabbit Contextual Bandit model and compute performance statistics." ], "cell_type": "markdown", "metadata": {} diff --git a/notebooks/samples/Vowpal Wabbit - Quantile Regression for Drug Discovery.ipynb b/notebooks/samples/Vowpal Wabbit - Quantile Regression for Drug Discovery.ipynb deleted file mode 100644 index 4ffafef149..0000000000 --- a/notebooks/samples/Vowpal Wabbit - Quantile Regression for Drug Discovery.ipynb +++ /dev/null @@ -1,146 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Vowpal Wabbit - Quantile Regression for Drug Discovery\n", - "\n", - "We will demonstrate how to use the VowpalWabbit quantile regressor with\n", - "TrainRegressor and ComputeModelStatistics on the Triazines dataset.\n", - "\n", - "\n", - "This sample demonstrates how to use the following APIs:\n", - "- [`TrainRegressor`\n", - " ](http://mmlspark.azureedge.net/docs/pyspark/TrainRegressor.html)\n", - "- [`VowpalWabbitRegressor`\n", - " ](http://mmlspark.azureedge.net/docs/pyspark/VowpalWabbitRegressor.html)\n", - "- [`ComputeModelStatistics`\n", - " ](http://mmlspark.azureedge.net/docs/pyspark/ComputeModelStatistics.html)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "triazines = spark.read.format(\"libsvm\")\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print some basic info\n", - "print(\"records read: \" + str(triazines.count()))\n", - "print(\"Schema: \")\n", - "triazines.printSchema()\n", - "triazines.limit(10).toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Split the dataset into train and test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train, test = triazines.randomSplit([0.85, 0.15], seed=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Train the quantile regressor on the training data.\n", - "\n", - "Note: have a look at stderr for the task to see VW's output\n", - "\n", - "Full command line argument docs can be found [here](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments).\n", - "\n", - "Learning rate, numPasses and power_t are exposed to support grid search." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.vw import VowpalWabbitRegressor\n", - "model = (VowpalWabbitRegressor(numPasses=20, args=\"--holdout_off --loss_function quantile -q :: -l 0.1\")\n", - " .fit(train))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Score the regressor on the test data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scoredData = model.transform(test)\n", - "scoredData.limit(10).toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compute metrics using ComputeModelStatistics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mmlspark.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", - " labelCol='label',\n", - " scoresCol='prediction') \\\n", - " .transform(scoredData)\n", - "metrics.toPandas()" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file From 31a66cbac2c663ec89711817f625b990cc24f2c2 Mon Sep 17 00:00:00 2001 From: Serena Date: Wed, 19 May 2021 14:31:14 +0800 Subject: [PATCH 19/19] tiny modification --- notebooks/samples/Cognitive Services - Overview.ipynb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/notebooks/samples/Cognitive Services - Overview.ipynb b/notebooks/samples/Cognitive Services - Overview.ipynb index 3a586efe84..26a464f62c 100644 --- a/notebooks/samples/Cognitive Services - Overview.ipynb +++ b/notebooks/samples/Cognitive Services - Overview.ipynb @@ -361,10 +361,6 @@ "metadata": {}, "outputs": [], "source": [ - "# import os, sys, time, json, requests\n", - "# from pyspark.ml import Transformer, Estimator, Pipeline\n", - "# from pyspark.ml.feature import SQLTransformer\n", - "# from pyspark.sql.functions import lit, udf, col, split\n", "from mmlspark.cognitive import *\n", "\n", "VISION_API_KEY = os.environ['VISION_API_KEY']\n",