diff --git a/.vsts-dotnet-ci.yml b/.vsts-dotnet-ci.yml index 03fc2dbdd..9f7e2cf9f 100644 --- a/.vsts-dotnet-ci.yml +++ b/.vsts-dotnet-ci.yml @@ -7,6 +7,14 @@ phases: inputs: projects: '.\samples\csharp\getting-started\GettingStarted.sln' +- phase: FSharpGettingStarted + queue: Hosted VS2017 + steps: + - task: DotNetCoreCLI@2 + displayName: Build F# GettingStarted + inputs: + projects: '.\samples\fsharp\getting-started\GettingStarted.sln' + - phase: BinaryClasification_Titanic queue: Hosted VS2017 steps: diff --git a/README.md b/README.md index 52dc7df18..567293fcd 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,10 @@ [ML.NET](https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet) is a cross-platform open-source machine learning framework that makes machine learning accessible to .NET developers. ML.NET samples are divided in three categories: -* **Getting started** - basic "hello world" samples for each ML task. +* **Getting started (C#)** - basic "hello world" samples for each ML task, in C# +* **Getting started (F#)** - basic "hello world" samples for each ML task, in F# * **Examples** - examples of how you can use various ML.NET components (learners, transforms, ...). -* **End-to-end apps** - real world examples of web, desktop, mobile, and other applications infused with ML solutions via [ML.NET APIs](https://docs.microsoft.com/dotnet/api/?view=ml-dotnet). +* **End-to-end (C#)** - real world examples of web, desktop, mobile, and other applications infused with ML solutions via [ML.NET APIs](https://docs.microsoft.com/dotnet/api/?view=ml-dotnet). All samples in this repo are using the latest released [Microsoft.ML](https://www.nuget.org/packages/Microsoft.ML/) NuGet package. If you would like to see the examples referencing the source code, check out [scenario tests](https://github.com/dotnet/machinelearning/tree/master/test/Microsoft.ML.Tests/Scenarios) in [ML.NET repository](https://github.com/dotnet/machinelearning). diff --git a/samples/Directory.Build.props b/samples/Directory.Build.props index 029a82325..ef31a207e 100644 --- a/samples/Directory.Build.props +++ b/samples/Directory.Build.props @@ -1,7 +1,7 @@ - 0.3.0 + 0.4.0 diff --git a/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/BinaryClassification_SentimentAnalysis.fsproj b/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/BinaryClassification_SentimentAnalysis.fsproj new file mode 100644 index 000000000..7169b9854 --- /dev/null +++ b/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/BinaryClassification_SentimentAnalysis.fsproj @@ -0,0 +1,19 @@ + + + + Exe + netcoreapp2.0 + + + + + + + + + + + + + + diff --git a/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/Program.fs b/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/Program.fs new file mode 100644 index 000000000..83c69967c --- /dev/null +++ b/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/Program.fs @@ -0,0 +1,103 @@ +module BinaryClassification_SentimentAnalysis + +open System +open System.IO +open Microsoft.ML +open Microsoft.ML.Data +open Microsoft.ML.Models +open Microsoft.ML.Runtime.Api +open Microsoft.ML.Trainers +open Microsoft.ML.Transforms + +type SentimentData() = + [] + member val SentimentText: string = "" with get, set + + [] + member val Sentiment : double = 0.0 with get, set + +type SentimentPrediction() = + [] + member val Sentiment : bool = false with get, set + +let sentiments = + [| SentimentData(SentimentText = "Contoso's 11 is a wonderful experience", Sentiment = 1.0) + SentimentData(SentimentText = "The acting in this movie is very bad", Sentiment = 0.0) + SentimentData(SentimentText = "Joe versus the Volcano Coffee Company is a great film.", Sentiment = 1.0) |] + +let AppPath = Path.Combine(__SOURCE_DIRECTORY__, "../../../..") +let TrainDataPath = Path.Combine(AppPath, "datasets", "sentiment-imdb-train.txt") +let TestDataPath = Path.Combine(AppPath, "datasets", "sentiment-yelp-test.txt") +let modelPath = Path.Combine(AppPath, "SentimentModel.zip") + +let TrainAsync() = + // LearningPipeline holds all steps of the learning process: data, transforms, learners. + let pipeline = LearningPipeline() + + // The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing + // all the column names and their types. + pipeline.Add(TextLoader(TrainDataPath).CreateFrom()) + + // TextFeaturizer is a transform that will be used to featurize an input column to format and clean the data. + pipeline.Add(TextFeaturizer("Features", "SentimentText")) + + // FastTreeBinaryClassifier is an algorithm that will be used to train the model. + // It has three hyperparameters for tuning decision tree performance. + pipeline.Add(FastTreeBinaryClassifier(NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2)) + + Console.WriteLine("=============== Training model ===============") + // The pipeline is trained on the dataset that has been loaded and transformed. + let model = pipeline.Train() + + // Saving the model as a .zip file. + model.WriteAsync(modelPath) |> Async.AwaitTask |> Async.RunSynchronously + + Console.WriteLine("=============== End training ===============") + Console.WriteLine(sprintf "The model is saved to %s" modelPath) + + model + +let Evaluate(model: PredictionModel ) = + // To evaluate how good the model predicts values, the model is ran against new set + // of data (test data) that was not involved in training. + let testData = TextLoader(TestDataPath).CreateFrom() + + // BinaryClassificationEvaluator performs evaluation for Binary Classification type of ML problems. + let evaluator = BinaryClassificationEvaluator() + + Console.WriteLine("=============== Evaluating model ===============") + + let metrics = evaluator.Evaluate(model, testData) + // BinaryClassificationMetrics contains the overall metrics computed by binary classification evaluators + // The Accuracy metric gets the accuracy of a classifier which is the proportion + //of correct predictions in the test set. + + // The Auc metric gets the area under the ROC curve. + // The area under the ROC curve is equal to the probability that the classifier ranks + // a randomly chosen positive instance higher than a randomly chosen negative one + // (assuming 'positive' ranks higher than 'negative'). + + // The F1Score metric gets the classifier's F1 score. + // The F1 score is the harmonic mean of precision and recall: + // 2 * precision * recall / (precision + recall). + + Console.WriteLine(sprintf "Accuracy: %0.2f" metrics.Accuracy) + Console.WriteLine(sprintf "Auc: %0.2f" metrics.Auc) + Console.WriteLine(sprintf "F1Score: %0.2f" metrics.F1Score) + Console.WriteLine("=============== End evaluating ===============") + Console.WriteLine() + +// STEP 1: Create a model +let model = TrainAsync() + +// STEP2: Test accuracy +Evaluate(model) + +// STEP 3: Make a prediction +let predictions = model.Predict(sentiments) + +for (sentiment, prediction) in Seq.zip sentiments predictions do + Console.WriteLine( sprintf "Sentiment: %s | Prediction: %s sentiment" sentiment.SentimentText (if prediction.Sentiment then "Positive" else "Negative")) + +Console.ReadLine() |> ignore + diff --git a/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/README.md b/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/README.md new file mode 100644 index 000000000..b7bf9e896 --- /dev/null +++ b/samples/fsharp/getting-started/BinaryClassification_SentimentAnalysis/README.md @@ -0,0 +1,72 @@ +# Sentiment Analysis for User Reviews +In this introductory sample, you'll see how to use [ML.NET](https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet) to predict a sentiment (positive or negative) for customer reviews. In the world of machine learning, this type of prediction is known as **binary classification**. + +## Problem +This problem is centered around predicting if a customer's review has positive or negative sentiment. We will use IMDB and Yelp comments that were processed by humans and each comment has been assigned a label: +* 0 - negative +* 1 - positive + +Using those datasets we will build a model that will analyze a string and predict a sentiment value of 0 or 1. + +## ML task - Binary classification +The generalized problem of **binary classification** is to classify items into one of two classes (classifying items into more than two classes is called **multiclass classification**). + +* predict if an insurance claim is valid or not. +* predict if a plane will be delayed or will arrive on time. +* predict if a face ID (photo) belongs to the owner of a device. + +The common feature for all those examples is that the parameter we want to predict can take only one of two values. In other words, this value is represented by `boolean` type. + +## Solution +To solve this problem, first we will build an ML model. Then we will train the model on existing data, evaluate how good it is, and lastly we'll consume the model to predict a sentiment for new reviews. + +![Build -> Train -> Evaluate -> Consume](https://github.com/dotnet/machinelearning-samples/raw/master/samples/getting-started/shared_content/modelpipeline.png) + +### 1. Build model + +Building a model includes: uploading data (`sentiment-imdb-train.txt` with `TextLoader`), transforming the data so it can be used effectively by an ML algorithm (with `TextFeaturizer`), and choosing a learning algorithm (`FastTreeBinaryClassifier`). All of those steps are stored in a `LearningPipeline`: +```fsharp +// LearningPipeline holds all steps of the learning process: data, transforms, learners. +let pipeline = LearningPipeline() +// The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing +// all the column names and their types. +pipeline.Add(TextLoader(TrainDataPath).CreateFrom()) +// TextFeaturizer is a transform that will be used to featurize an input column to format and clean the data. +pipeline.Add(TextFeaturizer("Features", "SentimentText")) +// FastTreeBinaryClassifier is an algorithm that will be used to train the model. +// It has three hyperparameters for tuning decision tree performance. +pipeline.Add(FastTreeBinaryClassifier(NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2) +``` +### 2. Train model +Training the model is a process of running the chosen algorithm on a training data (with known sentiment values) to tune the parameters of the model. It is implemented in the `Train()` API. To perform training we just call the method and provide the types for our data object `SentimentData` and prediction object `SentimentPrediction`. +```fsharp +let model = pipeline.Train() +``` +### 3. Evaluate model +We need this step to conclude how accurate our model operates on new data. To do so, the model from the previous step is run against another dataset that was not used in training (`sentiment-yelp-test.txt`). This dataset also contains known sentiments. `BinaryClassificationEvaluator` calculates the difference between known fares and values predicted by the model in various metrics. +```fsharp + let testData = TextLoader(TestDataPath).CreateFrom() + + let evaluator = BinaryClassificationEvaluator() + let metrics = evaluator.Evaluate(model, testData) +``` +>*To learn more on how to understand the metrics, check out the Machine Learning glossary from the [ML.NET Guide](https://docs.microsoft.com/en-us/dotnet/machine-learning/) or use any available materials on data science and machine learning*. + +If you are not satisfied with the quality of the model, there are a variety of ways to improve it, which will be covered in the *examples* category. + +>*Keep in mind that for this sample the quality is lower than it could be because the datasets were reduced in size for performance purposes. You can use bigger labeled sentiment datasets available online to significantly improve the quality.* + +### 4. Consume model +After the model is trained, we can use the `Predict()` API to predict the sentiment for new reviews. + +```fsharp +let predictions = model.Predict(sentiments) +``` +Where `sentiments` contains new user reviews that we want to analyze. + +```fsharp +let sentiments = + [| SentimentData(SentimentText = "Contoso's 11 is a wonderful experience", Sentiment = 1.0) + SentimentData(SentimentText = "The acting in this movie is very bad", Sentiment = 0.0) + SentimentData(SentimentText = "Joe versus the Volcano Coffee Company is a great film.", Sentiment = 1.0) |] +``` \ No newline at end of file diff --git a/samples/fsharp/getting-started/Clustering_Iris/Clustering_Iris.fsproj b/samples/fsharp/getting-started/Clustering_Iris/Clustering_Iris.fsproj new file mode 100644 index 000000000..ce0fdc839 --- /dev/null +++ b/samples/fsharp/getting-started/Clustering_Iris/Clustering_Iris.fsproj @@ -0,0 +1,18 @@ + + + + Exe + netcoreapp2.0 + + + + + + + + + + + + + diff --git a/samples/fsharp/getting-started/Clustering_Iris/Program.fs b/samples/fsharp/getting-started/Clustering_Iris/Program.fs new file mode 100644 index 000000000..cebe37741 --- /dev/null +++ b/samples/fsharp/getting-started/Clustering_Iris/Program.fs @@ -0,0 +1,94 @@ +module Clustering_Iris + +open System +open System.IO +open Microsoft.ML +open Microsoft.ML.Runtime.Api +open Microsoft.ML.Data +open Microsoft.ML.Trainers +open Microsoft.ML.Transforms + +let AppPath = Path.Combine(__SOURCE_DIRECTORY__, "../../../..") +let DataPath = Path.Combine(AppPath, "datasets", "iris-full.txt") +let ModelPath = Path.Combine(AppPath, "IrisClustersModel.zip") + +type IrisData() = + [] + member val Label = 0.0 with get,set + + [] + member val SepalLength = 0.0 with get, set + + [] + member val SepalWidth = 0.0 with get, set + + [] + member val PetalLength = 0.0 with get, set + + [] + member val PetalWidth = 0.0 with get, set + +type ClusterPrediction() = + [] + member val SelectedClusterId = 0 with get, set + + [] + member val Distance : float[] = null with get, set + +let Train() = + // LearningPipeline holds all steps of the learning process: data, transforms, learners. + let pipeline = LearningPipeline() + // The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing + // all the column names and their types. + pipeline.Add(TextLoader(DataPath).CreateFrom(useHeader=true)) + // ColumnConcatenator concatenates all columns into Features column + pipeline.Add(ColumnConcatenator("Features", + "SepalLength", + "SepalWidth", + "PetalLength", + "PetalWidth")) + // KMeansPlusPlusClusterer is an algorithm that will be used to build clusters. We set the number of clusters to 3. + pipeline.Add(KMeansPlusPlusClusterer(K = 3)) + + Console.WriteLine("=============== Training model ===============") + let model = pipeline.Train() + Console.WriteLine("=============== End training ===============") + + // Saving the model as a .zip file. + model.WriteAsync(ModelPath) |> Async.AwaitTask |> Async.RunSynchronously + Console.WriteLine("The model is saved to {0}", ModelPath) + + model + +module TestIrisData = + let Setosa1 = IrisData(SepalLength = 5.1, SepalWidth = 3.3, PetalLength = 1.6, PetalWidth = 0.2) + let Setosa2 = IrisData(SepalLength = 0.2, SepalWidth = 5.1, PetalLength = 3.5, PetalWidth = 1.4) + let Virginica1 = IrisData(SepalLength = 6.4, SepalWidth = 3.1, PetalLength = 5.5, PetalWidth = 2.2) + let Virginica2 = IrisData(SepalLength = 2.5, SepalWidth = 6.3, PetalLength = 3.3, PetalWidth = 6.0) + let Versicolor1 = IrisData(SepalLength = 6.4, SepalWidth = 3.1, PetalLength = 4.5, PetalWidth = 1.5) + let Versicolor2 = IrisData(SepalLength = 7.0, SepalWidth = 3.2, PetalLength = 4.7, PetalWidth = 1.4) + +// STEP 1: Create a model +let model = Train() + +// STEP 2: Make a prediction +Console.WriteLine() +let prediction1 = model.Predict(TestIrisData.Setosa1) +let prediction2 = model.Predict(TestIrisData.Setosa2) +Console.WriteLine(sprintf "Clusters assigned for setosa flowers:") +Console.WriteLine(sprintf " {%d}" prediction1.SelectedClusterId) +Console.WriteLine(sprintf " {%d}" prediction2.SelectedClusterId) + +let prediction3 = model.Predict(TestIrisData.Virginica1) +let prediction4 = model.Predict(TestIrisData.Virginica2) +Console.WriteLine(sprintf "Clusters assigned for virginica flowers:") +Console.WriteLine(sprintf " {%d}" prediction3.SelectedClusterId) +Console.WriteLine(sprintf " {%d}" prediction4.SelectedClusterId) + +let prediction5 = model.Predict(TestIrisData.Versicolor1) +let prediction6 = model.Predict(TestIrisData.Versicolor2) +Console.WriteLine(sprintf "Clusters assigned for versicolor flowers:") +Console.WriteLine(sprintf " {%d}" prediction5.SelectedClusterId) +Console.WriteLine(sprintf " {%d}" prediction6.SelectedClusterId) +Console.ReadLine() |> ignore + diff --git a/samples/fsharp/getting-started/Clustering_Iris/README.md b/samples/fsharp/getting-started/Clustering_Iris/README.md new file mode 100644 index 000000000..d45ff6430 --- /dev/null +++ b/samples/fsharp/getting-started/Clustering_Iris/README.md @@ -0,0 +1,68 @@ +# Clustering Iris Data +In this introductory sample, you'll see how to use [ML.NET](https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet) to divide iris flowers into different groups that correspond to different types of iris. In the world of machine learning, this task is known as **clustering**. + +## Problem +To demonstrate clustering API in action, we will use three types of iris flowers: setosa, versicolor, and virginica. All of them are stored in the same dataset. Even though the type of these flowers is known, we will not use it and run clustering algorithm only on flower parameters such as petal length, petal width, etc. The task is to group all flowers into three different clusters. We would expect the flowers of different types belong to different clusters. + +The inputs of the model are following iris parameters: +* petal length +* petal width +* sepal length +* sepal width + +## ML task - Clustering +The generalized problem of **clustering** is to group a set of objects in such a way that objects in the same group are more similar to each other than to those in other groups. + +Some other examples of clustering: +* group news articles into topics: sports, politics, tech, etc. +* group customers by purchase preferences. +* divide a digital image into distinct regions for border detection or object recognition. + +Clustering can look similar to multiclass classification, but the difference is that for clustering tasks we don't know the answers for the past data. So there is no "tutor"/"supervisor" that can tell if our algorithm's prediction was right or wrong. This type of ML task is called **unsupervised learning**. + +## Solution +To solve this problem, first we will build and train an ML model. Then we will use trained model for predicting a cluster for iris flowers. + +### 1. Build model + +Building a model includes: uploading data (`iris-full.txt` with `TextLoader`), transforming the data so it can be used effectively by an ML algorithm (with `ColumnConcatenator`), and choosing a learning algorithm (`KMeansPlusPlusClusterer`). All of those steps are stored in a `LearningPipeline`: + +```fsharp +// LearningPipeline holds all steps of the learning process: data, transforms, learners. +let pipeline = LearningPipeline() +// The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing +// all the column names and their types. +pipeline.Add(TextLoader(DataPath).CreateFrom(useHeader=true)) + +// ColumnConcatenator concatenates all columns into Features column +pipeline.Add(ColumnConcatenator("Features", + "SepalLength", + "SepalWidth", + "PetalLength", + "PetalWidth")) + +// KMeansPlusPlusClusterer is an algorithm that will be used to build clusters. We set the number of clusters to 3. +pipeline.Add(KMeansPlusPlusClusterer(K = 3)) +``` + +### 2. Train model +Training the model is a process of running the chosen algorithm on the given data. It is implemented in the `Train()` API. To perform training we just call the method and provide our data object `IrisData` and prediction object `ClusterPrediction`. + +```fsharp +let model = pipeline.Train() +``` + +### 3. Consume model +After the model is build and trained, we can use the `Predict()` API to predict the cluster for an iris flower and calculate the distance from given flower parameters to each cluster (each centroid of a cluster). + +```fsharp +let prediction1 = model.Predict(TestIrisData.Setosa1) +``` + +Where `TestIrisData.Setosa1` stores the information about a setosa iris flower. + +```fsharp +module TestIrisData = + let Setosa1 = IrisData(SepalLength = 5.1, SepalWidth = 3.3, PetalLength = 1.6, PetalWidth = 0.2) + ... +``` \ No newline at end of file diff --git a/samples/fsharp/getting-started/GettingStarted.sln b/samples/fsharp/getting-started/GettingStarted.sln new file mode 100644 index 000000000..b4e21cf1b --- /dev/null +++ b/samples/fsharp/getting-started/GettingStarted.sln @@ -0,0 +1,43 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.27703.2000 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Regression_TaxiFarePrediction", "Regression_TaxiFarePrediction\Regression_TaxiFarePrediction.fsproj", "{C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BinaryClassification_SentimentAnalysis", "BinaryClassification_SentimentAnalysis\BinaryClassification_SentimentAnalysis.fsproj", "{ED877F56-5304-4F0D-A75C-4C77219C8D0E}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "MulticlassClassification_Iris", "MulticlassClassification_Iris\MulticlassClassification_Iris.fsproj", "{EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Clustering_Iris", "Clustering_Iris\Clustering_Iris.fsproj", "{F7E57F21-1CD6-4808-98F3-6D367672D4A5}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}.Release|Any CPU.Build.0 = Release|Any CPU + {ED877F56-5304-4F0D-A75C-4C77219C8D0E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {ED877F56-5304-4F0D-A75C-4C77219C8D0E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {ED877F56-5304-4F0D-A75C-4C77219C8D0E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {ED877F56-5304-4F0D-A75C-4C77219C8D0E}.Release|Any CPU.Build.0 = Release|Any CPU + {EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}.Debug|Any CPU.Build.0 = Debug|Any CPU + {EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}.Release|Any CPU.ActiveCfg = Release|Any CPU + {EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}.Release|Any CPU.Build.0 = Release|Any CPU + {F7E57F21-1CD6-4808-98F3-6D367672D4A5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F7E57F21-1CD6-4808-98F3-6D367672D4A5}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F7E57F21-1CD6-4808-98F3-6D367672D4A5}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F7E57F21-1CD6-4808-98F3-6D367672D4A5}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {B84E804C-06CA-45C8-9B9F-8F69CA930535} + EndGlobalSection +EndGlobal diff --git a/samples/fsharp/getting-started/MulticlassClassification_Iris/MulticlassClassification_Iris.fsproj b/samples/fsharp/getting-started/MulticlassClassification_Iris/MulticlassClassification_Iris.fsproj new file mode 100644 index 000000000..0ed209c6d --- /dev/null +++ b/samples/fsharp/getting-started/MulticlassClassification_Iris/MulticlassClassification_Iris.fsproj @@ -0,0 +1,19 @@ + + + + Exe + netcoreapp2.0 + + + + + + + + + + + + + + diff --git a/samples/fsharp/getting-started/MulticlassClassification_Iris/Program.fs b/samples/fsharp/getting-started/MulticlassClassification_Iris/Program.fs new file mode 100644 index 000000000..aa4037b75 --- /dev/null +++ b/samples/fsharp/getting-started/MulticlassClassification_Iris/Program.fs @@ -0,0 +1,136 @@ +module MulticlassClassification_Iris + +open System +open System.IO +open Microsoft.ML +open Microsoft.ML.Data +open Microsoft.ML.Runtime.Api +open Microsoft.ML.Models +open Microsoft.ML.Trainers +open Microsoft.ML.Transforms + +let AppPath = Path.Combine(__SOURCE_DIRECTORY__, "../../../..") +let TrainDataPath= Path.Combine(AppPath, "datasets", "iris-train.txt") +let TestDataPath= Path.Combine(AppPath, "datasets", "iris-test.txt") +let ModelPath= Path.Combine(AppPath, "IrisModel.zip") + +type IrisData() = + [] + member val Label = 0.0 with get,set + + [] + member val SepalLength = 0.0 with get, set + + [] + member val SepalWidth = 0.0 with get, set + + [] + member val PetalLength = 0.0 with get, set + + [] + member val PetalWidth = 0.0 with get, set + +type IrisPrediction() = + + [] + member val Score : float[] = null with get, set + + +let TrainAsync() = + // LearningPipeline holds all steps of the learning process: data, transforms, learners. + let pipeline = LearningPipeline() + // The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing + // all the column names and their types. + pipeline.Add(TextLoader(TrainDataPath).CreateFrom()) + + // Transforms + // When ML model starts training, it looks for two columns: Label and Features. + // Label: values that should be predicted. If you have a field named Label in your data type, + // like in this example, no extra actions required. + // If you don’t have it, copy the column you want to predict with ColumnCopier transform: + // new ColumnCopier(("FareAmount", "Label")) + // Features: all data used for prediction. At the end of all transforms you need to concatenate + // all columns except the one you want to predict into Features column with + // ColumnConcatenator transform: + pipeline.Add(ColumnConcatenator("Features", + "SepalLength", + "SepalWidth", + "PetalLength", + "PetalWidth")) + + // StochasticDualCoordinateAscentClassifier is an algorithm that will be used to train the model. + pipeline.Add(StochasticDualCoordinateAscentClassifier()) + + Console.WriteLine("=============== Training model ===============") + // The pipeline is trained on the dataset that has been loaded and transformed. + let model = pipeline.Train() + + // Saving the model as a .zip file. + model.WriteAsync(ModelPath) |> Async.AwaitTask |> Async.RunSynchronously + + Console.WriteLine("=============== End training ===============") + Console.WriteLine("The model is saved to {0}", ModelPath) + + model + +module TestIrisData = + let Iris1 = IrisData(SepalLength = 5.1, SepalWidth = 3.3, PetalLength = 1.6, PetalWidth= 0.2) + let Iris2 = IrisData(SepalLength = 6.4, SepalWidth = 3.1, PetalLength = 5.5, PetalWidth = 2.2) + let Iris3 = IrisData(SepalLength = 4.4, SepalWidth = 3.1, PetalLength = 2.5, PetalWidth = 1.2) + +let Evaluate(model : PredictionModel) = + // To evaluate how good the model predicts values, the model is ran against new set + // of data (test data) that was not involved in training. + let testData = TextLoader(TestDataPath).CreateFrom() + + // ClassificationEvaluator performs evaluation for Multiclass Classification type of ML problems. + let evaluator = ClassificationEvaluator(OutputTopKAcc = Nullable(3)) + + Console.WriteLine("=============== Evaluating model ===============") + + let metrics = evaluator.Evaluate(model, testData) + Console.WriteLine("Metrics:") + Console.WriteLine(sprintf " AccuracyMacro = {metrics.AccuracyMacro:0.####}, a value between 0 and 1, the closer to 1, the better") + Console.WriteLine(sprintf " AccuracyMicro = {metrics.AccuracyMicro:0.####}, a value between 0 and 1, the closer to 1, the better") + Console.WriteLine(sprintf " LogLoss = {metrics.LogLoss:0.####}, the closer to 0, the better") + Console.WriteLine(sprintf " LogLoss for class 1 = {metrics.PerClassLogLoss[0]:0.####}, the closer to 0, the better") + Console.WriteLine(sprintf " LogLoss for class 2 = {metrics.PerClassLogLoss[1]:0.####}, the closer to 0, the better") + Console.WriteLine(sprintf " LogLoss for class 3 = {metrics.PerClassLogLoss[2]:0.####}, the closer to 0, the better") + Console.WriteLine() + Console.WriteLine(sprintf " ConfusionMatrix:") + + // Print confusion matrix + for i in 0 .. metrics.ConfusionMatrix.Order - 1 do + for j in 0 .. metrics.ConfusionMatrix.ClassNames.Count - 1 do + Console.Write("\t" + string metrics.ConfusionMatrix.[i, j]) + Console.WriteLine() + + Console.WriteLine("=============== End evaluating ===============") + Console.WriteLine() + +// STEP 1: Create a model +let model = TrainAsync() + +// STEP2: Test accuracy +Evaluate(model) + +// STEP 3: Make a prediction +Console.WriteLine() +let prediction1 = model.Predict(TestIrisData.Iris1) +Console.WriteLine(sprintf "Actual: setosa. Predicted probability: setosa: %0.4f" prediction1.Score.[0]) +Console.WriteLine(sprintf " versicolor: %0.4f" prediction1.Score.[1]) +Console.WriteLine(sprintf " virginica: %0.4f" prediction1.Score.[2]) +Console.WriteLine() + +let prediction2 = model.Predict(TestIrisData.Iris2) +Console.WriteLine(sprintf "Actual: virginica. Predicted probability: setosa: %0.4f" prediction2.Score.[0]) +Console.WriteLine(sprintf " versicolor: %0.4f" prediction2.Score.[1]) +Console.WriteLine(sprintf " virginica: %0.4f" prediction2.Score.[2]) +Console.WriteLine() + +let prediction3 = model.Predict(TestIrisData.Iris3) +Console.WriteLine(sprintf "Actual: versicolor. Predicted probability: setosa: %0.4f" prediction3.Score.[0]) +Console.WriteLine(sprintf " versicolor: %0.4f" prediction3.Score.[1]) +Console.WriteLine(sprintf " virginica: %0.4f" prediction3.Score.[2]) + +Console.ReadLine() |> ignore diff --git a/samples/fsharp/getting-started/MulticlassClassification_Iris/README.md b/samples/fsharp/getting-started/MulticlassClassification_Iris/README.md new file mode 100644 index 000000000..9724879d9 --- /dev/null +++ b/samples/fsharp/getting-started/MulticlassClassification_Iris/README.md @@ -0,0 +1,103 @@ +# Iris Classification +In this introductory sample, you'll see how to use [ML.NET](https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet) to predict the type of iris flower. In the world of machine learning, this type of prediction is known as **multiclass classification**. + +## Problem +This problem is centered around predicting the type of an iris flower (setosa, versicolor, or virginica) based on the flower's parameters such as petal length, petal width, etc. + +To solve this problem, we will build an ML model that takes as inputs 4 parameters: +* petal length +* petal width +* sepal length +* sepal width + +and predicts which iris type the flower belongs to: +* setosa +* versicolor +* virginica + +To be precise, the model will return probabilities for the flower to belong to each type. + +## ML task - Multiclass classification +The generalized problem of **multiclass classification** is to classify items into one of three or more classes. (Classifying items into one of the two classes is called **binary classification**). + +Some other examples of multiclass classification are: +* handwriting digit recognition: predict which of 10 digits (0-9) an image contains. +* issues labeling: predict which category (UI, back end, documentation) an issue belongs to. +* disease stage prediction based on patient's test results. + +The common feature for all those examples is that the parameter we want to predict can take one of a few (more that two) values. In other words, this value is represented by `enum`, not by `integer`, `float`/`double` or `boolean` types. + +## Solution +To solve this problem, first we will build an ML model. Then we will train the model on existing data, evaluate how good it is, and lastly we'll consume the model to predict an iris type. + +![Build -> Train -> Evaluate -> Consume](https://github.com/dotnet/machinelearning-samples/raw/master/samples/getting-started/shared_content/modelpipeline.png) + +### 1. Build model + +Building a model includes: uploading data (`iris-train.txt` with `TextLoader`), transforming the data so it can be used effectively by an ML algorithm (with `ColumnConcatenator`), and choosing a learning algorithm (`StochasticDualCoordinateAscentClassifier`). All of those steps are stored in a `LearningPipeline`: +```fsharp +// LearningPipeline holds all steps of the learning process: data, transforms, learners. +let pipeline = LearningPipeline() + +// The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing +// all the column names and their types. +pipeline.Add(TextLoader(TrainDataPath).CreateFrom()) + +//When ML model starts training, it looks for two columns: Label and //Features. +// Transforms +// like in this example, no extra actions required. +// Label: values that should be predicted. If you have a field named Label in your data type, +// If you don’t have it, copy the column you want to predict with ColumnCopier transform: +// new ColumnCopier(("FareAmount", "Label")) +// Features: all data used for prediction. At the end of all transforms you need to concatenate +// all columns except the one you want to predict into Features column with +// ColumnConcatenator transform: +pipeline.Add(ColumnConcatenator("Features", + "SepalLength", + "SepalWidth", + "PetalLength", + "PetalWidth")) +// StochasticDualCoordinateAscentClassifier is an algorithm that will be used to train the model. +pipeline.Add(StochasticDualCoordinateAscentClassifier()) +``` + +### 2. Train model +Training the model is a process of running the chosen algorithm on a training data (with known iris types) to tune the parameters of the model. It is implemented in the `Train()` API. To perform training we just call the method and provide our data object `IrisData` and prediction object `IrisPrediction`. + +```fsharp +let model = pipeline.Train() +``` + +### 3. Evaluate model +We need this step to conclude how accurate our model operates on new data. To do so, the model from the previous step is run against another dataset that was not used in training (`iris-test.txt`). This dataset also contains known iris types. `ClassificationEvaluator` calculates the difference between known types and values predicted by the model in various metrics. + +```fsharp +let testData = TextLoader(TestDataPath).CreateFrom() + +let evaluator = ClassificationEvaluator(OutputTopKAcc=3.0) +let metrics = evaluator.Evaluate(model, testData) +``` + +>*To learn more on how to understand the metrics, check out the Machine Learning glossary from the [ML.NET Guide](https://docs.microsoft.com/en-us/dotnet/machine-learning/) or use any available materials on data science and machine learning*. + +If you are not satisfied with the quality of the model, there are a variety of ways to improve it, which will be covered in the *examples* category. + +### 4. Consume model + +After the model is trained, we can use the `Predict()` API to predict the probability that this flower belongs to each iris type. + +```fsharp +let prediction1 = model.Predict(TestIrisData.Iris1) +Console.WriteLine(sprintf "Actual: setosa. Predicted probability: setosa: %0.4f" prediction1.Score.[0]) +Console.WriteLine(sprintf " versicolor: %0.4f" prediction1.Score.[1]) +Console.WriteLine(sprintf " virginica: %0.4f" prediction1.Score.[2]) +Console.WriteLine() +``` + +Where `TestIrisData.Iris1` stores the information about the flower we'd like to predict the type for. + +```fsharp +module TestIrisData = + let Iris1 = IrisData(SepalLength = 5.1, SepalWidth = 3.3, PetalLength = 1.6, PetalWidth= 0.2) + ... +``` diff --git a/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Program.fs b/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Program.fs new file mode 100644 index 000000000..30a3a15b1 --- /dev/null +++ b/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Program.fs @@ -0,0 +1,280 @@ +module Regression_TaxiFarePrediction + +open Microsoft.ML.Runtime.Api +open System +open System.Diagnostics +open System.IO +open System.Linq +open Microsoft.ML +open Microsoft.ML.Data +open Microsoft.ML.Models +open Microsoft.ML.Trainers +open Microsoft.ML.Transforms + +open PLplot + +let AppPath = Path.Combine(__SOURCE_DIRECTORY__, "../../../..") +let TrainDataPath= Path.Combine(AppPath, "datasets", "taxi-fare-train.csv") +let TestDataPath= Path.Combine(AppPath, "datasets", "taxi-fare-test.csv") +let ModelPath= Path.Combine(AppPath, "TaxiFareModel.zip") + +type TaxiTrip() = + [] + member val VendorId = "" with get, set + + [] + member val RateCode = "" with get, set + + [] + member val PassengerCount = 0.0 with get, set + + [] + member val TripTime = 0.0 with get, set + + [] + member val TripDistance = 0.0 with get, set + + [] + member val PaymentType = "" with get, set + + [] + member val FareAmount = 0.0 with get,set + +type TaxiTripFarePrediction() = + [] + member val FareAmount = 0.0 with get, set + +module TestTaxiTrips = + let Trip1 = + TaxiTrip( + VendorId = "VTS", + RateCode = "1", + PassengerCount = 1.0, + TripDistance = 10.33, + PaymentType = "CSH", + FareAmount = 0.0 // predict it. actual = 29.5 + ) + + +let Train() = + // LearningPipeline holds all steps of the learning process: data, transforms, learners. + let pipeline = LearningPipeline() + + // The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing + // all the column names and their types. + pipeline.Add (TextLoader(TrainDataPath).CreateFrom(separator=',')) + + // Transforms + // When ML model starts training, it looks for two columns: Label and Features. + // Label: values that should be predicted. If you have a field named Label in your data type, + // no extra actions required. + // If you don't have it, like in this example, copy the column you want to predict with + // ColumnCopier transform: + pipeline.Add(ColumnCopier(struct ("FareAmount", "Label"))) + + // CategoricalOneHotVectorizer transforms categorical (string) values into 0/1 vectors + pipeline.Add(CategoricalOneHotVectorizer("VendorId", "RateCode", "PaymentType")) + + // Features: all data used for prediction. At the end of all transforms you need to concatenate + // all columns except the one you want to predict into Features column with + // ColumnConcatenator transform: + pipeline.Add(ColumnConcatenator("Features", + "VendorId", + "RateCode", + "PassengerCount", + "TripDistance", + "PaymentType")) + //FastTreeRegressor is an algorithm that will be used to train the model. + pipeline.Add(FastTreeRegressor()) + + Console.WriteLine("=============== Training model ===============") + // The pipeline is trained on the dataset that has been loaded and transformed. + let model = pipeline.Train() + + // Saving the model as a .zip file. + model.WriteAsync(ModelPath) |> Async.AwaitTask |> Async.RunSynchronously + + Console.WriteLine("=============== End training ===============") + Console.WriteLine("The model is saved to {0}", ModelPath) + + model + +let Evaluate(model: PredictionModel) = + + // To evaluate how good the model predicts values, it is run against new set + // of data (test data) that was not involved in training. + let testData = TextLoader(TestDataPath).CreateFrom(separator=',') + + // RegressionEvaluator calculates the differences (in various metrics) between predicted and actual + // values in the test dataset. + let evaluator = RegressionEvaluator() + + Console.WriteLine("=============== Evaluating model ===============") + + let metrics = evaluator.Evaluate(model, testData) + + Console.WriteLine(sprintf "Rms = {metrics.Rms}, ideally should be around 2.8, can be improved with larger dataset") + Console.WriteLine(sprintf "RSquared = {metrics.RSquared}, a value between 0 and 1, the closer to 1, the better") + Console.WriteLine("=============== End evaluating ===============") + Console.WriteLine() + +let GetDataFromCsv(dataLocation: string, numMaxRecords: int) = + File.ReadAllLines(dataLocation) + .Skip(1) + .Select(fun x -> x.Split(',')) + .Select(fun x -> + TaxiTrip( + VendorId = x.[0], + RateCode = x.[1], + PassengerCount = Double.Parse(x.[2]), + TripTime = Double.Parse(x.[3]), + TripDistance = Double.Parse(x.[4]), + PaymentType = x.[5], + FareAmount = Double.Parse(x.[6]) + ) + ) + .Take(numMaxRecords) + +let PaintChart(model: PredictionModel, + testDataSetPath: string, + numberOfRecordsToRead: int, + args: string[]) = + + use pl = new PLStream() + // use SVG backend and write to SineWaves.svg in current directory + let chartFileName = + if (args.Length = 1 && args.[0] = "svg") then + pl.sdev("svg") + let chartFileName = "TaxiRegressionDistribution.svg" + pl.sfnam(chartFileName) + chartFileName + else + pl.sdev("pngcairo") + let chartFileName = "TaxiRegressionDistribution.png" + pl.sfnam(chartFileName) + chartFileName + + // use white background with black foreground + pl.spal0("cmap0_alternate.pal") + + // Initialize plplot + pl.init() + + // set axis limits + let xMinLimit = 0.0 + let xMaxLimit = 40.0 //Rides larger than $40 are not shown in the chart + let yMinLimit = 0.0 + let yMaxLimit = 40.0 //Rides larger than $40 are not shown in the chart + pl.env(xMinLimit, xMaxLimit, yMinLimit, yMaxLimit, AxesScale.Independent, AxisBox.BoxTicksLabelsAxes) + + // Set scaling for mail title text 125% size of default + pl.schr(0.0, 1.25) + + // The main title + pl.lab("Measured", "Predicted", "Distribution of Taxi Fare Prediction") + + // plot open different colors + // see http://plplot.sourceforge.net/examples.php?demo=02 for palette indices + pl.col0(1) + + let totalNumber = numberOfRecordsToRead + let testData = GetDataFromCsv(testDataSetPath, totalNumber).ToList() + + //This code is the symbol to paint + let code = (char)9 + + // plot open other color + //pl.col0(9) //Light Green + //pl.col0(4) //Red + pl.col0(2) //Blue + + let mutable yTotal = 0.0 + let mutable xTotal = 0.0 + let mutable xyMultiTotal = 0.0 + let mutable xSquareTotal = 0.0 + + for i in 0 .. testData.Count-1 do + let farePrediction = model.Predict(testData.[i]) + + let x = [| testData.[i].FareAmount |] + let y = [| farePrediction.FareAmount |] + + //Paint a dot + pl.poin(x, y, code) + + xTotal <- xTotal + x.[0] + yTotal <- yTotal + y.[0] + + let multi = x.[0] * y.[0] + xyMultiTotal <- xyMultiTotal + multi + + let xSquare = x.[0] * x.[0] + xSquareTotal <- xSquareTotal + xSquare + + let ySquare = y.[0] * y.[0] + + Console.WriteLine(sprintf "-------------------------------------------------") + Console.WriteLine(sprintf "Predicted : {FarePrediction.FareAmount}") + Console.WriteLine(sprintf "Actual: {testData[i].FareAmount}") + Console.WriteLine(sprintf "-------------------------------------------------") + + // Regression Line calculation explanation: + // https://www.khanacademy.org/math/statistics-probability/describing-relationships-quantitative-data/more-on-regression/v/regression-line-example + + let minY = yTotal / double totalNumber + let minX = xTotal / double totalNumber + let minXY = xyMultiTotal / double totalNumber + let minXsquare = xSquareTotal / double totalNumber + + let m = ((minX * minY) - minXY) / ((minX * minX) - minXsquare) + + let b = minY - (m * minX) + + //Generic function for Y for the regression line + // y = (m * x) + b + + let x1 = 1.0 + //Function for Y1 in the line + let y1 = (m * x1) + b + + let x2 = 39.0 + //Function for Y2 in the line + let y2 = (m * x2) + b + + let xArray = [| x1; x2 |] + let yArray = [| y1; y2 |] + + pl.col0(4) + pl.line(xArray, yArray) + + // end page (writes output to disk) + pl.eop() + + // output version of PLplot + let verText = pl.gver() + Console.WriteLine("PLplot version " + verText) + + // Open Chart File In Microsoft Photos App (Or default app, like browser for .svg) + + Console.WriteLine("Showing chart...") + let chartFileNamePath = @".\" + chartFileName + let p = new Process(StartInfo=ProcessStartInfo(chartFileNamePath, UseShellExecute = true)) + p.Start() |> ignore + + +// STEP 1: Create a model +let model = Train() + +// STEP2: Test accuracy +Evaluate(model) + +// STEP 3: Make a test prediction +let prediction = model.Predict(TestTaxiTrips.Trip1) +Console.WriteLine(sprintf "Predicted fare: {prediction.FareAmount:0.####}, actual fare: 29.5") + +//STEP 4: Paint regression distribution chart for a number of elements read from a Test DataSet file +let args = Environment.GetCommandLineArgs().[1..] +PaintChart(model, TestDataPath, 100, args) + +Console.WriteLine("Press any key to exit..") +Console.ReadLine() |> ignore diff --git a/samples/fsharp/getting-started/Regression_TaxiFarePrediction/README.md b/samples/fsharp/getting-started/Regression_TaxiFarePrediction/README.md new file mode 100644 index 000000000..224cb2196 --- /dev/null +++ b/samples/fsharp/getting-started/Regression_TaxiFarePrediction/README.md @@ -0,0 +1,110 @@ +# Taxi Fare Prediction +In this introductory sample, you'll see how to use [ML.NET](https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet) to predict taxi fares. In the world of machine learning, this type of prediction is known as **regression**. + +## Problem +This problem is centered around predicting the fare of a taxi trip in New York City. At first glance, it may seem to depend simply on the distance traveled. However, taxi vendors in New York charge varying amounts for other factors such as additional passengers, paying with a credit card instead of cash and so on. This prediction can be used in application for taxi providers to give users and drivers an estimate on ride fares. + +To solve this problem, we will build an ML model that takes as inputs: +* vendor ID +* rate code +* passenger count +* trip time +* trip distance +* payment type + +and predicts the fare of the ride. + +## ML task - Regression +The generalized problem of **regression** is to predict some continuous value for given parameters, for example: +* predict a house prise based on number of rooms, location, year built, etc. +* predict a car fuel consumption based on fuel type and car parameters. +* predict a time estimate for fixing an issue based on issue attributes. + +The common feature for all those examples is that the parameter we want to predict can take any numeric value in certain range. In other words, this value is represented by `integer` or `float`/`double`, not by `enum` or `boolean` types. + +## Solution +To solve this problem, first we will build an ML model. Then we will train the model on existing data, evaluate how good it is, and lastly we'll consume the model to predict taxi fares. + +![Build -> Train -> Evaluate -> Consume](https://github.com/dotnet/machinelearning-samples/raw/master/samples/getting-started/shared_content/modelpipeline.png) + +### 1. Build model + +Building a model includes: uploading data (`taxi-fare-train.csv` with `TextLoader`), transforming the data so it can be used effectively by an ML algorithm (with `ColumnCopier`,`CategoricalOneHotVectorizer`,`ColumnConcatenator`), and choosing a learning algorithm (`FastTreeRegressor`). All of those steps are stored in a `LearningPipeline`: +```fsharp +// LearningPipeline holds all steps of the learning process: data, transforms, learners. +let pipeline = LearningPipeline() + +// The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing +// all the column names and their types. This will be used to create the model, and train it. +pipeline.Add(TextLoader(TrainDataPath).CreateFrom(separator=',') +// Transforms +// When ML model starts training, it looks for two columns: Label and Features. +// Label: values that should be predicted. If you have a field named Label in your data type, +// no extra actions required. +// If you don’t have it, like in this example, copy the column you want to predict with +// ColumnCopier transform: +pipeline.Add(ColumnCopier(("FareAmount", "Label"))) + +// CategoricalOneHotVectorizer transforms categorical (string) values into 0/1 vectors +pipeline.Add(CategoricalOneHotVectorizer("VendorId", + "RateCode", + "PaymentType")) + +// Features: all data used for prediction. At the end of all transforms you need to concatenate +// all columns except the one you want to predict into Features column with +// ColumnConcatenator transform: +pipeline.Add(ColumnConcatenator("Features", + "VendorId", + "RateCode", + "PassengerCount", + "TripDistance", + "PaymentType")) + +//FastTreeRegressor is an algorithm that will be used to train the model. +pipeline.Add(FastTreeRegressor()) +``` + +### 2. Train model +Training the model is a process of running the chosen algorithm on a training data (with known fare values) to tune the parameters of the model. It is implemented in the `Train()` API. To perform training we just call the method and provide the types for our data object `TaxiTrip` and prediction object `TaxiTripFarePrediction`. + +```fsharp +let model = pipeline.Train() +``` + +### 3. Evaluate model +We need this step to conclude how accurate our model operates on new data. To do so, the model from the previous step is run against another dataset that was not used in training (`taxi-fare-test.csv`). This dataset also contains known fares. `RegressionEvaluator` calculates the difference between known fares and values predicted by the model in various metrics. + +```fsharp +let testData = TextLoader(TestDataPath).CreateFrom(separator=',') + +let evaluator = RegressionEvaluator() +let metrics = evaluator.Evaluate(model, testData) +``` + +>*To learn more on how to understand the metrics, check out the Machine Learning glossary from the [ML.NET Guide](https://docs.microsoft.com/en-us/dotnet/machine-learning/) or use any available materials on data science and machine learning*. + +If you are not satisfied with the quality of the model, there are a variety of ways to improve it, which will be covered in the *examples* category. + +>*Keep in mind that for this sample the quality is lower than it could be because the datasets were reduced in size for performance purposes. You can use the original datasets to significantly improve the quality (Original datasets are referenced in datasets [README](../../../datasets/README.md)).* + +### 4. Consume model +After the model is trained, we can use the `Predict()` API to predict the fare amount for specified trip. + +```fsharp +let prediction = model.Predict(TestTaxiTrips.Trip1) +Console.WriteLine(sprintf "Predicted fare: {prediction.FareAmount:0.####}, actual fare: 29.5") +``` +Where `TestTaxiTrips.Trip1` stores the information about the trip we'd like to get the prediction for. + +```fsharp +module TestTaxiTrips = + let Trip1 = + TaxiTrip( + VendorId = "VTS", + RateCode = "1", + PassengerCount = 1.0, + TripDistance = 10.33, + PaymentType = "CSH", + FareAmount = 0.0 // predict it. actual = 29.5 + ) +``` diff --git a/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Regression_TaxiFarePrediction.fsproj b/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Regression_TaxiFarePrediction.fsproj new file mode 100644 index 000000000..404aa9be1 --- /dev/null +++ b/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Regression_TaxiFarePrediction.fsproj @@ -0,0 +1,20 @@ + + + + Exe + netcoreapp2.0 + + + + + + + + + + + + + + + diff --git a/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Sample-Regression-Chart.png b/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Sample-Regression-Chart.png new file mode 100644 index 000000000..af1c59565 Binary files /dev/null and b/samples/fsharp/getting-started/Regression_TaxiFarePrediction/Sample-Regression-Chart.png differ diff --git a/samples/fsharp/getting-started/shared_content/modelpipeline.png b/samples/fsharp/getting-started/shared_content/modelpipeline.png new file mode 100644 index 000000000..e6da7f3df Binary files /dev/null and b/samples/fsharp/getting-started/shared_content/modelpipeline.png differ