-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Checking in the samples generated during bug bash for MissingNa, Repl… #2960
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
using static Microsoft.ML.Transforms.OneHotEncodingEstimator; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class OneHotEncoding | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Get a small dataset as an IEnumerable. | ||
var samples = new List<DataPoint>() | ||
{ | ||
new DataPoint(){ Label = 0, Education = "0-5yrs" }, | ||
new DataPoint(){ Label = 1, Education = "0-5yrs" }, | ||
new DataPoint(){ Label = 45, Education = "6-11yrs" }, | ||
new DataPoint(){ Label = 50, Education = "6-11yrs" }, | ||
new DataPoint(){ Label = 50, Education = "11-15yrs" }, | ||
}; | ||
|
||
// Convert training data to IDataView. | ||
var trainData = mlContext.Data.LoadFromEnumerable(samples); | ||
|
||
// A pipeline for one hot encoding the Education column. | ||
var bagPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag); | ||
// Fit to data. | ||
var bagTransformer = bagPipeline.Fit(trainData); | ||
|
||
// Get transformed data | ||
var bagTransformedData = bagTransformer.Transform(trainData); | ||
// Getting the data of the newly created column, so we can preview it. | ||
var bagEncodedColumn = bagTransformedData.GetColumn<float[]>("EducationOneHotEncoded"); | ||
|
||
var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key); | ||
// Fit to data. | ||
var keyTransformer = keyPipeline.Fit(trainData); | ||
|
||
// Get transformed data | ||
var keyTransformedData = keyTransformer.Transform(trainData); | ||
// Getting the data of the newly created column, so we can preview it. | ||
var keyEncodedColumn = keyTransformedData.GetColumn<uint>("EducationOneHotEncoded"); | ||
|
||
Console.WriteLine("One Hot Encoding based on the bagging strategy."); | ||
foreach (var row in bagEncodedColumn) | ||
{ | ||
for (var i = 0; i < row.Length; i++) | ||
Console.Write($"{row[i]} "); | ||
} | ||
|
||
// data column obtained post-transformation. | ||
// Since there are only two categories in the Education column of the trainData, the output vector | ||
// for one hot will have two slots. | ||
// | ||
// 0 0 0 | ||
// 0 0 0 | ||
// 0 0 1 | ||
// 0 0 1 | ||
// 0 1 0 | ||
|
||
Console.WriteLine("One Hot Encoding with key type output."); | ||
foreach (var element in keyEncodedColumn) | ||
Console.WriteLine(element); | ||
|
||
// 1 | ||
// 1 | ||
// 2 | ||
// 2 | ||
// 3 | ||
|
||
} | ||
|
||
private class DataPoint | ||
{ | ||
public float Label { get; set; } | ||
|
||
public string Education { get; set; } | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class IndicateMissingValues | ||
{ | ||
|
||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
var samples = new List<DataPoint>() | ||
{ | ||
new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, | ||
new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, | ||
new DataPoint(){ Label = float.NaN, Features = new float[3] {-1, float.NaN, -3} }, | ||
}; | ||
// Convert training data to IDataView, the general data type used in ML.NET. | ||
var data = mlContext.Data.LoadFromEnumerable(samples); | ||
|
||
// IndicateMissingValues is used to create a boolean containing | ||
// 'true' where the value in the input column is NaN. This value can be used | ||
// to replace missing values with other values. | ||
IEstimator<ITransformer> pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); | ||
|
||
// Now we can transform the data and look at the output to confirm the behavior of the estimator. | ||
// This operation doesn't actually evaluate data until we read the data below. | ||
var tansformer = pipeline.Fit(data); | ||
var transformedData = tansformer.Transform(data); | ||
|
||
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. | ||
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(transformedData, reuseRowObject: false); | ||
|
||
// a small printing utility | ||
Func<object[], string> vectorPrinter = (object[] vector) => | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Break out of main code path and into a helper. #Pending There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like the main logic is above. Breaking out would just change the order of what comes first to the attention of the users: the definition of printing or printing itself.. In reply to: 265789218 [](ancestors = 265789218) |
||
string preview = "["; | ||
foreach (var slot in vector) | ||
preview += $"{slot} "; | ||
return preview += "]"; | ||
|
||
}; | ||
|
||
// And finally, we can write out the rows of the dataset, looking at the columns of interest. | ||
foreach (var row in rowEnumerable) | ||
{ | ||
Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingIndicator: {vectorPrinter(row.MissingIndicator.Cast<object>().ToArray())}"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. feels self-explanatory since it casts, than ToArray. Addign yet another sample that does the same thing might make the sample look less professional. In reply to: 265789625 [](ancestors = 265789625) |
||
} | ||
|
||
// Expected output: | ||
// | ||
// Label: 3 Features: [1 1 0] MissingIndicator: [False False False] | ||
// Label: 32 Features: [0 NaN 1] MissingIndicator: [False True False] | ||
// Label: NaN Features: [-1 NaN -3 ] MissingIndicator: [False True False] | ||
} | ||
|
||
private class DataPoint | ||
{ | ||
public float Label { get; set; } | ||
[VectorType(3)] | ||
public float[] Features { get; set; } | ||
} | ||
|
||
private sealed class SampleDataTransformed : DataPoint | ||
{ | ||
public bool[] MissingIndicator { get; set; } | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
using static Microsoft.ML.Transforms.MissingValueReplacingEstimator.ColumnOptions; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
class ReplaceMissingValues | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
var samples = new List<DataPoint>() | ||
{ | ||
new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||
new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, | ||
new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} }, | ||
new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} }, | ||
}; | ||
// Convert training data to IDataView, the general data type used in ML.NET. | ||
var data = mlContext.Data.LoadFromEnumerable(samples); | ||
|
||
// ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. | ||
var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.Mean); | ||
|
||
// Now we can transform the data and look at the output to confirm the behavior of the estimator. | ||
// This operation doesn't actually evaluate data until we read the data below. | ||
var meanTransformer = meanPipeline.Fit(data); | ||
var meanTransformedData = meanTransformer.Transform(data); | ||
|
||
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. | ||
var meanRowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(meanTransformedData, reuseRowObject: false); | ||
|
||
// ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. | ||
var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.DefaultValue); | ||
|
||
// Now we can transform the data and look at the output to confirm the behavior of the estimator. | ||
// This operation doesn't actually evaluate data until we read the data below. | ||
var defaultTransformer = defaultPipeline.Fit(data); | ||
var defaultTransformedData = defaultTransformer.Transform(data); | ||
|
||
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. | ||
var defaultRowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(defaultTransformedData, reuseRowObject: false); | ||
|
||
// a small printing utility | ||
Func<object[], string> vectorPrinter = (object[] vector) => | ||
{ | ||
string preview = "["; | ||
foreach (var slot in vector) | ||
preview += $"{slot} "; | ||
return preview += "]"; | ||
|
||
}; | ||
|
||
// And finally, we can write out the rows of the dataset, looking at the columns of interest. | ||
foreach (var row in meanRowEnumerable) | ||
{ | ||
Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast<object>().ToArray())}"); | ||
} | ||
|
||
// Expected output: | ||
// Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row | ||
// | ||
//Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] | ||
//Label: 32 Features: [0 NaN 1] MissingReplaced: [0 3 1] | ||
//Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 -3] | ||
//Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 -3] | ||
|
||
// And finally, we can write out the rows of the dataset, looking at the columns of interest. | ||
foreach (var row in defaultRowEnumerable) | ||
{ | ||
Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast<object>().ToArray())}"); | ||
} | ||
|
||
// Expected output: | ||
// Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats. | ||
// | ||
//Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] | ||
//Label: 32 Features: [0 NaN 1] MissingReplaced: [0 0 1] | ||
//Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 - 3] | ||
//Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 - 3] | ||
} | ||
|
||
private class DataPoint | ||
{ | ||
public float Label { get; set; } | ||
|
||
[VectorType(3)] | ||
public float[] Features { get; set; } | ||
} | ||
|
||
private sealed class SampleDataTransformed : DataPoint | ||
{ | ||
[VectorType(3)] | ||
public float[] MissingReplaced { get; set; } | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ internal static class Program | |
{ | ||
static void Main(string[] args) | ||
{ | ||
CustomMapping.Example(); | ||
ReplaceMissingValues.Example(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
please don't change this file. It creates unnecessary merge conflicts. |
||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Blank line above #Resolved