-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Clean up the SchemaDefinition class #2995
Changes from all commits
1d2d219
f23cdbf
be22be6
b5cf7a4
5ce0537
1860cce
0598c9f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System; | ||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.RunTests; | ||
using Microsoft.ML.TestFramework; | ||
using Xunit; | ||
using Xunit.Abstractions; | ||
|
||
namespace Microsoft.ML.Functional.Tests | ||
{ | ||
public class SchemaDefinitionTests : BaseTestClass | ||
{ | ||
private MLContext _ml; | ||
|
||
public SchemaDefinitionTests(ITestOutputHelper output) : base(output) | ||
{ | ||
} | ||
|
||
protected override void Initialize() | ||
{ | ||
base.Initialize(); | ||
|
||
_ml = new MLContext(42); | ||
_ml.AddStandardComponents(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you need to call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
} | ||
|
||
[Fact] | ||
public void SchemaDefinitionForPredictionEngine() | ||
{ | ||
var fileName = GetDataPath(TestDatasets.adult.trainFilename); | ||
var loader = _ml.Data.CreateTextLoader(new TextLoader.Options(), new MultiFileSource(fileName)); | ||
var data = loader.Load(new MultiFileSource(fileName)); | ||
var pipeline1 = _ml.Transforms.Categorical.OneHotEncoding("Cat", "Workclass", maximumNumberOfKeys: 3) | ||
.Append(_ml.Transforms.Concatenate("Features", "Cat", "NumericFeatures")); | ||
var model1 = pipeline1.Fit(data); | ||
|
||
var pipeline2 = _ml.Transforms.Categorical.OneHotEncoding("Cat", "Workclass", maximumNumberOfKeys: 4) | ||
.Append(_ml.Transforms.Concatenate("Features", "Cat", "NumericFeatures")); | ||
var model2 = pipeline2.Fit(data); | ||
|
||
var outputSchemaDefinition = SchemaDefinition.Create(typeof(OutputData)); | ||
outputSchemaDefinition["Features"].ColumnType = model1.GetOutputSchema(data.Schema)["Features"].Type; | ||
var engine1 = _ml.Model.CreatePredictionEngine<InputData, OutputData>(model1, outputSchemaDefinition: outputSchemaDefinition); | ||
|
||
outputSchemaDefinition = SchemaDefinition.Create(typeof(OutputData)); | ||
outputSchemaDefinition["Features"].ColumnType = model2.GetOutputSchema(data.Schema)["Features"].Type; | ||
var engine2 = _ml.Model.CreatePredictionEngine<InputData, OutputData>(model2, outputSchemaDefinition: outputSchemaDefinition); | ||
|
||
var prediction = engine1.Predict(new InputData() { Workclass = "Self-emp-not-inc", NumericFeatures = new float[6] }); | ||
Assert.Equal((engine1.OutputSchema["Features"].Type as VectorType).Size, prediction.Features.Length); | ||
Assert.True(prediction.Features.All(x => x == 0)); | ||
prediction = engine2.Predict(new InputData() { Workclass = "Self-emp-not-inc", NumericFeatures = new float[6] }); | ||
Assert.Equal((engine2.OutputSchema["Features"].Type as VectorType).Size, prediction.Features.Length); | ||
Assert.True(prediction.Features.Select((x, i) => i == 3 && x == 1 || x == 0).All(b => b)); | ||
} | ||
|
||
[Fact] | ||
public void SchemaDefinitionForCustomMapping() | ||
{ | ||
var fileName = GetDataPath(TestDatasets.adult.trainFilename); | ||
var data = new MultiFileSource(fileName); | ||
var loader = _ml.Data.CreateTextLoader(new TextLoader.Options(), new MultiFileSource(fileName)); | ||
var pipeline = _ml.Transforms.Categorical.OneHotEncoding("Categories") | ||
.Append(_ml.Transforms.Categorical.OneHotEncoding("Workclass")) | ||
.Append(_ml.Transforms.Concatenate("Features", "NumericFeatures", "Categories", "Workclass")) | ||
.Append(_ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("Features")); | ||
var model = pipeline.Fit(loader.Load(data)); | ||
var schema = model.GetOutputSchema(loader.GetOutputSchema()); | ||
|
||
var inputSchemaDefinition = SchemaDefinition.Create(typeof(OutputData)); | ||
inputSchemaDefinition["Features"].ColumnType = schema["Features"].Type; | ||
var outputSchemaDefinition = SchemaDefinition.Create(typeof(OutputData)); | ||
outputSchemaDefinition["Features"].ColumnType = new VectorType(NumberDataViewType.Single, (schema["Features"].Type as VectorType).Size * 2); | ||
|
||
var custom = _ml.Transforms.CustomMapping( | ||
(OutputData src, OutputData dst) => | ||
{ | ||
dst.Features = new float[src.Features.Length * 2]; | ||
for (int i = 0; i < src.Features.Length; i++) | ||
{ | ||
dst.Features[2 * i] = src.Features[i]; | ||
dst.Features[2 * i + 1] = (float)Math.Log(src.Features[i]); | ||
} | ||
}, null, inputSchemaDefinition, outputSchemaDefinition); | ||
|
||
model = model.Append(custom.Fit(model.Transform(loader.Load(data))) as ITransformer); | ||
schema = model.GetOutputSchema(loader.GetOutputSchema()); | ||
Assert.Equal(168, (schema["Features"].Type as VectorType).Size); | ||
} | ||
|
||
private sealed class InputData | ||
{ | ||
[LoadColumn(0)] | ||
public float Label { get; set; } | ||
[LoadColumn(1)] | ||
public string Workclass { get; set; } | ||
[LoadColumn(2, 8)] | ||
public string[] Categories { get; set; } | ||
[LoadColumn(9, 14)] | ||
[VectorType(6)] | ||
public float[] NumericFeatures { get; set; } | ||
} | ||
|
||
private sealed class OutputData | ||
{ | ||
public float Label { get; set; } | ||
public float[] Features { get; set; } | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, why is this private? I'm thinking about how I'd like to use it. I have my class, I create a new schema definition (but empty), then I populate the mapping. Do I have any other way to create an empty one of these guys?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It appears that not only is there no way to create an empty one, but there is no longer a way to add a new column to it, since the column constructor is now internal.
Are we sure there are no scenarios that need to do this?