Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove label requirement for PCA anomaly detector entry point. #221

Merged
merged 2 commits into from
May 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions ZBaselines/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -10585,18 +10585,6 @@
"IsNullable": false,
"Default": "Features"
},
{
"Name": "LabelColumn",
"Type": "String",
"Desc": "Column to use for labels",
"Aliases": [
"lab"
],
"Required": false,
"SortOrder": 3.0,
"IsNullable": false,
"Default": "Label"
},
{
"Name": "WeightColumn",
"Type": "String",
Expand Down Expand Up @@ -10727,8 +10715,6 @@
}
],
"InputKind": [
"ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
"OutputKind": [
Expand Down
10 changes: 6 additions & 4 deletions src/Microsoft.ML.PCA/PcaTrainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public sealed class RandomizedPcaTrainer : TrainerBase<RoleMappedData, PcaPredic
internal const string Summary = "This algorithm trains an approximate PCA using Randomized SVD algorithm. "
+ "This PCA can be made into Kernel PCA by using Random Fourier Features transform.";

public class Arguments : LearnerInputBaseWithWeight
public class Arguments : LearnerInputBase
{
[Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k", SortOrder = 50)]
[TGUI(SuggestedSweeps = "10,20,40,80")]
Expand All @@ -62,11 +62,14 @@ public class Arguments : LearnerInputBaseWithWeight
public int Oversampling = 20;

[Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean", ShortName = "center")]
[TlcModule.SweepableDiscreteParam("Center", null, isBool:true)]
[TlcModule.SweepableDiscreteParam("Center", null, isBool: true)]
public bool Center = true;

[Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation", ShortName = "seed")]
public int? Seed;

[Argument(ArgumentType.AtMostOnce, HelpText = "Column to use for example weight", ShortName = "weight", SortOrder = 4, Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly)]
public Optional<string> WeightColumn = Optional<string>.Implicit(DefaultColumnNames.Weight);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm. Actually this is interesting. I see you're adding this here, but K-means definitely supports weighted training, descends from this same abstract class, LearnerInputBase, and did not specify a weight column input. :( I'll file a separate issue for that.

}

private int _dimension;
Expand Down Expand Up @@ -294,8 +297,7 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm

return LearnerEntryPointsUtils.Train<Arguments, CommonOutputs.AnomalyDetectionOutput>(host, input,
() => new RandomizedPcaTrainer(host, input),
() => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn),
() => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
getWeight: () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
}
}

Expand Down
279 changes: 279 additions & 0 deletions src/Microsoft.ML/CSharpApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,18 @@ public void Add(Microsoft.ML.Trainers.OrdinaryLeastSquaresRegressor input, Micro
_jsonNodes.Add(Serialize("Trainers.OrdinaryLeastSquaresRegressor", input, output));
}

public Microsoft.ML.Trainers.PcaAnomalyDetector.Output Add(Microsoft.ML.Trainers.PcaAnomalyDetector input)
{
var output = new Microsoft.ML.Trainers.PcaAnomalyDetector.Output();
Add(input, output);
return output;
}

public void Add(Microsoft.ML.Trainers.PcaAnomalyDetector input, Microsoft.ML.Trainers.PcaAnomalyDetector.Output output)
{
_jsonNodes.Add(Serialize("Trainers.PcaAnomalyDetector", input, output));
}

public Microsoft.ML.Trainers.PoissonRegressor.Output Add(Microsoft.ML.Trainers.PoissonRegressor input)
{
var output = new Microsoft.ML.Trainers.PoissonRegressor.Output();
Expand Down Expand Up @@ -1090,6 +1102,18 @@ public void Add(Microsoft.ML.Transforms.OptionalColumnCreator input, Microsoft.M
_jsonNodes.Add(Serialize("Transforms.OptionalColumnCreator", input, output));
}

public Microsoft.ML.Transforms.PcaCalculator.Output Add(Microsoft.ML.Transforms.PcaCalculator input)
{
var output = new Microsoft.ML.Transforms.PcaCalculator.Output();
Add(input, output);
return output;
}

public void Add(Microsoft.ML.Transforms.PcaCalculator input, Microsoft.ML.Transforms.PcaCalculator.Output output)
{
_jsonNodes.Add(Serialize("Transforms.PcaCalculator", input, output));
}

public Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter.Output Add(Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter input)
{
var output = new Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter.Output();
Expand Down Expand Up @@ -6739,6 +6763,97 @@ public OrdinaryLeastSquaresRegressorPipelineStep(Output output)
}
}

namespace Trainers
{

/// <summary>
/// Train an PCA Anomaly model.
/// </summary>
public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
{


/// <summary>
/// The number of components in the PCA
/// </summary>
[TlcModule.SweepableDiscreteParamAttribute("Rank", new object[]{10, 20, 40, 80})]
public int Rank { get; set; } = 20;

/// <summary>
/// Oversampling parameter for randomized PCA training
/// </summary>
[TlcModule.SweepableDiscreteParamAttribute("Oversampling", new object[]{10, 20, 40})]
public int Oversampling { get; set; } = 20;

/// <summary>
/// If enabled, data is centered to be zero mean
/// </summary>
[TlcModule.SweepableDiscreteParamAttribute("Center", new object[]{false, true})]
public bool Center { get; set; } = true;

/// <summary>
/// The seed for random number generation
/// </summary>
public int? Seed { get; set; }

/// <summary>
/// Column to use for example weight
/// </summary>
public Microsoft.ML.Runtime.EntryPoints.Optional<string> WeightColumn { get; set; }

/// <summary>
/// The data to be used for training
/// </summary>
public Var<Microsoft.ML.Runtime.Data.IDataView> TrainingData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();

/// <summary>
/// Column to use for features
/// </summary>
public string FeatureColumn { get; set; } = "Features";

/// <summary>
/// Normalize option for the feature column
/// </summary>
public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto;

/// <summary>
/// Whether learner should cache input training data
/// </summary>
public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto;


public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IAnomalyDetectionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput
{
/// <summary>
/// The trained model
/// </summary>
public Var<Microsoft.ML.Runtime.EntryPoints.IPredictorModel> PredictorModel { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.IPredictorModel>();

}
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
{
if (!(previousStep is ILearningPipelineDataStep dataStep))
{
throw new InvalidOperationException($"{ nameof(PcaAnomalyDetector)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
}

TrainingData = dataStep.Data;
Output output = experiment.Add(this);
return new PcaAnomalyDetectorPipelineStep(output);
}

private class PcaAnomalyDetectorPipelineStep : ILearningPipelinePredictorStep
{
public PcaAnomalyDetectorPipelineStep(Output output)
{
Model = output.PredictorModel;
}

public Var<IPredictorModel> Model { get; }
}
}
}

namespace Trainers
{

Expand Down Expand Up @@ -11417,6 +11532,170 @@ public OptionalColumnCreatorPipelineStep(Output output)
}
}

namespace Transforms
{

public sealed partial class PcaTransformColumn : OneToOneColumn<PcaTransformColumn>, IOneToOneColumn
{
/// <summary>
/// The name of the weight column
/// </summary>
public string WeightColumn { get; set; }

/// <summary>
/// The number of components in the PCA
/// </summary>
public int? Rank { get; set; }

/// <summary>
/// Oversampling parameter for randomized PCA training
/// </summary>
public int? Oversampling { get; set; }

/// <summary>
/// If enabled, data is centered to be zero mean
/// </summary>
public bool? Center { get; set; }

/// <summary>
/// The seed for random number generation
/// </summary>
public int? Seed { get; set; }

/// <summary>
/// Name of the new column
/// </summary>
public string Name { get; set; }

/// <summary>
/// Name of the source column
/// </summary>
public string Source { get; set; }

}

/// <summary>
/// Train an PCA Anomaly model.
/// </summary>
public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
{

public PcaCalculator()
{
}

public PcaCalculator(params string[] inputColumns)
{
if (inputColumns != null)
{
foreach (string input in inputColumns)
{
AddColumn(input);
}
}
}

public PcaCalculator(params ValueTuple<string, string>[] inputOutputColumns)
{
if (inputOutputColumns != null)
{
foreach (ValueTuple<string, string> inputOutput in inputOutputColumns)
{
AddColumn(inputOutput.Item2, inputOutput.Item1);
}
}
}

public void AddColumn(string source)
{
var list = Column == null ? new List<Transforms.PcaTransformColumn>() : new List<Transforms.PcaTransformColumn>(Column);
list.Add(OneToOneColumn<Transforms.PcaTransformColumn>.Create(source));
Column = list.ToArray();
}

public void AddColumn(string name, string source)
{
var list = Column == null ? new List<Transforms.PcaTransformColumn>() : new List<Transforms.PcaTransformColumn>(Column);
list.Add(OneToOneColumn<Transforms.PcaTransformColumn>.Create(name, source));
Column = list.ToArray();
}


/// <summary>
/// New column definition(s) (optional form: name:src)
/// </summary>
public Transforms.PcaTransformColumn[] Column { get; set; }

/// <summary>
/// The name of the weight column
/// </summary>
public string WeightColumn { get; set; }

/// <summary>
/// The number of components in the PCA
/// </summary>
public int Rank { get; set; } = 20;

/// <summary>
/// Oversampling parameter for randomized PCA training
/// </summary>
public int Oversampling { get; set; } = 20;

/// <summary>
/// If enabled, data is centered to be zero mean
/// </summary>
public bool Center { get; set; } = true;

/// <summary>
/// The seed for random number generation
/// </summary>
public int Seed { get; set; }

/// <summary>
/// Input dataset
/// </summary>
public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();


public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput
{
/// <summary>
/// Transformed dataset
/// </summary>
public Var<Microsoft.ML.Runtime.Data.IDataView> OutputData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();

/// <summary>
/// Transform model
/// </summary>
public Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel> Model { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel>();

}
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
{
if (!(previousStep is ILearningPipelineDataStep dataStep))
{
throw new InvalidOperationException($"{ nameof(PcaCalculator)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
}

Data = dataStep.Data;
Output output = experiment.Add(this);
return new PcaCalculatorPipelineStep(output);
}

private class PcaCalculatorPipelineStep : ILearningPipelineDataStep
{
public PcaCalculatorPipelineStep(Output output)
{
Data = output.OutputData;
Model = output.Model;
}

public Var<IDataView> Data { get; }
public Var<ITransformModel> Model { get; }
}
}
}

namespace Transforms
{

Expand Down
2 changes: 1 addition & 1 deletion test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,7 @@ public void EntryPointLogisticRegressionMultiClass()
[Fact]
public void EntryPointPcaAnomaly()
{
TestEntryPointRoutine("MNIST.Train.0-class.tiny.txt", "Trainers.PcaAnomalyDetector");
TestEntryPointRoutine("MNIST.Train.0-class.tiny.txt", "Trainers.PcaAnomalyDetector", "col=Features:R4:1-784");
}

[Fact]
Expand Down
1 change: 1 addition & 0 deletions test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<ItemGroup>
<ProjectReference Include="..\..\src\Microsoft.ML.PCA\Microsoft.ML.PCA.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.PipelineInference\Microsoft.ML.PipelineInference.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.StandardLearners\Microsoft.ML.StandardLearners.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML\Microsoft.ML.csproj" />
Expand Down