From 8dd53e13d24868e739a201a7f35939a0f82a02c5 Mon Sep 17 00:00:00 2001 From: Scott Inglis Date: Thu, 28 Feb 2019 17:42:14 -0800 Subject: [PATCH 1/4] Api clean up for LightGBM. The cleanup includes: - Changing all abbreviated parameters to full names (i.e. numThreads->NumberOfThreads) - Updating column parameters to have Name if thats what they represent (LabelColumn->LabelColumnName). - Updated baseline files to reflect these changes which are semantical and should not have any computational difference. This fixes #2618 --- .../Dynamic/Trainers/Ranking/LightGbm.cs | 6 +- .../Trainers/Ranking/LightGbmWithOptions.cs | 6 +- .../Dynamic/Trainers/Regression/LightGbm.cs | 4 +- .../Regression/LightGbmWithOptions.cs | 4 +- .../Static/LightGBMRegression.cs | 4 +- .../LightGbmStaticExtensions.cs | 65 ++++---- .../LightGbmArguments.cs | 82 +++++----- .../LightGbmBinaryTrainer.cs | 24 +-- src/Microsoft.ML.LightGBM/LightGbmCatalog.cs | 56 +++---- .../LightGbmMulticlassTrainer.cs | 36 ++--- .../LightGbmRankingTrainer.cs | 41 ++--- .../LightGbmRegressionTrainer.cs | 28 ++-- .../LightGbmTrainerBase.cs | 53 +++---- src/Microsoft.ML.LightGBM/doc.xml | 24 +-- .../Standard/Online/AveragedPerceptron.cs | 12 +- .../Common/EntryPoints/core_manifest.json | 146 +++++++++--------- ...LightGBMDart-CV-breast-cancer.dart-out.txt | 4 +- ...MDart-TrainTest-breast-cancer.dart-out.txt | 2 +- ...LightGBMGoss-CV-breast-cancer.goss-out.txt | 4 +- ...MGoss-TrainTest-breast-cancer.goss-out.txt | 2 +- .../LightGBM-TrainTest-breast-cancer-out.txt | 2 +- .../LightGBMMC/LightGBMMC-CV-iris.key-out.txt | 8 +- .../LightGBMMC-CV-iris.keyU404-out.txt | 8 +- .../LightGBMMC-TrainTest-iris.key-out.txt | 4 +- .../LightGBMMC-TrainTest-iris.keyU404-out.txt | 4 +- ...MReg-CV-generatedRegressionDataset-out.txt | 4 +- ...ainTest-generatedRegressionDataset-out.txt | 2 +- ...-CV-generatedRegressionDataset.MAE-out.txt | 4 +- ...est-generatedRegressionDataset.MAE-out.txt | 2 +- ...CV-generatedRegressionDataset.RMSE-out.txt | 4 +- ...st-generatedRegressionDataset.RMSE-out.txt | 2 +- .../TestPredictors.cs | 4 +- .../Training.cs | 4 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 2 +- .../TensorflowTests.cs | 4 +- .../TrainerEstimators/TreeEstimators.cs | 18 +-- 36 files changed, 342 insertions(+), 337 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs index c3bd9d604e..576b98cbee 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs @@ -20,10 +20,10 @@ public static void Example() // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( - numLeaves: 4, - minDataPerLeaf: 10, + leafCount: 4, + minimumDataPerLeaf: 10, learningRate: 0.1, - numBoostRound: 2); + numberOfIterations: 2); // Fit this Pipeline to the Training Data. var model = pipeline.Fit(split.TrainSet); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs index dc898fb4d3..235d30e078 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs @@ -23,10 +23,10 @@ public static void Example() var pipeline = mlContext.Ranking.Trainers.LightGbm( new Options { - NumLeaves = 4, - MinDataPerLeaf = 10, + NumberOfLeaves = 4, + MinimumDataPerLeaf = 10, LearningRate = 0.1, - NumBoostRound = 2, + NumberOfIterations = 2, Booster = new TreeBooster.Options { FeatureFraction = 0.9 diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs index ce9e27a0fc..5f43a78c25 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs @@ -35,8 +35,8 @@ public static void Example() var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) .Append(mlContext.Regression.Trainers.LightGbm( labelColumnName: labelName, - numLeaves: 4, - minDataPerLeaf: 6, + leafCount: 4, + minimumDataPerLeaf: 6, learningRate: 0.001)); // Fit this pipeline to the training data. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs index e93eeb3f96..c2255554fb 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs @@ -39,8 +39,8 @@ public static void Example() .Append(mlContext.Regression.Trainers.LightGbm(new Options { LabelColumnName = labelName, - NumLeaves = 4, - MinDataPerLeaf = 6, + NumberOfLeaves = 4, + MinimumDataPerLeaf = 6, LearningRate = 0.001, Booster = new GossBooster.Options { diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs index 61225fe1e9..aa2f2e65fc 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs @@ -38,8 +38,8 @@ public static void LightGbmRegression() .Append(r => (r.label, score: mlContext.Regression.Trainers.LightGbm( r.label, r.features, - numLeaves: 4, - minDataPerLeaf: 6, + numberOfLeaves: 4, + minimumDataPerLeaf: 6, learningRate: 0.001, onFit: p => pred = p) ) diff --git a/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs b/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs index 581bbc0b59..9fc5c33c95 100644 --- a/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs +++ b/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs @@ -21,10 +21,10 @@ public static class LightGbmStaticExtensions /// The label column. /// The features column. /// The weights column. - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// Number of iterations. /// A delegate that is called every time the /// method is called on the /// instance created out of this. This delegate will receive @@ -39,19 +39,19 @@ public static class LightGbmStaticExtensions /// public static Scalar LightGbm(this RegressionCatalog.RegressionTrainers catalog, Scalar label, Vector features, Scalar weights = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumDataPerLeaf = null, double? learningRate = null, - int numBoostRound = Options.Defaults.NumBoostRound, + int numberOfIterations = Options.Defaults.NumberOfIterations, Action onFit = null) { - CheckUserValues(label, features, weights, numLeaves, minDataPerLeaf, learningRate, numBoostRound, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, minimumDataPerLeaf, learningRate, numberOfIterations, onFit); var rec = new TrainerEstimatorReconciler.Regression( (env, labelName, featuresName, weightsName) => { - var trainer = new LightGbmRegressorTrainer(env, labelName, featuresName, weightsName, numLeaves, - minDataPerLeaf, learningRate, numBoostRound); + var trainer = new LightGbmRegressorTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, + minimumDataPerLeaf, learningRate, numberOfIterations); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); return trainer; @@ -122,11 +122,13 @@ public static Scalar LightGbm(this RegressionCatalog.RegressionTrainers c /// ]]> /// public static (Scalar score, Scalar probability, Scalar predictedLabel) LightGbm(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector features, Scalar weights = null, + Scalar label, + Vector features, + Scalar weights = null, int? numLeaves = null, int? minDataPerLeaf = null, double? learningRate = null, - int numBoostRound = Options.Defaults.NumBoostRound, + int numBoostRound = Options.Defaults.NumberOfIterations, Action> onFit = null) { CheckUserValues(label, features, weights, numLeaves, minDataPerLeaf, learningRate, numBoostRound, onFit); @@ -194,9 +196,9 @@ public static (Scalar score, Scalar probability, Scalar pred /// The features column. /// The groupId column. /// The weights column. - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// Number of iterations. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// A delegate that is called every time the /// method is called on the @@ -206,21 +208,24 @@ public static (Scalar score, Scalar probability, Scalar pred /// The set of output columns including in order the predicted binary classification score (which will range /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. public static Scalar LightGbm(this RankingCatalog.RankingTrainers catalog, - Scalar label, Vector features, Key groupId, Scalar weights = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + Scalar label, + Vector features, + Key groupId, + Scalar weights = null, + int? numberOfLeaves = null, + int? minimumDataPerLeaf = null, double? learningRate = null, - int numBoostRound = Options.Defaults.NumBoostRound, + int numberOfIterations = Options.Defaults.NumberOfIterations, Action onFit = null) { - CheckUserValues(label, features, weights, numLeaves, minDataPerLeaf, learningRate, numBoostRound, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, minimumDataPerLeaf, learningRate, numberOfIterations, onFit); Contracts.CheckValue(groupId, nameof(groupId)); var rec = new TrainerEstimatorReconciler.Ranker( (env, labelName, featuresName, groupIdName, weightsName) => { - var trainer = new LightGbmRankingTrainer(env, labelName, featuresName, groupIdName, weightsName, numLeaves, - minDataPerLeaf, learningRate, numBoostRound); + var trainer = new LightGbmRankingTrainer(env, labelName, featuresName, groupIdName, weightsName, numberOfLeaves, + minimumDataPerLeaf, learningRate, numberOfIterations); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); @@ -279,10 +284,10 @@ public static Scalar LightGbm(this RankingCatalog.RankingTrainers c /// The label, or dependent variable. /// The features, or independent variables. /// The weights column. - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// Number of iterations. /// A delegate that is called every time the /// method is called on the /// instance created out of this. This delegate will receive @@ -301,19 +306,19 @@ public static (Vector score, Key predictedLabel) Key label, Vector features, Scalar weights = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumDataPerLeaf = null, double? learningRate = null, - int numBoostRound = Options.Defaults.NumBoostRound, + int numberOfIterations = Options.Defaults.NumberOfIterations, Action onFit = null) { - CheckUserValues(label, features, weights, numLeaves, minDataPerLeaf, learningRate, numBoostRound, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, minimumDataPerLeaf, learningRate, numberOfIterations, onFit); var rec = new TrainerEstimatorReconciler.MulticlassClassifier( (env, labelName, featuresName, weightsName) => { - var trainer = new LightGbmMulticlassTrainer(env, labelName, featuresName, weightsName, numLeaves, - minDataPerLeaf, learningRate, numBoostRound); + var trainer = new LightGbmMulticlassTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, + minimumDataPerLeaf, learningRate, numberOfIterations); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); diff --git a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs index 0b635bfaed..047c19b88d 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs @@ -66,14 +66,14 @@ internal virtual void UpdateParameters(Dictionary res) if (attribute == null) continue; - res[GetArgName(field.Name)] = field.GetValue(BoosterParameterOptions); + res[GetOptionName(field.Name)] = field.GetValue(BoosterParameterOptions); } } void IBoosterParameter.UpdateParameters(Dictionary res) => UpdateParameters(res); } - private static string GetArgName(string name) + private static string GetOptionName(string name) { StringBuilder strBuf = new StringBuilder(); bool first = true; @@ -96,7 +96,7 @@ private static string GetArgName(string name) [BestFriend] internal static class Defaults { - public const int NumBoostRound = 100; + public const int NumberOfIterations = 100; } public sealed class TreeBooster : BoosterParameter @@ -107,7 +107,7 @@ public sealed class TreeBooster : BoosterParameter [TlcModule.Component(Name = Name, FriendlyName = FriendlyName, Desc = "Traditional Gradient Boosting Decision Tree.")] public class Options : ISupportBoosterParameterFactory { - [Argument(ArgumentType.AtMostOnce, HelpText = "Use for binary classification when classes are not balanced.", ShortName = "us")] + [Argument(ArgumentType.AtMostOnce, HelpText = "Use for binary classification when training data is not balanced.", ShortName = "us")] public bool UnbalancedSets = false; [Argument(ArgumentType.AtMostOnce, @@ -129,7 +129,7 @@ public class Options : ISupportBoosterParameterFactory public double MinChildWeight = 0.1; [Argument(ArgumentType.AtMostOnce, - HelpText = "Subsample frequency. 0 means no subsample. " + HelpText = "Subsample frequency for bagging. 0 means no subsample. " + "If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.")] [TlcModule.Range(Min = 0, Max = int.MaxValue)] public int SubsampleFreq = 0; @@ -179,7 +179,9 @@ internal TreeBooster(Options options) Contracts.CheckUserArg(BoosterParameterOptions.MinChildWeight >= 0, nameof(BoosterParameterOptions.MinChildWeight), "must be >= 0."); Contracts.CheckUserArg(BoosterParameterOptions.Subsample > 0 && BoosterParameterOptions.Subsample <= 1, nameof(BoosterParameterOptions.Subsample), "must be in (0,1]."); Contracts.CheckUserArg(BoosterParameterOptions.FeatureFraction > 0 && BoosterParameterOptions.FeatureFraction <= 1, nameof(BoosterParameterOptions.FeatureFraction), "must be in (0,1]."); - Contracts.CheckUserArg(BoosterParameterOptions.ScalePosWeight > 0 && BoosterParameterOptions.ScalePosWeight <= 1, nameof(BoosterParameterOptions.ScalePosWeight), "must be in (0,1]."); + Contracts.CheckUserArg(BoosterParameterOptions.RegLambda >= 0, nameof(BoosterParameterOptions.RegLambda), "must be >= 0."); + Contracts.CheckUserArg(BoosterParameterOptions.RegAlpha >= 0, nameof(BoosterParameterOptions.RegAlpha), "must be >= 0."); + Contracts.CheckUserArg(BoosterParameterOptions.ScalePosWeight > 0, nameof(BoosterParameterOptions.ScalePosWeight), "must be >= 0."); } internal override void UpdateParameters(Dictionary res) @@ -197,15 +199,15 @@ public sealed class DartBooster : BoosterParameter [TlcModule.Component(Name = Name, FriendlyName = FriendlyName, Desc = "Dropouts meet Multiple Additive Regresion Trees. See https://arxiv.org/abs/1505.01866")] public sealed class Options : TreeBooster.Options { - [Argument(ArgumentType.AtMostOnce, HelpText = "Drop ratio for trees. Range:(0,1).")] + [Argument(ArgumentType.AtMostOnce, HelpText = "The drop ratio for trees. Range:(0,1).")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] public double DropRate = 0.1; - [Argument(ArgumentType.AtMostOnce, HelpText = "Max number of dropped tree in a boosting round.")] + [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of dropped tree in a boosting round.")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] public int MaxDrop = 1; - [Argument(ArgumentType.AtMostOnce, HelpText = "Probability for not perform dropping in a boosting round.")] + [Argument(ArgumentType.AtMostOnce, HelpText = "Probability for not dropping in a boosting round.")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] public double SkipDrop = 0.5; @@ -222,7 +224,6 @@ internal DartBooster(Options options) : base(options) { Contracts.CheckUserArg(BoosterParameterOptions.DropRate > 0 && BoosterParameterOptions.DropRate < 1, nameof(BoosterParameterOptions.DropRate), "must be in (0,1)."); - Contracts.CheckUserArg(BoosterParameterOptions.MaxDrop > 0, nameof(BoosterParameterOptions.MaxDrop), "must be > 0."); Contracts.CheckUserArg(BoosterParameterOptions.SkipDrop >= 0 && BoosterParameterOptions.SkipDrop < 1, nameof(BoosterParameterOptions.SkipDrop), "must be in [0,1)."); } @@ -241,14 +242,11 @@ public sealed class GossBooster : BoosterParameter [TlcModule.Component(Name = Name, FriendlyName = FriendlyName, Desc = "Gradient-based One-Side Sampling.")] public sealed class Options : TreeBooster.Options { - [Argument(ArgumentType.AtMostOnce, - HelpText = "Retain ratio for large gradient instances.")] + [Argument(ArgumentType.AtMostOnce, HelpText = "Retain ratio for large gradient instances.")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] public double TopRate = 0.2; - [Argument(ArgumentType.AtMostOnce, - HelpText = - "Retain ratio for small gradient instances.")] + [Argument(ArgumentType.AtMostOnce, HelpText = "Retain ratio for small gradient instances.")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] public double OtherRate = 0.1; @@ -287,7 +285,7 @@ public enum EvalMetricType [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations.", SortOrder = 1, ShortName = "iter")] [TGUI(Label = "Number of boosting iterations", SuggestedSweeps = "10,20,50,100,150,200")] [TlcModule.SweepableDiscreteParam("NumBoostRound", new object[] { 10, 20, 50, 100, 150, 200 })] - public int NumBoostRound = Defaults.NumBoostRound; + public int NumberOfIterations = Defaults.NumberOfIterations; [Argument(ArgumentType.AtMostOnce, HelpText = "Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1].", @@ -300,37 +298,37 @@ public enum EvalMetricType SortOrder = 2, ShortName = "nl", NullName = "")] [TGUI(Description = "The maximum number of leaves per tree", SuggestedSweeps = "2-128;log;inc:4")] [TlcModule.SweepableLongParamAttribute("NumLeaves", 2, 128, isLogScale: true, stepSize: 4)] - public int? NumLeaves; + public int? NumberOfLeaves; [Argument(ArgumentType.AtMostOnce, HelpText = "Minimum number of instances needed in a child.", SortOrder = 2, ShortName = "mil", NullName = "")] [TGUI(Label = "Min Documents In Leaves", SuggestedSweeps = "1,10,20,50 ")] [TlcModule.SweepableDiscreteParamAttribute("MinDataPerLeaf", new object[] { 1, 10, 20, 50 })] - public int? MinDataPerLeaf; + public int? MinimumDataPerLeaf; - [Argument(ArgumentType.AtMostOnce, HelpText = "Max number of bucket bin for features.", ShortName = "mb")] - public int MaxBin = 255; + [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of bucket bin for features.", ShortName = "mb")] + public int MaximumBin = 255; [Argument(ArgumentType.Multiple, HelpText = "Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.", SortOrder = 3)] public ISupportBoosterParameterFactory Booster = new TreeBooster.Options(); [Argument(ArgumentType.AtMostOnce, HelpText = "Verbose", ShortName = "v")] - public bool VerboseEval = false; + public bool Verbose = false; [Argument(ArgumentType.AtMostOnce, HelpText = "Printing running messages.")] public bool Silent = true; [Argument(ArgumentType.AtMostOnce, HelpText = "Number of parallel threads used to run LightGBM.", ShortName = "nt")] - public int? NThread; + public int? NumberOfThreads; [Argument(ArgumentType.AtMostOnce, HelpText = "Evaluation metrics.", ShortName = "em")] - public EvalMetricType EvalMetric = EvalMetricType.DefaultMetric; + public EvalMetricType EvaluationMetric = EvalMetricType.DefaultMetric; [Argument(ArgumentType.AtMostOnce, HelpText = "Use softmax loss for the multi classification.")] [TlcModule.SweepableDiscreteParam("UseSoftmax", new object[] { true, false })] - public bool? UseSoftmax; + public bool? UseSoftMaximum; [Argument(ArgumentType.AtMostOnce, HelpText = "Rounds of early stopping, 0 will disable it.", ShortName = "es")] @@ -350,31 +348,31 @@ public enum EvalMetricType [Argument(ArgumentType.AtMostOnce, HelpText = "Enable categorical split or not.", ShortName = "cat")] [TlcModule.SweepableDiscreteParam("UseCat", new object[] { true, false })] - public bool? UseCat; + public bool? UseCategoricalSplit; - [Argument(ArgumentType.AtMostOnce, HelpText = "Enable missing value auto infer or not.")] + [Argument(ArgumentType.AtMostOnce, HelpText = "Enable special handling of missing value or not.")] [TlcModule.SweepableDiscreteParam("UseMissing", new object[] { true, false })] public bool UseMissing = false; - [Argument(ArgumentType.AtMostOnce, HelpText = "Min number of instances per categorical group.", ShortName = "mdpg")] + [Argument(ArgumentType.AtMostOnce, HelpText = "Minimum number of instances per categorical group.", ShortName = "mdpg")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] [TlcModule.SweepableDiscreteParam("MinDataPerGroup", new object[] { 10, 50, 100, 200 })] - public int MinDataPerGroup = 100; + public int MinimumDataPerGroup = 100; [Argument(ArgumentType.AtMostOnce, HelpText = "Max number of categorical thresholds.", ShortName = "maxcat")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] [TlcModule.SweepableDiscreteParam("MaxCatThreshold", new object[] { 8, 16, 32, 64 })] - public int MaxCatThreshold = 32; + public int MaximumCategoricalThreshold = 32; [Argument(ArgumentType.AtMostOnce, HelpText = "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.")] [TlcModule.Range(Min = 0.0)] [TlcModule.SweepableDiscreteParam("CatSmooth", new object[] { 1, 10, 20 })] - public double CatSmooth = 10; + public double CategoricalSmoothing = 10; [Argument(ArgumentType.AtMostOnce, HelpText = "L2 Regularization for categorical split.")] [TlcModule.Range(Min = 0.0)] [TlcModule.SweepableDiscreteParam("CatL2", new object[] { 0.1, 0.5, 1, 5, 10 })] - public double CatL2 = 10; + public double L2Categorical = 10; [Argument(ArgumentType.AtMostOnce, HelpText = "Sets the random seed for LightGBM to use.")] public int? Seed; @@ -385,23 +383,23 @@ public enum EvalMetricType internal Dictionary ToDictionary(IHost host) { Contracts.CheckValue(host, nameof(host)); - Contracts.CheckUserArg(MaxBin > 0, nameof(MaxBin), "must be > 0."); + Contracts.CheckUserArg(MaximumBin > 0, nameof(MaximumBin), "must be > 0."); Contracts.CheckUserArg(Sigmoid > 0, nameof(Sigmoid), "must be > 0."); Dictionary res = new Dictionary(); var boosterParams = Booster.CreateComponent(host); boosterParams.UpdateParameters(res); - res[GetArgName(nameof(MaxBin))] = MaxBin; + res["max_bin"] = MaximumBin; res["verbose"] = Silent ? "-1" : "1"; - if (NThread.HasValue) - res["nthread"] = NThread.Value; + if (NumberOfThreads.HasValue) + res["nthread"] = NumberOfThreads.Value; res["seed"] = (Seed.HasValue) ? Seed : host.Rand.Next(); string metric = null; - switch (EvalMetric) + switch (EvaluationMetric) { case EvalMetricType.DefaultMetric: break; @@ -424,18 +422,18 @@ internal Dictionary ToDictionary(IHost host) case EvalMetricType.Auc: case EvalMetricType.Ndcg: case EvalMetricType.Map: - metric = EvalMetric.ToString().ToLower(); + metric = EvaluationMetric.ToString().ToLower(); break; } if (!string.IsNullOrEmpty(metric)) res["metric"] = metric; res["sigmoid"] = Sigmoid; res["label_gain"] = CustomGains; - res[GetArgName(nameof(UseMissing))] = UseMissing; - res[GetArgName(nameof(MinDataPerGroup))] = MinDataPerGroup; - res[GetArgName(nameof(MaxCatThreshold))] = MaxCatThreshold; - res[GetArgName(nameof(CatSmooth))] = CatSmooth; - res[GetArgName(nameof(CatL2))] = CatL2; + res["use_missing"] = UseMissing; + res["min_data_per_group"] = MinimumDataPerGroup; + res["max_cat_threshold"] = MaximumCategoricalThreshold; + res["cat_smooth"] = CategoricalSmoothing; + res["cat_l2"] = L2Categorical; return res; } } diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs index 5df67201f8..21693a6882 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs @@ -102,22 +102,22 @@ internal LightGbmBinaryTrainer(IHostEnvironment env, Options options) /// Initializes a new instance of /// /// The private instance of . - /// The name of The label column. - /// The name of the feature column. + /// The name of The label column. + /// The name of the feature column. /// The name for the column containing the initial weight. - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// Number of iterations. internal LightGbmBinaryTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, string weights = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + int? leafCount = null, + int? minimumDataPerLeaf = null, double? learningRate = null, - int numBoostRound = LightGBM.Options.Defaults.NumBoostRound) - : base(env, LoadNameValue, TrainerUtils.MakeBoolScalarLabel(labelColumn), featureColumn, weights, null, numLeaves, minDataPerLeaf, learningRate, numBoostRound) + int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeBoolScalarLabel(labelColumnName), featureColumnName, weights, null, leafCount, minimumDataPerLeaf, learningRate, numberOfIterations) { } @@ -138,7 +138,7 @@ private protected override void CheckDataValid(IChannel ch, RoleMappedData data) if (!(labelType is BooleanDataViewType || labelType is KeyType || labelType == NumberDataViewType.Single)) { throw ch.ExceptParam(nameof(data), - $"Label column '{data.Schema.Label.Value.Name}' is of type '{labelType}', but must be key, boolean or R4."); + $"Label column '{data.Schema.Label.Value.Name}' is of type '{labelType.RawType}', but must be unsigned int, boolean or float."); } } diff --git a/src/Microsoft.ML.LightGBM/LightGbmCatalog.cs b/src/Microsoft.ML.LightGBM/LightGbmCatalog.cs index 3e38fa248a..03988ad45f 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmCatalog.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmCatalog.cs @@ -20,10 +20,10 @@ public static class LightGbmExtensions /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// The number of iterations to use. /// /// /// @@ -72,10 +72,10 @@ public static LightGbmRegressorTrainer LightGbm(this RegressionCatalog.Regressio /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// The number of iterations to use. /// /// /// @@ -125,23 +125,23 @@ public static LightGbmBinaryTrainer LightGbm(this BinaryClassificationCatalog.Bi /// The name of the feature column. /// The name of the group column. /// The name of the example weight column (optional). - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// The number of iterations to use. public static LightGbmRankingTrainer LightGbm(this RankingCatalog.RankingTrainers catalog, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string rowGroupColumnName = DefaultColumnNames.GroupId, string exampleWeightColumnName = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + int? leafCount = null, + int? minimumDataPerLeaf = null, double? learningRate = null, - int numBoostRound = Options.Defaults.NumBoostRound) + int numberOfIterations = Options.Defaults.NumberOfIterations) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new LightGbmRankingTrainer(env, labelColumnName, featureColumnName, rowGroupColumnName, exampleWeightColumnName, numLeaves, minDataPerLeaf, learningRate, numBoostRound); + return new LightGbmRankingTrainer(env, labelColumnName, featureColumnName, rowGroupColumnName, exampleWeightColumnName, leafCount, minimumDataPerLeaf, learningRate, numberOfIterations); } /// @@ -164,10 +164,10 @@ public static LightGbmRankingTrainer LightGbm(this RankingCatalog.RankingTrainer /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// The number of iterations to use. /// /// /// diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs index 67e709c5c6..4ddfd845e4 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs @@ -43,22 +43,22 @@ internal LightGbmMulticlassTrainer(IHostEnvironment env, Options options) /// Initializes a new instance of /// /// The private instance of . - /// The name of The label column. - /// The name of the feature column. + /// The name of The label column. + /// The name of the feature column. /// The name for the column containing the initial weight. /// The number of leaves to use. - /// Number of iterations. /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// The number of iterations to use. internal LightGbmMulticlassTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, string weights = null, int? numLeaves = null, int? minDataPerLeaf = null, double? learningRate = null, - int numBoostRound = LightGBM.Options.Defaults.NumBoostRound) - : base(env, LoadNameValue, TrainerUtils.MakeU4ScalarColumn(labelColumn), featureColumn, weights, null, numLeaves, minDataPerLeaf, learningRate, numBoostRound) + int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeU4ScalarColumn(labelColumnName), featureColumnName, weights, null, numLeaves, minDataPerLeaf, learningRate, numberOfIterations) { _numClass = -1; } @@ -110,7 +110,7 @@ private protected override void CheckDataValid(IChannel ch, RoleMappedData data) if (!(labelType is BooleanDataViewType || labelType is KeyType || labelType == NumberDataViewType.Single)) { throw ch.ExceptParam(nameof(data), - $"Label column '{data.Schema.Label.Value.Name}' is of type '{labelType}', but must be key, boolean or R4."); + $"Label column '{data.Schema.Label.Value.Name}' is of type '{labelType.RawType}', but must be of unsigned int, boolean or float."); } } @@ -132,9 +132,9 @@ private protected override void ConvertNaNLabels(IChannel ch, RoleMappedData dat maxLabel = Math.Max(maxLabel, labelColumn); } } - ch.CheckParam(minLabel >= 0, nameof(data), "min labelColumn cannot be negative"); + ch.CheckParam(minLabel >= 0, nameof(data), "Minimum value in label column cannot be negative"); if (maxLabel >= _maxNumClass) - throw ch.ExceptParam(nameof(data), $"max labelColumn cannot exceed {_maxNumClass}"); + throw ch.ExceptParam(nameof(data), $"Maximum value {maxLabel} in label column exceeds {_maxNumClass}"); if (data.Schema.Label.Value.Type is KeyType keyType) { @@ -163,16 +163,16 @@ protected override void GetDefaultParameters(IChannel ch, int numRow, bool hasCa { base.GetDefaultParameters(ch, numRow, hasCategorical, totalCats, true); int numLeaves = (int)Options["num_leaves"]; - int minDataPerLeaf = LightGbmTrainerOptions.MinDataPerLeaf ?? DefaultMinDataPerLeaf(numRow, numLeaves, _numClass); + int minDataPerLeaf = LightGbmTrainerOptions.MinimumDataPerLeaf ?? DefaultMinDataPerLeaf(numRow, numLeaves, _numClass); Options["min_data_per_leaf"] = minDataPerLeaf; if (!hiddenMsg) { if (!LightGbmTrainerOptions.LearningRate.HasValue) ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.LearningRate) + " = " + Options["learning_rate"]); - if (!LightGbmTrainerOptions.NumLeaves.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.NumLeaves) + " = " + numLeaves); - if (!LightGbmTrainerOptions.MinDataPerLeaf.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.MinDataPerLeaf) + " = " + minDataPerLeaf); + if (!LightGbmTrainerOptions.NumberOfLeaves.HasValue) + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.NumberOfLeaves) + " = " + numLeaves); + if (!LightGbmTrainerOptions.MinimumDataPerLeaf.HasValue) + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.MinimumDataPerLeaf) + " = " + minDataPerLeaf); } } @@ -184,14 +184,14 @@ private protected override void CheckAndUpdateParametersBeforeTraining(IChannel Options["num_class"] = _numClass; bool useSoftmax = false; - if (LightGbmTrainerOptions.UseSoftmax.HasValue) - useSoftmax = LightGbmTrainerOptions.UseSoftmax.Value; + if (LightGbmTrainerOptions.UseSoftMaximum.HasValue) + useSoftmax = LightGbmTrainerOptions.UseSoftMaximum.Value; else { if (labels.Length >= _minDataToUseSoftmax) useSoftmax = true; - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.UseSoftmax) + " = " + useSoftmax); + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.UseSoftMaximum) + " = " + useSoftmax); } if (useSoftmax) diff --git a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs index 482a04f0d9..d9870205a3 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs @@ -22,7 +22,6 @@ namespace Microsoft.ML.LightGBM { - public sealed class LightGbmRankingModelParameters : TreeEnsembleModelParametersBasedOnRegressionTree { internal const string LoaderSignature = "LightGBMRankerExec"; @@ -89,26 +88,28 @@ internal LightGbmRankingTrainer(IHostEnvironment env, Options options) /// Initializes a new instance of /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name of the column containing the group ID. - /// The name of the optional column containing the initial weights. - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The name of the label column. + /// The name of the feature column. + /// The name of the column containing the group ID. + /// The name of the optional column containing the initial weights. + /// The number of leaves to use. /// The learning rate. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of iterations to use. internal LightGbmRankingTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string groupId = DefaultColumnNames.GroupId, - string weights = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string groupIdColumnName = DefaultColumnNames.GroupId, + string weightsColumnName = null, + int? leafCount = null, + int? minimumDataPerLeaf = null, double? learningRate = null, - int numBoostRound = LightGBM.Options.Defaults.NumBoostRound) - : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumn), featureColumn, weights, groupId, numLeaves, minDataPerLeaf, learningRate, numBoostRound) + int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumnName), + featureColumnName, weightsColumnName, groupIdColumnName, leafCount, + minimumDataPerLeaf, learningRate, numberOfIterations) { - Host.CheckNonEmpty(groupId, nameof(groupId)); + Host.CheckNonEmpty(groupIdColumnName, nameof(groupIdColumnName)); } private protected override void CheckDataValid(IChannel ch, RoleMappedData data) @@ -121,7 +122,7 @@ private protected override void CheckDataValid(IChannel ch, RoleMappedData data) if (!(labelType is KeyType || labelType == NumberDataViewType.Single)) { throw ch.ExceptParam(nameof(data), - $"Label column '{labelCol.Name}' is of type '{labelType}', but must be key or R4."); + $"Label column '{labelCol.Name}' is of type '{labelType.RawType}', but must be unsigned int or float."); } // Check group types. ch.CheckParam(data.Schema.Group.HasValue, nameof(data), "Need a group column."); @@ -130,7 +131,7 @@ private protected override void CheckDataValid(IChannel ch, RoleMappedData data) if (!(groupType == NumberDataViewType.UInt32 || groupType is KeyType)) { throw ch.ExceptParam(nameof(data), - $"Group column '{groupCol.Name}' is of type '{groupType}', but must be U4 or a Key."); + $"Group column '{groupCol.Name}' is of type '{groupType.RawType}', but must be unsigned int."); } } @@ -139,7 +140,7 @@ private protected override void CheckLabelCompatible(SchemaShape.Column labelCol Contracts.Assert(labelCol.IsValid); Action error = - () => throw Host.ExceptSchemaMismatch(nameof(labelCol), "label", labelCol.Name, "float or KeyType", labelCol.GetTypeString()); + () => throw Host.ExceptSchemaMismatch(nameof(labelCol), "label", labelCol.Name, "float or unsigned int", labelCol.GetTypeString()); if (labelCol.Kind != SchemaShape.Column.VectorKind.Scalar) error(); diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs index 3729154cbd..f93e2126c6 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs @@ -87,22 +87,22 @@ public sealed class LightGbmRegressorTrainer : LightGbmTrainerBase /// /// The private instance of . - /// The name of the label column. - /// The name of the feature column. - /// The name for the column containing the initial weight. - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The name of the label column. + /// The name of the feature column. + /// The name for the column containing the initial weight. + /// The number of leaves to use. + /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// Number of iterations. internal LightGbmRegressorTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, - string weights = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, + string weightsColumnName = null, + int? numberOfLeaves = null, + int? minimumDataPerLeaf = null, double? learningRate = null, - int numBoostRound = LightGBM.Options.Defaults.NumBoostRound) - : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumn), featureColumn, weights, null, numLeaves, minDataPerLeaf, learningRate, numBoostRound) + int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumnName), featureColumnName, weightsColumnName, null, numberOfLeaves, minimumDataPerLeaf, learningRate, numberOfIterations) { } @@ -127,7 +127,7 @@ private protected override void CheckDataValid(IChannel ch, RoleMappedData data) if (!(labelType is BooleanDataViewType || labelType is KeyType || labelType == NumberDataViewType.Single)) { throw ch.ExceptParam(nameof(data), - $"Label column '{data.Schema.Label.Value.Name}' is of type '{labelType}', but must be key, boolean or R4."); + $"Label column '{data.Schema.Label.Value.Name}' is of type '{labelType.RawType}', but must be an unsigned int, boolean or float."); } } diff --git a/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs b/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs index 614251c117..7ae79c09fc 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs @@ -58,27 +58,28 @@ private sealed class CategoricalMetaData private protected LightGbmTrainerBase(IHostEnvironment env, string name, - SchemaShape.Column label, - string featureColumn, - string weightColumn, - string groupIdColumn, - int? numLeaves, - int? minDataPerLeaf, + SchemaShape.Column labelColumn, + string featureColumnName, + string weightColumnName, + string groupIdColumnName, + int? leafCount, + int? minimumDataPerLeaf, double? learningRate, - int numBoostRound) - : base(Contracts.CheckRef(env, nameof(env)).Register(name), TrainerUtils.MakeR4VecFeature(featureColumn), label, TrainerUtils.MakeR4ScalarWeightColumn(weightColumn), TrainerUtils.MakeU4ScalarColumn(groupIdColumn)) + int numberOfIterations) + : base(Contracts.CheckRef(env, nameof(env)).Register(name), TrainerUtils.MakeR4VecFeature(featureColumnName), + labelColumn, TrainerUtils.MakeR4ScalarWeightColumn(weightColumnName), TrainerUtils.MakeU4ScalarColumn(groupIdColumnName)) { LightGbmTrainerOptions = new Options(); - LightGbmTrainerOptions.NumLeaves = numLeaves; - LightGbmTrainerOptions.MinDataPerLeaf = minDataPerLeaf; + LightGbmTrainerOptions.NumberOfLeaves = leafCount; + LightGbmTrainerOptions.MinimumDataPerLeaf = minimumDataPerLeaf; LightGbmTrainerOptions.LearningRate = learningRate; - LightGbmTrainerOptions.NumBoostRound = numBoostRound; + LightGbmTrainerOptions.NumberOfIterations = numberOfIterations; - LightGbmTrainerOptions.LabelColumnName = label.Name; - LightGbmTrainerOptions.FeatureColumnName = featureColumn; - LightGbmTrainerOptions.ExampleWeightColumnName = weightColumn; - LightGbmTrainerOptions.RowGroupColumnName = groupIdColumn; + LightGbmTrainerOptions.LabelColumnName = labelColumn.Name; + LightGbmTrainerOptions.FeatureColumnName = featureColumnName; + LightGbmTrainerOptions.ExampleWeightColumnName = weightColumnName; + LightGbmTrainerOptions.RowGroupColumnName = groupIdColumnName; InitParallelTraining(); } @@ -167,8 +168,8 @@ private protected virtual void CheckDataValid(IChannel ch, RoleMappedData data) protected virtual void GetDefaultParameters(IChannel ch, int numRow, bool hasCategarical, int totalCats, bool hiddenMsg = false) { double learningRate = LightGbmTrainerOptions.LearningRate ?? DefaultLearningRate(numRow, hasCategarical, totalCats); - int numLeaves = LightGbmTrainerOptions.NumLeaves ?? DefaultNumLeaves(numRow, hasCategarical, totalCats); - int minDataPerLeaf = LightGbmTrainerOptions.MinDataPerLeaf ?? DefaultMinDataPerLeaf(numRow, numLeaves, 1); + int numLeaves = LightGbmTrainerOptions.NumberOfLeaves ?? DefaultNumLeaves(numRow, hasCategarical, totalCats); + int minDataPerLeaf = LightGbmTrainerOptions.MinimumDataPerLeaf ?? DefaultMinDataPerLeaf(numRow, numLeaves, 1); Options["learning_rate"] = learningRate; Options["num_leaves"] = numLeaves; Options["min_data_per_leaf"] = minDataPerLeaf; @@ -176,10 +177,10 @@ protected virtual void GetDefaultParameters(IChannel ch, int numRow, bool hasCat { if (!LightGbmTrainerOptions.LearningRate.HasValue) ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.LearningRate) + " = " + learningRate); - if (!LightGbmTrainerOptions.NumLeaves.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.NumLeaves) + " = " + numLeaves); - if (!LightGbmTrainerOptions.MinDataPerLeaf.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.MinDataPerLeaf) + " = " + minDataPerLeaf); + if (!LightGbmTrainerOptions.NumberOfLeaves.HasValue) + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.NumberOfLeaves) + " = " + numLeaves); + if (!LightGbmTrainerOptions.MinimumDataPerLeaf.HasValue) + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.MinimumDataPerLeaf) + " = " + minDataPerLeaf); } } @@ -274,9 +275,9 @@ private CategoricalMetaData GetCategoricalMetaData(IChannel ch, RoleMappedData t int[] categoricalFeatures = null; const int useCatThreshold = 50000; // Disable cat when data is too small, reduce the overfitting. - bool useCat = LightGbmTrainerOptions.UseCat ?? numRow > useCatThreshold; - if (!LightGbmTrainerOptions.UseCat.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.UseCat) + " = " + useCat); + bool useCat = LightGbmTrainerOptions.UseCategoricalSplit ?? numRow > useCatThreshold; + if (!LightGbmTrainerOptions.UseCategoricalSplit.HasValue) + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.UseCategoricalSplit) + " = " + useCat); if (useCat) { var featureCol = trainData.Schema.Schema[DefaultColumnNames.Features]; @@ -369,8 +370,8 @@ private void TrainCore(IChannel ch, IProgressChannel pch, Dataset dtrain, Catego { ch.Info("LightGBM objective={0}", Options["objective"]); using (Booster bst = WrappedLightGbmTraining.Train(ch, pch, Options, dtrain, - dvalid: dvalid, numIteration: LightGbmTrainerOptions.NumBoostRound, - verboseEval: LightGbmTrainerOptions.VerboseEval, earlyStoppingRound: LightGbmTrainerOptions.EarlyStoppingRound)) + dvalid: dvalid, numIteration: LightGbmTrainerOptions.NumberOfIterations, + verboseEval: LightGbmTrainerOptions.Verbose, earlyStoppingRound: LightGbmTrainerOptions.EarlyStoppingRound)) { TrainedEnsemble = bst.GetModel(catMetaData.CategoricalBoudaries); } diff --git a/src/Microsoft.ML.LightGBM/doc.xml b/src/Microsoft.ML.LightGBM/doc.xml index 1fcd38dd7a..dfa4ccfad7 100644 --- a/src/Microsoft.ML.LightGBM/doc.xml +++ b/src/Microsoft.ML.LightGBM/doc.xml @@ -16,10 +16,10 @@ new LightGbmBinaryClassifier { - NumBoostRound = 200, + NumberOfIterations = 200, LearningRate = 0.5f, - NumLeaves = 32, - MinDataPerLeaf = 20 + NumberOfLeaves = 32, + MinimumDataPerLeaf = 20 } @@ -29,10 +29,10 @@ new LightGbmClassifier { - NumBoostRound = 200, + NumberOfIterations = 200, LearningRate = 0.5f, - NumLeaves = 32, - MinDataPerLeaf = 20 + NumberOfLeaves = 32, + MinimumDataPerLeaf = 20 } @@ -42,10 +42,10 @@ new LightGbmRegressor { - NumBoostRound = 100, + NumberOfIterations = 100, LearningRate = 0.5f, - NumLeaves = 32, - MinDataPerLeaf = 20, + NumberOfLeaves = 32, + MinimumDataPerLeaf = 20, Booster = new DartBoosterParameterFunction { XgboostDartMode = true, @@ -60,10 +60,10 @@ new LightGbmRanker { - NumBoostRound = 100, + NumberOfIterations = 100, LearningRate = 0.5f, - NumLeaves = 32, - MinDataPerLeaf = 20, + NumberOfLeaves = 32, + MinimumDataPerLeaf = 20, Booster = new GbdtBoosterParameterFunction { MinSplitGain = 3, diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs index c91fae0244..3fe119d98a 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs @@ -121,15 +121,15 @@ internal AveragedPerceptronTrainer(IHostEnvironment env, Options options) /// /// The local instance of the /// The classification loss function. - /// The name of the label column. - /// The name of the feature column. + /// The name of the label column. + /// The name of the feature column. /// The learning rate. /// Whether to decrease learning rate as iterations progress. /// L2 Regularization Weight. /// The number of training iterations. internal AveragedPerceptronTrainer(IHostEnvironment env, - string labelColumn = DefaultColumnNames.Label, - string featureColumn = DefaultColumnNames.Features, + string labelColumnName = DefaultColumnNames.Label, + string featureColumnName = DefaultColumnNames.Features, IClassificationLoss lossFunction = null, float learningRate = Options.AveragedDefault.LearningRate, bool decreaseLearningRate = Options.AveragedDefault.DecreaseLearningRate, @@ -137,8 +137,8 @@ internal AveragedPerceptronTrainer(IHostEnvironment env, int numIterations = Options.AveragedDefault.NumIterations) : this(env, new Options { - LabelColumnName = labelColumn, - FeatureColumnName = featureColumn, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName, LearningRate = learningRate, DecreaseLearningRate = decreaseLearningRate, L2RegularizerWeight = l2RegularizerWeight, diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index d3e0180dbd..8654c4c032 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -11129,7 +11129,7 @@ "ShortName": "LightGBM", "Inputs": [ { - "Name": "NumBoostRound", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations.", "Aliases": [ @@ -11181,7 +11181,7 @@ } }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "Maximum leaves for trees.", "Aliases": [ @@ -11200,7 +11200,7 @@ } }, { - "Name": "MinDataPerLeaf", + "Name": "MinimumDataPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -11322,9 +11322,9 @@ "Default": "Auto" }, { - "Name": "MaxBin", + "Name": "MaximumBin", "Type": "Int", - "Desc": "Max number of bucket bin for features.", + "Desc": "Maximum number of bucket bin for features.", "Aliases": [ "mb" ], @@ -11334,7 +11334,7 @@ "Default": 255 }, { - "Name": "VerboseEval", + "Name": "Verbose", "Type": "Bool", "Desc": "Verbose", "Aliases": [ @@ -11355,7 +11355,7 @@ "Default": true }, { - "Name": "NThread", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of parallel threads used to run LightGBM.", "Aliases": [ @@ -11367,7 +11367,7 @@ "Default": null }, { - "Name": "EvalMetric", + "Name": "EvaluationMetric", "Type": { "Kind": "Enum", "Values": [ @@ -11393,7 +11393,7 @@ "Default": "DefaultMetric" }, { - "Name": "UseSoftmax", + "Name": "UseSoftMaximum", "Type": "Bool", "Desc": "Use softmax loss for the multi classification.", "Required": false, @@ -11454,7 +11454,7 @@ "Default": 1048576 }, { - "Name": "UseCat", + "Name": "UseCategoricalSplit", "Type": "Bool", "Desc": "Enable categorical split or not.", "Aliases": [ @@ -11475,7 +11475,7 @@ { "Name": "UseMissing", "Type": "Bool", - "Desc": "Enable missing value auto infer or not.", + "Desc": "Enable special handling of missing value or not.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -11489,9 +11489,9 @@ } }, { - "Name": "MinDataPerGroup", + "Name": "MinimumDataPerGroup", "Type": "Int", - "Desc": "Min number of instances per categorical group.", + "Desc": "Minimum number of instances per categorical group.", "Aliases": [ "mdpg" ], @@ -11514,7 +11514,7 @@ } }, { - "Name": "MaxCatThreshold", + "Name": "MaximumCategoricalThreshold", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -11539,7 +11539,7 @@ } }, { - "Name": "CatSmooth", + "Name": "CategoricalSmoothing", "Type": "Float", "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.", "Required": false, @@ -11559,7 +11559,7 @@ } }, { - "Name": "CatL2", + "Name": "L2Categorical", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -11632,7 +11632,7 @@ "ShortName": "LightGBMMC", "Inputs": [ { - "Name": "NumBoostRound", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations.", "Aliases": [ @@ -11684,7 +11684,7 @@ } }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "Maximum leaves for trees.", "Aliases": [ @@ -11703,7 +11703,7 @@ } }, { - "Name": "MinDataPerLeaf", + "Name": "MinimumDataPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -11825,9 +11825,9 @@ "Default": "Auto" }, { - "Name": "MaxBin", + "Name": "MaximumBin", "Type": "Int", - "Desc": "Max number of bucket bin for features.", + "Desc": "Maximum number of bucket bin for features.", "Aliases": [ "mb" ], @@ -11837,7 +11837,7 @@ "Default": 255 }, { - "Name": "VerboseEval", + "Name": "Verbose", "Type": "Bool", "Desc": "Verbose", "Aliases": [ @@ -11858,7 +11858,7 @@ "Default": true }, { - "Name": "NThread", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of parallel threads used to run LightGBM.", "Aliases": [ @@ -11870,7 +11870,7 @@ "Default": null }, { - "Name": "EvalMetric", + "Name": "EvaluationMetric", "Type": { "Kind": "Enum", "Values": [ @@ -11896,7 +11896,7 @@ "Default": "DefaultMetric" }, { - "Name": "UseSoftmax", + "Name": "UseSoftMaximum", "Type": "Bool", "Desc": "Use softmax loss for the multi classification.", "Required": false, @@ -11957,7 +11957,7 @@ "Default": 1048576 }, { - "Name": "UseCat", + "Name": "UseCategoricalSplit", "Type": "Bool", "Desc": "Enable categorical split or not.", "Aliases": [ @@ -11978,7 +11978,7 @@ { "Name": "UseMissing", "Type": "Bool", - "Desc": "Enable missing value auto infer or not.", + "Desc": "Enable special handling of missing value or not.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -11992,9 +11992,9 @@ } }, { - "Name": "MinDataPerGroup", + "Name": "MinimumDataPerGroup", "Type": "Int", - "Desc": "Min number of instances per categorical group.", + "Desc": "Minimum number of instances per categorical group.", "Aliases": [ "mdpg" ], @@ -12017,7 +12017,7 @@ } }, { - "Name": "MaxCatThreshold", + "Name": "MaximumCategoricalThreshold", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -12042,7 +12042,7 @@ } }, { - "Name": "CatSmooth", + "Name": "CategoricalSmoothing", "Type": "Float", "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.", "Required": false, @@ -12062,7 +12062,7 @@ } }, { - "Name": "CatL2", + "Name": "L2Categorical", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -12135,7 +12135,7 @@ "ShortName": "LightGBMRank", "Inputs": [ { - "Name": "NumBoostRound", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations.", "Aliases": [ @@ -12187,7 +12187,7 @@ } }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "Maximum leaves for trees.", "Aliases": [ @@ -12206,7 +12206,7 @@ } }, { - "Name": "MinDataPerLeaf", + "Name": "MinimumDataPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -12328,9 +12328,9 @@ "Default": "Auto" }, { - "Name": "MaxBin", + "Name": "MaximumBin", "Type": "Int", - "Desc": "Max number of bucket bin for features.", + "Desc": "Maximum number of bucket bin for features.", "Aliases": [ "mb" ], @@ -12340,7 +12340,7 @@ "Default": 255 }, { - "Name": "VerboseEval", + "Name": "Verbose", "Type": "Bool", "Desc": "Verbose", "Aliases": [ @@ -12361,7 +12361,7 @@ "Default": true }, { - "Name": "NThread", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of parallel threads used to run LightGBM.", "Aliases": [ @@ -12373,7 +12373,7 @@ "Default": null }, { - "Name": "EvalMetric", + "Name": "EvaluationMetric", "Type": { "Kind": "Enum", "Values": [ @@ -12399,7 +12399,7 @@ "Default": "DefaultMetric" }, { - "Name": "UseSoftmax", + "Name": "UseSoftMaximum", "Type": "Bool", "Desc": "Use softmax loss for the multi classification.", "Required": false, @@ -12460,7 +12460,7 @@ "Default": 1048576 }, { - "Name": "UseCat", + "Name": "UseCategoricalSplit", "Type": "Bool", "Desc": "Enable categorical split or not.", "Aliases": [ @@ -12481,7 +12481,7 @@ { "Name": "UseMissing", "Type": "Bool", - "Desc": "Enable missing value auto infer or not.", + "Desc": "Enable special handling of missing value or not.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -12495,9 +12495,9 @@ } }, { - "Name": "MinDataPerGroup", + "Name": "MinimumDataPerGroup", "Type": "Int", - "Desc": "Min number of instances per categorical group.", + "Desc": "Minimum number of instances per categorical group.", "Aliases": [ "mdpg" ], @@ -12520,7 +12520,7 @@ } }, { - "Name": "MaxCatThreshold", + "Name": "MaximumCategoricalThreshold", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -12545,7 +12545,7 @@ } }, { - "Name": "CatSmooth", + "Name": "CategoricalSmoothing", "Type": "Float", "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.", "Required": false, @@ -12565,7 +12565,7 @@ } }, { - "Name": "CatL2", + "Name": "L2Categorical", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -12638,7 +12638,7 @@ "ShortName": "LightGBMR", "Inputs": [ { - "Name": "NumBoostRound", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations.", "Aliases": [ @@ -12690,7 +12690,7 @@ } }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "Maximum leaves for trees.", "Aliases": [ @@ -12709,7 +12709,7 @@ } }, { - "Name": "MinDataPerLeaf", + "Name": "MinimumDataPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -12831,9 +12831,9 @@ "Default": "Auto" }, { - "Name": "MaxBin", + "Name": "MaximumBin", "Type": "Int", - "Desc": "Max number of bucket bin for features.", + "Desc": "Maximum number of bucket bin for features.", "Aliases": [ "mb" ], @@ -12843,7 +12843,7 @@ "Default": 255 }, { - "Name": "VerboseEval", + "Name": "Verbose", "Type": "Bool", "Desc": "Verbose", "Aliases": [ @@ -12864,7 +12864,7 @@ "Default": true }, { - "Name": "NThread", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of parallel threads used to run LightGBM.", "Aliases": [ @@ -12876,7 +12876,7 @@ "Default": null }, { - "Name": "EvalMetric", + "Name": "EvaluationMetric", "Type": { "Kind": "Enum", "Values": [ @@ -12902,7 +12902,7 @@ "Default": "DefaultMetric" }, { - "Name": "UseSoftmax", + "Name": "UseSoftMaximum", "Type": "Bool", "Desc": "Use softmax loss for the multi classification.", "Required": false, @@ -12963,7 +12963,7 @@ "Default": 1048576 }, { - "Name": "UseCat", + "Name": "UseCategoricalSplit", "Type": "Bool", "Desc": "Enable categorical split or not.", "Aliases": [ @@ -12984,7 +12984,7 @@ { "Name": "UseMissing", "Type": "Bool", - "Desc": "Enable missing value auto infer or not.", + "Desc": "Enable special handling of missing value or not.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -12998,9 +12998,9 @@ } }, { - "Name": "MinDataPerGroup", + "Name": "MinimumDataPerGroup", "Type": "Int", - "Desc": "Min number of instances per categorical group.", + "Desc": "Minimum number of instances per categorical group.", "Aliases": [ "mdpg" ], @@ -13023,7 +13023,7 @@ } }, { - "Name": "MaxCatThreshold", + "Name": "MaximumCategoricalThreshold", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -13048,7 +13048,7 @@ } }, { - "Name": "CatSmooth", + "Name": "CategoricalSmoothing", "Type": "Float", "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.", "Required": false, @@ -13068,7 +13068,7 @@ } }, { - "Name": "CatL2", + "Name": "L2Categorical", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -23538,7 +23538,7 @@ { "Name": "DropRate", "Type": "Float", - "Desc": "Drop ratio for trees. Range:(0,1).", + "Desc": "The drop ratio for trees. Range:(0,1).", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23551,7 +23551,7 @@ { "Name": "MaxDrop", "Type": "Int", - "Desc": "Max number of dropped tree in a boosting round.", + "Desc": "Maximum number of dropped tree in a boosting round.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23564,7 +23564,7 @@ { "Name": "SkipDrop", "Type": "Float", - "Desc": "Probability for not perform dropping in a boosting round.", + "Desc": "Probability for not dropping in a boosting round.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23595,7 +23595,7 @@ { "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Use for binary classification when classes are not balanced.", + "Desc": "Use for binary classification when training data is not balanced.", "Aliases": [ "us" ], @@ -23644,7 +23644,7 @@ { "Name": "SubsampleFreq", "Type": "Int", - "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23748,7 +23748,7 @@ { "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Use for binary classification when classes are not balanced.", + "Desc": "Use for binary classification when training data is not balanced.", "Aliases": [ "us" ], @@ -23797,7 +23797,7 @@ { "Name": "SubsampleFreq", "Type": "Int", - "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23927,7 +23927,7 @@ { "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Use for binary classification when classes are not balanced.", + "Desc": "Use for binary classification when training data is not balanced.", "Aliases": [ "us" ], @@ -23976,7 +23976,7 @@ { "Name": "SubsampleFreq", "Type": "Int", - "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", "Required": false, "SortOrder": 150.0, "IsNullable": false, diff --git a/test/BaselineOutput/Common/LightGBM/LightGBMDart-CV-breast-cancer.dart-out.txt b/test/BaselineOutput/Common/LightGBM/LightGBMDart-CV-breast-cancer.dart-out.txt index 44635aa70a..e79b7a61e7 100644 --- a/test/BaselineOutput/Common/LightGBM/LightGBMDart-CV-breast-cancer.dart-out.txt +++ b/test/BaselineOutput/Common/LightGBM/LightGBMDart-CV-breast-cancer.dart-out.txt @@ -1,10 +1,10 @@ maml.exe CV tr=LightGBM{nt=1 iter=10 booster=dart lr=0.2 mil=10 nl=20} threads=- cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=binary Not training a calibrator because it is not needed. Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=binary Not training a calibrator because it is not needed. TEST POSITIVE RATIO: 0.3702 (134.0/(134.0+228.0)) diff --git a/test/BaselineOutput/Common/LightGBM/LightGBMDart-TrainTest-breast-cancer.dart-out.txt b/test/BaselineOutput/Common/LightGBM/LightGBMDart-TrainTest-breast-cancer.dart-out.txt index 232f6326d4..bfa0bf3f97 100644 --- a/test/BaselineOutput/Common/LightGBM/LightGBMDart-TrainTest-breast-cancer.dart-out.txt +++ b/test/BaselineOutput/Common/LightGBM/LightGBMDart-TrainTest-breast-cancer.dart-out.txt @@ -1,6 +1,6 @@ maml.exe TrainTest test=%Data% tr=LightGBM{nt=1 iter=10 booster=dart lr=0.2 mil=10 nl=20} cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% out=%Output% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=binary Not training a calibrator because it is not needed. TEST POSITIVE RATIO: 0.3448 (241.0/(241.0+458.0)) diff --git a/test/BaselineOutput/Common/LightGBM/LightGBMGoss-CV-breast-cancer.goss-out.txt b/test/BaselineOutput/Common/LightGBM/LightGBMGoss-CV-breast-cancer.goss-out.txt index 8bd89002c1..a331a81b7e 100644 --- a/test/BaselineOutput/Common/LightGBM/LightGBMGoss-CV-breast-cancer.goss-out.txt +++ b/test/BaselineOutput/Common/LightGBM/LightGBMGoss-CV-breast-cancer.goss-out.txt @@ -1,10 +1,10 @@ maml.exe CV tr=LightGBM{nt=1 iter=10 v=+ booster=goss lr=0.2 mil=10 nl=20} threads=- cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=binary Not training a calibrator because it is not needed. Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=binary Not training a calibrator because it is not needed. TEST POSITIVE RATIO: 0.3702 (134.0/(134.0+228.0)) diff --git a/test/BaselineOutput/Common/LightGBM/LightGBMGoss-TrainTest-breast-cancer.goss-out.txt b/test/BaselineOutput/Common/LightGBM/LightGBMGoss-TrainTest-breast-cancer.goss-out.txt index ba50420a1d..d249f34e1b 100644 --- a/test/BaselineOutput/Common/LightGBM/LightGBMGoss-TrainTest-breast-cancer.goss-out.txt +++ b/test/BaselineOutput/Common/LightGBM/LightGBMGoss-TrainTest-breast-cancer.goss-out.txt @@ -1,6 +1,6 @@ maml.exe TrainTest test=%Data% tr=LightGBM{nt=1 iter=10 v=+ booster=goss lr=0.2 mil=10 nl=20} cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% out=%Output% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=binary Not training a calibrator because it is not needed. TEST POSITIVE RATIO: 0.3448 (241.0/(241.0+458.0)) diff --git a/test/BaselineOutput/Common/LightGBMBinary/LightGBM-TrainTest-breast-cancer-out.txt b/test/BaselineOutput/Common/LightGBMBinary/LightGBM-TrainTest-breast-cancer-out.txt index 2496917e04..391a8665ce 100644 --- a/test/BaselineOutput/Common/LightGBMBinary/LightGBM-TrainTest-breast-cancer-out.txt +++ b/test/BaselineOutput/Common/LightGBMBinary/LightGBM-TrainTest-breast-cancer-out.txt @@ -1,6 +1,6 @@ maml.exe TrainTest test=%Data% tr=LightGBMBinary{nt=1 nl=5 mil=5 lr=0.25 iter=20 mb=255} cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% out=%Output% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=binary Not training a calibrator because it is not needed. TEST POSITIVE RATIO: 0.3448 (241.0/(241.0+458.0)) diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt index dcd0b08107..a9a5af5543 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt @@ -1,12 +1,12 @@ maml.exe CV tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:TX:0 col=Features:1-*} data=%Data% seed=1 xf=Term{col=Label} Not adding a normalizer. -Auto-tuning parameters: UseCat = False -Auto-tuning parameters: UseSoftmax = False +Auto-tuning parameters: UseCategoricalSplit = False +Auto-tuning parameters: UseSoftMaximum = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. Not adding a normalizer. -Auto-tuning parameters: UseCat = False -Auto-tuning parameters: UseSoftmax = False +Auto-tuning parameters: UseCategoricalSplit = False +Auto-tuning parameters: UseSoftMaximum = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt index db69b4b0d8..9958fa93b8 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt @@ -1,12 +1,12 @@ maml.exe CV tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:U4[0-2]:0 col=Features:1-4} data=%Data% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False -Auto-tuning parameters: UseSoftmax = False +Auto-tuning parameters: UseCategoricalSplit = False +Auto-tuning parameters: UseSoftMaximum = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. Not adding a normalizer. -Auto-tuning parameters: UseCat = False -Auto-tuning parameters: UseSoftmax = False +Auto-tuning parameters: UseCategoricalSplit = False +Auto-tuning parameters: UseSoftMaximum = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt index 1c4cb95912..a92727951e 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt @@ -1,7 +1,7 @@ maml.exe TrainTest test=%Data% tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:TX:0 col=Features:1-*} data=%Data% out=%Output% seed=1 xf=Term{col=Label} Not adding a normalizer. -Auto-tuning parameters: UseCat = False -Auto-tuning parameters: UseSoftmax = False +Auto-tuning parameters: UseCategoricalSplit = False +Auto-tuning parameters: UseSoftMaximum = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt index 1de8c3d919..e0001f3d38 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt @@ -1,7 +1,7 @@ maml.exe TrainTest test=%Data% tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:U4[0-2]:0 col=Features:1-4} data=%Data% out=%Output% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False -Auto-tuning parameters: UseSoftmax = False +Auto-tuning parameters: UseCategoricalSplit = False +Auto-tuning parameters: UseSoftMaximum = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMR/LightGBMReg-CV-generatedRegressionDataset-out.txt b/test/BaselineOutput/Common/LightGBMR/LightGBMReg-CV-generatedRegressionDataset-out.txt index afa867d488..1fc6084997 100644 --- a/test/BaselineOutput/Common/LightGBMR/LightGBMReg-CV-generatedRegressionDataset-out.txt +++ b/test/BaselineOutput/Common/LightGBMR/LightGBMReg-CV-generatedRegressionDataset-out.txt @@ -1,10 +1,10 @@ maml.exe CV tr=LightGBMR{nt=1 iter=50 v=+ booster=gbdt{l1=0.2 l2=0.2} lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+} data=%Data% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. L1(avg): 27.477977 diff --git a/test/BaselineOutput/Common/LightGBMR/LightGBMReg-TrainTest-generatedRegressionDataset-out.txt b/test/BaselineOutput/Common/LightGBMR/LightGBMReg-TrainTest-generatedRegressionDataset-out.txt index f15a4bb020..909d9f0012 100644 --- a/test/BaselineOutput/Common/LightGBMR/LightGBMReg-TrainTest-generatedRegressionDataset-out.txt +++ b/test/BaselineOutput/Common/LightGBMR/LightGBMReg-TrainTest-generatedRegressionDataset-out.txt @@ -1,6 +1,6 @@ maml.exe TrainTest test=%Data% tr=LightGBMR{nt=1 iter=50 v=+ booster=gbdt{l1=0.2 l2=0.2} lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+} data=%Data% out=%Output% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. L1(avg): 3.472291 diff --git a/test/BaselineOutput/Common/LightGBMR/LightGBMRegMae-CV-generatedRegressionDataset.MAE-out.txt b/test/BaselineOutput/Common/LightGBMR/LightGBMRegMae-CV-generatedRegressionDataset.MAE-out.txt index c2530555e1..4550a80d3c 100644 --- a/test/BaselineOutput/Common/LightGBMR/LightGBMRegMae-CV-generatedRegressionDataset.MAE-out.txt +++ b/test/BaselineOutput/Common/LightGBMR/LightGBMRegMae-CV-generatedRegressionDataset.MAE-out.txt @@ -1,10 +1,10 @@ maml.exe CV tr=LightGBMR{nt=1 iter=50 em=mae v=+ lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+} data=%Data% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. L1(avg): 27.482854 diff --git a/test/BaselineOutput/Common/LightGBMR/LightGBMRegMae-TrainTest-generatedRegressionDataset.MAE-out.txt b/test/BaselineOutput/Common/LightGBMR/LightGBMRegMae-TrainTest-generatedRegressionDataset.MAE-out.txt index aaad5d20e5..59d2ceaa05 100644 --- a/test/BaselineOutput/Common/LightGBMR/LightGBMRegMae-TrainTest-generatedRegressionDataset.MAE-out.txt +++ b/test/BaselineOutput/Common/LightGBMR/LightGBMRegMae-TrainTest-generatedRegressionDataset.MAE-out.txt @@ -1,6 +1,6 @@ maml.exe TrainTest test=%Data% tr=LightGBMR{nt=1 iter=50 em=mae v=+ lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+} data=%Data% out=%Output% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. L1(avg): 3.428896 diff --git a/test/BaselineOutput/Common/LightGBMR/LightGBMRegRmse-CV-generatedRegressionDataset.RMSE-out.txt b/test/BaselineOutput/Common/LightGBMR/LightGBMRegRmse-CV-generatedRegressionDataset.RMSE-out.txt index 483c724038..71d131bb5a 100644 --- a/test/BaselineOutput/Common/LightGBMR/LightGBMRegRmse-CV-generatedRegressionDataset.RMSE-out.txt +++ b/test/BaselineOutput/Common/LightGBMR/LightGBMRegRmse-CV-generatedRegressionDataset.RMSE-out.txt @@ -1,10 +1,10 @@ maml.exe CV tr=LightGBMR{nt=1 iter=50 em=rmse v=+ lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+} data=%Data% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. L1(avg): 27.482854 diff --git a/test/BaselineOutput/Common/LightGBMR/LightGBMRegRmse-TrainTest-generatedRegressionDataset.RMSE-out.txt b/test/BaselineOutput/Common/LightGBMR/LightGBMRegRmse-TrainTest-generatedRegressionDataset.RMSE-out.txt index 1ed592dd87..c919475347 100644 --- a/test/BaselineOutput/Common/LightGBMR/LightGBMRegRmse-TrainTest-generatedRegressionDataset.RMSE-out.txt +++ b/test/BaselineOutput/Common/LightGBMR/LightGBMRegRmse-TrainTest-generatedRegressionDataset.RMSE-out.txt @@ -1,6 +1,6 @@ maml.exe TrainTest test=%Data% tr=LightGBMR{nt=1 iter=50 em=rmse v=+ lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+} data=%Data% out=%Output% seed=1 Not adding a normalizer. -Auto-tuning parameters: UseCat = False +Auto-tuning parameters: UseCategoricalSplit = False LightGBM objective=regression Not training a calibrator because it is not needed. L1(avg): 3.428896 diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs index 731b013189..cc2257fe71 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs @@ -777,8 +777,8 @@ public void TestMultiClassEnsembleCombiner() LightGbm.TrainMultiClass(Env, new Options { FeatureColumnName = "Features", - NumBoostRound = 5, - NumLeaves = 4, + NumberOfIterations = 5, + NumberOfLeaves = 4, LabelColumnName = DefaultColumnNames.Label, TrainingData = dataView }).PredictorModel, diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs index 3daaa1f3c2..cfdd9851d6 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/Training.cs @@ -576,8 +576,8 @@ public void LightGbmRegression() var est = reader.MakeNewEstimator() .Append(r => (r.label, score: catalog.Trainers.LightGbm(r.label, r.features, - numBoostRound: 10, - numLeaves: 5, + numberOfIterations: 10, + numberOfLeaves: 5, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 492a71a501..0983dbef3d 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -345,7 +345,7 @@ public void LightGbmBinaryClassificationOnnxConversionTest() var dynamicPipeline = mlContext.Transforms.Normalize("FeatureVector") .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.LightGbm(labelColumnName: "Target", featureColumnName: "FeatureVector", numBoostRound: 3, numLeaves: 16, minDataPerLeaf: 100)); + .Append(mlContext.Regression.Trainers.LightGbm(labelColumnName: "Target", featureColumnName: "FeatureVector", numberOfIterations: 3, leafCount: 16, minimumDataPerLeaf: 100)); var model = dynamicPipeline.Fit(data); // Step 2: Convert ML.NET model to ONNX format and save it as a file. diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 8e6823739c..c495ae0428 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -685,8 +685,8 @@ private void ExecuteTFTransformMNISTConvTrainingTest(bool shuffle, int? shuffleS LabelColumnName = "Label", FeatureColumnName = "Features", Seed = 1, - NThread = 1, - NumBoostRound = 1 + NumberOfThreads = 1, + NumberOfIterations = 1 })); var trainedModel = pipe.Fit(preprocessedTrainData); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs index 444db573da..db58372e6e 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs @@ -50,9 +50,9 @@ public void LightGBMBinaryEstimator() var trainer = ML.BinaryClassification.Trainers.LightGbm(new Options { - NumLeaves = 10, - NThread = 1, - MinDataPerLeaf = 2, + NumberOfLeaves = 10, + NumberOfThreads = 1, + MinimumDataPerLeaf = 2, }); var pipeWithTrainer = pipe.Append(trainer); @@ -169,9 +169,9 @@ public void LightGBMRegressorEstimator() var dataView = GetRegressionPipeline(); var trainer = ML.Regression.Trainers.LightGbm(new Options { - NThread = 1, + NumberOfThreads = 1, NormalizeFeatures = NormalizeOption.Warn, - CatL2 = 5, + L2Categorical = 5, }); TestEstimatorCore(trainer, dataView); @@ -295,10 +295,10 @@ private void LightGbmHelper(bool useSoftmax, out string modelString, out List Date: Fri, 1 Mar 2019 16:58:27 -0800 Subject: [PATCH 2/4] - Updating from feedback --- .../LightGbmWithOptions.cs | 2 +- .../Dynamic/Trainers/Ranking/LightGbm.cs | 4 +- .../Trainers/Ranking/LightGbmWithOptions.cs | 2 +- .../Dynamic/Trainers/Regression/LightGbm.cs | 4 +- .../Regression/LightGbmWithOptions.cs | 2 +- .../Static/LightGBMBinaryClassification.cs | 4 +- .../LightGbmStaticExtensions.cs | 34 ++-- .../LightGbmArguments.cs | 104 +++++++---- .../LightGbmBinaryTrainer.cs | 19 +- src/Microsoft.ML.LightGBM/LightGbmCatalog.cs | 40 ++--- .../LightGbmMulticlassTrainer.cs | 30 ++-- .../LightGbmRankingTrainer.cs | 19 +- .../LightGbmRegressionTrainer.cs | 8 +- .../LightGbmTrainerBase.cs | 36 ++-- .../WrappedLightGbmBooster.cs | 14 +- src/Microsoft.ML.LightGBM/doc.xml | 79 --------- .../Common/EntryPoints/core_manifest.json | 165 ++++++++++++------ .../LightGBMMC/LightGBMMC-CV-iris.key-out.txt | 4 +- .../LightGBMMC-CV-iris.keyU404-out.txt | 4 +- .../LightGBMMC-TrainTest-iris.key-out.txt | 2 +- .../LightGBMMC-TrainTest-iris.keyU404-out.txt | 2 +- .../UnitTests/TestEntryPoints.cs | 3 +- .../Training.cs | 4 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 2 +- .../TrainerEstimators/TreeEstimators.cs | 10 +- 25 files changed, 300 insertions(+), 297 deletions(-) delete mode 100644 src/Microsoft.ML.LightGBM/doc.xml diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs index 4971fe6180..3b3487bbb6 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs @@ -39,7 +39,7 @@ public static void Example() FeatureColumnName = "Features", Booster = new DartBooster.Options { - DropRate = 0.15, + TreeDropFraction = 0.15, XgboostDartMode = false } })) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs index 576b98cbee..2e616dfee1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs @@ -20,8 +20,8 @@ public static void Example() // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( - leafCount: 4, - minimumDataPerLeaf: 10, + numberOfLeaves: 4, + minimumExampleCountPerLeaf: 10, learningRate: 0.1, numberOfIterations: 2); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs index 235d30e078..101d08ec13 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs @@ -24,7 +24,7 @@ public static void Example() new Options { NumberOfLeaves = 4, - MinimumDataPerLeaf = 10, + MinimumExampleCountPerGroup = 10, LearningRate = 0.1, NumberOfIterations = 2, Booster = new TreeBooster.Options diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs index 5f43a78c25..d23aebf141 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs @@ -35,8 +35,8 @@ public static void Example() var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) .Append(mlContext.Regression.Trainers.LightGbm( labelColumnName: labelName, - leafCount: 4, - minimumDataPerLeaf: 6, + numberOfLeaves: 4, + minimumExampleCountPerLeaf: 6, learningRate: 0.001)); // Fit this pipeline to the training data. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs index c2255554fb..260c546e7f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs @@ -40,7 +40,7 @@ public static void Example() { LabelColumnName = labelName, NumberOfLeaves = 4, - MinimumDataPerLeaf = 6, + MinimumExampleCountPerLeaf = 6, LearningRate = 0.001, Booster = new GossBooster.Options { diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs index 97a30add47..5228c356dc 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs @@ -78,8 +78,8 @@ public static void LightGbmBinaryClassification() Score: mlContext.BinaryClassification.Trainers.LightGbm( row.Label, row.Features, - numLeaves: 4, - minDataPerLeaf: 6, + numberOfLeaves: 4, + minimumExampleCountPerLeaf: 6, learningRate: 0.001))) .Append(row => ( Label: row.Label, diff --git a/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs b/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs index 8dd25b2408..cfa2126526 100644 --- a/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs +++ b/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs @@ -22,7 +22,7 @@ public static class LightGbmStaticExtensions /// The features column. /// The weights column. /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// Number of iterations. /// A delegate that is called every time the @@ -104,10 +104,10 @@ public static Scalar LightGbm(this RegressionCatalog.RegressionTrainers c /// The label column. /// The features column. /// The weights column. - /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// Number of iterations. /// A delegate that is called every time the /// method is called on the /// instance created out of this. This delegate will receive @@ -125,19 +125,19 @@ public static (Scalar score, Scalar probability, Scalar pred Scalar label, Vector features, Scalar weights = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, - int numBoostRound = Options.Defaults.NumberOfIterations, + int numberOfIterations = Options.Defaults.NumberOfIterations, Action> onFit = null) { - CheckUserValues(label, features, weights, numLeaves, minDataPerLeaf, learningRate, numBoostRound, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations, onFit); var rec = new TrainerEstimatorReconciler.BinaryClassifier( (env, labelName, featuresName, weightsName) => { - var trainer = new LightGbmBinaryTrainer(env, labelName, featuresName, weightsName, numLeaves, - minDataPerLeaf, learningRate, numBoostRound); + var trainer = new LightGbmBinaryTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, + minimumExampleCountPerLeaf, learningRate, numberOfIterations); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); @@ -197,9 +197,9 @@ public static (Scalar score, Scalar probability, Scalar pred /// The groupId column. /// The weights column. /// The number of leaves to use. - /// Number of iterations. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. + /// Number of iterations. /// A delegate that is called every time the /// method is called on the /// instance created out of this. This delegate will receive @@ -285,7 +285,7 @@ public static Scalar LightGbm(this RankingCatalog.RankingTrainers c /// The features, or independent variables. /// The weights column. /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// Number of iterations. /// A delegate that is called every time the @@ -370,8 +370,8 @@ public static (Vector score, Key predictedLabel) } private static void CheckUserValues(PipelineColumn label, Vector features, Scalar weights, - int? numLeaves, - int? minDataPerLeaf, + int? numberOfLeaves, + int? minimumExampleCountPerLeaf, double? learningRate, int numBoostRound, Delegate onFit) @@ -379,8 +379,8 @@ private static void CheckUserValues(PipelineColumn label, Vector features Contracts.CheckValue(label, nameof(label)); Contracts.CheckValue(features, nameof(features)); Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(!(numLeaves < 2), nameof(numLeaves), "Must be at least 2."); - Contracts.CheckParam(!(minDataPerLeaf <= 0), nameof(minDataPerLeaf), "Must be positive"); + Contracts.CheckParam(!(numberOfLeaves < 2), nameof(numberOfLeaves), "Must be at least 2."); + Contracts.CheckParam(!(minimumExampleCountPerLeaf <= 0), nameof(minimumExampleCountPerLeaf), "Must be positive"); Contracts.CheckParam(!(learningRate <= 0), nameof(learningRate), "Must be positive"); Contracts.CheckParam(numBoostRound > 0, nameof(numBoostRound), "Must be positive"); Contracts.CheckValueOrNull(onFit); diff --git a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs index 047c19b88d..eb0e70ac02 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs @@ -75,6 +75,10 @@ internal virtual void UpdateParameters(Dictionary res) private static string GetOptionName(string name) { + if (_nameMapping.ContainsKey(name)) + return _nameMapping[name]; + + // Otherwise convert the name to the light gbm argument StringBuilder strBuf = new StringBuilder(); bool first = true; foreach (char c in name) @@ -93,6 +97,31 @@ private static string GetOptionName(string name) return strBuf.ToString(); } + // Static name map that maps friendly names to lightGBM arguments. + // There is a conversion that will convert the field name to a lightGBM name + // (but lowercasing and adding an underscore between words). In + private static Dictionary _nameMapping = new Dictionary() + { + {nameof(TreeBooster.Options.MinimumSplitGain), "min_split_gain" }, + {nameof(TreeBooster.Options.MaximumTreeDepth), "max_depth"}, + {nameof(TreeBooster.Options.MinimumChildWeight), "min_child_weight"}, + {nameof(TreeBooster.Options.SubsampleFrequency), "subsample_freq"}, + {nameof(TreeBooster.Options.L1Regularization), "reg_alpha"}, + {nameof(TreeBooster.Options.L2Regularization), "reg_lambda"}, + {nameof(TreeBooster.Options.WeightOfPositiveExamples), "scale_pos_weight"}, + {nameof(DartBooster.Options.TreeDropFraction), "drop_rate" }, + {nameof(DartBooster.Options.MaximumDroppedTreesPerRound), "max_drop" }, + {nameof(DartBooster.Options.SkipDropFraction), "skip_drop" }, + {nameof(MinimumExampleCountPerLeaf), "min_data_per_leaf"}, + {nameof(NumberOfLeaves), "num_leaves"}, + {nameof(MaximumBinCountPerFeature), "max_bin" }, + {nameof(CustomGains), "label_gain" }, + {nameof(MinimumExampleCountPerGroup), "min_data_per_group" }, + {nameof(MaximumCategoricalSplitPointCount), "max_cat_threshold" }, + {nameof(CategoricalSmoothing), "cat_smooth" }, + {nameof(L2CategoricalRegularization), "cat_l2" } + }; + [BestFriend] internal static class Defaults { @@ -114,25 +143,25 @@ public class Options : ISupportBoosterParameterFactory HelpText = "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, " + "the more conservative the algorithm will be.")] [TlcModule.Range(Min = 0.0)] - public double MinSplitGain = 0; + public double MinimumSplitGain = 0; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.")] [TlcModule.Range(Min = 0, Max = int.MaxValue)] - public int MaxDepth = 0; + public int MaximumTreeDepth = 0; [Argument(ArgumentType.AtMostOnce, HelpText = "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf " + "node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, " + "this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.")] [TlcModule.Range(Min = 0.0)] - public double MinChildWeight = 0.1; + public double MinimumChildWeight = 0.1; [Argument(ArgumentType.AtMostOnce, HelpText = "Subsample frequency for bagging. 0 means no subsample. " - + "If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.")] + + "If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.")] [TlcModule.Range(Min = 0, Max = int.MaxValue)] - public int SubsampleFreq = 0; + public int SubsampleFrequency = 0; [Argument(ArgumentType.AtMostOnce, HelpText = "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected " + @@ -148,24 +177,25 @@ public class Options : ISupportBoosterParameterFactory [Argument(ArgumentType.AtMostOnce, HelpText = "L2 regularization term on weights, increasing this value will make model more conservative.", - ShortName = "l2")] + ShortName = "l2,RegLambda")] [TlcModule.Range(Min = 0.0)] [TGUI(Label = "Lambda(L2)", SuggestedSweeps = "0,0.5,1")] [TlcModule.SweepableDiscreteParam("RegLambda", new object[] { 0f, 0.5f, 1f })] - public double RegLambda = 0.01; + public double L2Regularization = 0.01; [Argument(ArgumentType.AtMostOnce, HelpText = "L1 regularization term on weights, increase this value will make model more conservative.", - ShortName = "l1")] + ShortName = "l1,RegAlpha")] [TlcModule.Range(Min = 0.0)] [TGUI(Label = "Alpha(L1)", SuggestedSweeps = "0,0.5,1")] [TlcModule.SweepableDiscreteParam("RegAlpha", new object[] { 0f, 0.5f, 1f })] - public double RegAlpha = 0; + public double L1Regularization = 0; [Argument(ArgumentType.AtMostOnce, HelpText = "Control the balance of positive and negative weights, useful for unbalanced classes." + - " A typical value to consider: sum(negative cases) / sum(positive cases).")] - public double ScalePosWeight = 1; + " A typical value to consider: sum(negative cases) / sum(positive cases).", + ShortName = "ScalePosWeight")] + public double WeightOfPositiveExamples = 1; internal virtual IBoosterParameter CreateComponent(IHostEnvironment env) => new TreeBooster(this); @@ -175,13 +205,13 @@ public class Options : ISupportBoosterParameterFactory internal TreeBooster(Options options) : base(options) { - Contracts.CheckUserArg(BoosterParameterOptions.MinSplitGain >= 0, nameof(BoosterParameterOptions.MinSplitGain), "must be >= 0."); - Contracts.CheckUserArg(BoosterParameterOptions.MinChildWeight >= 0, nameof(BoosterParameterOptions.MinChildWeight), "must be >= 0."); + Contracts.CheckUserArg(BoosterParameterOptions.MinimumSplitGain >= 0, nameof(BoosterParameterOptions.MinimumSplitGain), "must be >= 0."); + Contracts.CheckUserArg(BoosterParameterOptions.MinimumChildWeight >= 0, nameof(BoosterParameterOptions.MinimumChildWeight), "must be >= 0."); Contracts.CheckUserArg(BoosterParameterOptions.Subsample > 0 && BoosterParameterOptions.Subsample <= 1, nameof(BoosterParameterOptions.Subsample), "must be in (0,1]."); Contracts.CheckUserArg(BoosterParameterOptions.FeatureFraction > 0 && BoosterParameterOptions.FeatureFraction <= 1, nameof(BoosterParameterOptions.FeatureFraction), "must be in (0,1]."); - Contracts.CheckUserArg(BoosterParameterOptions.RegLambda >= 0, nameof(BoosterParameterOptions.RegLambda), "must be >= 0."); - Contracts.CheckUserArg(BoosterParameterOptions.RegAlpha >= 0, nameof(BoosterParameterOptions.RegAlpha), "must be >= 0."); - Contracts.CheckUserArg(BoosterParameterOptions.ScalePosWeight > 0, nameof(BoosterParameterOptions.ScalePosWeight), "must be >= 0."); + Contracts.CheckUserArg(BoosterParameterOptions.L2Regularization >= 0, nameof(BoosterParameterOptions.L2Regularization), "must be >= 0."); + Contracts.CheckUserArg(BoosterParameterOptions.L1Regularization >= 0, nameof(BoosterParameterOptions.L1Regularization), "must be >= 0."); + Contracts.CheckUserArg(BoosterParameterOptions.WeightOfPositiveExamples > 0, nameof(BoosterParameterOptions.WeightOfPositiveExamples), "must be >= 0."); } internal override void UpdateParameters(Dictionary res) @@ -201,15 +231,15 @@ public sealed class Options : TreeBooster.Options { [Argument(ArgumentType.AtMostOnce, HelpText = "The drop ratio for trees. Range:(0,1).")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] - public double DropRate = 0.1; + public double TreeDropFraction = 0.1; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of dropped tree in a boosting round.")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] - public int MaxDrop = 1; + public int MaximumDroppedTreesPerRound = 1; [Argument(ArgumentType.AtMostOnce, HelpText = "Probability for not dropping in a boosting round.")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] - public double SkipDrop = 0.5; + public double SkipDropFraction = 0.5; [Argument(ArgumentType.AtMostOnce, HelpText = "True will enable xgboost dart mode.")] public bool XgboostDartMode = false; @@ -223,8 +253,8 @@ public sealed class Options : TreeBooster.Options internal DartBooster(Options options) : base(options) { - Contracts.CheckUserArg(BoosterParameterOptions.DropRate > 0 && BoosterParameterOptions.DropRate < 1, nameof(BoosterParameterOptions.DropRate), "must be in (0,1)."); - Contracts.CheckUserArg(BoosterParameterOptions.SkipDrop >= 0 && BoosterParameterOptions.SkipDrop < 1, nameof(BoosterParameterOptions.SkipDrop), "must be in [0,1)."); + Contracts.CheckUserArg(BoosterParameterOptions.TreeDropFraction > 0 && BoosterParameterOptions.TreeDropFraction < 1, nameof(BoosterParameterOptions.TreeDropFraction), "must be in (0,1)."); + Contracts.CheckUserArg(BoosterParameterOptions.SkipDropFraction >= 0 && BoosterParameterOptions.SkipDropFraction < 1, nameof(BoosterParameterOptions.SkipDropFraction), "must be in [0,1)."); } internal override void UpdateParameters(Dictionary res) @@ -304,10 +334,10 @@ public enum EvalMetricType SortOrder = 2, ShortName = "mil", NullName = "")] [TGUI(Label = "Min Documents In Leaves", SuggestedSweeps = "1,10,20,50 ")] [TlcModule.SweepableDiscreteParamAttribute("MinDataPerLeaf", new object[] { 1, 10, 20, 50 })] - public int? MinimumDataPerLeaf; + public int? MinimumExampleCountPerLeaf; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of bucket bin for features.", ShortName = "mb")] - public int MaximumBin = 255; + public int MaximumBinCountPerFeature = 255; [Argument(ArgumentType.Multiple, HelpText = "Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.", SortOrder = 3)] public ISupportBoosterParameterFactory Booster = new TreeBooster.Options(); @@ -328,7 +358,7 @@ public enum EvalMetricType [Argument(ArgumentType.AtMostOnce, HelpText = "Use softmax loss for the multi classification.")] [TlcModule.SweepableDiscreteParam("UseSoftmax", new object[] { true, false })] - public bool? UseSoftMaximum; + public bool? UseSoftMax; [Argument(ArgumentType.AtMostOnce, HelpText = "Rounds of early stopping, 0 will disable it.", ShortName = "es")] @@ -357,12 +387,12 @@ public enum EvalMetricType [Argument(ArgumentType.AtMostOnce, HelpText = "Minimum number of instances per categorical group.", ShortName = "mdpg")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] [TlcModule.SweepableDiscreteParam("MinDataPerGroup", new object[] { 10, 50, 100, 200 })] - public int MinimumDataPerGroup = 100; + public int MinimumExampleCountPerGroup = 100; [Argument(ArgumentType.AtMostOnce, HelpText = "Max number of categorical thresholds.", ShortName = "maxcat")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] [TlcModule.SweepableDiscreteParam("MaxCatThreshold", new object[] { 8, 16, 32, 64 })] - public int MaximumCategoricalThreshold = 32; + public int MaximumCategoricalSplitPointCount = 32; [Argument(ArgumentType.AtMostOnce, HelpText = "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.")] [TlcModule.Range(Min = 0.0)] @@ -372,7 +402,7 @@ public enum EvalMetricType [Argument(ArgumentType.AtMostOnce, HelpText = "L2 Regularization for categorical split.")] [TlcModule.Range(Min = 0.0)] [TlcModule.SweepableDiscreteParam("CatL2", new object[] { 0.1, 0.5, 1, 5, 10 })] - public double L2Categorical = 10; + public double L2CategoricalRegularization = 10; [Argument(ArgumentType.AtMostOnce, HelpText = "Sets the random seed for LightGBM to use.")] public int? Seed; @@ -383,14 +413,14 @@ public enum EvalMetricType internal Dictionary ToDictionary(IHost host) { Contracts.CheckValue(host, nameof(host)); - Contracts.CheckUserArg(MaximumBin > 0, nameof(MaximumBin), "must be > 0."); + Contracts.CheckUserArg(MaximumBinCountPerFeature > 0, nameof(MaximumBinCountPerFeature), "must be > 0."); Contracts.CheckUserArg(Sigmoid > 0, nameof(Sigmoid), "must be > 0."); Dictionary res = new Dictionary(); var boosterParams = Booster.CreateComponent(host); boosterParams.UpdateParameters(res); - res["max_bin"] = MaximumBin; + res[GetOptionName(nameof(MaximumBinCountPerFeature))] = MaximumBinCountPerFeature; res["verbose"] = Silent ? "-1" : "1"; if (NumberOfThreads.HasValue) @@ -426,14 +456,14 @@ internal Dictionary ToDictionary(IHost host) break; } if (!string.IsNullOrEmpty(metric)) - res["metric"] = metric; - res["sigmoid"] = Sigmoid; - res["label_gain"] = CustomGains; - res["use_missing"] = UseMissing; - res["min_data_per_group"] = MinimumDataPerGroup; - res["max_cat_threshold"] = MaximumCategoricalThreshold; - res["cat_smooth"] = CategoricalSmoothing; - res["cat_l2"] = L2Categorical; + res[GetOptionName(nameof(metric))] = metric; + res[GetOptionName(nameof(Sigmoid))] = Sigmoid; + res[GetOptionName(nameof(CustomGains))] = CustomGains; + res[GetOptionName(nameof(UseMissing))] = UseMissing; + res[GetOptionName(nameof(MinimumExampleCountPerGroup))] = MinimumExampleCountPerGroup; + res[GetOptionName(nameof(MaximumCategoricalSplitPointCount))] = MaximumCategoricalSplitPointCount; + res[GetOptionName(nameof(CategoricalSmoothing))] = CategoricalSmoothing; + res[GetOptionName(nameof(L2CategoricalRegularization))] = L2CategoricalRegularization; return res; } } diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs index 21693a6882..33bb6a57e8 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs @@ -24,7 +24,6 @@ namespace Microsoft.ML.LightGBM { - /// public sealed class LightGbmBinaryModelParameters : TreeEnsembleModelParametersBasedOnRegressionTree { internal const string LoaderSignature = "LightGBMBinaryExec"; @@ -81,7 +80,13 @@ private static IPredictorProducing Create(IHostEnvironment env, ModelLoad } } - /// + /// + /// Trains a Light GBM Model. + /// + /// + /// Light GBM is an open source implementation of boosted trees. + /// GitHub: LightGBM + /// public sealed class LightGbmBinaryTrainer : LightGbmTrainerBase>, CalibratedModelParametersBase> @@ -105,19 +110,19 @@ internal LightGbmBinaryTrainer(IHostEnvironment env, Options options) /// The name of The label column. /// The name of the feature column. /// The name for the column containing the initial weight. - /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// Number of iterations. internal LightGbmBinaryTrainer(IHostEnvironment env, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string weights = null, - int? leafCount = null, - int? minimumDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) - : base(env, LoadNameValue, TrainerUtils.MakeBoolScalarLabel(labelColumnName), featureColumnName, weights, null, leafCount, minimumDataPerLeaf, learningRate, numberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeBoolScalarLabel(labelColumnName), featureColumnName, weights, null, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations) { } diff --git a/src/Microsoft.ML.LightGBM/LightGbmCatalog.cs b/src/Microsoft.ML.LightGBM/LightGbmCatalog.cs index 03988ad45f..ce610af9b2 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmCatalog.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmCatalog.cs @@ -20,8 +20,8 @@ public static class LightGbmExtensions /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// The number of iterations to use. /// @@ -35,14 +35,14 @@ public static LightGbmRegressorTrainer LightGbm(this RegressionCatalog.Regressio string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int? leafCount = null, - int? minimumDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = Options.Defaults.NumberOfIterations) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new LightGbmRegressorTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, leafCount, minimumDataPerLeaf, learningRate, numberOfIterations); + return new LightGbmRegressorTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations); } /// @@ -72,8 +72,8 @@ public static LightGbmRegressorTrainer LightGbm(this RegressionCatalog.Regressio /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// The number of iterations to use. /// @@ -87,14 +87,14 @@ public static LightGbmBinaryTrainer LightGbm(this BinaryClassificationCatalog.Bi string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int? leafCount = null, - int? minimumDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = Options.Defaults.NumberOfIterations) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new LightGbmBinaryTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, leafCount, minimumDataPerLeaf, learningRate, numberOfIterations); + return new LightGbmBinaryTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations); } /// @@ -125,8 +125,8 @@ public static LightGbmBinaryTrainer LightGbm(this BinaryClassificationCatalog.Bi /// The name of the feature column. /// The name of the group column. /// The name of the example weight column (optional). - /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// The number of iterations to use. public static LightGbmRankingTrainer LightGbm(this RankingCatalog.RankingTrainers catalog, @@ -134,14 +134,14 @@ public static LightGbmRankingTrainer LightGbm(this RankingCatalog.RankingTrainer string featureColumnName = DefaultColumnNames.Features, string rowGroupColumnName = DefaultColumnNames.GroupId, string exampleWeightColumnName = null, - int? leafCount = null, - int? minimumDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = Options.Defaults.NumberOfIterations) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new LightGbmRankingTrainer(env, labelColumnName, featureColumnName, rowGroupColumnName, exampleWeightColumnName, leafCount, minimumDataPerLeaf, learningRate, numberOfIterations); + return new LightGbmRankingTrainer(env, labelColumnName, featureColumnName, rowGroupColumnName, exampleWeightColumnName, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations); } /// @@ -164,8 +164,8 @@ public static LightGbmRankingTrainer LightGbm(this RankingCatalog.RankingTrainer /// The name of the label column. /// The name of the feature column. /// The name of the example weight column (optional). - /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// The number of iterations to use. /// @@ -179,14 +179,14 @@ public static LightGbmMulticlassTrainer LightGbm(this MulticlassClassificationCa string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, - int? leafCount = null, - int? minimumDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = Options.Defaults.NumberOfIterations) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); - return new LightGbmMulticlassTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, leafCount, minimumDataPerLeaf, learningRate, numberOfIterations); + return new LightGbmMulticlassTrainer(env, labelColumnName, featureColumnName, exampleWeightColumnName, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations); } /// diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs index 4298c05996..c6029123cf 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs @@ -19,8 +19,6 @@ namespace Microsoft.ML.LightGBM { - - /// public sealed class LightGbmMulticlassTrainer : LightGbmTrainerBase, MulticlassPredictionTransformer, OneVersusAllModelParameters> { internal const string Summary = "LightGBM Multi Class Classifier"; @@ -46,19 +44,19 @@ internal LightGbmMulticlassTrainer(IHostEnvironment env, Options options) /// The name of The label column. /// The name of the feature column. /// The name for the column containing the initial weight. - /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The number of leaves to use. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// The number of iterations to use. internal LightGbmMulticlassTrainer(IHostEnvironment env, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, string weights = null, - int? numLeaves = null, - int? minDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) - : base(env, LoadNameValue, TrainerUtils.MakeU4ScalarColumn(labelColumnName), featureColumnName, weights, null, numLeaves, minDataPerLeaf, learningRate, numberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeU4ScalarColumn(labelColumnName), featureColumnName, weights, null, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations) { _numClass = -1; } @@ -162,17 +160,17 @@ private protected override void ConvertNaNLabels(IChannel ch, RoleMappedData dat protected override void GetDefaultParameters(IChannel ch, int numRow, bool hasCategorical, int totalCats, bool hiddenMsg = false) { base.GetDefaultParameters(ch, numRow, hasCategorical, totalCats, true); - int numLeaves = (int)Options["num_leaves"]; - int minDataPerLeaf = LightGbmTrainerOptions.MinimumDataPerLeaf ?? DefaultMinDataPerLeaf(numRow, numLeaves, _numClass); - Options["min_data_per_leaf"] = minDataPerLeaf; + int numberOfLeaves = (int)Options["num_leaves"]; + int minimumExampleCountPerLeaf = LightGbmTrainerOptions.MinimumExampleCountPerLeaf ?? DefaultMinDataPerLeaf(numRow, numberOfLeaves, _numClass); + Options["min_data_per_leaf"] = minimumExampleCountPerLeaf; if (!hiddenMsg) { if (!LightGbmTrainerOptions.LearningRate.HasValue) ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.LearningRate) + " = " + Options["learning_rate"]); if (!LightGbmTrainerOptions.NumberOfLeaves.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.NumberOfLeaves) + " = " + numLeaves); - if (!LightGbmTrainerOptions.MinimumDataPerLeaf.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.MinimumDataPerLeaf) + " = " + minDataPerLeaf); + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.NumberOfLeaves) + " = " + numberOfLeaves); + if (!LightGbmTrainerOptions.MinimumExampleCountPerLeaf.HasValue) + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.MinimumExampleCountPerLeaf) + " = " + minimumExampleCountPerLeaf); } } @@ -184,14 +182,14 @@ private protected override void CheckAndUpdateParametersBeforeTraining(IChannel Options["num_class"] = _numClass; bool useSoftmax = false; - if (LightGbmTrainerOptions.UseSoftMaximum.HasValue) - useSoftmax = LightGbmTrainerOptions.UseSoftMaximum.Value; + if (LightGbmTrainerOptions.UseSoftMax.HasValue) + useSoftmax = LightGbmTrainerOptions.UseSoftMax.Value; else { if (labels.Length >= _minDataToUseSoftmax) useSoftmax = true; - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.UseSoftMaximum) + " = " + useSoftmax); + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.UseSoftMax) + " = " + useSoftmax); } if (useSoftmax) diff --git a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs index d9870205a3..5e1cfe311d 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs @@ -70,7 +70,6 @@ private static LightGbmRankingModelParameters Create(IHostEnvironment env, Model } } - /// public sealed class LightGbmRankingTrainer : LightGbmTrainerBase, LightGbmRankingModelParameters> { internal const string UserName = "LightGBM Ranking"; @@ -90,26 +89,26 @@ internal LightGbmRankingTrainer(IHostEnvironment env, Options options) /// The private instance of . /// The name of the label column. /// The name of the feature column. - /// The name of the column containing the group ID. + /// The name of the column containing the group ID. /// The name of the optional column containing the initial weights. - /// The number of leaves to use. + /// The number of leaves to use. /// The learning rate. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The number of iterations to use. internal LightGbmRankingTrainer(IHostEnvironment env, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, - string groupIdColumnName = DefaultColumnNames.GroupId, + string rowGroupdColumnName = DefaultColumnNames.GroupId, string weightsColumnName = null, - int? leafCount = null, - int? minimumDataPerLeaf = null, + int? numberOfLeaves = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumnName), - featureColumnName, weightsColumnName, groupIdColumnName, leafCount, - minimumDataPerLeaf, learningRate, numberOfIterations) + featureColumnName, weightsColumnName, rowGroupdColumnName, numberOfLeaves, + minimumExampleCountPerLeaf, learningRate, numberOfIterations) { - Host.CheckNonEmpty(groupIdColumnName, nameof(groupIdColumnName)); + Host.CheckNonEmpty(rowGroupdColumnName, nameof(rowGroupdColumnName)); } private protected override void CheckDataValid(IChannel ch, RoleMappedData data) diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs index f93e2126c6..9453cf6a2e 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs @@ -21,7 +21,6 @@ namespace Microsoft.ML.LightGBM { - /// public sealed class LightGbmRegressionModelParameters : TreeEnsembleModelParametersBasedOnRegressionTree { internal const string LoaderSignature = "LightGBMRegressionExec"; @@ -73,7 +72,6 @@ private static LightGbmRegressionModelParameters Create(IHostEnvironment env, Mo } } - /// public sealed class LightGbmRegressorTrainer : LightGbmTrainerBase, LightGbmRegressionModelParameters> { internal const string Summary = "LightGBM Regression"; @@ -91,7 +89,7 @@ public sealed class LightGbmRegressorTrainer : LightGbmTrainerBaseThe name of the feature column. /// The name for the column containing the initial weight. /// The number of leaves to use. - /// The minimal number of documents allowed in a leaf of the tree, out of the subsampled data. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// Number of iterations. internal LightGbmRegressorTrainer(IHostEnvironment env, @@ -99,10 +97,10 @@ internal LightGbmRegressorTrainer(IHostEnvironment env, string featureColumnName = DefaultColumnNames.Features, string weightsColumnName = null, int? numberOfLeaves = null, - int? minimumDataPerLeaf = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) - : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumnName), featureColumnName, weightsColumnName, null, numberOfLeaves, minimumDataPerLeaf, learningRate, numberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeR4ScalarColumn(labelColumnName), featureColumnName, weightsColumnName, null, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations) { } diff --git a/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs b/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs index 7ae79c09fc..60f8e96bca 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs @@ -60,26 +60,26 @@ private protected LightGbmTrainerBase(IHostEnvironment env, string name, SchemaShape.Column labelColumn, string featureColumnName, - string weightColumnName, - string groupIdColumnName, - int? leafCount, - int? minimumDataPerLeaf, + string exampleWeightColumnName, + string rowGroupColumnName, + int? numberOfLeaves, + int? minimumExampleCountPerLeaf, double? learningRate, int numberOfIterations) : base(Contracts.CheckRef(env, nameof(env)).Register(name), TrainerUtils.MakeR4VecFeature(featureColumnName), - labelColumn, TrainerUtils.MakeR4ScalarWeightColumn(weightColumnName), TrainerUtils.MakeU4ScalarColumn(groupIdColumnName)) + labelColumn, TrainerUtils.MakeR4ScalarWeightColumn(exampleWeightColumnName), TrainerUtils.MakeU4ScalarColumn(rowGroupColumnName)) { LightGbmTrainerOptions = new Options(); - LightGbmTrainerOptions.NumberOfLeaves = leafCount; - LightGbmTrainerOptions.MinimumDataPerLeaf = minimumDataPerLeaf; + LightGbmTrainerOptions.NumberOfLeaves = numberOfLeaves; + LightGbmTrainerOptions.MinimumExampleCountPerLeaf = minimumExampleCountPerLeaf; LightGbmTrainerOptions.LearningRate = learningRate; LightGbmTrainerOptions.NumberOfIterations = numberOfIterations; LightGbmTrainerOptions.LabelColumnName = labelColumn.Name; LightGbmTrainerOptions.FeatureColumnName = featureColumnName; - LightGbmTrainerOptions.ExampleWeightColumnName = weightColumnName; - LightGbmTrainerOptions.RowGroupColumnName = groupIdColumnName; + LightGbmTrainerOptions.ExampleWeightColumnName = exampleWeightColumnName; + LightGbmTrainerOptions.RowGroupColumnName = rowGroupColumnName; InitParallelTraining(); } @@ -168,19 +168,19 @@ private protected virtual void CheckDataValid(IChannel ch, RoleMappedData data) protected virtual void GetDefaultParameters(IChannel ch, int numRow, bool hasCategarical, int totalCats, bool hiddenMsg = false) { double learningRate = LightGbmTrainerOptions.LearningRate ?? DefaultLearningRate(numRow, hasCategarical, totalCats); - int numLeaves = LightGbmTrainerOptions.NumberOfLeaves ?? DefaultNumLeaves(numRow, hasCategarical, totalCats); - int minDataPerLeaf = LightGbmTrainerOptions.MinimumDataPerLeaf ?? DefaultMinDataPerLeaf(numRow, numLeaves, 1); + int numberOfLeaves = LightGbmTrainerOptions.NumberOfLeaves ?? DefaultNumLeaves(numRow, hasCategarical, totalCats); + int minimumExampleCountPerLeaf = LightGbmTrainerOptions.MinimumExampleCountPerLeaf ?? DefaultMinDataPerLeaf(numRow, numberOfLeaves, 1); Options["learning_rate"] = learningRate; - Options["num_leaves"] = numLeaves; - Options["min_data_per_leaf"] = minDataPerLeaf; + Options["num_leaves"] = numberOfLeaves; + Options["min_data_per_leaf"] = minimumExampleCountPerLeaf; if (!hiddenMsg) { if (!LightGbmTrainerOptions.LearningRate.HasValue) ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.LearningRate) + " = " + learningRate); if (!LightGbmTrainerOptions.NumberOfLeaves.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.NumberOfLeaves) + " = " + numLeaves); - if (!LightGbmTrainerOptions.MinimumDataPerLeaf.HasValue) - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.MinimumDataPerLeaf) + " = " + minDataPerLeaf); + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.NumberOfLeaves) + " = " + numberOfLeaves); + if (!LightGbmTrainerOptions.MinimumExampleCountPerLeaf.HasValue) + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.MinimumExampleCountPerLeaf) + " = " + minimumExampleCountPerLeaf); } } @@ -873,11 +873,11 @@ private static int DefaultNumLeaves(int numRow, bool useCat, int totalCats) return 30; } - protected static int DefaultMinDataPerLeaf(int numRow, int numLeaves, int numClass) + protected static int DefaultMinDataPerLeaf(int numRow, int numberOfLeaves, int numClass) { if (numClass > 1) { - int ret = numRow / numLeaves / numClass / 10; + int ret = numRow / numberOfLeaves / numClass / 10; ret = Math.Max(ret, 5); ret = Math.Min(ret, 50); return ret; diff --git a/src/Microsoft.ML.LightGBM/WrappedLightGbmBooster.cs b/src/Microsoft.ML.LightGBM/WrappedLightGbmBooster.cs index e5384a8c6c..21c3eb4a13 100644 --- a/src/Microsoft.ML.LightGBM/WrappedLightGbmBooster.cs +++ b/src/Microsoft.ML.LightGBM/WrappedLightGbmBooster.cs @@ -205,9 +205,9 @@ public InternalTreeEnsemble GetModel(int[] categoricalFeatureBoudaries) kvPairs[kv[0].Trim()] = kv[1].Trim(); ++i; } - int numLeaves = int.Parse(kvPairs["num_leaves"], CultureInfo.InvariantCulture); + int numberOfLeaves = int.Parse(kvPairs["num_leaves"], CultureInfo.InvariantCulture); int numCat = int.Parse(kvPairs["num_cat"], CultureInfo.InvariantCulture); - if (numLeaves > 1) + if (numberOfLeaves > 1) { var leftChild = Str2IntArray(kvPairs["left_child"], ' '); var rightChild = Str2IntArray(kvPairs["right_child"], ' '); @@ -217,12 +217,12 @@ public InternalTreeEnsemble GetModel(int[] categoricalFeatureBoudaries) var leafOutput = Str2DoubleArray(kvPairs["leaf_value"], ' '); var decisionType = Str2UIntArray(kvPairs["decision_type"], ' '); var defaultValue = GetDefalutValue(threshold, decisionType); - var categoricalSplitFeatures = new int[numLeaves - 1][]; - var categoricalSplit = new bool[numLeaves - 1]; + var categoricalSplitFeatures = new int[numberOfLeaves - 1][]; + var categoricalSplit = new bool[numberOfLeaves - 1]; if (categoricalFeatureBoudaries != null) { // Add offsets to split features. - for (int node = 0; node < numLeaves - 1; ++node) + for (int node = 0; node < numberOfLeaves - 1; ++node) splitFeature[node] = categoricalFeatureBoudaries[splitFeature[node]]; } @@ -230,7 +230,7 @@ public InternalTreeEnsemble GetModel(int[] categoricalFeatureBoudaries) { var catBoundaries = Str2IntArray(kvPairs["cat_boundaries"], ' '); var catThreshold = Str2UIntArray(kvPairs["cat_threshold"], ' '); - for (int node = 0; node < numLeaves - 1; ++node) + for (int node = 0; node < numberOfLeaves - 1; ++node) { if (GetIsCategoricalSplit(decisionType[node])) { @@ -254,7 +254,7 @@ public InternalTreeEnsemble GetModel(int[] categoricalFeatureBoudaries) } } } - InternalRegressionTree tree = InternalRegressionTree.Create(numLeaves, splitFeature, splitGain, + InternalRegressionTree tree = InternalRegressionTree.Create(numberOfLeaves, splitFeature, splitGain, threshold.Select(x => (float)(x)).ToArray(), defaultValue.Select(x => (float)(x)).ToArray(), leftChild, rightChild, leafOutput, categoricalSplitFeatures, categoricalSplit); res.AddTree(tree); diff --git a/src/Microsoft.ML.LightGBM/doc.xml b/src/Microsoft.ML.LightGBM/doc.xml deleted file mode 100644 index dfa4ccfad7..0000000000 --- a/src/Microsoft.ML.LightGBM/doc.xml +++ /dev/null @@ -1,79 +0,0 @@ - - - - - - - Trains a Light GBM Model. - - - Light GBM is an open source implementation of boosted trees. - GitHub: LightGBM - - - - - - new LightGbmBinaryClassifier - { - NumberOfIterations = 200, - LearningRate = 0.5f, - NumberOfLeaves = 32, - MinimumDataPerLeaf = 20 - } - - - - - - - new LightGbmClassifier - { - NumberOfIterations = 200, - LearningRate = 0.5f, - NumberOfLeaves = 32, - MinimumDataPerLeaf = 20 - } - - - - - - - new LightGbmRegressor - { - NumberOfIterations = 100, - LearningRate = 0.5f, - NumberOfLeaves = 32, - MinimumDataPerLeaf = 20, - Booster = new DartBoosterParameterFunction - { - XgboostDartMode = true, - UniformDrop = true - } - } - - - - - - - new LightGbmRanker - { - NumberOfIterations = 100, - LearningRate = 0.5f, - NumberOfLeaves = 32, - MinimumDataPerLeaf = 20, - Booster = new GbdtBoosterParameterFunction - { - MinSplitGain = 3, - MaxDepth = 200, - Subsample = 0.5 - } - } - - - - - - \ No newline at end of file diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 8654c4c032..bf4f4c5408 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -11200,7 +11200,7 @@ } }, { - "Name": "MinimumDataPerLeaf", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -11322,7 +11322,7 @@ "Default": "Auto" }, { - "Name": "MaximumBin", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of bucket bin for features.", "Aliases": [ @@ -11393,7 +11393,7 @@ "Default": "DefaultMetric" }, { - "Name": "UseSoftMaximum", + "Name": "UseSoftMax", "Type": "Bool", "Desc": "Use softmax loss for the multi classification.", "Required": false, @@ -11489,7 +11489,7 @@ } }, { - "Name": "MinimumDataPerGroup", + "Name": "MinimumExampleCountPerGroup", "Type": "Int", "Desc": "Minimum number of instances per categorical group.", "Aliases": [ @@ -11514,7 +11514,7 @@ } }, { - "Name": "MaximumCategoricalThreshold", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -11559,7 +11559,7 @@ } }, { - "Name": "L2Categorical", + "Name": "L2CategoricalRegularization", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -11703,7 +11703,7 @@ } }, { - "Name": "MinimumDataPerLeaf", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -11825,7 +11825,7 @@ "Default": "Auto" }, { - "Name": "MaximumBin", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of bucket bin for features.", "Aliases": [ @@ -11896,7 +11896,7 @@ "Default": "DefaultMetric" }, { - "Name": "UseSoftMaximum", + "Name": "UseSoftMax", "Type": "Bool", "Desc": "Use softmax loss for the multi classification.", "Required": false, @@ -11992,7 +11992,7 @@ } }, { - "Name": "MinimumDataPerGroup", + "Name": "MinimumExampleCountPerGroup", "Type": "Int", "Desc": "Minimum number of instances per categorical group.", "Aliases": [ @@ -12017,7 +12017,7 @@ } }, { - "Name": "MaximumCategoricalThreshold", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -12062,7 +12062,7 @@ } }, { - "Name": "L2Categorical", + "Name": "L2CategoricalRegularization", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -12206,7 +12206,7 @@ } }, { - "Name": "MinimumDataPerLeaf", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -12328,7 +12328,7 @@ "Default": "Auto" }, { - "Name": "MaximumBin", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of bucket bin for features.", "Aliases": [ @@ -12399,7 +12399,7 @@ "Default": "DefaultMetric" }, { - "Name": "UseSoftMaximum", + "Name": "UseSoftMax", "Type": "Bool", "Desc": "Use softmax loss for the multi classification.", "Required": false, @@ -12495,7 +12495,7 @@ } }, { - "Name": "MinimumDataPerGroup", + "Name": "MinimumExampleCountPerGroup", "Type": "Int", "Desc": "Minimum number of instances per categorical group.", "Aliases": [ @@ -12520,7 +12520,7 @@ } }, { - "Name": "MaximumCategoricalThreshold", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -12565,7 +12565,7 @@ } }, { - "Name": "L2Categorical", + "Name": "L2CategoricalRegularization", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -12709,7 +12709,7 @@ } }, { - "Name": "MinimumDataPerLeaf", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -12831,7 +12831,7 @@ "Default": "Auto" }, { - "Name": "MaximumBin", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of bucket bin for features.", "Aliases": [ @@ -12902,7 +12902,7 @@ "Default": "DefaultMetric" }, { - "Name": "UseSoftMaximum", + "Name": "UseSoftMax", "Type": "Bool", "Desc": "Use softmax loss for the multi classification.", "Required": false, @@ -12998,7 +12998,7 @@ } }, { - "Name": "MinimumDataPerGroup", + "Name": "MinimumExampleCountPerGroup", "Type": "Int", "Desc": "Minimum number of instances per categorical group.", "Aliases": [ @@ -13023,7 +13023,7 @@ } }, { - "Name": "MaximumCategoricalThreshold", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -13068,7 +13068,7 @@ } }, { - "Name": "L2Categorical", + "Name": "L2CategoricalRegularization", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -23536,7 +23536,7 @@ "FriendlyName": "Tree Dropout Tree Booster", "Settings": [ { - "Name": "DropRate", + "Name": "TreeDropFraction", "Type": "Float", "Desc": "The drop ratio for trees. Range:(0,1).", "Required": false, @@ -23549,7 +23549,7 @@ } }, { - "Name": "MaxDrop", + "Name": "MaximumDroppedTreesPerRound", "Type": "Int", "Desc": "Maximum number of dropped tree in a boosting round.", "Required": false, @@ -23562,7 +23562,7 @@ } }, { - "Name": "SkipDrop", + "Name": "SkipDropFraction", "Type": "Float", "Desc": "Probability for not dropping in a boosting round.", "Required": false, @@ -23605,9 +23605,12 @@ "Default": false }, { - "Name": "MinSplitGain", + "Name": "MinimumSplitGain", "Type": "Float", "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.", + "Aliases": [ + "MinSplitGain" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23617,9 +23620,12 @@ } }, { - "Name": "MaxDepth", + "Name": "MaximumTreeDepth", "Type": "Int", "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.", + "Aliases": [ + "MaxDepth" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23630,9 +23636,12 @@ } }, { - "Name": "MinChildWeight", + "Name": "MinimumChildWeight", "Type": "Float", "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.", + "Aliases": [ + "MinChildWeight" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23642,9 +23651,12 @@ } }, { - "Name": "SubsampleFreq", + "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", + "Aliases": [ + "SubsampleFreq" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23684,11 +23696,12 @@ } }, { - "Name": "RegLambda", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ - "l2" + "l2", + "RegLambda" ], "Required": false, "SortOrder": 150.0, @@ -23707,11 +23720,12 @@ } }, { - "Name": "RegAlpha", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ - "l1" + "l1", + "RegAlpha" ], "Required": false, "SortOrder": 150.0, @@ -23730,9 +23744,12 @@ } }, { - "Name": "ScalePosWeight", + "Name": "WeightOfPositiveExamples", "Type": "Float", "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).", + "Aliases": [ + "ScalePosWeight" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23758,9 +23775,12 @@ "Default": false }, { - "Name": "MinSplitGain", + "Name": "MinimumSplitGain", "Type": "Float", "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.", + "Aliases": [ + "MinSplitGain" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23770,9 +23790,12 @@ } }, { - "Name": "MaxDepth", + "Name": "MaximumTreeDepth", "Type": "Int", "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.", + "Aliases": [ + "MaxDepth" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23783,9 +23806,12 @@ } }, { - "Name": "MinChildWeight", + "Name": "MinimumChildWeight", "Type": "Float", "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.", + "Aliases": [ + "MinChildWeight" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23795,9 +23821,12 @@ } }, { - "Name": "SubsampleFreq", + "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", + "Aliases": [ + "SubsampleFreq" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23837,11 +23866,12 @@ } }, { - "Name": "RegLambda", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ - "l2" + "l2", + "RegLambda" ], "Required": false, "SortOrder": 150.0, @@ -23860,11 +23890,12 @@ } }, { - "Name": "RegAlpha", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ - "l1" + "l1", + "RegAlpha" ], "Required": false, "SortOrder": 150.0, @@ -23883,9 +23914,12 @@ } }, { - "Name": "ScalePosWeight", + "Name": "WeightOfPositiveExamples", "Type": "Float", "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).", + "Aliases": [ + "ScalePosWeight" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23937,9 +23971,12 @@ "Default": false }, { - "Name": "MinSplitGain", + "Name": "MinimumSplitGain", "Type": "Float", "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.", + "Aliases": [ + "MinSplitGain" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23949,9 +23986,12 @@ } }, { - "Name": "MaxDepth", + "Name": "MaximumTreeDepth", "Type": "Int", "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.", + "Aliases": [ + "MaxDepth" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23962,9 +24002,12 @@ } }, { - "Name": "MinChildWeight", + "Name": "MinimumChildWeight", "Type": "Float", "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.", + "Aliases": [ + "MinChildWeight" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23974,9 +24017,12 @@ } }, { - "Name": "SubsampleFreq", + "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", + "Aliases": [ + "SubsampleFreq" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -24016,11 +24062,12 @@ } }, { - "Name": "RegLambda", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ - "l2" + "l2", + "RegLambda" ], "Required": false, "SortOrder": 150.0, @@ -24039,11 +24086,12 @@ } }, { - "Name": "RegAlpha", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ - "l1" + "l1", + "RegAlpha" ], "Required": false, "SortOrder": 150.0, @@ -24062,9 +24110,12 @@ } }, { - "Name": "ScalePosWeight", + "Name": "WeightOfPositiveExamples", "Type": "Float", "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).", + "Aliases": [ + "ScalePosWeight" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt index a9a5af5543..7f10dc44ad 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt @@ -1,12 +1,12 @@ maml.exe CV tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:TX:0 col=Features:1-*} data=%Data% seed=1 xf=Term{col=Label} Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMaximum = False +Auto-tuning parameters: UseSoftMax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMaximum = False +Auto-tuning parameters: UseSoftMax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt index 9958fa93b8..4765814e37 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt @@ -1,12 +1,12 @@ maml.exe CV tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:U4[0-2]:0 col=Features:1-4} data=%Data% seed=1 Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMaximum = False +Auto-tuning parameters: UseSoftMax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMaximum = False +Auto-tuning parameters: UseSoftMax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt index a92727951e..d65bd3ff29 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt @@ -1,7 +1,7 @@ maml.exe TrainTest test=%Data% tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:TX:0 col=Features:1-*} data=%Data% out=%Output% seed=1 xf=Term{col=Label} Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMaximum = False +Auto-tuning parameters: UseSoftMax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt index e0001f3d38..3398f10263 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt @@ -1,7 +1,7 @@ maml.exe TrainTest test=%Data% tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:U4[0-2]:0 col=Features:1-4} data=%Data% out=%Output% seed=1 Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMaximum = False +Auto-tuning parameters: UseSoftMax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 87e788c31f..cb1b1911c9 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -226,7 +226,8 @@ private string GetBuildPrefix() #endif } - [Fact(Skip = "Execute this test if you want to regenerate the core_manifest and core_ep_list files")] + //[Fact(Skip = "Execute this test if you want to regenerate the core_manifest and core_ep_list files")] + [Fact] public void RegenerateEntryPointCatalog() { var (epListContents, jObj) = BuildManifests(); diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs index e233de6f36..b0e6b2a42f 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/Training.cs @@ -534,8 +534,8 @@ public void LightGbmBinaryClassification() var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: catalog.Trainers.LightGbm(r.label, r.features, - numBoostRound: 10, - numLeaves: 5, + numberOfIterations: 10, + numberOfLeaves: 5, learningRate: 0.01, onFit: (p) => { pred = p; }))); diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 0983dbef3d..003b49f3db 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -345,7 +345,7 @@ public void LightGbmBinaryClassificationOnnxConversionTest() var dynamicPipeline = mlContext.Transforms.Normalize("FeatureVector") .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.LightGbm(labelColumnName: "Target", featureColumnName: "FeatureVector", numberOfIterations: 3, leafCount: 16, minimumDataPerLeaf: 100)); + .Append(mlContext.Regression.Trainers.LightGbm(labelColumnName: "Target", featureColumnName: "FeatureVector", numberOfIterations: 3, numberOfLeaves: 16, minimumExampleCountPerLeaf: 100)); var model = dynamicPipeline.Fit(data); // Step 2: Convert ML.NET model to ONNX format and save it as a file. diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs index db58372e6e..e875f2a90b 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs @@ -52,7 +52,7 @@ public void LightGBMBinaryEstimator() { NumberOfLeaves = 10, NumberOfThreads = 1, - MinimumDataPerLeaf = 2, + MinimumExampleCountPerLeaf = 2, }); var pipeWithTrainer = pipe.Append(trainer); @@ -171,7 +171,7 @@ public void LightGBMRegressorEstimator() { NumberOfThreads = 1, NormalizeFeatures = NormalizeOption.Warn, - L2Categorical = 5, + L2CategoricalRegularization = 5, }); TestEstimatorCore(trainer, dataView); @@ -296,9 +296,9 @@ private void LightGbmHelper(bool useSoftmax, out string modelString, out List Date: Fri, 1 Mar 2019 17:48:52 -0800 Subject: [PATCH 3/4] - Minor updates --- .../LightGbmArguments.cs | 23 ++++---- .../LightGbmBinaryTrainer.cs | 6 +-- .../LightGbmMulticlassTrainer.cs | 12 ++--- .../LightGbmRegressionTrainer.cs | 6 +-- .../Common/EntryPoints/core_manifest.json | 52 +++---------------- .../UnitTests/TestEntryPoints.cs | 3 +- .../TrainerEstimators/TreeEstimators.cs | 2 +- 7 files changed, 34 insertions(+), 70 deletions(-) diff --git a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs index eb0e70ac02..abedc67b42 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs @@ -97,9 +97,9 @@ private static string GetOptionName(string name) return strBuf.ToString(); } - // Static name map that maps friendly names to lightGBM arguments. - // There is a conversion that will convert the field name to a lightGBM name - // (but lowercasing and adding an underscore between words). In + // Static override name map that maps friendly names to lightGBM arguments. + // If an argument is not here, then its name is identicaltto a lightGBM argument + // and does not require a mapping, for example, Subsample. private static Dictionary _nameMapping = new Dictionary() { {nameof(TreeBooster.Options.MinimumSplitGain), "min_split_gain" }, @@ -110,7 +110,7 @@ private static string GetOptionName(string name) {nameof(TreeBooster.Options.L2Regularization), "reg_lambda"}, {nameof(TreeBooster.Options.WeightOfPositiveExamples), "scale_pos_weight"}, {nameof(DartBooster.Options.TreeDropFraction), "drop_rate" }, - {nameof(DartBooster.Options.MaximumDroppedTreesPerRound), "max_drop" }, + {nameof(DartBooster.Options.MaximumDroppedTreeCountPerRound), "max_drop" }, {nameof(DartBooster.Options.SkipDropFraction), "skip_drop" }, {nameof(MinimumExampleCountPerLeaf), "min_data_per_leaf"}, {nameof(NumberOfLeaves), "num_leaves"}, @@ -159,7 +159,8 @@ public class Options : ISupportBoosterParameterFactory [Argument(ArgumentType.AtMostOnce, HelpText = "Subsample frequency for bagging. 0 means no subsample. " - + "If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.")] + + "Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at N iterations." + + "This must be set with Subsample as this specifies the amount to subsample.")] [TlcModule.Range(Min = 0, Max = int.MaxValue)] public int SubsampleFrequency = 0; @@ -177,7 +178,7 @@ public class Options : ISupportBoosterParameterFactory [Argument(ArgumentType.AtMostOnce, HelpText = "L2 regularization term on weights, increasing this value will make model more conservative.", - ShortName = "l2,RegLambda")] + ShortName = "l2")] [TlcModule.Range(Min = 0.0)] [TGUI(Label = "Lambda(L2)", SuggestedSweeps = "0,0.5,1")] [TlcModule.SweepableDiscreteParam("RegLambda", new object[] { 0f, 0.5f, 1f })] @@ -185,7 +186,7 @@ public class Options : ISupportBoosterParameterFactory [Argument(ArgumentType.AtMostOnce, HelpText = "L1 regularization term on weights, increase this value will make model more conservative.", - ShortName = "l1,RegAlpha")] + ShortName = "l1")] [TlcModule.Range(Min = 0.0)] [TGUI(Label = "Alpha(L1)", SuggestedSweeps = "0,0.5,1")] [TlcModule.SweepableDiscreteParam("RegAlpha", new object[] { 0f, 0.5f, 1f })] @@ -235,7 +236,7 @@ public sealed class Options : TreeBooster.Options [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of dropped tree in a boosting round.")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] - public int MaximumDroppedTreesPerRound = 1; + public int MaximumDroppedTreeCountPerRound = 1; [Argument(ArgumentType.AtMostOnce, HelpText = "Probability for not dropping in a boosting round.")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] @@ -358,7 +359,7 @@ public enum EvalMetricType [Argument(ArgumentType.AtMostOnce, HelpText = "Use softmax loss for the multi classification.")] [TlcModule.SweepableDiscreteParam("UseSoftmax", new object[] { true, false })] - public bool? UseSoftMax; + public bool? UseSoftmax; [Argument(ArgumentType.AtMostOnce, HelpText = "Rounds of early stopping, 0 will disable it.", ShortName = "es")] @@ -382,7 +383,7 @@ public enum EvalMetricType [Argument(ArgumentType.AtMostOnce, HelpText = "Enable special handling of missing value or not.")] [TlcModule.SweepableDiscreteParam("UseMissing", new object[] { true, false })] - public bool UseMissing = false; + public bool HandleMissingValue = false; [Argument(ArgumentType.AtMostOnce, HelpText = "Minimum number of instances per categorical group.", ShortName = "mdpg")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] @@ -459,7 +460,7 @@ internal Dictionary ToDictionary(IHost host) res[GetOptionName(nameof(metric))] = metric; res[GetOptionName(nameof(Sigmoid))] = Sigmoid; res[GetOptionName(nameof(CustomGains))] = CustomGains; - res[GetOptionName(nameof(UseMissing))] = UseMissing; + res[GetOptionName(nameof(HandleMissingValue))] = HandleMissingValue; res[GetOptionName(nameof(MinimumExampleCountPerGroup))] = MinimumExampleCountPerGroup; res[GetOptionName(nameof(MaximumCategoricalSplitPointCount))] = MaximumCategoricalSplitPointCount; res[GetOptionName(nameof(CategoricalSmoothing))] = CategoricalSmoothing; diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs index 33bb6a57e8..ab85635cb1 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs @@ -109,7 +109,7 @@ internal LightGbmBinaryTrainer(IHostEnvironment env, Options options) /// The private instance of . /// The name of The label column. /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name for the column containing the initial weight. /// The number of leaves to use. /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. @@ -117,12 +117,12 @@ internal LightGbmBinaryTrainer(IHostEnvironment env, Options options) internal LightGbmBinaryTrainer(IHostEnvironment env, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, - string weights = null, + string exampleWeightColumnName = null, int? numberOfLeaves = null, int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) - : base(env, LoadNameValue, TrainerUtils.MakeBoolScalarLabel(labelColumnName), featureColumnName, weights, null, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeBoolScalarLabel(labelColumnName), featureColumnName, exampleWeightColumnName, null, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations) { } diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs index c6029123cf..a35b59572f 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs @@ -43,7 +43,7 @@ internal LightGbmMulticlassTrainer(IHostEnvironment env, Options options) /// The private instance of . /// The name of The label column. /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name for the column containing the initial weight. /// The number of leaves to use. /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. @@ -51,12 +51,12 @@ internal LightGbmMulticlassTrainer(IHostEnvironment env, Options options) internal LightGbmMulticlassTrainer(IHostEnvironment env, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, - string weights = null, + string exampleWeightColumnName = null, int? numberOfLeaves = null, int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = LightGBM.Options.Defaults.NumberOfIterations) - : base(env, LoadNameValue, TrainerUtils.MakeU4ScalarColumn(labelColumnName), featureColumnName, weights, null, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations) + : base(env, LoadNameValue, TrainerUtils.MakeU4ScalarColumn(labelColumnName), featureColumnName, exampleWeightColumnName, null, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations) { _numClass = -1; } @@ -182,14 +182,14 @@ private protected override void CheckAndUpdateParametersBeforeTraining(IChannel Options["num_class"] = _numClass; bool useSoftmax = false; - if (LightGbmTrainerOptions.UseSoftMax.HasValue) - useSoftmax = LightGbmTrainerOptions.UseSoftMax.Value; + if (LightGbmTrainerOptions.UseSoftmax.HasValue) + useSoftmax = LightGbmTrainerOptions.UseSoftmax.Value; else { if (labels.Length >= _minDataToUseSoftmax) useSoftmax = true; - ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.UseSoftMax) + " = " + useSoftmax); + ch.Info("Auto-tuning parameters: " + nameof(LightGbmTrainerOptions.UseSoftmax) + " = " + useSoftmax); } if (useSoftmax) diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs index 9453cf6a2e..26f670dfe2 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs @@ -87,7 +87,7 @@ public sealed class LightGbmRegressorTrainer : LightGbmTrainerBaseThe private instance of . /// The name of the label column. /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name for the column containing the initial weight. /// The number of leaves to use. /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. @@ -95,12 +95,12 @@ public sealed class LightGbmRegressorTrainer : LightGbmTrainerBase 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", - "Aliases": [ - "SubsampleFreq" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23778,9 +23766,6 @@ "Name": "MinimumSplitGain", "Type": "Float", "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.", - "Aliases": [ - "MinSplitGain" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23793,9 +23778,6 @@ "Name": "MaximumTreeDepth", "Type": "Int", "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.", - "Aliases": [ - "MaxDepth" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23809,9 +23791,6 @@ "Name": "MinimumChildWeight", "Type": "Float", "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.", - "Aliases": [ - "MinChildWeight" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23824,9 +23803,6 @@ "Name": "SubsampleFrequency", "Type": "Int", "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", - "Aliases": [ - "SubsampleFreq" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23974,9 +23950,6 @@ "Name": "MinimumSplitGain", "Type": "Float", "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.", - "Aliases": [ - "MinSplitGain" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23989,9 +23962,6 @@ "Name": "MaximumTreeDepth", "Type": "Int", "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.", - "Aliases": [ - "MaxDepth" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -24005,9 +23975,6 @@ "Name": "MinimumChildWeight", "Type": "Float", "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.", - "Aliases": [ - "MinChildWeight" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -24020,9 +23987,6 @@ "Name": "SubsampleFrequency", "Type": "Int", "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", - "Aliases": [ - "SubsampleFreq" - ], "Required": false, "SortOrder": 150.0, "IsNullable": false, diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index d255f44d7e..746e0fd16d 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -226,8 +226,7 @@ private string GetBuildPrefix() #endif } - //[Fact(Skip = "Execute this test if you want to regenerate the core_manifest and core_ep_list files")] - [Fact] + [Fact(Skip = "Execute this test if you want to regenerate the core_manifest and core_ep_list files")] public void RegenerateEntryPointCatalog() { var (epListContents, jObj) = BuildManifests(); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs index e875f2a90b..88c8e46e64 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEstimators.cs @@ -298,7 +298,7 @@ private void LightGbmHelper(bool useSoftmax, out string modelString, out List Date: Mon, 4 Mar 2019 16:26:43 -0800 Subject: [PATCH 4/4] - Updating based upon feedback --- .../Static/LightGBMRegression.cs | 2 +- .../LightGbmStaticExtensions.cs | 24 ++++++------- .../LightGbmArguments.cs | 17 +++++----- .../LightGbmBinaryTrainer.cs | 5 ++- .../LightGbmMulticlassTrainer.cs | 4 +-- .../LightGbmRegressionTrainer.cs | 2 +- .../LightGbmTrainerBase.cs | 4 +-- .../Common/EntryPoints/core_manifest.json | 34 ++++++++----------- .../LightGBMMC/LightGBMMC-CV-iris.key-out.txt | 4 +-- .../LightGBMMC-CV-iris.keyU404-out.txt | 4 +-- .../LightGBMMC-TrainTest-iris.key-out.txt | 2 +- .../LightGBMMC-TrainTest-iris.keyU404-out.txt | 2 +- 12 files changed, 49 insertions(+), 55 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs index aa2f2e65fc..cab1700636 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs @@ -39,7 +39,7 @@ public static void LightGbmRegression() r.label, r.features, numberOfLeaves: 4, - minimumDataPerLeaf: 6, + minimumExampleCountPerLeaf: 6, learningRate: 0.001, onFit: p => pred = p) ) diff --git a/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs b/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs index cfa2126526..fb84489ff0 100644 --- a/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs +++ b/src/Microsoft.ML.LightGBM.StaticPipe/LightGbmStaticExtensions.cs @@ -22,7 +22,7 @@ public static class LightGbmStaticExtensions /// The features column. /// The weights column. /// The number of leaves to use. - /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// Number of iterations. /// A delegate that is called every time the @@ -40,18 +40,18 @@ public static class LightGbmStaticExtensions public static Scalar LightGbm(this RegressionCatalog.RegressionTrainers catalog, Scalar label, Vector features, Scalar weights = null, int? numberOfLeaves = null, - int? minimumDataPerLeaf = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = Options.Defaults.NumberOfIterations, Action onFit = null) { - CheckUserValues(label, features, weights, numberOfLeaves, minimumDataPerLeaf, learningRate, numberOfIterations, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations, onFit); var rec = new TrainerEstimatorReconciler.Regression( (env, labelName, featuresName, weightsName) => { var trainer = new LightGbmRegressorTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, - minimumDataPerLeaf, learningRate, numberOfIterations); + minimumExampleCountPerLeaf, learningRate, numberOfIterations); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); return trainer; @@ -197,7 +197,7 @@ public static (Scalar score, Scalar probability, Scalar pred /// The groupId column. /// The weights column. /// The number of leaves to use. - /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// Number of iterations. /// A delegate that is called every time the @@ -213,19 +213,19 @@ public static Scalar LightGbm(this RankingCatalog.RankingTrainers c Key groupId, Scalar weights = null, int? numberOfLeaves = null, - int? minimumDataPerLeaf = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = Options.Defaults.NumberOfIterations, Action onFit = null) { - CheckUserValues(label, features, weights, numberOfLeaves, minimumDataPerLeaf, learningRate, numberOfIterations, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations, onFit); Contracts.CheckValue(groupId, nameof(groupId)); var rec = new TrainerEstimatorReconciler.Ranker( (env, labelName, featuresName, groupIdName, weightsName) => { var trainer = new LightGbmRankingTrainer(env, labelName, featuresName, groupIdName, weightsName, numberOfLeaves, - minimumDataPerLeaf, learningRate, numberOfIterations); + minimumExampleCountPerLeaf, learningRate, numberOfIterations); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); @@ -285,7 +285,7 @@ public static Scalar LightGbm(this RankingCatalog.RankingTrainers c /// The features, or independent variables. /// The weights column. /// The number of leaves to use. - /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. + /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. /// Number of iterations. /// A delegate that is called every time the @@ -307,18 +307,18 @@ public static (Vector score, Key predictedLabel) Vector features, Scalar weights = null, int? numberOfLeaves = null, - int? minimumDataPerLeaf = null, + int? minimumExampleCountPerLeaf = null, double? learningRate = null, int numberOfIterations = Options.Defaults.NumberOfIterations, Action onFit = null) { - CheckUserValues(label, features, weights, numberOfLeaves, minimumDataPerLeaf, learningRate, numberOfIterations, onFit); + CheckUserValues(label, features, weights, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations, onFit); var rec = new TrainerEstimatorReconciler.MulticlassClassifier( (env, labelName, featuresName, weightsName) => { var trainer = new LightGbmMulticlassTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, - minimumDataPerLeaf, learningRate, numberOfIterations); + minimumExampleCountPerLeaf, learningRate, numberOfIterations); if (onFit != null) return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); diff --git a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs index abedc67b42..21dbb0753f 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmArguments.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmArguments.cs @@ -46,9 +46,9 @@ public sealed class Options : TrainerInputBaseWithGroupId public abstract class BoosterParameter : IBoosterParameter where TOptions : class, new() { - protected TOptions BoosterParameterOptions { get; } + private protected TOptions BoosterParameterOptions { get; } - protected BoosterParameter(TOptions options) + private protected BoosterParameter(TOptions options) { BoosterParameterOptions = options; } @@ -105,12 +105,13 @@ private static string GetOptionName(string name) {nameof(TreeBooster.Options.MinimumSplitGain), "min_split_gain" }, {nameof(TreeBooster.Options.MaximumTreeDepth), "max_depth"}, {nameof(TreeBooster.Options.MinimumChildWeight), "min_child_weight"}, + {nameof(TreeBooster.Options.SubsampleFraction), "subsample"}, {nameof(TreeBooster.Options.SubsampleFrequency), "subsample_freq"}, {nameof(TreeBooster.Options.L1Regularization), "reg_alpha"}, {nameof(TreeBooster.Options.L2Regularization), "reg_lambda"}, {nameof(TreeBooster.Options.WeightOfPositiveExamples), "scale_pos_weight"}, {nameof(DartBooster.Options.TreeDropFraction), "drop_rate" }, - {nameof(DartBooster.Options.MaximumDroppedTreeCountPerRound), "max_drop" }, + {nameof(DartBooster.Options.MaximumNumberOfDroppedTreesPerRound),"max_drop" }, {nameof(DartBooster.Options.SkipDropFraction), "skip_drop" }, {nameof(MinimumExampleCountPerLeaf), "min_data_per_leaf"}, {nameof(NumberOfLeaves), "num_leaves"}, @@ -159,7 +160,7 @@ public class Options : ISupportBoosterParameterFactory [Argument(ArgumentType.AtMostOnce, HelpText = "Subsample frequency for bagging. 0 means no subsample. " - + "Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at N iterations." + + + "Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at every N iterations." + "This must be set with Subsample as this specifies the amount to subsample.")] [TlcModule.Range(Min = 0, Max = int.MaxValue)] public int SubsampleFrequency = 0; @@ -168,7 +169,7 @@ public class Options : ISupportBoosterParameterFactory HelpText = "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected " + "half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] - public double Subsample = 1; + public double SubsampleFraction = 1; [Argument(ArgumentType.AtMostOnce, HelpText = "Subsample ratio of columns when constructing each tree. Range: (0,1].", @@ -208,7 +209,7 @@ internal TreeBooster(Options options) { Contracts.CheckUserArg(BoosterParameterOptions.MinimumSplitGain >= 0, nameof(BoosterParameterOptions.MinimumSplitGain), "must be >= 0."); Contracts.CheckUserArg(BoosterParameterOptions.MinimumChildWeight >= 0, nameof(BoosterParameterOptions.MinimumChildWeight), "must be >= 0."); - Contracts.CheckUserArg(BoosterParameterOptions.Subsample > 0 && BoosterParameterOptions.Subsample <= 1, nameof(BoosterParameterOptions.Subsample), "must be in (0,1]."); + Contracts.CheckUserArg(BoosterParameterOptions.SubsampleFraction > 0 && BoosterParameterOptions.SubsampleFraction <= 1, nameof(BoosterParameterOptions.SubsampleFraction), "must be in (0,1]."); Contracts.CheckUserArg(BoosterParameterOptions.FeatureFraction > 0 && BoosterParameterOptions.FeatureFraction <= 1, nameof(BoosterParameterOptions.FeatureFraction), "must be in (0,1]."); Contracts.CheckUserArg(BoosterParameterOptions.L2Regularization >= 0, nameof(BoosterParameterOptions.L2Regularization), "must be >= 0."); Contracts.CheckUserArg(BoosterParameterOptions.L1Regularization >= 0, nameof(BoosterParameterOptions.L1Regularization), "must be >= 0."); @@ -234,9 +235,9 @@ public sealed class Options : TreeBooster.Options [TlcModule.Range(Inf = 0.0, Max = 1.0)] public double TreeDropFraction = 0.1; - [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of dropped tree in a boosting round.")] + [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of dropped trees in a boosting round.")] [TlcModule.Range(Inf = 0, Max = int.MaxValue)] - public int MaximumDroppedTreeCountPerRound = 1; + public int MaximumNumberOfDroppedTreesPerRound = 1; [Argument(ArgumentType.AtMostOnce, HelpText = "Probability for not dropping in a boosting round.")] [TlcModule.Range(Inf = 0.0, Max = 1.0)] diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs index ab85635cb1..66f28f98b1 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs @@ -8,7 +8,6 @@ using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; using Microsoft.ML.LightGBM; -using Microsoft.ML.Model; using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.FastTree; @@ -85,7 +84,7 @@ private static IPredictorProducing Create(IHostEnvironment env, ModelLoad /// /// /// Light GBM is an open source implementation of boosted trees. - /// GitHub: LightGBM + /// GitHub: LightGBM /// public sealed class LightGbmBinaryTrainer : LightGbmTrainerBase>, @@ -109,7 +108,7 @@ internal LightGbmBinaryTrainer(IHostEnvironment env, Options options) /// The private instance of . /// The name of The label column. /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name of the example weight column (optional). /// The number of leaves to use. /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs index a35b59572f..6e01c37c7d 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs @@ -43,7 +43,7 @@ internal LightGbmMulticlassTrainer(IHostEnvironment env, Options options) /// The private instance of . /// The name of The label column. /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name of the example weight column (optional). /// The number of leaves to use. /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. @@ -157,7 +157,7 @@ private protected override void ConvertNaNLabels(IChannel ch, RoleMappedData dat labels[i] = defaultLabel; } - protected override void GetDefaultParameters(IChannel ch, int numRow, bool hasCategorical, int totalCats, bool hiddenMsg = false) + private protected override void GetDefaultParameters(IChannel ch, int numRow, bool hasCategorical, int totalCats, bool hiddenMsg = false) { base.GetDefaultParameters(ch, numRow, hasCategorical, totalCats, true); int numberOfLeaves = (int)Options["num_leaves"]; diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs index 26f670dfe2..b1981250e3 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs @@ -87,7 +87,7 @@ public sealed class LightGbmRegressorTrainer : LightGbmTrainerBaseThe private instance of . /// The name of the label column. /// The name of the feature column. - /// The name for the column containing the initial weight. + /// The name of the example weight column (optional). /// The number of leaves to use. /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. /// The learning rate. diff --git a/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs b/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs index 60f8e96bca..5d4504add7 100644 --- a/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs +++ b/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs @@ -165,7 +165,7 @@ private protected virtual void CheckDataValid(IChannel ch, RoleMappedData data) ch.CheckParam(data.Schema.Label.HasValue, nameof(data), "Need a label column"); } - protected virtual void GetDefaultParameters(IChannel ch, int numRow, bool hasCategarical, int totalCats, bool hiddenMsg = false) + private protected virtual void GetDefaultParameters(IChannel ch, int numRow, bool hasCategarical, int totalCats, bool hiddenMsg = false) { double learningRate = LightGbmTrainerOptions.LearningRate ?? DefaultLearningRate(numRow, hasCategarical, totalCats); int numberOfLeaves = LightGbmTrainerOptions.NumberOfLeaves ?? DefaultNumLeaves(numRow, hasCategarical, totalCats); @@ -873,7 +873,7 @@ private static int DefaultNumLeaves(int numRow, bool useCat, int totalCats) return 30; } - protected static int DefaultMinDataPerLeaf(int numRow, int numberOfLeaves, int numClass) + private protected static int DefaultMinDataPerLeaf(int numRow, int numberOfLeaves, int numClass) { if (numClass > 1) { diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 37fbbbdaa9..e26ef31800 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -23549,9 +23549,9 @@ } }, { - "Name": "MaximumDroppedTreesPerRound", + "Name": "MaximumNumberOfDroppedTreesPerRound", "Type": "Int", - "Desc": "Maximum number of dropped tree in a boosting round.", + "Desc": "Maximum number of dropped trees in a boosting round.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23644,7 +23644,7 @@ { "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at every N iterations.This must be set with Subsample as this specifies the amount to subsample.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23655,7 +23655,7 @@ } }, { - "Name": "Subsample", + "Name": "SubsampleFraction", "Type": "Float", "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].", "Required": false, @@ -23688,8 +23688,7 @@ "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ - "l2", - "RegLambda" + "l2" ], "Required": false, "SortOrder": 150.0, @@ -23712,8 +23711,7 @@ "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ - "l1", - "RegAlpha" + "l1" ], "Required": false, "SortOrder": 150.0, @@ -23802,7 +23800,7 @@ { "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at every N iterations.This must be set with Subsample as this specifies the amount to subsample.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23813,7 +23811,7 @@ } }, { - "Name": "Subsample", + "Name": "SubsampleFraction", "Type": "Float", "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].", "Required": false, @@ -23846,8 +23844,7 @@ "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ - "l2", - "RegLambda" + "l2" ], "Required": false, "SortOrder": 150.0, @@ -23870,8 +23867,7 @@ "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ - "l1", - "RegAlpha" + "l1" ], "Required": false, "SortOrder": 150.0, @@ -23986,7 +23982,7 @@ { "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency for bagging. 0 means no subsample. If subsampleFreq > 0, it will use a subset to train and the subset will be updated on every Subsample iteration.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at every N iterations.This must be set with Subsample as this specifies the amount to subsample.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23997,7 +23993,7 @@ } }, { - "Name": "Subsample", + "Name": "SubsampleFraction", "Type": "Float", "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].", "Required": false, @@ -24030,8 +24026,7 @@ "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ - "l2", - "RegLambda" + "l2" ], "Required": false, "SortOrder": 150.0, @@ -24054,8 +24049,7 @@ "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ - "l1", - "RegAlpha" + "l1" ], "Required": false, "SortOrder": 150.0, diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt index 7f10dc44ad..5d529e5e17 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.key-out.txt @@ -1,12 +1,12 @@ maml.exe CV tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:TX:0 col=Features:1-*} data=%Data% seed=1 xf=Term{col=Label} Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMax = False +Auto-tuning parameters: UseSoftmax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMax = False +Auto-tuning parameters: UseSoftmax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt index 4765814e37..0902395bec 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-CV-iris.keyU404-out.txt @@ -1,12 +1,12 @@ maml.exe CV tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} threads=- dout=%Output% loader=Text{col=Label:U4[0-2]:0 col=Features:1-4} data=%Data% seed=1 Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMax = False +Auto-tuning parameters: UseSoftmax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMax = False +Auto-tuning parameters: UseSoftmax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt index d65bd3ff29..5fba9409c5 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.key-out.txt @@ -1,7 +1,7 @@ maml.exe TrainTest test=%Data% tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:TX:0 col=Features:1-*} data=%Data% out=%Output% seed=1 xf=Term{col=Label} Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMax = False +Auto-tuning parameters: UseSoftmax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed. diff --git a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt index 3398f10263..ff43c349eb 100644 --- a/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt +++ b/test/BaselineOutput/Common/LightGBMMC/LightGBMMC-TrainTest-iris.keyU404-out.txt @@ -1,7 +1,7 @@ maml.exe TrainTest test=%Data% tr=LightGBMMC{nt=1 iter=10 v=- lr=0.2 mil=10 nl=20} dout=%Output% loader=Text{col=Label:U4[0-2]:0 col=Features:1-4} data=%Data% out=%Output% seed=1 Not adding a normalizer. Auto-tuning parameters: UseCategoricalSplit = False -Auto-tuning parameters: UseSoftMax = False +Auto-tuning parameters: UseSoftmax = False LightGBM objective=multiclassova Not training a calibrator because it is not needed.