Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle exception during GetNextPipeline for AutoML #5455

Merged
merged 3 commits into from
Oct 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Microsoft.ML.AutoML/Experiment/Experiment.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public IList<TRunDetail> Execute()
// get next pipeline
var getPipelineStopwatch = Stopwatch.StartNew();
var pipeline = PipelineSuggester.GetNextInferredPipeline(_context, _history, _datasetColumnInfo, _task,
_optimizingMetricInfo.IsMaximizing, _experimentSettings.CacheBeforeTrainer, _trainerAllowList);
_optimizingMetricInfo.IsMaximizing, _experimentSettings.CacheBeforeTrainer, _logger, _trainerAllowList);

var pipelineInferenceTimeInSeconds = getPipelineStopwatch.Elapsed.TotalSeconds;

Expand Down
56 changes: 36 additions & 20 deletions src/Microsoft.ML.AutoML/PipelineSuggesters/PipelineSuggester.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Runtime;

namespace Microsoft.ML.AutoML
{
Expand All @@ -17,10 +19,11 @@ public static Pipeline GetNextPipeline(MLContext context,
IEnumerable<PipelineScore> history,
DatasetColumnInfo[] columns,
TaskKind task,
IChannel logger,
bool isMaximizingMetric = true)
{
var inferredHistory = history.Select(r => SuggestedPipelineRunDetail.FromPipelineRunResult(context, r));
var nextInferredPipeline = GetNextInferredPipeline(context, inferredHistory, columns, task, isMaximizingMetric, CacheBeforeTrainer.Auto);
var nextInferredPipeline = GetNextInferredPipeline(context, inferredHistory, columns, task, isMaximizingMetric, CacheBeforeTrainer.Auto, logger);
return nextInferredPipeline?.ToPipeline();
}

Expand All @@ -30,6 +33,7 @@ public static SuggestedPipeline GetNextInferredPipeline(MLContext context,
TaskKind task,
bool isMaximizingMetric,
CacheBeforeTrainer cacheBeforeTrainer,
IChannel logger,
IEnumerable<TrainerName> trainerAllowList = null)
{
var availableTrainers = RecipeInference.AllowedTrainers(context, task,
Expand Down Expand Up @@ -64,7 +68,7 @@ public static SuggestedPipeline GetNextInferredPipeline(MLContext context,
do
{
// sample new hyperparameters for the learner
if (!SampleHyperparameters(context, newTrainer, history, isMaximizingMetric))
if (!SampleHyperparameters(context, newTrainer, history, isMaximizingMetric, logger))
{
// if unable to sample new hyperparameters for the learner
// (ie SMAC returned 0 suggestions), break
Expand Down Expand Up @@ -188,30 +192,42 @@ private static IValueGenerator[] ConvertToValueGenerators(IEnumerable<SweepableP
/// Samples new hyperparameters for the trainer, and sets them.
/// Returns true if success (new hyperparameters were suggested and set). Else, returns false.
/// </summary>
private static bool SampleHyperparameters(MLContext context, SuggestedTrainer trainer, IEnumerable<SuggestedPipelineRunDetail> history, bool isMaximizingMetric)
private static bool SampleHyperparameters(MLContext context, SuggestedTrainer trainer,
IEnumerable<SuggestedPipelineRunDetail> history, bool isMaximizingMetric, IChannel logger)
{
var sps = ConvertToValueGenerators(trainer.SweepParams);
var sweeper = new SmacSweeper(context,
new SmacSweeper.Arguments
try
{
var sps = ConvertToValueGenerators(trainer.SweepParams);
var sweeper = new SmacSweeper(context,
new SmacSweeper.Arguments
{
SweptParameters = sps
});

IEnumerable<SuggestedPipelineRunDetail> historyToUse = history
.Where(r => r.RunSucceeded && r.Pipeline.Trainer.TrainerName == trainer.TrainerName &&
r.Pipeline.Trainer.HyperParamSet != null &&
r.Pipeline.Trainer.HyperParamSet.Any() &&
FloatUtils.IsFinite(r.Score));

// get new set of hyperparameter values
var proposedParamSet = sweeper.ProposeSweeps(1, historyToUse.Select(h => h.ToRunResult(isMaximizingMetric))).FirstOrDefault();
if (!proposedParamSet.Any())
{
SweptParameters = sps
});
return false;
}

IEnumerable<SuggestedPipelineRunDetail> historyToUse = history
.Where(r => r.RunSucceeded && r.Pipeline.Trainer.TrainerName == trainer.TrainerName && r.Pipeline.Trainer.HyperParamSet != null && r.Pipeline.Trainer.HyperParamSet.Any());
// associate proposed parameter set with trainer, so that smart hyperparameter
// sweepers (like KDO) can map them back.
trainer.SetHyperparamValues(proposedParamSet);

// get new set of hyperparameter values
var proposedParamSet = sweeper.ProposeSweeps(1, historyToUse.Select(h => h.ToRunResult(isMaximizingMetric))).First();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From #5428 (comment):

...another issue with AutoML experiment that after about 70 round of experiment...

Iteration ~70 is interesting as it's when the SMAC sweeper moves from random sweeping (where it explores the hyperparameter space) to a bayesian style, and starts to generate new hyperparameter sets and rank them. Hence why it dies in sweeper.ProposeSweeps() at ~70.

Sweeping pattern is:

  • One iteration per trainer for the task -- Trying defaults for each trainer (~8-12 iterations)
    • Cull to top 3 trainers
  • 20 iterations * 3 trainers -- Warm-up random sweeping (60 iterations)
  • Until time expires -- SMAC based Bayesian style hyperparameter optimization (hopefully another 50 - 100 iterations) <= dies at the start of this step

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for explaining the behavior of AutoML experiment here.
I debugged the issue and looks like for this case at some stage (around 70 iterations) the Score of all valid history becomes NaN and NaN are not accepted as valid number for Trainer (FastTree in this case) thus we got "All instances skipped due to missing features" error.
Here I filtered the history with additional condition: score must be finite.


In reply to: 511429267 [](ancestors = 511429267)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. I would have expected that FastTree would simply skip rows with NaN labels.

FastTree may be failing due to NaNs in the feature vector (aka, the hyperparameter set for the pipeline).

As a test, would it be possible to fill in a value (perhaps 0.5) for the score field? This would test if (1) SMAC is dying due to the score column being NaN; or (2) our filtering of FloatUtils.IsFinite(r.Score)), causes indirect removing rows which would cause SMAC to fail due to missing values within the feature vector.

If filling with 0.5 causes SMAC to work, then your filtering by inspecting the score is a good method. If it still fails, we should instead and filter the history by looking for NaN values in the feature vectors (or perhaps consider throwing as they are unexpected).

NaN values for metrics are expected/semi-common, but I wouldn't expect a NaN in the feature vector.

After filtering, are there any rows left from the history? Does it continue sweeping, or does it exit at ~70?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would have expected that FastTree would simply skip rows with NaN labels.

Yes, this is exactlly what happens here except in some cases all label used are NaN and trainer (FastTree) complaints about that (trainer need at least 1 valid lable). After filtering, sampling method will return at below line and no sampling will happen here: https://github.com/dotnet/machinelearning/blob/master/src/Microsoft.ML.AutoML/PipelineSuggesters/PipelineSuggester.cs#L207

FastTree may be failing due to NaNs in the feature vector (aka, the hyperparameter set for the pipeline).

As far as I see, feature vector looks good in this case.

As a test, would it be possible to fill in a value (perhaps 0.5) for the score field? This would test if (1) SMAC is dying due to the score column being NaN; or (2) our filtering of FloatUtils.IsFinite(r.Score)), causes indirect removing rows which would cause SMAC to fail due to missing values within the feature vector.

Yes, I do some tests as you suggested changing Score from NaN to 0.5 for test purpose and experiment will success running 150 iterations for 10 minutes.

NaN values for metrics are expected/semi-common, but I wouldn't expect a NaN in the feature vector.

NaN values for metrics (label) will be filter out during dataview cursor acessing so filter out in advance should keep logic consistented. https://github.com/dotnet/machinelearning/blob/master/src/Microsoft.ML.Data/Training/TrainerUtils.cs#L856

After filtering, are there any rows left from the history? Does it continue sweeping, or does it exit at ~70?

After filtering (instead of change NaN to 0.5), the experiment continue sweeping and return at 150 iteration, history data for these 150 iteration looks valid.


In reply to: 513323635 [](ancestors = 513323635)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@frank-dong-ms: Thanks for investigating. This all sounds good and the NaN filtering on the score does seem like the right route.

I'd recommend pushing in #5163 before this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@justinormont Sure, thanks Justin for your review and great suggestions.

Copy link
Contributor Author

@frank-dong-ms-zz frank-dong-ms-zz Oct 30, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#5163 has been merged, could you approve this PR so I can merge this one. @justinormont


In reply to: 513926616 [](ancestors = 513926616)

if (!proposedParamSet.Any())
return true;
}
catch (Exception ex)
{
return false;
logger.Error($"SampleHyperparameters failed with exception: {ex}");
throw;
}

// associate proposed parameter set with trainer, so that smart hyperparameter
// sweepers (like KDO) can map them back.
trainer.SetHyperparamValues(proposedParamSet);

return true;
}
}
}
6 changes: 4 additions & 2 deletions test/Microsoft.ML.AutoML.Tests/GetNextPipelineTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using Newtonsoft.Json;
using Microsoft.ML.TestFramework;
using Xunit.Abstractions;
using Microsoft.ML.Runtime;

namespace Microsoft.ML.AutoML.Test
{
Expand All @@ -27,7 +28,8 @@ public void GetNextPipeline()
var columns = DatasetColumnInfoUtil.GetDatasetColumnInfo(context, uciAdult, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel });

// get next pipeline
var pipeline = PipelineSuggester.GetNextPipeline(context, new List<PipelineScore>(), columns, TaskKind.BinaryClassification);
var pipeline = PipelineSuggester.GetNextPipeline(context, new List<PipelineScore>(), columns,
TaskKind.BinaryClassification, ((IChannelProvider)context).Start("AutoMLTest"));

// serialize & deserialize pipeline
var serialized = JsonConvert.SerializeObject(pipeline);
Expand Down Expand Up @@ -57,7 +59,7 @@ public void GetNextPipelineMock()
for (var i = 0; i < maxIterations; i++)
{
// Get next pipeline
var pipeline = PipelineSuggester.GetNextPipeline(context, history, columns, task);
var pipeline = PipelineSuggester.GetNextPipeline(context, history, columns, task, ((IChannelProvider)context).Start("AutoMLTest"));
if (pipeline == null)
{
break;
Expand Down