Skip to content

Commit

Permalink
get next pipeline API rev -- refactor API to consume column dimension…
Browse files Browse the repository at this point in the history
…s, purpose, type, and name instead of available trainers & transforms (dotnet#19)
  • Loading branch information
daholste authored and Dmitry-A committed Aug 22, 2019
1 parent 816e8e8 commit eac3695
Show file tree
Hide file tree
Showing 14 changed files with 205 additions and 293 deletions.
15 changes: 0 additions & 15 deletions src/AutoML/API/MLContextAutoFitExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,6 @@ internal static RegressionResult AutoFit(this RegressionContext context,
var bestResult = new RegressionIterationResult(bestPipeline.Model, (RegressionMetrics)bestPipeline.EvaluatedMetrics, bestPipeline.ScoredValidationData, bestPipeline.Pipeline.ToPipeline());
return new RegressionResult(bestResult, results);
}

public static Pipeline GetPipeline(this RegressionContext context, IDataView dataView, string label)
{
return PipelineSuggesterApi.GetPipeline(TaskKind.Regression, dataView, label);
}
}

public static class BinaryClassificationExtensions
Expand Down Expand Up @@ -96,11 +91,6 @@ internal static BinaryClassificationResult AutoFit(this BinaryClassificationCont
var bestResult = new BinaryClassificationItertionResult(bestPipeline.Model, (BinaryClassificationMetrics)bestPipeline.EvaluatedMetrics, bestPipeline.ScoredValidationData, bestPipeline.Pipeline.ToPipeline());
return new BinaryClassificationResult(bestResult, results);
}

public static Pipeline GetPipeline(this BinaryClassificationContext context, IDataView dataView, string label)
{
return PipelineSuggesterApi.GetPipeline(TaskKind.BinaryClassification, dataView, label);
}
}

public static class MulticlassExtensions
Expand Down Expand Up @@ -144,11 +134,6 @@ internal static MulticlassClassificationResult AutoFit(this MulticlassClassifica
var bestResult = new MulticlassClassificationIterationResult(bestPipeline.Model, (MultiClassClassifierMetrics)bestPipeline.EvaluatedMetrics, bestPipeline.ScoredValidationData, bestPipeline.Pipeline.ToPipeline());
return new MulticlassClassificationResult(bestResult, results);
}

public static Pipeline GetPipeline(this MulticlassClassificationContext context, IDataView dataView, string label)
{
return PipelineSuggesterApi.GetPipeline(TaskKind.MulticlassClassification, dataView, label);
}
}

public class BinaryClassificationResult
Expand Down
20 changes: 10 additions & 10 deletions src/AutoML/API/Pipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ namespace Microsoft.ML.Auto
{
public class Pipeline
{
public PipelineNode[] Elements { get; set; }
public PipelineNode[] Nodes { get; set; }

public Pipeline(PipelineNode[] elements)
public Pipeline(PipelineNode[] nodes)
{
Elements = elements;
Nodes = nodes;
}

// (used by Newtonsoft)
Expand All @@ -27,31 +27,31 @@ public IEstimator<ITransformer> ToEstimator()
public class PipelineNode
{
public string Name { get; set; }
public PipelineNodeType ElementType { get; set; }
public PipelineNodeType NodeType { get; set; }
public string[] InColumns { get; set; }
public string[] OutColumns { get; set; }
public IDictionary<string, object> Properties { get; set; }

public PipelineNode(string name, PipelineNodeType elementType,
public PipelineNode(string name, PipelineNodeType nodeType,
string[] inColumns, string[] outColumns,
IDictionary<string, object> properties = null)
{
Name = name;
ElementType = elementType;
NodeType = nodeType;
InColumns = inColumns;
OutColumns = outColumns;
Properties = properties ?? new Dictionary<string, object>();
}

public PipelineNode(string name, PipelineNodeType elementType,
public PipelineNode(string name, PipelineNodeType nodeType,
string inColumn, string outColumn, IDictionary<string, object> properties = null) :
this(name, elementType, new string[] { inColumn }, new string[] { outColumn }, properties)
this(name, nodeType, new string[] { inColumn }, new string[] { outColumn }, properties)
{
}

public PipelineNode(string name, PipelineNodeType elementType,
public PipelineNode(string name, PipelineNodeType nodeType,
string[] inColumns, string outColumn, IDictionary<string, object> properties = null) :
this(name, elementType, inColumns, new string[] { outColumn }, properties)
this(name, nodeType, inColumns, new string[] { outColumn }, properties)
{
}

Expand Down
18 changes: 9 additions & 9 deletions src/AutoML/AutoFitter/AutoFitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,22 @@ internal class AutoFitter
private readonly IDebugLogger _debugLogger;
private readonly IList<InferredPipelineRunResult> _history;
private readonly string _label;
private readonly MLContext _mlContext;
private readonly MLContext _context;
private readonly OptimizingMetricInfo _optimizingMetricInfo;
private readonly IDictionary<string, ColumnPurpose> _purposeOverrides;
private readonly AutoFitSettings _settings;
private readonly IDataView _trainData;
private readonly TaskKind _task;
private readonly IDataView _validationData;

public AutoFitter(MLContext mlContext, OptimizingMetricInfo metricInfo, AutoFitSettings settings,
public AutoFitter(MLContext context, OptimizingMetricInfo metricInfo, AutoFitSettings settings,
TaskKind task, string label, IDataView trainData, IDataView validationData,
IDictionary<string, ColumnPurpose> purposeOverrides, IDebugLogger debugLogger)
{
_debugLogger = debugLogger;
_history = new List<InferredPipelineRunResult>();
_label = label;
_mlContext = mlContext;
_context = context;
_optimizingMetricInfo = metricInfo;
_settings = settings ?? new AutoFitSettings();
_purposeOverrides = purposeOverrides;
Expand All @@ -49,13 +49,13 @@ public InferredPipelineRunResult[] Fit()
private void IteratePipelinesAndFit()
{
var stopwatch = Stopwatch.StartNew();
var transforms = TransformInferenceApi.InferTransforms(_mlContext, _trainData, _label, _purposeOverrides);
var availableTrainers = RecipeInference.AllowedTrainers(_mlContext, _task, _settings.StoppingCriteria.MaxIterations);
var columns = AutoMlUtils.GetColumnInfoTuples(_context, _trainData, _label, _purposeOverrides);

do
{
// get next pipeline
var pipeline = PipelineSuggester.GetNextInferredPipeline(_history, transforms, availableTrainers, _optimizingMetricInfo.IsMaximizing);
var iterationsRemaining = _settings.StoppingCriteria.MaxIterations - _history.Count;
var pipeline = PipelineSuggester.GetNextInferredPipeline(_history, columns, _task, iterationsRemaining, _optimizingMetricInfo.IsMaximizing);

// break if no candidates returned, means no valid pipeline available
if (pipeline == null)
Expand Down Expand Up @@ -113,11 +113,11 @@ private object GetEvaluatedMetrics(IDataView scoredData)
switch(_task)
{
case TaskKind.BinaryClassification:
return _mlContext.BinaryClassification.EvaluateNonCalibrated(scoredData);
return _context.BinaryClassification.EvaluateNonCalibrated(scoredData);
case TaskKind.MulticlassClassification:
return _mlContext.MulticlassClassification.Evaluate(scoredData);
return _context.MulticlassClassification.Evaluate(scoredData);
case TaskKind.Regression:
return _mlContext.Regression.Evaluate(scoredData);
return _context.Regression.Evaluate(scoredData);
// should not be possible to reach here
default:
throw new InvalidOperationException($"unsupported machine learning task type {_task}");
Expand Down
6 changes: 3 additions & 3 deletions src/AutoML/AutoFitter/InferredPipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,17 @@ public static InferredPipeline FromPipeline(Pipeline pipeline)
var transforms = new List<SuggestedTransform>();
SuggestedTrainer trainer = null;

foreach(var pipelineNode in pipeline.Elements)
foreach(var pipelineNode in pipeline.Nodes)
{
if(pipelineNode.ElementType == PipelineNodeType.Trainer)
if(pipelineNode.NodeType == PipelineNodeType.Trainer)
{
var trainerName = (TrainerName)Enum.Parse(typeof(TrainerName), pipelineNode.Name);
var trainerExtension = TrainerExtensionCatalog.GetTrainerExtension(trainerName);
var stringParamVals = pipelineNode.Properties.Select(prop => new StringParameterValue(prop.Key, prop.Value.ToString()));
var hyperParamSet = new ParameterSet(stringParamVals);
trainer = new SuggestedTrainer(context, trainerExtension, hyperParamSet);
}
else if (pipelineNode.ElementType == PipelineNodeType.Transform)
else if (pipelineNode.NodeType == PipelineNodeType.Transform)
{
var estimatorName = (EstimatorName)Enum.Parse(typeof(EstimatorName), pipelineNode.Name);
var estimatorExtension = EstimatorExtensionCatalog.GetExtension(estimatorName);
Expand Down
16 changes: 16 additions & 0 deletions src/AutoML/AutoMlUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;
Expand All @@ -29,5 +30,20 @@ public static IDataView Take(this IDataView data, int count)
var take = SkipTakeFilter.Create(env, new SkipTakeFilter.TakeArguments { Count = count }, data);
return new CacheDataView(env, data, Enumerable.Range(0, data.Schema.Count).ToArray());
}

public static (string, ColumnType, ColumnPurpose, ColumnDimensions)[] GetColumnInfoTuples(MLContext context,
IDataView data, string label, IDictionary<string, ColumnPurpose> purposeOverrides)
{
var purposes = PurposeInference.InferPurposes(context, data, label, purposeOverrides);
var colDimensions = DatasetDimensionsApi.CalcColumnDimensions(data, purposes);
var cols = new (string, ColumnType, ColumnPurpose, ColumnDimensions)[data.Schema.Count];
for (var i = 0; i < cols.Length; i++)
{
var schemaCol = data.Schema[i];
var col = (schemaCol.Name, schemaCol.Type, purposes[i].Purpose, colDimensions[i]);
cols[i] = col;
}
return cols;
}
}
}
14 changes: 14 additions & 0 deletions src/AutoML/DatasetDimensions/ColumnDimensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
namespace Microsoft.ML.Auto
{
internal class ColumnDimensions
{
public int? Cardinality;
public bool? HasMissing;

public ColumnDimensions(int? cardinality, bool? hasMissing)
{
Cardinality = cardinality;
HasMissing = hasMissing;
}
}
}
45 changes: 45 additions & 0 deletions src/AutoML/DatasetDimensions/DatasetDimensionsApi.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
using Microsoft.ML.Data;

namespace Microsoft.ML.Auto
{
internal class DatasetDimensionsApi
{
private const int MaxRowsToRead = 1000;

public static ColumnDimensions[] CalcColumnDimensions(IDataView data, PurposeInference.Column[] purposes)
{
data = data.Take(MaxRowsToRead);

var colDimensions = new ColumnDimensions[data.Schema.Count];

for (var i = 0; i < data.Schema.Count; i++)
{
var column = data.Schema[i];
var purpose = purposes[i];

// default column dimensions
int? cardinality = null;
bool? hasMissing = null;

// if categorical text feature, calc cardinality
if(column.Type.ItemType().IsText() && purpose.Purpose == ColumnPurpose.CategoricalFeature)
{
cardinality = DatasetDimensionsUtil.GetTextColumnCardinality(data, i);
}

// if numeric feature, discover missing values
// todo: upgrade logic to consider R8?
if (column.Type.ItemType() == NumberType.R4)
{
hasMissing = column.Type.IsVector() ?
DatasetDimensionsUtil.HasMissingNumericVector(data, i) :
DatasetDimensionsUtil.HasMissingNumericSingleValue(data, i);
}

colDimensions[i] = new ColumnDimensions(cardinality, hasMissing);
}

return colDimensions;
}
}
}
62 changes: 62 additions & 0 deletions src/AutoML/DatasetDimensions/DatasetDimensionsUtil.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;

namespace Microsoft.ML.Auto
{
internal static class DatasetDimensionsUtil
{
public static int GetTextColumnCardinality(IDataView data, int colIndex)
{
var seen = new HashSet<string>();
using (var cursor = data.GetRowCursor(x => x == colIndex))
{
var getter = cursor.GetGetter<ReadOnlyMemory<char>>(colIndex);
while (cursor.MoveNext())
{
var value = default(ReadOnlyMemory<char>);
getter(ref value);
var valueStr = value.ToString();
seen.Add(valueStr);
}
}
return seen.Count;
}

public static bool HasMissingNumericSingleValue(IDataView data, int colIndex)
{
using (var cursor = data.GetRowCursor(x => x == colIndex))
{
var getter = cursor.GetGetter<Single>(colIndex);
var value = default(Single);
while (cursor.MoveNext())
{
getter(ref value);
if (Single.IsNaN(value))
{
return true;
}
}
return false;
}
}

public static bool HasMissingNumericVector(IDataView data, int colIndex)
{
using (var cursor = data.GetRowCursor(x => x == colIndex))
{
var getter = cursor.GetGetter<VBuffer<Single>>(colIndex);
var value = default(VBuffer<Single>);
while (cursor.MoveNext())
{
getter(ref value);
if (VBufferUtils.HasNaNs(value))
{
return true;
}
}
return false;
}
}
}
}
21 changes: 14 additions & 7 deletions src/AutoML/PipelineSuggesters/PipelineSuggester.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;

namespace Microsoft.ML.Auto
{
Expand All @@ -13,23 +14,29 @@ internal static class PipelineSuggester
private const int TopKTrainers = 3;

public static Pipeline GetNextPipeline(IEnumerable<PipelineRunResult> history,
IEnumerable<SuggestedTransform> transforms,
IEnumerable<SuggestedTrainer> availableTrainers,
(string, ColumnType, ColumnPurpose, ColumnDimensions)[] columns,
TaskKind task,
int iterationsRemaining,
bool isMaximizingMetric = true)
{
var inferredHistory = history.Select(r => InferredPipelineRunResult.FromPipelineRunResult(r));
var nextInferredPipeline = GetNextInferredPipeline(inferredHistory,
transforms, availableTrainers, isMaximizingMetric);
var nextInferredPipeline = GetNextInferredPipeline(inferredHistory, columns, task, iterationsRemaining, isMaximizingMetric);
return nextInferredPipeline.ToPipeline();
}

public static InferredPipeline GetNextInferredPipeline(IEnumerable<InferredPipelineRunResult> history,
IEnumerable<SuggestedTransform> transforms,
IEnumerable<SuggestedTrainer> availableTrainers,
(string, ColumnType, ColumnPurpose, ColumnDimensions)[] columns,
TaskKind task,
int iterationsRemaining,
bool isMaximizingMetric = true)
{
var context = new MLContext();

var availableTrainers = RecipeInference.AllowedTrainers(context, TaskKind.BinaryClassification, history.Count() + iterationsRemaining);
var transforms = TransformInferenceApi.InferTransforms(context, columns);

// if we haven't run all pipelines once
if(history.Count() < availableTrainers.Count())
if (history.Count() < availableTrainers.Count())
{
return GetNextFirstStagePipeline(history, availableTrainers, transforms);
}
Expand Down
18 changes: 0 additions & 18 deletions src/AutoML/PipelineSuggesters/PipelineSuggesterApi.cs

This file was deleted.

Loading

0 comments on commit eac3695

Please sign in to comment.