diff --git a/src/AutoML/API/AutoFitSettings.cs b/src/AutoML/API/AutoFitSettings.cs new file mode 100644 index 00000000000..2e329e02cf9 --- /dev/null +++ b/src/AutoML/API/AutoFitSettings.cs @@ -0,0 +1,71 @@ +using System.Collections.Generic; +using System.Diagnostics; + +namespace Microsoft.ML.Auto +{ + public class AutoFitSettings + { + public ExperimentStoppingCriteria StoppingCriteria = new ExperimentStoppingCriteria(); + internal IterationStoppingCriteria IterationStoppingCriteria; + internal Concurrency Concurrency; + internal Filters Filters; + internal CrossValidationSettings CrossValidationSettings; + internal OptimizingMetric OptimizingMetric; + internal bool EnableEnsembling; + internal bool EnableModelExplainability; + internal bool EnableAutoTransformation; + + // spec question: Are following automatic or a user setting? + internal bool EnableSubSampling; + internal bool EnableCaching; + internal bool ExternalizeTraining; + internal TraceLevel TraceLevel; // Should this be controlled through code or appconfig? + } + + public class ExperimentStoppingCriteria + { + public int MaxIterations = 100; + public int TimeOutInMinutes = 300; + internal bool StopAfterConverging; + internal double ExperimentExitScore; + } + + internal class Filters + { + internal IEnumerable WhitelistTrainers; + internal IEnumerable BlackListTrainers; + internal IEnumerable WhitelistTransformers; + internal IEnumerable BlacklistTransformers; + internal bool PreferExplainability; + internal bool PreferInferenceSpeed; + internal bool PreferSmallDeploymentSize; + internal bool PreferSmallMemoryFootprint; + } + + public class IterationStoppingCriteria + { + internal int TimeOutInSeconds; + internal bool TerminateOnLowAccuracy; + } + + public class Concurrency + { + internal int MaxConcurrentIterations; + internal int MaxCoresPerIteration; + } + + internal enum Trainers + { + } + + internal enum Transformers + { + } + + internal class CrossValidationSettings + { + internal int NumberOfFolds; + internal int ValidationSizePercentage; + internal IEnumerable StratificationColumnNames; + } +} diff --git a/src/AutoML/API/InferenceException.cs b/src/AutoML/API/InferenceException.cs new file mode 100644 index 00000000000..b09b7c8ac69 --- /dev/null +++ b/src/AutoML/API/InferenceException.cs @@ -0,0 +1,34 @@ +using System; + +namespace Microsoft.ML.Auto +{ + public enum InferenceType + { + Seperator, + Header, + Label, + Task, + ColumnDataKind, + ColumnPurpose, + Tranform, + Trainer, + Hyperparams, + ColumnSplit + } + + public class InferenceException : Exception + { + public InferenceType InferenceType; + + public InferenceException(InferenceType inferenceType, string message) + : base(message) + { + } + + public InferenceException(InferenceType inferenceType, string message, Exception inner) + : base(message, inner) + { + } + } + +} diff --git a/src/AutoML/API/MLContextExtensions.cs b/src/AutoML/API/MLContextAutoFitExtensions.cs similarity index 55% rename from src/AutoML/API/MLContextExtensions.cs rename to src/AutoML/API/MLContextAutoFitExtensions.cs index 6db0edcebe3..9fad49ef137 100644 --- a/src/AutoML/API/MLContextExtensions.cs +++ b/src/AutoML/API/MLContextAutoFitExtensions.cs @@ -1,8 +1,5 @@ using System; using System.Collections.Generic; -using System.Diagnostics; -using System.IO; -using System.Linq; using System.Threading; using Microsoft.ML.Core.Data; using Microsoft.ML.Data; @@ -12,12 +9,12 @@ namespace Microsoft.ML.Auto public static class RegressionExtensions { public static RegressionResult AutoFit(this RegressionContext context, - IDataView trainData, - string label, - IDataView validationData = null, + IDataView trainData, + string label, + IDataView validationData = null, AutoFitSettings settings = null, IEnumerable<(string, ColumnPurpose)> purposeOverrides = null, - CancellationToken cancellationToken = default, + CancellationToken cancellationToken = default, IProgress iterationCallback = null) { return AutoFit(context, trainData, label, validationData, settings, @@ -25,13 +22,13 @@ public static RegressionResult AutoFit(this RegressionContext context, } // todo: instead of internal methods, use static debug class w/ singleton logger? - internal static RegressionResult AutoFit(this RegressionContext context, - IDataView trainData, - string label, - IDataView validationData = null, + internal static RegressionResult AutoFit(this RegressionContext context, + IDataView trainData, + string label, + IDataView validationData = null, AutoFitSettings settings = null, IEnumerable<(string, ColumnPurpose)> purposeOverrides = null, - CancellationToken cancellationToken = default, + CancellationToken cancellationToken = default, IProgress iterationCallback = null, IDebugLogger debugLogger = null) { @@ -61,12 +58,12 @@ public static Pipeline GetPipeline(this RegressionContext context, IDataView dat public static class BinaryClassificationExtensions { public static BinaryClassificationResult AutoFit(this BinaryClassificationContext context, - IDataView trainData, - string label, + IDataView trainData, + string label, IDataView validationData = null, AutoFitSettings settings = null, IEnumerable<(string, ColumnPurpose)> purposeOverrides = null, - CancellationToken cancellationToken = default, + CancellationToken cancellationToken = default, IProgress iterationCallback = null) { return AutoFit(context, trainData, label, validationData, settings, @@ -74,13 +71,13 @@ public static BinaryClassificationResult AutoFit(this BinaryClassificationContex } internal static BinaryClassificationResult AutoFit(this BinaryClassificationContext context, - IDataView trainData, - string label, + IDataView trainData, + string label, IDataView validationData = null, AutoFitSettings settings = null, IEnumerable<(string, ColumnPurpose)> purposeOverrides = null, CancellationToken cancellationToken = default, - IProgress iterationCallback = null, + IProgress iterationCallback = null, IDebugLogger debugLogger = null) { UserInputValidationUtil.ValidateAutoFitArgs(trainData, label, validationData, settings, purposeOverrides); @@ -91,7 +88,7 @@ internal static BinaryClassificationResult AutoFit(this BinaryClassificationCont purposeOverrides, debugLogger); var results = new BinaryClassificationItertionResult[allPipelines.Length]; - for(var i = 0; i < results.Length; i++) + for (var i = 0; i < results.Length; i++) { var iterationResult = allPipelines[i]; var result = new BinaryClassificationItertionResult(iterationResult.Model, (BinaryClassificationMetrics)iterationResult.EvaluatedMetrics, iterationResult.ScoredValidationData, iterationResult.Pipeline.ToPipeline()); @@ -110,12 +107,12 @@ public static Pipeline GetPipeline(this BinaryClassificationContext context, IDa public static class MulticlassExtensions { public static MulticlassClassificationResult AutoFit(this MulticlassClassificationContext context, - IDataView trainData, - string label, + IDataView trainData, + string label, IDataView validationData = null, AutoFitSettings settings = null, IEnumerable<(string, ColumnPurpose)> purposeOverrides = null, - CancellationToken cancellationToken = default, + CancellationToken cancellationToken = default, IProgress iterationCallback = null) { return AutoFit(context, trainData, label, validationData, settings, @@ -123,8 +120,8 @@ public static MulticlassClassificationResult AutoFit(this MulticlassClassificati } internal static MulticlassClassificationResult AutoFit(this MulticlassClassificationContext context, - IDataView trainData, - string label, + IDataView trainData, + string label, IDataView validationData = null, AutoFitSettings settings = null, IEnumerable<(string, ColumnPurpose)> purposeOverrides = null, @@ -135,7 +132,7 @@ internal static MulticlassClassificationResult AutoFit(this MulticlassClassifica // run autofit & get all pipelines run in that process var (allPipelines, bestPipeline) = AutoFitApi.Fit(trainData, validationData, label, - settings, TaskKind.MulticlassClassification, OptimizingMetric.Accuracy, + settings, TaskKind.MulticlassClassification, OptimizingMetric.Accuracy, purposeOverrides, debugLogger); var results = new MulticlassClassificationIterationResult[allPipelines.Length]; @@ -155,164 +152,6 @@ public static Pipeline GetPipeline(this MulticlassClassificationContext context, } } - public static class TransformExtensions - { - public static IEstimator InferTransforms(this TransformsCatalog catalog, IDataView data, string label) - { - UserInputValidationUtil.ValidateInferTransformArgs(data, label); - var mlContext = new MLContext(); - var suggestedTransforms = TransformInferenceApi.InferTransforms(mlContext, data, label); - var estimators = suggestedTransforms.Select(s => s.Estimator); - var pipeline = new EstimatorChain(); - foreach(var estimator in estimators) - { - pipeline = pipeline.Append(estimator); - } - return pipeline; - } - } - - public static class DataExtensions - { - // Delimiter, header, column datatype inference - public static ColumnInferenceResult InferColumns(this DataOperations catalog, string path, string label, - bool hasHeader = false, char? separatorChar = null, bool? allowQuotedStrings = null, bool? supportSparse = null, bool trimWhitespace = false) - { - UserInputValidationUtil.ValidateInferColumnsArgs(path, label); - var mlContext = new MLContext(); - return ColumnInferenceApi.InferColumns(mlContext, path, label, hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace); - } - - public static IDataView AutoRead(this DataOperations catalog, string path, string label, - bool hasHeader = false, char? separatorChar = null, bool? allowQuotedStrings = null, bool? supportSparse = null, bool trimWhitespace = false) - { - UserInputValidationUtil.ValidateAutoReadArgs(path, label); - var mlContext = new MLContext(); - var columnInferenceResult = ColumnInferenceApi.InferColumns(mlContext, path, label, hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace); - var textLoader = columnInferenceResult.BuildTextLoader(); - return textLoader.Read(path); - } - - public static TextLoader CreateTextReader(this DataOperations catalog, ColumnInferenceResult columnInferenceResult) - { - UserInputValidationUtil.ValidateCreateTextReaderArgs(columnInferenceResult); - return columnInferenceResult.BuildTextLoader(); - } - - // Task inference - public static MachineLearningTaskType InferTask(this DataOperations catalog, IDataView dataView) - { - throw new NotImplementedException(); - } - - public enum MachineLearningTaskType - { - Regression, - BinaryClassification, - MultiClassClassification - } - } - - public class ColumnInferenceResult - { - public readonly IEnumerable<(TextLoader.Column, ColumnPurpose)> Columns; - public readonly bool AllowQuotedStrings; - public readonly bool SupportSparse; - public readonly string Separator; - public readonly bool HasHeader; - public readonly bool TrimWhitespace; - - public ColumnInferenceResult(IEnumerable<(TextLoader.Column, ColumnPurpose)> columns, - bool allowQuotedStrings, bool supportSparse, string separator, bool hasHeader, bool trimWhitespace) - { - Columns = columns; - AllowQuotedStrings = allowQuotedStrings; - SupportSparse = supportSparse; - Separator = separator; - HasHeader = hasHeader; - TrimWhitespace = trimWhitespace; - } - - internal TextLoader BuildTextLoader() - { - var context = new MLContext(); - return new TextLoader(context, new TextLoader.Arguments() { - AllowQuoting = AllowQuotedStrings, - AllowSparse = SupportSparse, - Column = Columns.Select(c => c.Item1).ToArray(), - Separator = Separator, - HasHeader = HasHeader, - TrimWhitespace = TrimWhitespace - }); - } - } - - public class AutoFitSettings - { - public ExperimentStoppingCriteria StoppingCriteria = new ExperimentStoppingCriteria(); - internal IterationStoppingCriteria IterationStoppingCriteria; - internal Concurrency Concurrency; - internal Filters Filters; - internal CrossValidationSettings CrossValidationSettings; - internal OptimizingMetric OptimizingMetric; - internal bool EnableEnsembling; - internal bool EnableModelExplainability; - internal bool EnableAutoTransformation; - - // spec question: Are following automatic or a user setting? - internal bool EnableSubSampling; - internal bool EnableCaching; - internal bool ExternalizeTraining; - internal TraceLevel TraceLevel; // Should this be controlled through code or appconfig? - } - - public class ExperimentStoppingCriteria - { - public int MaxIterations = 100; - public int TimeOutInMinutes = 300; - internal bool StopAfterConverging; - internal double ExperimentExitScore; - } - - internal class Filters - { - internal IEnumerable WhitelistTrainers; - internal IEnumerable BlackListTrainers; - internal IEnumerable WhitelistTransformers; - internal IEnumerable BlacklistTransformers; - internal bool PreferExplainability; - internal bool PreferInferenceSpeed; - internal bool PreferSmallDeploymentSize; - internal bool PreferSmallMemoryFootprint; - } - - public class IterationStoppingCriteria - { - internal int TimeOutInSeconds; - internal bool TerminateOnLowAccuracy; - } - - public class Concurrency - { - internal int MaxConcurrentIterations; - internal int MaxCoresPerIteration; - } - - internal enum Trainers - { - } - - internal enum Transformers - { - } - - internal class CrossValidationSettings - { - internal int NumberOfFolds; - internal int ValidationSizePercentage; - internal IEnumerable StratificationColumnNames; - } - public class BinaryClassificationResult { public readonly BinaryClassificationItertionResult BestPipeline; @@ -399,69 +238,4 @@ public RegressionIterationResult(ITransformer model, RegressionMetrics metrics, Pipeline = pipeline; } } - - public enum InferenceType - { - Seperator, - Header, - Label, - Task, - ColumnDataKind, - ColumnPurpose, - Tranform, - Trainer, - Hyperparams, - ColumnSplit - } - - public class InferenceException : Exception - { - public InferenceType InferenceType; - - public InferenceException(InferenceType inferenceType, string message) - : base(message) - { - } - - public InferenceException(InferenceType inferenceType, string message, Exception inner) - : base(message, inner) - { - } - } - - public class Pipeline - { - public readonly PipelineNode[] Elements; - - public Pipeline(PipelineNode[] elements) - { - Elements = elements; - } - } - - public class PipelineNode - { - public readonly string Name; - public readonly PipelineNodeType ElementType; - public readonly string[] InColumns; - public readonly string[] OutColumns; - public readonly IDictionary Properties; - - public PipelineNode(string name, PipelineNodeType elementType, - string[] inColumns, string[] outColumns, - IDictionary properties) - { - Name = name; - ElementType = elementType; - InColumns = inColumns; - OutColumns = outColumns; - Properties = properties; - } - } - - public enum PipelineNodeType - { - Transform, - Trainer - } } diff --git a/src/AutoML/API/MLContextDataExtensions.cs b/src/AutoML/API/MLContextDataExtensions.cs new file mode 100644 index 00000000000..04c9ac3d987 --- /dev/null +++ b/src/AutoML/API/MLContextDataExtensions.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Auto +{ + public static class DataExtensions + { + // Delimiter, header, column datatype inference + public static ColumnInferenceResult InferColumns(this DataOperations catalog, string path, string label, + bool hasHeader = false, char? separatorChar = null, bool? allowQuotedStrings = null, bool? supportSparse = null, bool trimWhitespace = false) + { + UserInputValidationUtil.ValidateInferColumnsArgs(path, label); + var mlContext = new MLContext(); + return ColumnInferenceApi.InferColumns(mlContext, path, label, hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace); + } + + public static IDataView AutoRead(this DataOperations catalog, string path, string label, + bool hasHeader = false, char? separatorChar = null, bool? allowQuotedStrings = null, bool? supportSparse = null, bool trimWhitespace = false) + { + UserInputValidationUtil.ValidateAutoReadArgs(path, label); + var mlContext = new MLContext(); + var columnInferenceResult = ColumnInferenceApi.InferColumns(mlContext, path, label, hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace); + var textLoader = columnInferenceResult.BuildTextLoader(); + return textLoader.Read(path); + } + + public static TextLoader CreateTextReader(this DataOperations catalog, ColumnInferenceResult columnInferenceResult) + { + UserInputValidationUtil.ValidateCreateTextReaderArgs(columnInferenceResult); + return columnInferenceResult.BuildTextLoader(); + } + + // Task inference + public static MachineLearningTaskType InferTask(this DataOperations catalog, IDataView dataView) + { + throw new NotImplementedException(); + } + + public enum MachineLearningTaskType + { + Regression, + BinaryClassification, + MultiClassClassification + } + } + + public class ColumnInferenceResult + { + public readonly IEnumerable<(TextLoader.Column, ColumnPurpose)> Columns; + public readonly bool AllowQuotedStrings; + public readonly bool SupportSparse; + public readonly string Separator; + public readonly bool HasHeader; + public readonly bool TrimWhitespace; + + public ColumnInferenceResult(IEnumerable<(TextLoader.Column, ColumnPurpose)> columns, + bool allowQuotedStrings, bool supportSparse, string separator, bool hasHeader, bool trimWhitespace) + { + Columns = columns; + AllowQuotedStrings = allowQuotedStrings; + SupportSparse = supportSparse; + Separator = separator; + HasHeader = hasHeader; + TrimWhitespace = trimWhitespace; + } + + internal TextLoader BuildTextLoader() + { + var context = new MLContext(); + return new TextLoader(context, new TextLoader.Arguments() + { + AllowQuoting = AllowQuotedStrings, + AllowSparse = SupportSparse, + Column = Columns.Select(c => c.Item1).ToArray(), + Separator = Separator, + HasHeader = HasHeader, + TrimWhitespace = TrimWhitespace + }); + } + } +} diff --git a/src/AutoML/API/Pipeline.cs b/src/AutoML/API/Pipeline.cs new file mode 100644 index 00000000000..d20d5305e18 --- /dev/null +++ b/src/AutoML/API/Pipeline.cs @@ -0,0 +1,40 @@ +using System.Collections.Generic; + +namespace Microsoft.ML.Auto +{ + public class Pipeline + { + public readonly PipelineNode[] Elements; + + public Pipeline(PipelineNode[] elements) + { + Elements = elements; + } + } + + public class PipelineNode + { + public readonly string Name; + public readonly PipelineNodeType ElementType; + public readonly string[] InColumns; + public readonly string[] OutColumns; + public readonly IDictionary Properties; + + public PipelineNode(string name, PipelineNodeType elementType, + string[] inColumns, string[] outColumns, + IDictionary properties) + { + Name = name; + ElementType = elementType; + InColumns = inColumns; + OutColumns = outColumns; + Properties = properties; + } + } + + public enum PipelineNodeType + { + Transform, + Trainer + } +} diff --git a/src/AutoML/API/UserInputValidationUtil.cs b/src/AutoML/Utils/UserInputValidationUtil.cs similarity index 100% rename from src/AutoML/API/UserInputValidationUtil.cs rename to src/AutoML/Utils/UserInputValidationUtil.cs