From 2f8099a1f26ab42719a7c8327df94389c120ab18 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Wed, 17 Oct 2018 11:31:00 -0700 Subject: [PATCH 01/28] Added placeholder --- src/Microsoft.ML.PCA/PcaTransformer.cs | 565 +++++++++++++++++++++++++ 1 file changed, 565 insertions(+) create mode 100644 src/Microsoft.ML.PCA/PcaTransformer.cs diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs new file mode 100644 index 0000000000..85a8573cee --- /dev/null +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -0,0 +1,565 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Float = System.Single; + +using System; +using System.Linq; +using System.Text; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.EntryPoints; +using Microsoft.ML.Runtime.Internal.CpuMath; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; +using Microsoft.ML.Runtime.Numeric; + +[assembly: LoadableClass(PcaTransformer.Summary, typeof(PcaTransformer), typeof(PcaTransformer.Arguments), typeof(SignatureDataTransform), + PcaTransformer.UserName, PcaTransformer.LoaderSignature, PcaTransformer.ShortName)] + +[assembly: LoadableClass(PcaTransformer.Summary, typeof(PcaTransformer), null, typeof(SignatureLoadDataTransform), + PcaTransformer.UserName, PcaTransformer.LoaderSignature)] + +[assembly: LoadableClass(typeof(void), typeof(PcaTransformer), null, typeof(SignatureEntryPointModule), PcaTransformer.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Data +{ + /// + public sealed class PcaTransformer : OneToOneTransformBase + { + internal static class Defaults + { + public const string WeightColumn = null; + public const int Rank = 20; + public const int Oversampling = 20; + public const bool Center = true; + public const int Seed = 0; + } + + public sealed class Arguments : TransformInputBase + { + [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", ShortName = "col", SortOrder = 1)] + public Column[] Column; + + [Argument(ArgumentType.Multiple, HelpText = "The name of the weight column", ShortName = "weight", Purpose = SpecialPurpose.ColumnName)] + public string WeightColumn = Defaults.WeightColumn; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k")] + public int Rank = Defaults.Rank; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Oversampling parameter for randomized PCA training", ShortName = "over")] + public int Oversampling = Defaults.Oversampling; + + [Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean")] + public bool Center = Defaults.Center; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation")] + public int Seed = Defaults.Seed; + } + + public class Column : OneToOneColumn + { + [Argument(ArgumentType.Multiple, HelpText = "The name of the weight column", ShortName = "weight")] + public string WeightColumn; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k")] + public int? Rank; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Oversampling parameter for randomized PCA training", ShortName = "over")] + public int? Oversampling; + + [Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean", ShortName = "center")] + public bool? Center; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation", ShortName = "seed")] + public int? Seed; + + public static Column Parse(string str) + { + Contracts.AssertNonEmpty(str); + + var res = new Column(); + if (res.TryParse(str)) + return res; + return null; + } + + public bool TryUnparse(StringBuilder sb) + { + Contracts.AssertValue(sb); + if (!string.IsNullOrEmpty(WeightColumn) || Rank != null || Oversampling != null || + Center != null || Seed != null) + { + return false; + } + return TryUnparseCore(sb); + } + } + + private sealed class TransformInfo + { + public readonly int Dimension; + public readonly int Rank; + + public Float[][] Eigenvectors; + public Float[] MeanProjected; + + public TransformInfo(Column item, Arguments args, int d) + { + Dimension = d; + Rank = item.Rank ?? args.Rank; + Contracts.CheckUserArg(0 < Rank && Rank <= Dimension, nameof(item.Rank), "Rank must be positive, and at most the dimension of untransformed data"); + } + + public TransformInfo(ModelLoadContext ctx, int colValueCount) + { + Contracts.AssertValue(ctx); + + // *** Binary format *** + // int: Dimension + // int: Rank + // for i=0,..,Rank-1: + // Float[]: the i'th eigenvector + // int: the size of MeanProjected (0 if it is null) + // Float[]: MeanProjected + + Dimension = ctx.Reader.ReadInt32(); + Contracts.CheckDecode(Dimension == colValueCount); + + Rank = ctx.Reader.ReadInt32(); + Contracts.CheckDecode(0 < Rank && Rank <= Dimension); + + Eigenvectors = new Float[Rank][]; + for (int i = 0; i < Rank; i++) + { + Eigenvectors[i] = ctx.Reader.ReadFloatArray(Dimension); + Contracts.CheckDecode(FloatUtils.IsFinite(Eigenvectors[i], Eigenvectors[i].Length)); + } + + MeanProjected = ctx.Reader.ReadFloatArray(); + Contracts.CheckDecode(MeanProjected == null || (MeanProjected.Length == Rank && FloatUtils.IsFinite(MeanProjected, MeanProjected.Length))); + } + + public void Save(ModelSaveContext ctx) + { + Contracts.AssertValue(ctx); + + // *** Binary format *** + // int: Dimension + // int: Rank + // for i=0,..,Rank-1: + // Float[]: the i'th eigenvector + // int: the size of MeanProjected (0 if it is null) + // Float[]: MeanProjected + + Contracts.Assert(0 < Rank && Rank <= Dimension); + ctx.Writer.Write(Dimension); + ctx.Writer.Write(Rank); + for (int i = 0; i < Rank; i++) + { + Contracts.Assert(FloatUtils.IsFinite(Eigenvectors[i], Eigenvectors[i].Length)); + ctx.Writer.WriteFloatsNoCount(Eigenvectors[i], Dimension); + } + Contracts.Assert(MeanProjected == null || (MeanProjected.Length == Rank && FloatUtils.IsFinite(MeanProjected, Rank))); + ctx.Writer.WriteFloatArray(MeanProjected); + } + + internal void ProjectMean(Float[] mean) + { + Contracts.AssertValue(Eigenvectors); + if (mean == null) + { + MeanProjected = null; + return; + } + + MeanProjected = new Float[Rank]; + for (var i = 0; i < Rank; ++i) + MeanProjected[i] = VectorUtils.DotProduct(Eigenvectors[i], mean); + } + } + + internal const string Summary = "PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace."; + internal const string UserName = "Principal Component Analysis Transform"; + internal const string ShortName = "Pca"; + + public const string LoaderSignature = "PcaTransformer"; + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "PCA FUNC", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature, + loaderAssemblyName: typeof(PcaTransformer).Assembly.FullName); + } + + // These are parallel to Infos. + private readonly ColumnType[] _types; + private readonly TransformInfo[] _transformInfos; + + private readonly int[] _oversampling; + private readonly bool[] _center; + private readonly int[] _weightColumnIndex; + + private const string RegistrationName = "Pca"; + + /// + /// Public constructor corresponding to SignatureDataTransform. + /// + public PcaTransformer(IHostEnvironment env, Arguments args, IDataView input) + : base(env, RegistrationName, Contracts.CheckRef(args, nameof(args)).Column, + input, TestIsFloatItem) + { + Host.AssertNonEmpty(Infos); + Host.Assert(Infos.Length == Utils.Size(args.Column)); + + _transformInfos = new TransformInfo[args.Column.Length]; + _oversampling = new int[args.Column.Length]; + _center = new bool[args.Column.Length]; + _weightColumnIndex = new int[args.Column.Length]; + for (int i = 0; i < _transformInfos.Length; i++) + { + Host.Check(Infos[i].TypeSrc.VectorSize > 1, "Pca transform can only be applied to columns with known dimensionality greater than 1"); + _transformInfos[i] = new TransformInfo(args.Column[i], args, Infos[i].TypeSrc.ValueCount); + _center[i] = args.Column[i].Center ?? args.Center; + _oversampling[i] = args.Column[i].Oversampling ?? args.Oversampling; + Host.CheckUserArg(_oversampling[i] >= 0, nameof(args.Oversampling), "Oversampling must be non-negative"); + _weightColumnIndex[i] = -1; + var weightColumn = args.Column[i].WeightColumn ?? args.WeightColumn; + if (weightColumn != null) + { + if (!Source.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndex[i])) + throw Host.Except("weight column '{0}' does not exist", weightColumn); + var type = Source.Schema.GetColumnType(_weightColumnIndex[i]); + Host.CheckUserArg(type == NumberType.Float, nameof(args.WeightColumn)); + } + } + + Train(args, _transformInfos, input); + + _types = InitColumnTypes(); + } + + private PcaTransformer(IHost host, ModelLoadContext ctx, IDataView input) + : base(host, ctx, input, TestIsFloatItem) + { + Host.AssertValue(ctx); + + // *** Binary format *** + // + // + // transformInfos + Host.AssertNonEmpty(Infos); + _transformInfos = new TransformInfo[Infos.Length]; + for (int i = 0; i < Infos.Length; i++) + _transformInfos[i] = new TransformInfo(ctx, Infos[i].TypeSrc.ValueCount); + _types = InitColumnTypes(); + } + + public static PcaTransformer Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + { + Contracts.CheckValue(env, nameof(env)); + var h = env.Register(RegistrationName); + h.CheckValue(ctx, nameof(ctx)); + h.CheckValue(input, nameof(input)); + ctx.CheckAtModel(GetVersionInfo()); + + // *** Binary format *** + // int: sizeof(Float) + // + int cbFloat = ctx.Reader.ReadInt32(); + h.CheckDecode(cbFloat == sizeof(Float)); + return h.Apply("Loading Model", ch => new PcaTransformer(h, ctx, input)); + } + + public override void Save(ModelSaveContext ctx) + { + Host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // int: sizeof(Float) + // + // transformInfos + ctx.Writer.Write(sizeof(Float)); + SaveBase(ctx); + for (int i = 0; i < _transformInfos.Length; i++) + _transformInfos[i].Save(ctx); + } + + private void Train(Arguments args, TransformInfo[] transformInfos, IDataView trainingData) + { + var y = new Float[transformInfos.Length][][]; + var omega = new Float[transformInfos.Length][][]; + var mean = new Float[transformInfos.Length][]; + + var oversampledRank = new int[transformInfos.Length]; + var rnd = Host.Rand; + Double totalMemoryUsageEstimate = 0; + for (int iinfo = 0; iinfo < transformInfos.Length; iinfo++) + { + oversampledRank[iinfo] = Math.Min(transformInfos[iinfo].Rank + _oversampling[iinfo], transformInfos[iinfo].Dimension); + + //exact: (size of the 2 big matrices + other minor allocations) / (2^30) + Double colMemoryUsageEstimate = 2.0 * transformInfos[iinfo].Dimension * oversampledRank[iinfo] * sizeof(Float) / 1e9; + totalMemoryUsageEstimate += colMemoryUsageEstimate; + if (colMemoryUsageEstimate > 2) + { + using (var ch = Host.Start("Memory usage")) + { + ch.Info("Estimate memory usage for transforming column {1}: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", + colMemoryUsageEstimate, Infos[iinfo].Name); + } + } + + y[iinfo] = new Float[oversampledRank[iinfo]][]; + omega[iinfo] = new Float[oversampledRank[iinfo]][]; + for (int i = 0; i < oversampledRank[iinfo]; i++) + { + y[iinfo][i] = new Float[transformInfos[iinfo].Dimension]; + omega[iinfo][i] = new Float[transformInfos[iinfo].Dimension]; + for (int j = 0; j < transformInfos[iinfo].Dimension; j++) + { + omega[iinfo][i][j] = (Float)Stats.SampleFromGaussian(rnd); + } + } + + if (_center[iinfo]) + mean[iinfo] = new Float[transformInfos[iinfo].Dimension]; + } + if (totalMemoryUsageEstimate > 2) + { + using (var ch = Host.Start("Memory usage")) + { + ch.Info("Estimate memory usage for all PCA transforms: {0:G2} GB. If running out of memory, reduce ranks and oversampling factors.", + totalMemoryUsageEstimate); + } + } + + Project(trainingData, mean, omega, y, transformInfos); + + for (int iinfo = 0; iinfo < transformInfos.Length; iinfo++) + { + //Orthonormalize Y in-place using stabilized Gram Schmidt algorithm + //Ref: https://en.wikipedia.org/wiki/Gram-Schmidt#Algorithm + for (var i = 0; i < oversampledRank[iinfo]; ++i) + { + var v = y[iinfo][i]; + VectorUtils.ScaleBy(v, 1 / VectorUtils.Norm(y[iinfo][i])); // normalize + + // Make the next vectors in the queue orthogonal to the orthonormalized vectors + for (var j = i + 1; j < oversampledRank[iinfo]; ++j) + VectorUtils.AddMult(v, y[iinfo][j], -VectorUtils.DotProduct(v, y[iinfo][j])); //subtract the projection of y[j] on v + } + } + var q = y; // q in QR decomposition + + var b = omega; // reuse the memory allocated by Omega + Project(trainingData, mean, q, b, transformInfos); + + for (int iinfo = 0; iinfo < transformInfos.Length; iinfo++) + { + //Compute B2 = B' * B + var b2 = new Float[oversampledRank[iinfo] * oversampledRank[iinfo]]; + for (var i = 0; i < oversampledRank[iinfo]; ++i) + { + for (var j = i; j < oversampledRank[iinfo]; ++j) + b2[i * oversampledRank[iinfo] + j] = b2[j * oversampledRank[iinfo] + i] = VectorUtils.DotProduct(b[iinfo][i], b[iinfo][j]); + } + + Float[] smallEigenvalues; // eigenvectors and eigenvalues of the small matrix B2. + Float[] smallEigenvectors; + + EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors); + transformInfos[iinfo].Eigenvectors = PostProcess(b[iinfo], smallEigenvalues, smallEigenvectors, transformInfos[iinfo].Dimension, oversampledRank[iinfo]); + transformInfos[iinfo].ProjectMean(mean[iinfo]); + } + } + + //Project the covariance matrix A on to Omega: Y <- A * Omega + //A = X' * X / n, where X = data - mean + //Note that the covariance matrix is not computed explicitly + private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, Float[][][] y, TransformInfo[] transformInfos) + { + Host.Assert(mean.Length == omega.Length && omega.Length == y.Length && y.Length == Infos.Length); + for (int i = 0; i < omega.Length; i++) + Contracts.Assert(omega[i].Length == y[i].Length); + + // set y to be all zeros + for (int iinfo = 0; iinfo < y.Length; iinfo++) + { + for (int i = 0; i < y[iinfo].Length; i++) + Array.Clear(y[iinfo][i], 0, y[iinfo][i].Length); + } + + bool[] center = Enumerable.Range(0, mean.Length).Select(i => mean[i] != null).ToArray(); + + Double[] totalColWeight = new Double[Infos.Length]; + + bool[] activeColumns = new bool[Source.Schema.ColumnCount]; + for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + { + activeColumns[Infos[iinfo].Source] = true; + if (_weightColumnIndex[iinfo] >= 0) + activeColumns[_weightColumnIndex[iinfo]] = true; + } + using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) + { + var weightGetters = new ValueGetter[Infos.Length]; + var columnGetters = new ValueGetter>[Infos.Length]; + for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + { + if (_weightColumnIndex[iinfo] >= 0) + weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndex[iinfo]); + columnGetters[iinfo] = cursor.GetGetter>(Infos[iinfo].Source); + } + + var features = default(VBuffer); + while (cursor.MoveNext()) + { + for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + { + Contracts.Check(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsNumber, + "PCA transform can only be performed on numeric columns of dimension > 1"); + + Float weight = 1; + if (weightGetters[iinfo] != null) + weightGetters[iinfo](ref weight); + columnGetters[iinfo](ref features); + + if (FloatUtils.IsFinite(weight) && weight >= 0 && (features.Count == 0 || FloatUtils.IsFinite(features.Values, features.Count))) + { + totalColWeight[iinfo] += weight; + + if (center[iinfo]) + VectorUtils.AddMult(ref features, mean[iinfo], weight); + + for (int i = 0; i < omega[iinfo].Length; i++) + VectorUtils.AddMult(ref features, y[iinfo][i], weight * VectorUtils.DotProductWithOffset(omega[iinfo][i], 0, ref features)); + } + } + } + + for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + { + if (totalColWeight[iinfo] <= 0) + throw Host.Except("Empty data in column '{0}'", Source.Schema.GetColumnName(Infos[iinfo].Source)); + } + + for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + { + var invn = (Float)(1 / totalColWeight[iinfo]); + + for (var i = 0; i < omega[iinfo].Length; ++i) + VectorUtils.ScaleBy(y[iinfo][i], invn); + + if (center[iinfo]) + { + VectorUtils.ScaleBy(mean[iinfo], invn); + for (int i = 0; i < omega[iinfo].Length; i++) + VectorUtils.AddMult(mean[iinfo], y[iinfo][i], -VectorUtils.DotProduct(omega[iinfo][i], mean[iinfo])); + } + } + } + } + + //return Y * eigenvectors / eigenvalues + // REVIEW: improve + private Float[][] PostProcess(Float[][] y, Float[] sigma, Float[] z, int d, int k) + { + var pinv = new Float[k]; + var tmp = new Float[k]; + + for (int i = 0; i < k; i++) + pinv[i] = (Float)(1.0) / ((Float)(1e-6) + sigma[i]); + + for (int i = 0; i < d; i++) + { + for (int j = 0; j < k; j++) + { + tmp[j] = 0; + for (int l = 0; l < k; l++) + tmp[j] += y[l][i] * z[j * k + l]; + } + for (int j = 0; j < k; j++) + y[j][i] = pinv[j] * tmp[j]; + } + + return y; + } + + private ColumnType[] InitColumnTypes() + { + Host.Assert(Infos.Length == _transformInfos.Length); + var types = new ColumnType[Infos.Length]; + for (int i = 0; i < _transformInfos.Length; i++) + types[i] = new VectorType(NumberType.Float, _transformInfos[i].Rank); + Metadata.Seal(); + return types; + } + + protected override ColumnType GetColumnTypeCore(int iinfo) + { + Host.Check(0 <= iinfo & iinfo < Utils.Size(_types)); + return _types[iinfo]; + } + + protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) + { + Host.AssertValueOrNull(ch); + Host.AssertValue(input); + Host.Assert(0 <= iinfo && iinfo < Infos.Length); + disposer = null; + + var getSrc = GetSrcGetter>(input, iinfo); + var src = default(VBuffer); + var trInfo = _transformInfos[iinfo]; + ValueGetter> del = + (ref VBuffer dst) => + { + getSrc(ref src); + TransformFeatures(Host, ref src, ref dst, trInfo); + }; + return del; + } + + private static void TransformFeatures(IExceptionContext ectx, ref VBuffer src, ref VBuffer dst, TransformInfo transformInfo) + { + ectx.Check(src.Length == transformInfo.Dimension); + + var values = dst.Values; + if (Utils.Size(values) < transformInfo.Rank) + values = new Float[transformInfo.Rank]; + + for (int i = 0; i < transformInfo.Rank; i++) + { + values[i] = VectorUtils.DotProductWithOffset(transformInfo.Eigenvectors[i], 0, ref src) - + (transformInfo.MeanProjected == null ? 0 : transformInfo.MeanProjected[i]); + } + + dst = new VBuffer(transformInfo.Rank, values, dst.Indices); + } + + [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator", + Desc = Summary, + UserName = UserName, + ShortName = ShortName, + XmlInclude = new[] { @"", + @""})] + public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input) + { + var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input); + var view = new PcaTransformer(h, input, input.Data); + return new CommonOutputs.TransformOutput() + { + Model = new TransformModel(h, view, input.Data), + OutputData = view + }; + } + } +} From b952053b842aeba38a7fe8e0c760965ce9765f14 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Wed, 17 Oct 2018 14:58:06 -0700 Subject: [PATCH 02/28] Cleaned up Infos (replaced with ColumnPairs) --- src/Microsoft.ML.PCA/PcaTransformer.cs | 123 +++++++++++++++---------- 1 file changed, 73 insertions(+), 50 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 85a8573cee..fa18484703 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -16,10 +16,16 @@ using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Numeric; -[assembly: LoadableClass(PcaTransformer.Summary, typeof(PcaTransformer), typeof(PcaTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(PcaTransformer.Summary, typeof(IDataTransform), typeof(PcaTransformer), typeof(PcaTransformer.Arguments), typeof(SignatureDataTransform), PcaTransformer.UserName, PcaTransformer.LoaderSignature, PcaTransformer.ShortName)] -[assembly: LoadableClass(PcaTransformer.Summary, typeof(PcaTransformer), null, typeof(SignatureLoadDataTransform), +[assembly: LoadableClass(PcaTransformer.Summary, typeof(IDataTransform), typeof(PcaTransformer), null, typeof(SignatureLoadDataTransform), + PcaTransformer.UserName, PcaTransformer.LoaderSignature)] + +[assembly: LoadableClass(PcaTransformer.Summary, typeof(PcaTransformer), null, typeof(SignatureLoadModel), + PcaTransformer.UserName, PcaTransformer.LoaderSignature)] + +[assembly: LoadableClass(typeof(IRowMapper), typeof(PcaTransformer), null, typeof(SignatureLoadRowMapper), PcaTransformer.UserName, PcaTransformer.LoaderSignature)] [assembly: LoadableClass(typeof(void), typeof(PcaTransformer), null, typeof(SignatureEntryPointModule), PcaTransformer.LoaderSignature)] @@ -27,7 +33,7 @@ namespace Microsoft.ML.Runtime.Data { /// - public sealed class PcaTransformer : OneToOneTransformBase + public sealed class PcaTransformer : OneToOneTransformerBase { internal static class Defaults { @@ -113,7 +119,7 @@ public TransformInfo(Column item, Arguments args, int d) Contracts.CheckUserArg(0 < Rank && Rank <= Dimension, nameof(item.Rank), "Rank must be positive, and at most the dimension of untransformed data"); } - public TransformInfo(ModelLoadContext ctx, int colValueCount) + public TransformInfo(ModelLoadContext ctx) { Contracts.AssertValue(ctx); @@ -126,8 +132,6 @@ public TransformInfo(ModelLoadContext ctx, int colValueCount) // Float[]: MeanProjected Dimension = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(Dimension == colValueCount); - Rank = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 < Rank && Rank <= Dimension); @@ -205,26 +209,37 @@ private static VersionInfo GetVersionInfo() private readonly bool[] _center; private readonly int[] _weightColumnIndex; + private readonly int[] _inputColumnsIndex; + private readonly ColumnType[] _inputColumnsTypes; + private readonly int _numColumns; + private const string RegistrationName = "Pca"; /// /// Public constructor corresponding to SignatureDataTransform. /// public PcaTransformer(IHostEnvironment env, Arguments args, IDataView input) - : base(env, RegistrationName, Contracts.CheckRef(args, nameof(args)).Column, - input, TestIsFloatItem) + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaTransformer)), GetColumnPairs(args)) { - Host.AssertNonEmpty(Infos); - Host.Assert(Infos.Length == Utils.Size(args.Column)); + Host.AssertNonEmpty(ColumnPairs); + Host.Assert(ColumnPairs.Length == Utils.Size(args.Column)); + + _numColumns = ColumnPairs.Length; + _transformInfos = new TransformInfo[_numColumns]; + _oversampling = new int[_numColumns]; + _center = new bool[_numColumns]; + _weightColumnIndex = new int[_numColumns]; + _inputColumnsIndex = new int[_numColumns]; + _inputColumnsTypes = new ColumnType[_numColumns]; - _transformInfos = new TransformInfo[args.Column.Length]; - _oversampling = new int[args.Column.Length]; - _center = new bool[args.Column.Length]; - _weightColumnIndex = new int[args.Column.Length]; for (int i = 0; i < _transformInfos.Length; i++) { - Host.Check(Infos[i].TypeSrc.VectorSize > 1, "Pca transform can only be applied to columns with known dimensionality greater than 1"); - _transformInfos[i] = new TransformInfo(args.Column[i], args, Infos[i].TypeSrc.ValueCount); + if (!input.Schema.TryGetColumnIndex(ColumnPairs[i].input, out _inputColumnsIndex[i])) + throw Host.ExceptSchemaMismatch(nameof(input), "input", ColumnPairs[i].input); + _inputColumnsTypes[i] = input.Schema.GetColumnType(_inputColumnsIndex[i]); + Host.Check(_inputColumnsTypes[i].IsKnownSizeVector && _inputColumnsTypes[i].VectorSize > 1, + "Pca transform can only be applied to columns with known dimensionality greater than 1"); + _transformInfos[i] = new TransformInfo(args.Column[i], args, _inputColumnsTypes[i].ValueCount); _center[i] = args.Column[i].Center ?? args.Center; _oversampling[i] = args.Column[i].Oversampling ?? args.Oversampling; Host.CheckUserArg(_oversampling[i] >= 0, nameof(args.Oversampling), "Oversampling must be non-negative"); @@ -232,20 +247,19 @@ public PcaTransformer(IHostEnvironment env, Arguments args, IDataView input) var weightColumn = args.Column[i].WeightColumn ?? args.WeightColumn; if (weightColumn != null) { - if (!Source.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndex[i])) + if (!input.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndex[i])) throw Host.Except("weight column '{0}' does not exist", weightColumn); - var type = Source.Schema.GetColumnType(_weightColumnIndex[i]); + var type = input.Schema.GetColumnType(_weightColumnIndex[i]); Host.CheckUserArg(type == NumberType.Float, nameof(args.WeightColumn)); } } Train(args, _transformInfos, input); - _types = InitColumnTypes(); } - private PcaTransformer(IHost host, ModelLoadContext ctx, IDataView input) - : base(host, ctx, input, TestIsFloatItem) + private PcaTransformer(IHost host, ModelLoadContext ctx) + : base(host, ctx) { Host.AssertValue(ctx); @@ -253,13 +267,20 @@ private PcaTransformer(IHost host, ModelLoadContext ctx, IDataView input) // // // transformInfos - Host.AssertNonEmpty(Infos); - _transformInfos = new TransformInfo[Infos.Length]; - for (int i = 0; i < Infos.Length; i++) - _transformInfos[i] = new TransformInfo(ctx, Infos[i].TypeSrc.ValueCount); + Host.AssertNonEmpty(ColumnPairs); + _numColumns = ColumnPairs.Length; + _transformInfos = new TransformInfo[_numColumns]; + for (int i = 0; i < _numColumns; i++) + _transformInfos[i] = new TransformInfo(ctx); _types = InitColumnTypes(); } + private static (string input, string output)[] GetColumnPairs(Arguments args) + { + //Contracts.CheckValue(columns, nameof(columns)); + return args.Column.Select(x => (x.Source, x.Name)).ToArray(); + } + public static PcaTransformer Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) { Contracts.CheckValue(env, nameof(env)); @@ -287,7 +308,7 @@ public override void Save(ModelSaveContext ctx) // // transformInfos ctx.Writer.Write(sizeof(Float)); - SaveBase(ctx); + SaveColumns(ctx); for (int i = 0; i < _transformInfos.Length; i++) _transformInfos[i].Save(ctx); } @@ -313,7 +334,7 @@ private void Train(Arguments args, TransformInfo[] transformInfos, IDataView tra using (var ch = Host.Start("Memory usage")) { ch.Info("Estimate memory usage for transforming column {1}: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", - colMemoryUsageEstimate, Infos[iinfo].Name); + colMemoryUsageEstimate, ColumnPairs[iinfo].input); } } @@ -386,7 +407,7 @@ private void Train(Arguments args, TransformInfo[] transformInfos, IDataView tra //Note that the covariance matrix is not computed explicitly private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, Float[][][] y, TransformInfo[] transformInfos) { - Host.Assert(mean.Length == omega.Length && omega.Length == y.Length && y.Length == Infos.Length); + Host.Assert(mean.Length == omega.Length && omega.Length == y.Length && y.Length == _numColumns); for (int i = 0; i < omega.Length; i++) Contracts.Assert(omega[i].Length == y[i].Length); @@ -399,37 +420,37 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, bool[] center = Enumerable.Range(0, mean.Length).Select(i => mean[i] != null).ToArray(); - Double[] totalColWeight = new Double[Infos.Length]; + Double[] totalColWeight = new Double[_numColumns]; - bool[] activeColumns = new bool[Source.Schema.ColumnCount]; - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - activeColumns[Infos[iinfo].Source] = true; + activeColumns[_inputColumnsIndex[iinfo]] = true; if (_weightColumnIndex[iinfo] >= 0) activeColumns[_weightColumnIndex[iinfo]] = true; } + using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) { - var weightGetters = new ValueGetter[Infos.Length]; - var columnGetters = new ValueGetter>[Infos.Length]; - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + var weightGetters = new ValueGetter[_numColumns]; + var columnGetters = new ValueGetter>[_numColumns]; + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { if (_weightColumnIndex[iinfo] >= 0) weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndex[iinfo]); - columnGetters[iinfo] = cursor.GetGetter>(Infos[iinfo].Source); + columnGetters[iinfo] = cursor.GetGetter>(_inputColumnsIndex[iinfo]); } var features = default(VBuffer); while (cursor.MoveNext()) { - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - Contracts.Check(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsNumber, + Contracts.Check(_inputColumnsTypes[iinfo].IsVector && _inputColumnsTypes[iinfo].ItemType.IsNumber, "PCA transform can only be performed on numeric columns of dimension > 1"); Float weight = 1; - if (weightGetters[iinfo] != null) - weightGetters[iinfo](ref weight); + weightGetters[iinfo]?.Invoke(ref weight); columnGetters[iinfo](ref features); if (FloatUtils.IsFinite(weight) && weight >= 0 && (features.Count == 0 || FloatUtils.IsFinite(features.Values, features.Count))) @@ -445,13 +466,13 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, } } - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { if (totalColWeight[iinfo] <= 0) - throw Host.Except("Empty data in column '{0}'", Source.Schema.GetColumnName(Infos[iinfo].Source)); + throw Host.Except("Empty data in column '{0}'", ColumnPairs[iinfo].input); } - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { var invn = (Float)(1 / totalColWeight[iinfo]); @@ -495,11 +516,8 @@ private Float[][] PostProcess(Float[][] y, Float[] sigma, Float[] z, int d, int private ColumnType[] InitColumnTypes() { - Host.Assert(Infos.Length == _transformInfos.Length); - var types = new ColumnType[Infos.Length]; - for (int i = 0; i < _transformInfos.Length; i++) - types[i] = new VectorType(NumberType.Float, _transformInfos[i].Rank); - Metadata.Seal(); + Host.Assert(ColumnPairs.Length == _transformInfos.Length); + var types = _transformInfos.Select(tInfo => new VectorType(NumberType.Float, tInfo.Rank)).ToArray(); return types; } @@ -513,7 +531,7 @@ protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, ou { Host.AssertValueOrNull(ch); Host.AssertValue(input); - Host.Assert(0 <= iinfo && iinfo < Infos.Length); + Host.Assert(0 <= iinfo && iinfo < _numColumns); disposer = null; var getSrc = GetSrcGetter>(input, iinfo); @@ -545,7 +563,7 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer dst = new VBuffer(transformInfo.Rank, values, dst.Indices); } - [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator", + [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator2", Desc = Summary, UserName = UserName, ShortName = ShortName, @@ -561,5 +579,10 @@ public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Argu OutputData = view }; } + + protected override IRowMapper MakeRowMapper(ISchema schema) + { + throw new NotImplementedException(); + } } } From fbb7bf90100d1fc52dc8a84b517e1b5fd704607d Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Thu, 18 Oct 2018 08:17:42 -0700 Subject: [PATCH 03/28] Added ColumnInfo --- src/Microsoft.ML.PCA/PcaTransformer.cs | 109 +++++++++++++++++-------- 1 file changed, 76 insertions(+), 33 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index fa18484703..5af3ac43ff 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -104,6 +104,31 @@ public bool TryUnparse(StringBuilder sb) } } + public sealed class ColumnInfo + { + public readonly string Input; + public readonly string Output; + public readonly string WeightColumn; + public readonly int Rank; + public readonly int Oversampling; + public readonly bool Center; + public readonly int? Seed; + + /// + /// Describes how the transformer handles one column pair. + /// + public ColumnInfo(string input, string output, string weightColumn, int rank, int overSampling, bool center, int? seed = null) + { + Input = input; + Output = output; + WeightColumn = weightColumn; + Rank = rank; + Oversampling = overSampling; + Center = center; + Seed = seed; + } + } + private sealed class TransformInfo { public readonly int Dimension; @@ -112,11 +137,11 @@ private sealed class TransformInfo public Float[][] Eigenvectors; public Float[] MeanProjected; - public TransformInfo(Column item, Arguments args, int d) + public TransformInfo(int rank, int dim) { - Dimension = d; - Rank = item.Rank ?? args.Rank; - Contracts.CheckUserArg(0 < Rank && Rank <= Dimension, nameof(item.Rank), "Rank must be positive, and at most the dimension of untransformed data"); + Dimension = dim; + Rank = rank; + Contracts.CheckUserArg(0 < Rank && Rank <= Dimension, nameof(Rank), "Rank must be positive, and at most the dimension of untransformed data"); } public TransformInfo(ModelLoadContext ctx) @@ -204,11 +229,7 @@ private static VersionInfo GetVersionInfo() // These are parallel to Infos. private readonly ColumnType[] _types; private readonly TransformInfo[] _transformInfos; - - private readonly int[] _oversampling; - private readonly bool[] _center; private readonly int[] _weightColumnIndex; - private readonly int[] _inputColumnsIndex; private readonly ColumnType[] _inputColumnsTypes; private readonly int _numColumns; @@ -218,43 +239,39 @@ private static VersionInfo GetVersionInfo() /// /// Public constructor corresponding to SignatureDataTransform. /// - public PcaTransformer(IHostEnvironment env, Arguments args, IDataView input) - : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaTransformer)), GetColumnPairs(args)) + public PcaTransformer(IHostEnvironment env, IDataView input, ColumnInfo[] columns) + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaTransformer)), GetColumnPairs(columns)) { Host.AssertNonEmpty(ColumnPairs); - Host.Assert(ColumnPairs.Length == Utils.Size(args.Column)); _numColumns = ColumnPairs.Length; _transformInfos = new TransformInfo[_numColumns]; - _oversampling = new int[_numColumns]; - _center = new bool[_numColumns]; _weightColumnIndex = new int[_numColumns]; _inputColumnsIndex = new int[_numColumns]; _inputColumnsTypes = new ColumnType[_numColumns]; for (int i = 0; i < _transformInfos.Length; i++) { - if (!input.Schema.TryGetColumnIndex(ColumnPairs[i].input, out _inputColumnsIndex[i])) - throw Host.ExceptSchemaMismatch(nameof(input), "input", ColumnPairs[i].input); + var col = columns[i]; + if (!input.Schema.TryGetColumnIndex(col.Input, out _inputColumnsIndex[i])) + throw Host.ExceptSchemaMismatch(nameof(col.Input), "input", col.Input); _inputColumnsTypes[i] = input.Schema.GetColumnType(_inputColumnsIndex[i]); Host.Check(_inputColumnsTypes[i].IsKnownSizeVector && _inputColumnsTypes[i].VectorSize > 1, "Pca transform can only be applied to columns with known dimensionality greater than 1"); - _transformInfos[i] = new TransformInfo(args.Column[i], args, _inputColumnsTypes[i].ValueCount); - _center[i] = args.Column[i].Center ?? args.Center; - _oversampling[i] = args.Column[i].Oversampling ?? args.Oversampling; - Host.CheckUserArg(_oversampling[i] >= 0, nameof(args.Oversampling), "Oversampling must be non-negative"); + _transformInfos[i] = new TransformInfo(col.Rank, _inputColumnsTypes[i].ValueCount); + Host.CheckUserArg(col.Oversampling >= 0, nameof(col.Oversampling), "Oversampling must be non-negative"); _weightColumnIndex[i] = -1; - var weightColumn = args.Column[i].WeightColumn ?? args.WeightColumn; + var weightColumn = col.WeightColumn; if (weightColumn != null) { if (!input.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndex[i])) throw Host.Except("weight column '{0}' does not exist", weightColumn); var type = input.Schema.GetColumnType(_weightColumnIndex[i]); - Host.CheckUserArg(type == NumberType.Float, nameof(args.WeightColumn)); + Host.CheckUserArg(type == NumberType.Float, nameof(weightColumn)); } } - Train(args, _transformInfos, input); + Train(columns, _transformInfos, input); _types = InitColumnTypes(); } @@ -275,10 +292,37 @@ private PcaTransformer(IHost host, ModelLoadContext ctx) _types = InitColumnTypes(); } - private static (string input, string output)[] GetColumnPairs(Arguments args) + private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) { //Contracts.CheckValue(columns, nameof(columns)); - return args.Column.Select(x => (x.Source, x.Name)).ToArray(); + return columns.Select(x => (x.Input, x.Output)).ToArray(); + } + + // Factory method for SignatureDataTransform. + private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(args, nameof(args)); + env.CheckValue(input, nameof(input)); + + env.CheckValue(args.Column, nameof(args.Column)); + var cols = new ColumnInfo[args.Column.Length]; + using (var ch = env.Start("ValidateArgs")) + { + + for (int i = 0; i < cols.Length; i++) + { + var item = args.Column[i]; + cols[i] = new ColumnInfo(item.Source, + item.Name, + item.WeightColumn, + item.Rank ?? args.Rank, + item.Oversampling ?? args.Oversampling, + item.Center ?? args.Center, + item.Seed ?? args.Seed); + }; + } + return new PcaTransformer(env, input, cols).MakeDataTransform(input); } public static PcaTransformer Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) @@ -313,18 +357,17 @@ public override void Save(ModelSaveContext ctx) _transformInfos[i].Save(ctx); } - private void Train(Arguments args, TransformInfo[] transformInfos, IDataView trainingData) + private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataView trainingData) { - var y = new Float[transformInfos.Length][][]; - var omega = new Float[transformInfos.Length][][]; - var mean = new Float[transformInfos.Length][]; - - var oversampledRank = new int[transformInfos.Length]; + var y = new Float[_numColumns][][]; + var omega = new Float[_numColumns][][]; + var mean = new Float[_numColumns][]; + var oversampledRank = new int[_numColumns]; var rnd = Host.Rand; Double totalMemoryUsageEstimate = 0; - for (int iinfo = 0; iinfo < transformInfos.Length; iinfo++) + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - oversampledRank[iinfo] = Math.Min(transformInfos[iinfo].Rank + _oversampling[iinfo], transformInfos[iinfo].Dimension); + oversampledRank[iinfo] = Math.Min(transformInfos[iinfo].Rank + columns[iinfo].Oversampling, transformInfos[iinfo].Dimension); //exact: (size of the 2 big matrices + other minor allocations) / (2^30) Double colMemoryUsageEstimate = 2.0 * transformInfos[iinfo].Dimension * oversampledRank[iinfo] * sizeof(Float) / 1e9; @@ -350,7 +393,7 @@ private void Train(Arguments args, TransformInfo[] transformInfos, IDataView tra } } - if (_center[iinfo]) + if (columns[iinfo].Center) mean[iinfo] = new Float[transformInfos[iinfo].Dimension]; } if (totalMemoryUsageEstimate > 2) From e1cada0fdcef0a2f5552a7210670c9e2ad2b44d4 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Thu, 18 Oct 2018 08:31:38 -0700 Subject: [PATCH 04/28] Added all the Create() methods. --- src/Microsoft.ML.PCA/PcaTransformer.cs | 40 +++++++++++++++----------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 5af3ac43ff..086753afcc 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -292,11 +292,13 @@ private PcaTransformer(IHost host, ModelLoadContext ctx) _types = InitColumnTypes(); } - private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) - { - //Contracts.CheckValue(columns, nameof(columns)); - return columns.Select(x => (x.Input, x.Output)).ToArray(); - } + // Factory method for SignatureLoadDataTransform. + private static IDataTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + => Create(env, ctx).MakeDataTransform(input); + + // Factory method for SignatureLoadRowMapper. + private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, ISchema inputSchema) + => Create(env, ctx).MakeRowMapper(inputSchema); // Factory method for SignatureDataTransform. private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) @@ -325,20 +327,20 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData return new PcaTransformer(env, input, cols).MakeDataTransform(input); } - public static PcaTransformer Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + // Factory method for SignatureLoadModel. + private static PcaTransformer Create(IHostEnvironment env, ModelLoadContext ctx) { Contracts.CheckValue(env, nameof(env)); - var h = env.Register(RegistrationName); - h.CheckValue(ctx, nameof(ctx)); - h.CheckValue(input, nameof(input)); - ctx.CheckAtModel(GetVersionInfo()); + var host = env.Register(nameof(PcaTransformer)); - // *** Binary format *** - // int: sizeof(Float) - // - int cbFloat = ctx.Reader.ReadInt32(); - h.CheckDecode(cbFloat == sizeof(Float)); - return h.Apply("Loading Model", ch => new PcaTransformer(h, ctx, input)); + host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + if (ctx.Header.ModelVerWritten == 0x00010001) + { + int cbFloat = ctx.Reader.ReadInt32(); + env.CheckDecode(cbFloat == sizeof(float)); + } + return new PcaTransformer(host, ctx); } public override void Save(ModelSaveContext ctx) @@ -357,6 +359,12 @@ public override void Save(ModelSaveContext ctx) _transformInfos[i].Save(ctx); } + private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) + { + //Contracts.CheckValue(columns, nameof(columns)); + return columns.Select(x => (x.Input, x.Output)).ToArray(); + } + private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataView trainingData) { var y = new Float[_numColumns][][]; From 4f490b01332be145d2349ad0f8769d66225280a0 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Thu, 18 Oct 2018 12:00:52 -0700 Subject: [PATCH 05/28] Added Mapper --- src/Microsoft.ML.PCA/PcaTransformer.cs | 154 +++++++++++++++---------- 1 file changed, 94 insertions(+), 60 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 086753afcc..8f3a227189 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -227,11 +227,11 @@ private static VersionInfo GetVersionInfo() } // These are parallel to Infos. - private readonly ColumnType[] _types; + private readonly ColumnType[] _outputColumnTypes; private readonly TransformInfo[] _transformInfos; - private readonly int[] _weightColumnIndex; - private readonly int[] _inputColumnsIndex; - private readonly ColumnType[] _inputColumnsTypes; + private readonly int[] _weightColumnIndices; + private readonly int[] _inputColumnIndices; + private readonly ColumnType[] _inputColumnTypes; private readonly int _numColumns; private const string RegistrationName = "Pca"; @@ -246,33 +246,33 @@ public PcaTransformer(IHostEnvironment env, IDataView input, ColumnInfo[] column _numColumns = ColumnPairs.Length; _transformInfos = new TransformInfo[_numColumns]; - _weightColumnIndex = new int[_numColumns]; - _inputColumnsIndex = new int[_numColumns]; - _inputColumnsTypes = new ColumnType[_numColumns]; + _weightColumnIndices = new int[_numColumns]; + _inputColumnIndices = new int[_numColumns]; + _inputColumnTypes = new ColumnType[_numColumns]; - for (int i = 0; i < _transformInfos.Length; i++) + for (int i = 0; i < _numColumns; i++) { var col = columns[i]; - if (!input.Schema.TryGetColumnIndex(col.Input, out _inputColumnsIndex[i])) + if (!input.Schema.TryGetColumnIndex(col.Input, out _inputColumnIndices[i])) throw Host.ExceptSchemaMismatch(nameof(col.Input), "input", col.Input); - _inputColumnsTypes[i] = input.Schema.GetColumnType(_inputColumnsIndex[i]); - Host.Check(_inputColumnsTypes[i].IsKnownSizeVector && _inputColumnsTypes[i].VectorSize > 1, + _inputColumnTypes[i] = input.Schema[_inputColumnIndices[i]].Type; + Host.Check(_inputColumnTypes[i].IsKnownSizeVector && _inputColumnTypes[i].VectorSize > 1, "Pca transform can only be applied to columns with known dimensionality greater than 1"); - _transformInfos[i] = new TransformInfo(col.Rank, _inputColumnsTypes[i].ValueCount); + _transformInfos[i] = new TransformInfo(col.Rank, _inputColumnTypes[i].ValueCount); Host.CheckUserArg(col.Oversampling >= 0, nameof(col.Oversampling), "Oversampling must be non-negative"); - _weightColumnIndex[i] = -1; + _weightColumnIndices[i] = -1; var weightColumn = col.WeightColumn; if (weightColumn != null) { - if (!input.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndex[i])) + if (!input.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndices[i])) throw Host.Except("weight column '{0}' does not exist", weightColumn); - var type = input.Schema.GetColumnType(_weightColumnIndex[i]); + var type = input.Schema.GetColumnType(_weightColumnIndices[i]); Host.CheckUserArg(type == NumberType.Float, nameof(weightColumn)); } } Train(columns, _transformInfos, input); - _types = InitColumnTypes(); + _outputColumnTypes = InitColumnTypes(); } private PcaTransformer(IHost host, ModelLoadContext ctx) @@ -289,7 +289,7 @@ private PcaTransformer(IHost host, ModelLoadContext ctx) _transformInfos = new TransformInfo[_numColumns]; for (int i = 0; i < _numColumns; i++) _transformInfos[i] = new TransformInfo(ctx); - _types = InitColumnTypes(); + _outputColumnTypes = InitColumnTypes(); } // Factory method for SignatureLoadDataTransform. @@ -476,9 +476,9 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - activeColumns[_inputColumnsIndex[iinfo]] = true; - if (_weightColumnIndex[iinfo] >= 0) - activeColumns[_weightColumnIndex[iinfo]] = true; + activeColumns[_inputColumnIndices[iinfo]] = true; + if (_weightColumnIndices[iinfo] >= 0) + activeColumns[_weightColumnIndices[iinfo]] = true; } using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) @@ -487,9 +487,9 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, var columnGetters = new ValueGetter>[_numColumns]; for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - if (_weightColumnIndex[iinfo] >= 0) - weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndex[iinfo]); - columnGetters[iinfo] = cursor.GetGetter>(_inputColumnsIndex[iinfo]); + if (_weightColumnIndices[iinfo] >= 0) + weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndices[iinfo]); + columnGetters[iinfo] = cursor.GetGetter>(_inputColumnIndices[iinfo]); } var features = default(VBuffer); @@ -497,7 +497,7 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, { for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - Contracts.Check(_inputColumnsTypes[iinfo].IsVector && _inputColumnsTypes[iinfo].ItemType.IsNumber, + Contracts.Check(_inputColumnTypes[iinfo].IsVector && _inputColumnTypes[iinfo].ItemType.IsNumber, "PCA transform can only be performed on numeric columns of dimension > 1"); Float weight = 1; @@ -572,46 +572,85 @@ private ColumnType[] InitColumnTypes() return types; } - protected override ColumnType GetColumnTypeCore(int iinfo) - { - Host.Check(0 <= iinfo & iinfo < Utils.Size(_types)); - return _types[iinfo]; - } + protected override IRowMapper MakeRowMapper(ISchema schema) => new Mapper(this, Schema.Create(schema)); - protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) + private sealed class Mapper : MapperBase { - Host.AssertValueOrNull(ch); - Host.AssertValue(input); - Host.Assert(0 <= iinfo && iinfo < _numColumns); - disposer = null; - - var getSrc = GetSrcGetter>(input, iinfo); - var src = default(VBuffer); - var trInfo = _transformInfos[iinfo]; - ValueGetter> del = - (ref VBuffer dst) => + private readonly ColumnType[] _outputColumnTypes; + private readonly ColumnType[] _inputColumnTypes; + private readonly int[] _inputColumnIndices; + private readonly PcaTransformer _parent; + private readonly int _numColumns; + + public Mapper(PcaTransformer parent, Schema inputSchema) + : base(parent.Host.Register(nameof(Mapper)), parent, inputSchema) + { + _parent = parent; + _numColumns = parent._numColumns; + _outputColumnTypes = parent.InitColumnTypes(); + _inputColumnTypes = new ColumnType[_numColumns]; + _inputColumnIndices = new int[_numColumns]; + for (int i = 0; i < _numColumns; i++) { - getSrc(ref src); - TransformFeatures(Host, ref src, ref dst, trInfo); - }; - return del; - } + var inputColName = _parent.ColumnPairs[i].input; + if (!inputSchema.TryGetColumnIndex(inputColName, out _inputColumnIndices[i])) + throw Host.ExceptSchemaMismatch(nameof(inputColName), "input", inputColName); + _inputColumnTypes[i] = inputSchema[_inputColumnIndices[i]].Type; + Host.Check(_inputColumnTypes[i].IsKnownSizeVector && _inputColumnTypes[i].VectorSize > 1, + "Pca transform can only be applied to columns with known dimensionality greater than 1"); + if (_inputColumnTypes[i].VectorSize != _parent._transformInfos[i].Dimension) + { + var msg = $"Dimension of column ${inputColName} is ${_inputColumnTypes[i].VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; + throw Host.Except(msg); + } + } + // Ivan't comment: + //var getSrc = input.GetGetter>(ColMapNewToOld[iinfo]); + } - private static void TransformFeatures(IExceptionContext ectx, ref VBuffer src, ref VBuffer dst, TransformInfo transformInfo) - { - ectx.Check(src.Length == transformInfo.Dimension); + public override Schema.Column[] GetOutputColumns() + { + var result = new Schema.Column[_numColumns]; + for (int i = 0; i < _numColumns; i++) + result[i] = new Schema.Column(_parent.ColumnPairs[i].output, _outputColumnTypes[i], null); + return result; + } - var values = dst.Values; - if (Utils.Size(values) < transformInfo.Rank) - values = new Float[transformInfo.Rank]; + protected override Delegate MakeGetter(IRow input, int iinfo, out Action disposer) + { + Contracts.AssertValue(input); + Contracts.Assert(0 <= iinfo && iinfo < _numColumns); + disposer = null; + + var srcGetter = input.GetGetter>(_inputColumnIndices[iinfo]); + var src = default(VBuffer); + + ValueGetter> dstGetter = (ref VBuffer dst) => + { + srcGetter(ref src); + TransformFeatures(Host, ref src, ref dst, _parent._transformInfos[iinfo]); + }; - for (int i = 0; i < transformInfo.Rank; i++) + return dstGetter; + } + + private static void TransformFeatures(IExceptionContext ectx, ref VBuffer src, ref VBuffer dst, TransformInfo transformInfo) { - values[i] = VectorUtils.DotProductWithOffset(transformInfo.Eigenvectors[i], 0, ref src) - - (transformInfo.MeanProjected == null ? 0 : transformInfo.MeanProjected[i]); + ectx.Check(src.Length == transformInfo.Dimension); + + var values = dst.Values; + if (Utils.Size(values) < transformInfo.Rank) + values = new Float[transformInfo.Rank]; + + for (int i = 0; i < transformInfo.Rank; i++) + { + values[i] = VectorUtils.DotProductWithOffset(transformInfo.Eigenvectors[i], 0, ref src) - + (transformInfo.MeanProjected == null ? 0 : transformInfo.MeanProjected[i]); + } + + dst = new VBuffer(transformInfo.Rank, values, dst.Indices); } - dst = new VBuffer(transformInfo.Rank, values, dst.Indices); } [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator2", @@ -630,10 +669,5 @@ public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Argu OutputData = view }; } - - protected override IRowMapper MakeRowMapper(ISchema schema) - { - throw new NotImplementedException(); - } } } From fa8d9387d61a554d9c11a749dadff794922e2990 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Thu, 18 Oct 2018 12:53:31 -0700 Subject: [PATCH 06/28] Commented out the EntryPoint --- src/Microsoft.ML.PCA/PcaTransformer.cs | 38 +++++++++++++++----------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 8f3a227189..78e8bdfa8e 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -577,6 +577,7 @@ private ColumnType[] InitColumnTypes() private sealed class Mapper : MapperBase { private readonly ColumnType[] _outputColumnTypes; + // Todo: replace with ColMapNewToOld private readonly ColumnType[] _inputColumnTypes; private readonly int[] _inputColumnIndices; private readonly PcaTransformer _parent; @@ -651,23 +652,28 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer dst = new VBuffer(transformInfo.Rank, values, dst.Indices); } - } + //protected virtual void CheckInputColumn(ISchema inputSchema, int col, int srcCol) + //{ + // // By default, there are no extra checks. + //} - [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator2", - Desc = Summary, - UserName = UserName, - ShortName = ShortName, - XmlInclude = new[] { @"", - @""})] - public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input) - { - var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input); - var view = new PcaTransformer(h, input, input.Data); - return new CommonOutputs.TransformOutput() - { - Model = new TransformModel(h, view, input.Data), - OutputData = view - }; } + + //[TlcModule.EntryPoint(Name = "Transforms.PcaCalculator2", + // Desc = Summary, + // UserName = UserName, + // ShortName = ShortName, + // XmlInclude = new[] { @"", + // @""})] + //public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input) + //{ + // var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input); + // var view = new PcaTransformer(h, input, input.Data); + // return new CommonOutputs.TransformOutput() + // { + // Model = new TransformModel(h, view, input.Data), + // OutputData = view + // }; + //} } } From 12bfaa04d106aeb7dc00ad0affafe19585669e0c Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Thu, 18 Oct 2018 13:48:33 -0700 Subject: [PATCH 07/28] Added PcaEstimator2 --- src/Microsoft.ML.PCA/PcaTransformer.cs | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 78e8bdfa8e..677eee1657 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -15,6 +15,7 @@ using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Numeric; +using Microsoft.ML.Core.Data; [assembly: LoadableClass(PcaTransformer.Summary, typeof(IDataTransform), typeof(PcaTransformer), typeof(PcaTransformer.Arguments), typeof(SignatureDataTransform), PcaTransformer.UserName, PcaTransformer.LoaderSignature, PcaTransformer.ShortName)] @@ -676,4 +677,55 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer // }; //} } + + public sealed class PcaEstimator2 : IEstimator + { + public static class Defaults + { + public const int NewDim = 1000; + public const bool UseSin = false; + } + + private readonly IHost _host; + private readonly PcaTransformer.ColumnInfo[] _columns; + + /// + /// Convinence constructor for simple one column case + /// + public PcaEstimator2(IHostEnvironment env, string inputColumn, string outputColumn = null, + string weightColumn = PcaTransformer.Defaults.WeightColumn, int rank = PcaTransformer.Defaults.Rank, + int overSampling = PcaTransformer.Defaults.Oversampling, bool center = PcaTransformer.Defaults.Center, + int? seed = null) + : this(env, new PcaTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, weightColumn, rank, overSampling, center, seed)) + { + } + + public PcaEstimator2(IHostEnvironment env, params PcaTransformer.ColumnInfo[] columns) + { + Contracts.CheckValue(env, nameof(env)); + _host = env.Register(nameof(PcaEstimator2)); + _columns = columns; + } + + public PcaTransformer Fit(IDataView input) => new PcaTransformer(_host, input, _columns); + + public SchemaShape GetOutputSchema(SchemaShape inputSchema) + { + _host.CheckValue(inputSchema, nameof(inputSchema)); + var result = inputSchema.Columns.ToDictionary(x => x.Name); + foreach (var colInfo in _columns) + { + if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) + throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); + if (col.ItemType.RawKind != DataKind.R4 || col.Kind != SchemaShape.Column.VectorKind.Vector) + throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); + + result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, + SchemaShape.Column.VectorKind.Vector, NumberType.R4, false); + } + + return new SchemaShape(result.Values); + } + } + } From 524d5452caac07e26fd0faaec946cf0ccbae47f2 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Thu, 18 Oct 2018 15:22:43 -0700 Subject: [PATCH 08/28] PcaWorkout test passes --- src/Microsoft.ML.PCA/PcaTransformer.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 677eee1657..33e57a4bc0 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -213,13 +213,13 @@ internal void ProjectMean(Float[] mean) internal const string Summary = "PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace."; internal const string UserName = "Principal Component Analysis Transform"; - internal const string ShortName = "Pca"; + internal const string ShortName = "Pca2"; public const string LoaderSignature = "PcaTransformer"; private static VersionInfo GetVersionInfo() { return new VersionInfo( - modelSignature: "PCA FUNC", + modelSignature: "PCA FUN2", verWrittenCur: 0x00010001, // Initial verReadableCur: 0x00010001, verWeCanReadBack: 0x00010001, @@ -235,7 +235,7 @@ private static VersionInfo GetVersionInfo() private readonly ColumnType[] _inputColumnTypes; private readonly int _numColumns; - private const string RegistrationName = "Pca"; + private const string RegistrationName = "Pca2"; /// /// Public constructor corresponding to SignatureDataTransform. From 79af00f476662042638c1bd2b0a7cdb1d578ec2f Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Thu, 18 Oct 2018 16:25:19 -0700 Subject: [PATCH 09/28] Added pigsty api --- src/Microsoft.ML.PCA/PcaTransformer.cs | 116 ++++++++++++++++++++----- 1 file changed, 92 insertions(+), 24 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 33e57a4bc0..dfc7bb3469 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -16,6 +16,9 @@ using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Numeric; using Microsoft.ML.Core.Data; +using Microsoft.ML.StaticPipe; +using Microsoft.ML.StaticPipe.Runtime; +using System.Collections.Generic; [assembly: LoadableClass(PcaTransformer.Summary, typeof(IDataTransform), typeof(PcaTransformer), typeof(PcaTransformer.Arguments), typeof(SignatureDataTransform), PcaTransformer.UserName, PcaTransformer.LoaderSignature, PcaTransformer.ShortName)] @@ -307,25 +310,21 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData Contracts.CheckValue(env, nameof(env)); env.CheckValue(args, nameof(args)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Column, nameof(args.Column)); - var cols = new ColumnInfo[args.Column.Length]; - using (var ch = env.Start("ValidateArgs")) - { + var cols = ArgumentsToColumnInfos(args); + return new PcaTransformer(env, input, cols).MakeDataTransform(input); + } - for (int i = 0; i < cols.Length; i++) - { - var item = args.Column[i]; - cols[i] = new ColumnInfo(item.Source, + internal static ColumnInfo[] ArgumentsToColumnInfos(Arguments args) + { + return args.Column.Select(item => new ColumnInfo( + item.Source, item.Name, item.WeightColumn, item.Rank ?? args.Rank, item.Oversampling ?? args.Oversampling, item.Center ?? args.Center, - item.Seed ?? args.Seed); - }; - } - return new PcaTransformer(env, input, cols).MakeDataTransform(input); + item.Seed ?? args.Seed)).ToArray(); } // Factory method for SignatureLoadModel. @@ -680,31 +679,48 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer public sealed class PcaEstimator2 : IEstimator { - public static class Defaults - { - public const int NewDim = 1000; - public const bool UseSin = false; - } - private readonly IHost _host; private readonly PcaTransformer.ColumnInfo[] _columns; /// /// Convinence constructor for simple one column case /// + /// + /// The environment. + /// Input column to apply PCA on. + /// Output column. Null means is replaced. + /// The number of components in the PCA. + /// A delegate to apply all the advanced arguments to the algorithm. public PcaEstimator2(IHostEnvironment env, string inputColumn, string outputColumn = null, - string weightColumn = PcaTransformer.Defaults.WeightColumn, int rank = PcaTransformer.Defaults.Rank, - int overSampling = PcaTransformer.Defaults.Oversampling, bool center = PcaTransformer.Defaults.Center, - int? seed = null) - : this(env, new PcaTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, weightColumn, rank, overSampling, center, seed)) + int rank = PcaTransformer.Defaults.Rank, + Action advancedSettings = null) + : this(env, new[] { (inputColumn, outputColumn ?? inputColumn) }, rank, advancedSettings) { } - public PcaEstimator2(IHostEnvironment env, params PcaTransformer.ColumnInfo[] columns) + /// + /// The environment. + /// Pairs of columns to run the PCA on. + /// The number of components in the PCA. + /// A delegate to apply all the advanced arguments to the algorithm. + public PcaEstimator2(IHostEnvironment env, (string input, string output)[] columns, + int rank = PcaTransformer.Defaults.Rank, + Action advancedSettings = null) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(PcaEstimator2)); - _columns = columns; + + foreach (var (input, output) in columns) + { + _host.CheckUserArg(Utils.Size(input) > 0, nameof(input)); + _host.CheckValue(output, nameof(output)); + } + + var args = new PcaTransformer.Arguments(); + args.Column = columns.Select(x => new PcaTransformer.Column { Source = x.input, Name = x.output }).ToArray(); + args.Rank = rank; + advancedSettings?.Invoke(args); + _columns = PcaTransformer.ArgumentsToColumnInfos(args); } public PcaTransformer Fit(IDataView input) => new PcaTransformer(_host, input, _columns); @@ -728,4 +744,56 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) } } + /// + /// Extensions for statically typed . + /// + public static class PcaEstimatorExtensions2 + { + private sealed class OutPipelineColumn : Vector + { + public readonly Vector Input; + + public OutPipelineColumn(Vector input, int rank, Action advancedSettings) + : base(new Reconciler(null, rank, advancedSettings), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + private readonly int _rank; + private readonly Action _advancedSettings; + + public Reconciler(PipelineColumn weightColumn, int rank, Action advancedSettings) + { + _rank = rank; + _advancedSettings = advancedSettings; + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + Contracts.Assert(toOutput.Length == 1); + + var pairs = new List<(string input, string output)>(); + foreach (var outCol in toOutput) + pairs.Add((inputNames[((OutPipelineColumn)outCol).Input], outputNames[outCol])); + + return new PcaEstimator2(env, pairs.ToArray(), _rank, _advancedSettings); + } + } + + /// Replace current vector with its principal components. Can significantly reduce size of vector. + /// + /// The column to apply PCA to. + /// The number of components in the PCA. + /// A delegate to apply all the advanced arguments to the algorithm. + public static Vector ToPrincipalComponents(this Vector input, + int rank = PcaTransformer.Defaults.Rank, + Action advancedSettings = null) => new OutPipelineColumn(input, rank, advancedSettings); + } } From f58b5e7192049fb7b752c067d93048524eefc368 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Thu, 18 Oct 2018 16:37:13 -0700 Subject: [PATCH 10/28] Fixed EntryPoint --- src/Microsoft.ML.PCA/PcaTransformer.cs | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index dfc7bb3469..c388644e04 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -659,22 +659,22 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer } - //[TlcModule.EntryPoint(Name = "Transforms.PcaCalculator2", - // Desc = Summary, - // UserName = UserName, - // ShortName = ShortName, - // XmlInclude = new[] { @"", - // @""})] - //public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input) - //{ - // var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input); - // var view = new PcaTransformer(h, input, input.Data); - // return new CommonOutputs.TransformOutput() - // { - // Model = new TransformModel(h, view, input.Data), - // OutputData = view - // }; - //} + [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator2", + Desc = Summary, + UserName = UserName, + ShortName = ShortName, + XmlInclude = new[] { @"", + @""})] + public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input) + { + var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input); + var view = PcaTransformer.Create(h, input, input.Data); + return new CommonOutputs.TransformOutput() + { + Model = new TransformModel(h, view, input.Data), + OutputData = view + }; + } } public sealed class PcaEstimator2 : IEstimator From 9a29c983043c70c9734871ec449e6bd7dcc2a84a Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Fri, 19 Oct 2018 09:11:33 -0700 Subject: [PATCH 11/28] Fixed the arguments --- src/Microsoft.ML.PCA/PcaTransformer.cs | 140 ++++++++++-------- .../Transformers/PcaTests.cs | 9 +- 2 files changed, 87 insertions(+), 62 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index c388644e04..6bc624bf03 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -121,7 +121,13 @@ public sealed class ColumnInfo /// /// Describes how the transformer handles one column pair. /// - public ColumnInfo(string input, string output, string weightColumn, int rank, int overSampling, bool center, int? seed = null) + public ColumnInfo(string input, + string output, + string weightColumn = PcaTransformer.Defaults.WeightColumn, + int rank = PcaTransformer.Defaults.Rank, + int overSampling = PcaTransformer.Defaults.Oversampling, + bool center = PcaTransformer.Defaults.Center, + int? seed = null) { Input = input; Output = output; @@ -311,13 +317,7 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData env.CheckValue(args, nameof(args)); env.CheckValue(input, nameof(input)); env.CheckValue(args.Column, nameof(args.Column)); - var cols = ArgumentsToColumnInfos(args); - return new PcaTransformer(env, input, cols).MakeDataTransform(input); - } - - internal static ColumnInfo[] ArgumentsToColumnInfos(Arguments args) - { - return args.Column.Select(item => new ColumnInfo( + var cols = args.Column.Select(item => new ColumnInfo( item.Source, item.Name, item.WeightColumn, @@ -325,6 +325,7 @@ internal static ColumnInfo[] ArgumentsToColumnInfos(Arguments args) item.Oversampling ?? args.Oversampling, item.Center ?? args.Center, item.Seed ?? args.Seed)).ToArray(); + return new PcaTransformer(env, input, cols).MakeDataTransform(input); } // Factory method for SignatureLoadModel. @@ -682,47 +683,63 @@ public sealed class PcaEstimator2 : IEstimator private readonly IHost _host; private readonly PcaTransformer.ColumnInfo[] _columns; - /// - /// Convinence constructor for simple one column case - /// - /// - /// The environment. - /// Input column to apply PCA on. - /// Output column. Null means is replaced. - /// The number of components in the PCA. - /// A delegate to apply all the advanced arguments to the algorithm. public PcaEstimator2(IHostEnvironment env, string inputColumn, string outputColumn = null, - int rank = PcaTransformer.Defaults.Rank, - Action advancedSettings = null) - : this(env, new[] { (inputColumn, outputColumn ?? inputColumn) }, rank, advancedSettings) + string weightColumn = PcaTransformer.Defaults.WeightColumn, int rank = PcaTransformer.Defaults.Rank, + int overSampling = PcaTransformer.Defaults.Oversampling, bool center = PcaTransformer.Defaults.Center, + int? seed = null) + : this(env, new PcaTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, weightColumn, rank, overSampling, center, seed)) { } - /// - /// The environment. - /// Pairs of columns to run the PCA on. - /// The number of components in the PCA. - /// A delegate to apply all the advanced arguments to the algorithm. - public PcaEstimator2(IHostEnvironment env, (string input, string output)[] columns, - int rank = PcaTransformer.Defaults.Rank, - Action advancedSettings = null) + public PcaEstimator2(IHostEnvironment env, params PcaTransformer.ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(PcaEstimator2)); - - foreach (var (input, output) in columns) - { - _host.CheckUserArg(Utils.Size(input) > 0, nameof(input)); - _host.CheckValue(output, nameof(output)); - } - - var args = new PcaTransformer.Arguments(); - args.Column = columns.Select(x => new PcaTransformer.Column { Source = x.input, Name = x.output }).ToArray(); - args.Rank = rank; - advancedSettings?.Invoke(args); - _columns = PcaTransformer.ArgumentsToColumnInfos(args); + _columns = columns; } + //TODO: move the dosctrings above + ///// + ///// Convinence constructor for simple one column case + ///// + ///// + ///// The environment. + ///// Input column to apply PCA on. + ///// Output column. Null means is replaced. + ///// The number of components in the PCA. + ///// A delegate to apply all the advanced arguments to the algorithm. + //public PcaEstimator2(IHostEnvironment env, string inputColumn, string outputColumn = null, + // int rank = PcaTransformer.Defaults.Rank, + // Action advancedSettings = null) + // : this(env, new[] { (inputColumn, outputColumn ?? inputColumn) }, rank, advancedSettings) + //{ + //} + + ///// + ///// The environment. + ///// Pairs of columns to run the PCA on. + ///// The number of components in the PCA. + ///// A delegate to apply all the advanced arguments to the algorithm. + //public PcaEstimator2(IHostEnvironment env, (string input, string output)[] columns, + // int rank = PcaTransformer.Defaults.Rank, + // Action advancedSettings = null) + //{ + // Contracts.CheckValue(env, nameof(env)); + // _host = env.Register(nameof(PcaEstimator2)); + + // foreach (var (input, output) in columns) + // { + // _host.CheckUserArg(Utils.Size(input) > 0, nameof(input)); + // _host.CheckValue(output, nameof(output)); + // } + + // var args = new PcaTransformer.Arguments(); + // args.Column = columns.Select(x => new PcaTransformer.Column { Source = x.input, Name = x.output }).ToArray(); + // args.Rank = rank; + // advancedSettings?.Invoke(args); + // _columns = PcaTransformer.ArgumentsToColumnInfos(args); + //} + public PcaTransformer Fit(IDataView input) => new PcaTransformer(_host, input, _columns); public SchemaShape GetOutputSchema(SchemaShape inputSchema) @@ -753,8 +770,9 @@ private sealed class OutPipelineColumn : Vector { public readonly Vector Input; - public OutPipelineColumn(Vector input, int rank, Action advancedSettings) - : base(new Reconciler(null, rank, advancedSettings), input) + public OutPipelineColumn(Vector input, string weightColumn, int rank, + int overSampling, bool center, int? seed = null) + : base(new Reconciler(weightColumn, rank, overSampling, center, seed)) { Input = input; } @@ -762,13 +780,12 @@ public OutPipelineColumn(Vector input, int rank, Action _advancedSettings; + private readonly PcaTransformer.ColumnInfo _colInfo; - public Reconciler(PipelineColumn weightColumn, int rank, Action advancedSettings) + public Reconciler(string weightColumn, int rank, int overSampling, bool center, int? seed = null) { - _rank = rank; - _advancedSettings = advancedSettings; + _colInfo = new PcaTransformer.ColumnInfo( + null, null, weightColumn, rank, overSampling, center, seed); } public override IEstimator Reconcile(IHostEnvironment env, @@ -777,23 +794,28 @@ public override IEstimator Reconcile(IHostEnvironment env, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) { + // Only one column is allowed. Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string input, string output)>(); - foreach (var outCol in toOutput) - pairs.Add((inputNames[((OutPipelineColumn)outCol).Input], outputNames[outCol])); - - return new PcaEstimator2(env, pairs.ToArray(), _rank, _advancedSettings); + var outCol = (OutPipelineColumn)toOutput[0]; + var inputColName = inputNames[outCol.Input]; + var outputColName = outputNames[outCol]; + return new PcaEstimator2(env, inputColName, outputColName, + _colInfo.WeightColumn, _colInfo.Rank, _colInfo.Oversampling, + _colInfo.Center, _colInfo.Seed); } } - /// Replace current vector with its principal components. Can significantly reduce size of vector. - /// - /// The column to apply PCA to. - /// The number of components in the PCA. - /// A delegate to apply all the advanced arguments to the algorithm. + // TODO: fix docstrings + // /// Replace current vector with its principal components. Can significantly reduce size of vector. + // /// + // /// The column to apply PCA to. + // /// The number of components in the PCA. + // /// A delegate to apply all the advanced arguments to the algorithm. public static Vector ToPrincipalComponents(this Vector input, + string weightColumn = PcaTransformer.Defaults.WeightColumn, int rank = PcaTransformer.Defaults.Rank, - Action advancedSettings = null) => new OutPipelineColumn(input, rank, advancedSettings); + int overSampling = PcaTransformer.Defaults.Oversampling, + bool center = PcaTransformer.Defaults.Center, + int? seed = null) => new OutPipelineColumn(input, weightColumn, rank, overSampling, center, seed); } } diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index f661d3c0af..454f029130 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -34,9 +34,12 @@ public void PcaWorkout() separator: ';', hasHeader: true) .Read(new MultiFileSource(dataSource)); - var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => { - s.Seed = 1; - }); + //var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => + //{ + // s.Seed = 1; + //}); + + var est = new PcaEstimator2(env, "features", "pca", rank: 5, seed: 1); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 From 2c3282f11ee70e4b54fc728cf244afae7771cbf0 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Fri, 19 Oct 2018 14:38:11 -0700 Subject: [PATCH 12/28] Fixed tests and added pigsty test --- src/Microsoft.ML.PCA/PcaTransformer.cs | 36 ++++++--- .../Transformers/PcaTests.cs | 73 +++++++++++++------ 2 files changed, 76 insertions(+), 33 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 6bc624bf03..0b33265869 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -263,11 +263,10 @@ public PcaTransformer(IHostEnvironment env, IDataView input, ColumnInfo[] column for (int i = 0; i < _numColumns; i++) { var col = columns[i]; - if (!input.Schema.TryGetColumnIndex(col.Input, out _inputColumnIndices[i])) - throw Host.ExceptSchemaMismatch(nameof(col.Input), "input", col.Input); + // Base class has checked existence of input columns + input.Schema.TryGetColumnIndex(col.Input, out _inputColumnIndices[i]); _inputColumnTypes[i] = input.Schema[_inputColumnIndices[i]].Type; - Host.Check(_inputColumnTypes[i].IsKnownSizeVector && _inputColumnTypes[i].VectorSize > 1, - "Pca transform can only be applied to columns with known dimensionality greater than 1"); + ValidatePcaInput(Host, col.Input, _inputColumnTypes[i]); _transformInfos[i] = new TransformInfo(col.Rank, _inputColumnTypes[i].ValueCount); Host.CheckUserArg(col.Oversampling >= 0, nameof(col.Oversampling), "Oversampling must be non-negative"); _weightColumnIndices[i] = -1; @@ -575,6 +574,24 @@ private ColumnType[] InitColumnTypes() protected override IRowMapper MakeRowMapper(ISchema schema) => new Mapper(this, Schema.Create(schema)); + protected override void CheckInputColumn(ISchema inputSchema, int col, int srcCol) + { + ValidatePcaInput(Host, inputSchema.GetColumnName(srcCol), inputSchema.GetColumnType(srcCol)); + } + + internal static void ValidatePcaInput(IHost host, string name, ColumnType type) + { + if (!type.IsVector) + throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of type ${type}"); + + if (!(type.IsKnownSizeVector && type.VectorSize > 1)) + throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of size ${type.VectorSize}"); + + var itemType = type.ItemType; + if (!itemType.IsNumber) + throw host.Except($"Pca transform can only be applied to vector of numeric items. Column ${name} contains type ${itemType}"); + } + private sealed class Mapper : MapperBase { private readonly ColumnType[] _outputColumnTypes; @@ -652,12 +669,6 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer dst = new VBuffer(transformInfo.Rank, values, dst.Indices); } - - //protected virtual void CheckInputColumn(ISchema inputSchema, int col, int srcCol) - //{ - // // By default, there are no extra checks. - //} - } [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator2", @@ -750,7 +761,8 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) { if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); - if (col.ItemType.RawKind != DataKind.R4 || col.Kind != SchemaShape.Column.VectorKind.Vector) + + if (!(col.Kind == SchemaShape.Column.VectorKind.Vector && col.ItemType.IsNumber)) throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, @@ -772,7 +784,7 @@ private sealed class OutPipelineColumn : Vector public OutPipelineColumn(Vector input, string weightColumn, int rank, int overSampling, bool center, int? seed = null) - : base(new Reconciler(weightColumn, rank, overSampling, center, seed)) + : base(new Reconciler(weightColumn, rank, overSampling, center, seed), input) { Input = input; } diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index 454f029130..87f7244566 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -5,7 +5,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; using Microsoft.ML.Runtime.RunTests; -using Microsoft.ML.Transforms; +using Microsoft.ML.StaticPipe; using System.IO; using Xunit; using Xunit.Abstractions; @@ -14,50 +14,81 @@ namespace Microsoft.ML.Tests.Transformers { public sealed class PcaTests : TestDataPipeBase { + private readonly ConsoleEnvironment _env; + private readonly string _dataSource; + private readonly TextSaver _saver; + public PcaTests(ITestOutputHelper helper) : base(helper) { + _env = new ConsoleEnvironment(seed: 1, conc: 1); + _dataSource = GetDataPath("generated_regression_dataset.csv"); + _saver = new TextSaver(_env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); } [Fact] public void PcaWorkout() { - var env = new ConsoleEnvironment(seed: 1, conc: 1); - string dataSource = GetDataPath("generated_regression_dataset.csv"); - var data = TextLoader.CreateReader(env, + var data = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) - .Read(new MultiFileSource(dataSource)); + .Read(new MultiFileSource(_dataSource)); - var invalidData = TextLoader.CreateReader(env, + var invalidData = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) - .Read(new MultiFileSource(dataSource)); - - //var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => - //{ - // s.Seed = 1; - //}); + .Read(new MultiFileSource(_dataSource)); - var est = new PcaEstimator2(env, "features", "pca", rank: 5, seed: 1); + var est = new PcaEstimator2(_env, "features", "pca", rank: 4, seed: 10); + TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + Done(); + } - // The following call fails because of the following issue - // https://github.com/dotnet/machinelearning/issues/969 - // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + [Fact] + public void TestPcaEstimator() + { + var data = TextLoader.CreateReader(_env, + c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), + separator: ';', hasHeader: true) + .Read(new MultiFileSource(_dataSource)); + var est = new PcaEstimator2(_env, "features", "pca", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); - using (var ch = env.Start("save")) + using (var ch = _env.Start("save")) { - var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); - IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); - savedData = new ChooseColumnsTransform(env, savedData, "pca"); + IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + savedData = new ChooseColumnsTransform(_env, savedData, "pca"); using (var fs = File.Create(outputPath)) - DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); + DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true); } CheckEquality("PCA", "pca.tsv"); Done(); } + + [Fact] + public void TestPcaPigsty() + { + var reader = TextLoader.CreateReader(_env, + c => (label: c.LoadFloat(11), features1: c.LoadFloat(0, 10)), + separator: ';', hasHeader: true); + var data = reader.Read(new MultiFileSource(_dataSource)); + var pipeline = reader.MakeNewEstimator() + .Append(r => (r.label, pca: r.features1.ToPrincipalComponents(rank: 5, seed: 1))); + + var outputPath = GetOutputPath("PCA", "pca.tsv"); + using (var ch = _env.Start("save")) + { + IDataView savedData = TakeFilter.Create(_env, pipeline.Fit(data).Transform(data).AsDynamic, 4); + savedData = new ChooseColumnsTransform(_env, savedData, "pca"); + + using (var fs = File.Create(outputPath)) + DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true); + } + + CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 5); + Done(); + } } } From 99d82915888c70425170f3e1818ef73e23677141 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Fri, 19 Oct 2018 14:47:13 -0700 Subject: [PATCH 13/28] Deleted Wrapped PCA transform --- src/Microsoft.ML.PCA/PcaTransform.cs | 510 ++++++++--- src/Microsoft.ML.PCA/PcaTransformer.cs | 833 ------------------ src/Microsoft.ML.PCA/WrappedPcaTransform.cs | 116 --- .../Transformers/PcaTests.cs | 4 +- 4 files changed, 391 insertions(+), 1072 deletions(-) delete mode 100644 src/Microsoft.ML.PCA/PcaTransformer.cs delete mode 100644 src/Microsoft.ML.PCA/WrappedPcaTransform.cs diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 30670cd96e..c8c1fef501 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -15,11 +15,21 @@ using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Numeric; +using Microsoft.ML.Core.Data; +using Microsoft.ML.StaticPipe; +using Microsoft.ML.StaticPipe.Runtime; +using System.Collections.Generic; -[assembly: LoadableClass(PcaTransform.Summary, typeof(PcaTransform), typeof(PcaTransform.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(PcaTransform.Summary, typeof(IDataTransform), typeof(PcaTransform), typeof(PcaTransform.Arguments), typeof(SignatureDataTransform), PcaTransform.UserName, PcaTransform.LoaderSignature, PcaTransform.ShortName)] -[assembly: LoadableClass(PcaTransform.Summary, typeof(PcaTransform), null, typeof(SignatureLoadDataTransform), +[assembly: LoadableClass(PcaTransform.Summary, typeof(IDataTransform), typeof(PcaTransform), null, typeof(SignatureLoadDataTransform), + PcaTransform.UserName, PcaTransform.LoaderSignature)] + +[assembly: LoadableClass(PcaTransform.Summary, typeof(PcaTransform), null, typeof(SignatureLoadModel), + PcaTransform.UserName, PcaTransform.LoaderSignature)] + +[assembly: LoadableClass(typeof(IRowMapper), typeof(PcaTransform), null, typeof(SignatureLoadRowMapper), PcaTransform.UserName, PcaTransform.LoaderSignature)] [assembly: LoadableClass(typeof(void), typeof(PcaTransform), null, typeof(SignatureEntryPointModule), PcaTransform.LoaderSignature)] @@ -27,7 +37,7 @@ namespace Microsoft.ML.Runtime.Data { /// - public sealed class PcaTransform : OneToOneTransformBase + public sealed class PcaTransform : OneToOneTransformerBase { internal static class Defaults { @@ -98,6 +108,37 @@ public bool TryUnparse(StringBuilder sb) } } + public sealed class ColumnInfo + { + public readonly string Input; + public readonly string Output; + public readonly string WeightColumn; + public readonly int Rank; + public readonly int Oversampling; + public readonly bool Center; + public readonly int? Seed; + + /// + /// Describes how the transformer handles one column pair. + /// + public ColumnInfo(string input, + string output, + string weightColumn = PcaTransform.Defaults.WeightColumn, + int rank = PcaTransform.Defaults.Rank, + int overSampling = PcaTransform.Defaults.Oversampling, + bool center = PcaTransform.Defaults.Center, + int? seed = null) + { + Input = input; + Output = output; + WeightColumn = weightColumn; + Rank = rank; + Oversampling = overSampling; + Center = center; + Seed = seed; + } + } + private sealed class TransformInfo { public readonly int Dimension; @@ -106,14 +147,14 @@ private sealed class TransformInfo public Float[][] Eigenvectors; public Float[] MeanProjected; - public TransformInfo(Column item, Arguments args, int d) + public TransformInfo(int rank, int dim) { - Dimension = d; - Rank = item.Rank ?? args.Rank; - Contracts.CheckUserArg(0 < Rank && Rank <= Dimension, nameof(item.Rank), "Rank must be positive, and at most the dimension of untransformed data"); + Dimension = dim; + Rank = rank; + Contracts.CheckUserArg(0 < Rank && Rank <= Dimension, nameof(Rank), "Rank must be positive, and at most the dimension of untransformed data"); } - public TransformInfo(ModelLoadContext ctx, int colValueCount) + public TransformInfo(ModelLoadContext ctx) { Contracts.AssertValue(ctx); @@ -126,8 +167,6 @@ public TransformInfo(ModelLoadContext ctx, int colValueCount) // Float[]: MeanProjected Dimension = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(Dimension == colValueCount); - Rank = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 < Rank && Rank <= Dimension); @@ -198,54 +237,55 @@ private static VersionInfo GetVersionInfo() } // These are parallel to Infos. - private readonly ColumnType[] _types; + private readonly ColumnType[] _outputColumnTypes; private readonly TransformInfo[] _transformInfos; - - private readonly int[] _oversampling; - private readonly bool[] _center; - private readonly int[] _weightColumnIndex; + private readonly int[] _weightColumnIndices; + private readonly int[] _inputColumnIndices; + private readonly ColumnType[] _inputColumnTypes; + private readonly int _numColumns; private const string RegistrationName = "Pca"; /// /// Public constructor corresponding to SignatureDataTransform. /// - public PcaTransform(IHostEnvironment env, Arguments args, IDataView input) - : base(env, RegistrationName, Contracts.CheckRef(args, nameof(args)).Column, - input, TestIsFloatItem) + public PcaTransform(IHostEnvironment env, IDataView input, ColumnInfo[] columns) + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaTransform)), GetColumnPairs(columns)) { - Host.AssertNonEmpty(Infos); - Host.Assert(Infos.Length == Utils.Size(args.Column)); + Host.AssertNonEmpty(ColumnPairs); - _transformInfos = new TransformInfo[args.Column.Length]; - _oversampling = new int[args.Column.Length]; - _center = new bool[args.Column.Length]; - _weightColumnIndex = new int[args.Column.Length]; - for (int i = 0; i < _transformInfos.Length; i++) + _numColumns = ColumnPairs.Length; + _transformInfos = new TransformInfo[_numColumns]; + _weightColumnIndices = new int[_numColumns]; + _inputColumnIndices = new int[_numColumns]; + _inputColumnTypes = new ColumnType[_numColumns]; + + for (int i = 0; i < _numColumns; i++) { - Host.Check(Infos[i].TypeSrc.VectorSize > 1, "Pca transform can only be applied to columns with known dimensionality greater than 1"); - _transformInfos[i] = new TransformInfo(args.Column[i], args, Infos[i].TypeSrc.ValueCount); - _center[i] = args.Column[i].Center ?? args.Center; - _oversampling[i] = args.Column[i].Oversampling ?? args.Oversampling; - Host.CheckUserArg(_oversampling[i] >= 0, nameof(args.Oversampling), "Oversampling must be non-negative"); - _weightColumnIndex[i] = -1; - var weightColumn = args.Column[i].WeightColumn ?? args.WeightColumn; + var col = columns[i]; + // Base class has checked existence of input columns + input.Schema.TryGetColumnIndex(col.Input, out _inputColumnIndices[i]); + _inputColumnTypes[i] = input.Schema[_inputColumnIndices[i]].Type; + ValidatePcaInput(Host, col.Input, _inputColumnTypes[i]); + _transformInfos[i] = new TransformInfo(col.Rank, _inputColumnTypes[i].ValueCount); + Host.CheckUserArg(col.Oversampling >= 0, nameof(col.Oversampling), "Oversampling must be non-negative"); + _weightColumnIndices[i] = -1; + var weightColumn = col.WeightColumn; if (weightColumn != null) { - if (!Source.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndex[i])) + if (!input.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndices[i])) throw Host.Except("weight column '{0}' does not exist", weightColumn); - var type = Source.Schema.GetColumnType(_weightColumnIndex[i]); - Host.CheckUserArg(type == NumberType.Float, nameof(args.WeightColumn)); + var type = input.Schema.GetColumnType(_weightColumnIndices[i]); + Host.CheckUserArg(type == NumberType.Float, nameof(weightColumn)); } } - Train(args, _transformInfos, input); - - _types = InitColumnTypes(); + Train(columns, _transformInfos, input); + _outputColumnTypes = InitColumnTypes(); } - private PcaTransform(IHost host, ModelLoadContext ctx, IDataView input) - : base(host, ctx, input, TestIsFloatItem) + private PcaTransform(IHost host, ModelLoadContext ctx) + : base(host, ctx) { Host.AssertValue(ctx); @@ -253,27 +293,54 @@ private PcaTransform(IHost host, ModelLoadContext ctx, IDataView input) // // // transformInfos - Host.AssertNonEmpty(Infos); - _transformInfos = new TransformInfo[Infos.Length]; - for (int i = 0; i < Infos.Length; i++) - _transformInfos[i] = new TransformInfo(ctx, Infos[i].TypeSrc.ValueCount); - _types = InitColumnTypes(); + Host.AssertNonEmpty(ColumnPairs); + _numColumns = ColumnPairs.Length; + _transformInfos = new TransformInfo[_numColumns]; + for (int i = 0; i < _numColumns; i++) + _transformInfos[i] = new TransformInfo(ctx); + _outputColumnTypes = InitColumnTypes(); } - public static PcaTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + // Factory method for SignatureLoadDataTransform. + private static IDataTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + => Create(env, ctx).MakeDataTransform(input); + + // Factory method for SignatureLoadRowMapper. + private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, ISchema inputSchema) + => Create(env, ctx).MakeRowMapper(inputSchema); + + // Factory method for SignatureDataTransform. + private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); - var h = env.Register(RegistrationName); - h.CheckValue(ctx, nameof(ctx)); - h.CheckValue(input, nameof(input)); - ctx.CheckAtModel(GetVersionInfo()); + env.CheckValue(args, nameof(args)); + env.CheckValue(input, nameof(input)); + env.CheckValue(args.Column, nameof(args.Column)); + var cols = args.Column.Select(item => new ColumnInfo( + item.Source, + item.Name, + item.WeightColumn, + item.Rank ?? args.Rank, + item.Oversampling ?? args.Oversampling, + item.Center ?? args.Center, + item.Seed ?? args.Seed)).ToArray(); + return new PcaTransform(env, input, cols).MakeDataTransform(input); + } - // *** Binary format *** - // int: sizeof(Float) - // - int cbFloat = ctx.Reader.ReadInt32(); - h.CheckDecode(cbFloat == sizeof(Float)); - return h.Apply("Loading Model", ch => new PcaTransform(h, ctx, input)); + // Factory method for SignatureLoadModel. + private static PcaTransform Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register(nameof(PcaTransform)); + + host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + if (ctx.Header.ModelVerWritten == 0x00010001) + { + int cbFloat = ctx.Reader.ReadInt32(); + env.CheckDecode(cbFloat == sizeof(float)); + } + return new PcaTransform(host, ctx); } public override void Save(ModelSaveContext ctx) @@ -287,23 +354,28 @@ public override void Save(ModelSaveContext ctx) // // transformInfos ctx.Writer.Write(sizeof(Float)); - SaveBase(ctx); + SaveColumns(ctx); for (int i = 0; i < _transformInfos.Length; i++) _transformInfos[i].Save(ctx); } - private void Train(Arguments args, TransformInfo[] transformInfos, IDataView trainingData) + private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) { - var y = new Float[transformInfos.Length][][]; - var omega = new Float[transformInfos.Length][][]; - var mean = new Float[transformInfos.Length][]; + //Contracts.CheckValue(columns, nameof(columns)); + return columns.Select(x => (x.Input, x.Output)).ToArray(); + } - var oversampledRank = new int[transformInfos.Length]; + private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataView trainingData) + { + var y = new Float[_numColumns][][]; + var omega = new Float[_numColumns][][]; + var mean = new Float[_numColumns][]; + var oversampledRank = new int[_numColumns]; var rnd = Host.Rand; Double totalMemoryUsageEstimate = 0; - for (int iinfo = 0; iinfo < transformInfos.Length; iinfo++) + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - oversampledRank[iinfo] = Math.Min(transformInfos[iinfo].Rank + _oversampling[iinfo], transformInfos[iinfo].Dimension); + oversampledRank[iinfo] = Math.Min(transformInfos[iinfo].Rank + columns[iinfo].Oversampling, transformInfos[iinfo].Dimension); //exact: (size of the 2 big matrices + other minor allocations) / (2^30) Double colMemoryUsageEstimate = 2.0 * transformInfos[iinfo].Dimension * oversampledRank[iinfo] * sizeof(Float) / 1e9; @@ -313,7 +385,7 @@ private void Train(Arguments args, TransformInfo[] transformInfos, IDataView tra using (var ch = Host.Start("Memory usage")) { ch.Info("Estimate memory usage for transforming column {1}: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", - colMemoryUsageEstimate, Infos[iinfo].Name); + colMemoryUsageEstimate, ColumnPairs[iinfo].input); } } @@ -329,7 +401,7 @@ private void Train(Arguments args, TransformInfo[] transformInfos, IDataView tra } } - if (_center[iinfo]) + if (columns[iinfo].Center) mean[iinfo] = new Float[transformInfos[iinfo].Dimension]; } if (totalMemoryUsageEstimate > 2) @@ -386,7 +458,7 @@ private void Train(Arguments args, TransformInfo[] transformInfos, IDataView tra //Note that the covariance matrix is not computed explicitly private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, Float[][][] y, TransformInfo[] transformInfos) { - Host.Assert(mean.Length == omega.Length && omega.Length == y.Length && y.Length == Infos.Length); + Host.Assert(mean.Length == omega.Length && omega.Length == y.Length && y.Length == _numColumns); for (int i = 0; i < omega.Length; i++) Contracts.Assert(omega[i].Length == y[i].Length); @@ -399,37 +471,37 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, bool[] center = Enumerable.Range(0, mean.Length).Select(i => mean[i] != null).ToArray(); - Double[] totalColWeight = new Double[Infos.Length]; + Double[] totalColWeight = new Double[_numColumns]; - bool[] activeColumns = new bool[Source.Schema.ColumnCount]; - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - activeColumns[Infos[iinfo].Source] = true; - if (_weightColumnIndex[iinfo] >= 0) - activeColumns[_weightColumnIndex[iinfo]] = true; + activeColumns[_inputColumnIndices[iinfo]] = true; + if (_weightColumnIndices[iinfo] >= 0) + activeColumns[_weightColumnIndices[iinfo]] = true; } + using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) { - var weightGetters = new ValueGetter[Infos.Length]; - var columnGetters = new ValueGetter>[Infos.Length]; - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + var weightGetters = new ValueGetter[_numColumns]; + var columnGetters = new ValueGetter>[_numColumns]; + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - if (_weightColumnIndex[iinfo] >= 0) - weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndex[iinfo]); - columnGetters[iinfo] = cursor.GetGetter>(Infos[iinfo].Source); + if (_weightColumnIndices[iinfo] >= 0) + weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndices[iinfo]); + columnGetters[iinfo] = cursor.GetGetter>(_inputColumnIndices[iinfo]); } var features = default(VBuffer); while (cursor.MoveNext()) { - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - Contracts.Check(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsNumber, + Contracts.Check(_inputColumnTypes[iinfo].IsVector && _inputColumnTypes[iinfo].ItemType.IsNumber, "PCA transform can only be performed on numeric columns of dimension > 1"); Float weight = 1; - if (weightGetters[iinfo] != null) - weightGetters[iinfo](ref weight); + weightGetters[iinfo]?.Invoke(ref weight); columnGetters[iinfo](ref features); if (FloatUtils.IsFinite(weight) && weight >= 0 && (features.Count == 0 || FloatUtils.IsFinite(features.Values, features.Count))) @@ -445,13 +517,13 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, } } - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { if (totalColWeight[iinfo] <= 0) - throw Host.Except("Empty data in column '{0}'", Source.Schema.GetColumnName(Infos[iinfo].Source)); + throw Host.Except("Empty data in column '{0}'", ColumnPairs[iinfo].input); } - for (int iinfo = 0; iinfo < Infos.Length; iinfo++) + for (int iinfo = 0; iinfo < _numColumns; iinfo++) { var invn = (Float)(1 / totalColWeight[iinfo]); @@ -495,54 +567,108 @@ private Float[][] PostProcess(Float[][] y, Float[] sigma, Float[] z, int d, int private ColumnType[] InitColumnTypes() { - Host.Assert(Infos.Length == _transformInfos.Length); - var types = new ColumnType[Infos.Length]; - for (int i = 0; i < _transformInfos.Length; i++) - types[i] = new VectorType(NumberType.Float, _transformInfos[i].Rank); - Metadata.Seal(); + Host.Assert(ColumnPairs.Length == _transformInfos.Length); + var types = _transformInfos.Select(tInfo => new VectorType(NumberType.Float, tInfo.Rank)).ToArray(); return types; } - protected override ColumnType GetColumnTypeCore(int iinfo) + protected override IRowMapper MakeRowMapper(ISchema schema) => new Mapper(this, Schema.Create(schema)); + + protected override void CheckInputColumn(ISchema inputSchema, int col, int srcCol) { - Host.Check(0 <= iinfo & iinfo < Utils.Size(_types)); - return _types[iinfo]; + ValidatePcaInput(Host, inputSchema.GetColumnName(srcCol), inputSchema.GetColumnType(srcCol)); } - protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) + internal static void ValidatePcaInput(IHost host, string name, ColumnType type) { - Host.AssertValueOrNull(ch); - Host.AssertValue(input); - Host.Assert(0 <= iinfo && iinfo < Infos.Length); - disposer = null; - - var getSrc = GetSrcGetter>(input, iinfo); - var src = default(VBuffer); - var trInfo = _transformInfos[iinfo]; - ValueGetter> del = - (ref VBuffer dst) => - { - getSrc(ref src); - TransformFeatures(Host, ref src, ref dst, trInfo); - }; - return del; + if (!type.IsVector) + throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of type ${type}"); + + if (!(type.IsKnownSizeVector && type.VectorSize > 1)) + throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of size ${type.VectorSize}"); + + var itemType = type.ItemType; + if (!itemType.IsNumber) + throw host.Except($"Pca transform can only be applied to vector of numeric items. Column ${name} contains type ${itemType}"); } - private static void TransformFeatures(IExceptionContext ectx, ref VBuffer src, ref VBuffer dst, TransformInfo transformInfo) + private sealed class Mapper : MapperBase { - ectx.Check(src.Length == transformInfo.Dimension); + private readonly ColumnType[] _outputColumnTypes; + // Todo: replace with ColMapNewToOld + private readonly ColumnType[] _inputColumnTypes; + private readonly int[] _inputColumnIndices; + private readonly PcaTransform _parent; + private readonly int _numColumns; + + public Mapper(PcaTransform parent, Schema inputSchema) + : base(parent.Host.Register(nameof(Mapper)), parent, inputSchema) + { + _parent = parent; + _numColumns = parent._numColumns; + _outputColumnTypes = parent.InitColumnTypes(); + _inputColumnTypes = new ColumnType[_numColumns]; + _inputColumnIndices = new int[_numColumns]; + for (int i = 0; i < _numColumns; i++) + { + var inputColName = _parent.ColumnPairs[i].input; + if (!inputSchema.TryGetColumnIndex(inputColName, out _inputColumnIndices[i])) + throw Host.ExceptSchemaMismatch(nameof(inputColName), "input", inputColName); + _inputColumnTypes[i] = inputSchema[_inputColumnIndices[i]].Type; + Host.Check(_inputColumnTypes[i].IsKnownSizeVector && _inputColumnTypes[i].VectorSize > 1, + "Pca transform can only be applied to columns with known dimensionality greater than 1"); + if (_inputColumnTypes[i].VectorSize != _parent._transformInfos[i].Dimension) + { + var msg = $"Dimension of column ${inputColName} is ${_inputColumnTypes[i].VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; + throw Host.Except(msg); + } + } + // Ivan't comment: + //var getSrc = input.GetGetter>(ColMapNewToOld[iinfo]); + } - var values = dst.Values; - if (Utils.Size(values) < transformInfo.Rank) - values = new Float[transformInfo.Rank]; + public override Schema.Column[] GetOutputColumns() + { + var result = new Schema.Column[_numColumns]; + for (int i = 0; i < _numColumns; i++) + result[i] = new Schema.Column(_parent.ColumnPairs[i].output, _outputColumnTypes[i], null); + return result; + } - for (int i = 0; i < transformInfo.Rank; i++) + protected override Delegate MakeGetter(IRow input, int iinfo, out Action disposer) { - values[i] = VectorUtils.DotProductWithOffset(transformInfo.Eigenvectors[i], 0, ref src) - - (transformInfo.MeanProjected == null ? 0 : transformInfo.MeanProjected[i]); + Contracts.AssertValue(input); + Contracts.Assert(0 <= iinfo && iinfo < _numColumns); + disposer = null; + + var srcGetter = input.GetGetter>(_inputColumnIndices[iinfo]); + var src = default(VBuffer); + + ValueGetter> dstGetter = (ref VBuffer dst) => + { + srcGetter(ref src); + TransformFeatures(Host, ref src, ref dst, _parent._transformInfos[iinfo]); + }; + + return dstGetter; } - dst = new VBuffer(transformInfo.Rank, values, dst.Indices); + private static void TransformFeatures(IExceptionContext ectx, ref VBuffer src, ref VBuffer dst, TransformInfo transformInfo) + { + ectx.Check(src.Length == transformInfo.Dimension); + + var values = dst.Values; + if (Utils.Size(values) < transformInfo.Rank) + values = new Float[transformInfo.Rank]; + + for (int i = 0; i < transformInfo.Rank; i++) + { + values[i] = VectorUtils.DotProductWithOffset(transformInfo.Eigenvectors[i], 0, ref src) - + (transformInfo.MeanProjected == null ? 0 : transformInfo.MeanProjected[i]); + } + + dst = new VBuffer(transformInfo.Rank, values, dst.Indices); + } } [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator", @@ -554,7 +680,7 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input) { var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input); - var view = new PcaTransform(h, input, input.Data); + var view = PcaTransform.Create(h, input, input.Data); return new CommonOutputs.TransformOutput() { Model = new TransformModel(h, view, input.Data), @@ -562,4 +688,146 @@ public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Argu }; } } + + public sealed class PcaEstimator : IEstimator + { + private readonly IHost _host; + private readonly PcaTransform.ColumnInfo[] _columns; + + public PcaEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, + string weightColumn = PcaTransform.Defaults.WeightColumn, int rank = PcaTransform.Defaults.Rank, + int overSampling = PcaTransform.Defaults.Oversampling, bool center = PcaTransform.Defaults.Center, + int? seed = null) + : this(env, new PcaTransform.ColumnInfo(inputColumn, outputColumn ?? inputColumn, weightColumn, rank, overSampling, center, seed)) + { + } + + public PcaEstimator(IHostEnvironment env, params PcaTransform.ColumnInfo[] columns) + { + Contracts.CheckValue(env, nameof(env)); + _host = env.Register(nameof(PcaEstimator)); + _columns = columns; + } + + //TODO: move the dosctrings above + ///// + ///// Convinence constructor for simple one column case + ///// + ///// + ///// The environment. + ///// Input column to apply PCA on. + ///// Output column. Null means is replaced. + ///// The number of components in the PCA. + ///// A delegate to apply all the advanced arguments to the algorithm. + //public PcaEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, + // int rank = PcaTransform.Defaults.Rank, + // Action advancedSettings = null) + // : this(env, new[] { (inputColumn, outputColumn ?? inputColumn) }, rank, advancedSettings) + //{ + //} + + ///// + ///// The environment. + ///// Pairs of columns to run the PCA on. + ///// The number of components in the PCA. + ///// A delegate to apply all the advanced arguments to the algorithm. + //public PcaEstimator(IHostEnvironment env, (string input, string output)[] columns, + // int rank = PcaTransform.Defaults.Rank, + // Action advancedSettings = null) + //{ + // Contracts.CheckValue(env, nameof(env)); + // _host = env.Register(nameof(PcaEstimator)); + + // foreach (var (input, output) in columns) + // { + // _host.CheckUserArg(Utils.Size(input) > 0, nameof(input)); + // _host.CheckValue(output, nameof(output)); + // } + + // var args = new PcaTransform.Arguments(); + // args.Column = columns.Select(x => new PcaTransform.Column { Source = x.input, Name = x.output }).ToArray(); + // args.Rank = rank; + // advancedSettings?.Invoke(args); + // _columns = PcaTransform.ArgumentsToColumnInfos(args); + //} + + public PcaTransform Fit(IDataView input) => new PcaTransform(_host, input, _columns); + + public SchemaShape GetOutputSchema(SchemaShape inputSchema) + { + _host.CheckValue(inputSchema, nameof(inputSchema)); + var result = inputSchema.Columns.ToDictionary(x => x.Name); + foreach (var colInfo in _columns) + { + if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) + throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); + + if (!(col.Kind == SchemaShape.Column.VectorKind.Vector && col.ItemType.IsNumber)) + throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); + + result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, + SchemaShape.Column.VectorKind.Vector, NumberType.R4, false); + } + + return new SchemaShape(result.Values); + } + } + + /// + /// Extensions for statically typed . + /// + public static class PcaEstimatorExtensions + { + private sealed class OutPipelineColumn : Vector + { + public readonly Vector Input; + + public OutPipelineColumn(Vector input, string weightColumn, int rank, + int overSampling, bool center, int? seed = null) + : base(new Reconciler(weightColumn, rank, overSampling, center, seed), input) + { + Input = input; + } + } + + private sealed class Reconciler : EstimatorReconciler + { + private readonly PcaTransform.ColumnInfo _colInfo; + + public Reconciler(string weightColumn, int rank, int overSampling, bool center, int? seed = null) + { + _colInfo = new PcaTransform.ColumnInfo( + null, null, weightColumn, rank, overSampling, center, seed); + } + + public override IEstimator Reconcile(IHostEnvironment env, + PipelineColumn[] toOutput, + IReadOnlyDictionary inputNames, + IReadOnlyDictionary outputNames, + IReadOnlyCollection usedNames) + { + // Only one column is allowed. + Contracts.Assert(toOutput.Length == 1); + var outCol = (OutPipelineColumn)toOutput[0]; + var inputColName = inputNames[outCol.Input]; + var outputColName = outputNames[outCol]; + return new PcaEstimator(env, inputColName, outputColName, + _colInfo.WeightColumn, _colInfo.Rank, _colInfo.Oversampling, + _colInfo.Center, _colInfo.Seed); + } + } + + // TODO: fix docstrings + // /// Replace current vector with its principal components. Can significantly reduce size of vector. + // /// + // /// The column to apply PCA to. + // /// The number of components in the PCA. + // /// A delegate to apply all the advanced arguments to the algorithm. + public static Vector ToPrincipalComponents(this Vector input, + string weightColumn = PcaTransform.Defaults.WeightColumn, + int rank = PcaTransform.Defaults.Rank, + int overSampling = PcaTransform.Defaults.Oversampling, + bool center = PcaTransform.Defaults.Center, + int? seed = null) => new OutPipelineColumn(input, weightColumn, rank, overSampling, center, seed); + } } diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs deleted file mode 100644 index 0b33265869..0000000000 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ /dev/null @@ -1,833 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Float = System.Single; - -using System; -using System.Linq; -using System.Text; -using Microsoft.ML.Runtime; -using Microsoft.ML.Runtime.CommandLine; -using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Runtime.EntryPoints; -using Microsoft.ML.Runtime.Internal.CpuMath; -using Microsoft.ML.Runtime.Internal.Utilities; -using Microsoft.ML.Runtime.Model; -using Microsoft.ML.Runtime.Numeric; -using Microsoft.ML.Core.Data; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.StaticPipe.Runtime; -using System.Collections.Generic; - -[assembly: LoadableClass(PcaTransformer.Summary, typeof(IDataTransform), typeof(PcaTransformer), typeof(PcaTransformer.Arguments), typeof(SignatureDataTransform), - PcaTransformer.UserName, PcaTransformer.LoaderSignature, PcaTransformer.ShortName)] - -[assembly: LoadableClass(PcaTransformer.Summary, typeof(IDataTransform), typeof(PcaTransformer), null, typeof(SignatureLoadDataTransform), - PcaTransformer.UserName, PcaTransformer.LoaderSignature)] - -[assembly: LoadableClass(PcaTransformer.Summary, typeof(PcaTransformer), null, typeof(SignatureLoadModel), - PcaTransformer.UserName, PcaTransformer.LoaderSignature)] - -[assembly: LoadableClass(typeof(IRowMapper), typeof(PcaTransformer), null, typeof(SignatureLoadRowMapper), - PcaTransformer.UserName, PcaTransformer.LoaderSignature)] - -[assembly: LoadableClass(typeof(void), typeof(PcaTransformer), null, typeof(SignatureEntryPointModule), PcaTransformer.LoaderSignature)] - -namespace Microsoft.ML.Runtime.Data -{ - /// - public sealed class PcaTransformer : OneToOneTransformerBase - { - internal static class Defaults - { - public const string WeightColumn = null; - public const int Rank = 20; - public const int Oversampling = 20; - public const bool Center = true; - public const int Seed = 0; - } - - public sealed class Arguments : TransformInputBase - { - [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", ShortName = "col", SortOrder = 1)] - public Column[] Column; - - [Argument(ArgumentType.Multiple, HelpText = "The name of the weight column", ShortName = "weight", Purpose = SpecialPurpose.ColumnName)] - public string WeightColumn = Defaults.WeightColumn; - - [Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k")] - public int Rank = Defaults.Rank; - - [Argument(ArgumentType.AtMostOnce, HelpText = "Oversampling parameter for randomized PCA training", ShortName = "over")] - public int Oversampling = Defaults.Oversampling; - - [Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean")] - public bool Center = Defaults.Center; - - [Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation")] - public int Seed = Defaults.Seed; - } - - public class Column : OneToOneColumn - { - [Argument(ArgumentType.Multiple, HelpText = "The name of the weight column", ShortName = "weight")] - public string WeightColumn; - - [Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k")] - public int? Rank; - - [Argument(ArgumentType.AtMostOnce, HelpText = "Oversampling parameter for randomized PCA training", ShortName = "over")] - public int? Oversampling; - - [Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean", ShortName = "center")] - public bool? Center; - - [Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation", ShortName = "seed")] - public int? Seed; - - public static Column Parse(string str) - { - Contracts.AssertNonEmpty(str); - - var res = new Column(); - if (res.TryParse(str)) - return res; - return null; - } - - public bool TryUnparse(StringBuilder sb) - { - Contracts.AssertValue(sb); - if (!string.IsNullOrEmpty(WeightColumn) || Rank != null || Oversampling != null || - Center != null || Seed != null) - { - return false; - } - return TryUnparseCore(sb); - } - } - - public sealed class ColumnInfo - { - public readonly string Input; - public readonly string Output; - public readonly string WeightColumn; - public readonly int Rank; - public readonly int Oversampling; - public readonly bool Center; - public readonly int? Seed; - - /// - /// Describes how the transformer handles one column pair. - /// - public ColumnInfo(string input, - string output, - string weightColumn = PcaTransformer.Defaults.WeightColumn, - int rank = PcaTransformer.Defaults.Rank, - int overSampling = PcaTransformer.Defaults.Oversampling, - bool center = PcaTransformer.Defaults.Center, - int? seed = null) - { - Input = input; - Output = output; - WeightColumn = weightColumn; - Rank = rank; - Oversampling = overSampling; - Center = center; - Seed = seed; - } - } - - private sealed class TransformInfo - { - public readonly int Dimension; - public readonly int Rank; - - public Float[][] Eigenvectors; - public Float[] MeanProjected; - - public TransformInfo(int rank, int dim) - { - Dimension = dim; - Rank = rank; - Contracts.CheckUserArg(0 < Rank && Rank <= Dimension, nameof(Rank), "Rank must be positive, and at most the dimension of untransformed data"); - } - - public TransformInfo(ModelLoadContext ctx) - { - Contracts.AssertValue(ctx); - - // *** Binary format *** - // int: Dimension - // int: Rank - // for i=0,..,Rank-1: - // Float[]: the i'th eigenvector - // int: the size of MeanProjected (0 if it is null) - // Float[]: MeanProjected - - Dimension = ctx.Reader.ReadInt32(); - Rank = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(0 < Rank && Rank <= Dimension); - - Eigenvectors = new Float[Rank][]; - for (int i = 0; i < Rank; i++) - { - Eigenvectors[i] = ctx.Reader.ReadFloatArray(Dimension); - Contracts.CheckDecode(FloatUtils.IsFinite(Eigenvectors[i], Eigenvectors[i].Length)); - } - - MeanProjected = ctx.Reader.ReadFloatArray(); - Contracts.CheckDecode(MeanProjected == null || (MeanProjected.Length == Rank && FloatUtils.IsFinite(MeanProjected, MeanProjected.Length))); - } - - public void Save(ModelSaveContext ctx) - { - Contracts.AssertValue(ctx); - - // *** Binary format *** - // int: Dimension - // int: Rank - // for i=0,..,Rank-1: - // Float[]: the i'th eigenvector - // int: the size of MeanProjected (0 if it is null) - // Float[]: MeanProjected - - Contracts.Assert(0 < Rank && Rank <= Dimension); - ctx.Writer.Write(Dimension); - ctx.Writer.Write(Rank); - for (int i = 0; i < Rank; i++) - { - Contracts.Assert(FloatUtils.IsFinite(Eigenvectors[i], Eigenvectors[i].Length)); - ctx.Writer.WriteFloatsNoCount(Eigenvectors[i], Dimension); - } - Contracts.Assert(MeanProjected == null || (MeanProjected.Length == Rank && FloatUtils.IsFinite(MeanProjected, Rank))); - ctx.Writer.WriteFloatArray(MeanProjected); - } - - internal void ProjectMean(Float[] mean) - { - Contracts.AssertValue(Eigenvectors); - if (mean == null) - { - MeanProjected = null; - return; - } - - MeanProjected = new Float[Rank]; - for (var i = 0; i < Rank; ++i) - MeanProjected[i] = VectorUtils.DotProduct(Eigenvectors[i], mean); - } - } - - internal const string Summary = "PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace."; - internal const string UserName = "Principal Component Analysis Transform"; - internal const string ShortName = "Pca2"; - - public const string LoaderSignature = "PcaTransformer"; - private static VersionInfo GetVersionInfo() - { - return new VersionInfo( - modelSignature: "PCA FUN2", - verWrittenCur: 0x00010001, // Initial - verReadableCur: 0x00010001, - verWeCanReadBack: 0x00010001, - loaderSignature: LoaderSignature, - loaderAssemblyName: typeof(PcaTransformer).Assembly.FullName); - } - - // These are parallel to Infos. - private readonly ColumnType[] _outputColumnTypes; - private readonly TransformInfo[] _transformInfos; - private readonly int[] _weightColumnIndices; - private readonly int[] _inputColumnIndices; - private readonly ColumnType[] _inputColumnTypes; - private readonly int _numColumns; - - private const string RegistrationName = "Pca2"; - - /// - /// Public constructor corresponding to SignatureDataTransform. - /// - public PcaTransformer(IHostEnvironment env, IDataView input, ColumnInfo[] columns) - : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaTransformer)), GetColumnPairs(columns)) - { - Host.AssertNonEmpty(ColumnPairs); - - _numColumns = ColumnPairs.Length; - _transformInfos = new TransformInfo[_numColumns]; - _weightColumnIndices = new int[_numColumns]; - _inputColumnIndices = new int[_numColumns]; - _inputColumnTypes = new ColumnType[_numColumns]; - - for (int i = 0; i < _numColumns; i++) - { - var col = columns[i]; - // Base class has checked existence of input columns - input.Schema.TryGetColumnIndex(col.Input, out _inputColumnIndices[i]); - _inputColumnTypes[i] = input.Schema[_inputColumnIndices[i]].Type; - ValidatePcaInput(Host, col.Input, _inputColumnTypes[i]); - _transformInfos[i] = new TransformInfo(col.Rank, _inputColumnTypes[i].ValueCount); - Host.CheckUserArg(col.Oversampling >= 0, nameof(col.Oversampling), "Oversampling must be non-negative"); - _weightColumnIndices[i] = -1; - var weightColumn = col.WeightColumn; - if (weightColumn != null) - { - if (!input.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndices[i])) - throw Host.Except("weight column '{0}' does not exist", weightColumn); - var type = input.Schema.GetColumnType(_weightColumnIndices[i]); - Host.CheckUserArg(type == NumberType.Float, nameof(weightColumn)); - } - } - - Train(columns, _transformInfos, input); - _outputColumnTypes = InitColumnTypes(); - } - - private PcaTransformer(IHost host, ModelLoadContext ctx) - : base(host, ctx) - { - Host.AssertValue(ctx); - - // *** Binary format *** - // - // - // transformInfos - Host.AssertNonEmpty(ColumnPairs); - _numColumns = ColumnPairs.Length; - _transformInfos = new TransformInfo[_numColumns]; - for (int i = 0; i < _numColumns; i++) - _transformInfos[i] = new TransformInfo(ctx); - _outputColumnTypes = InitColumnTypes(); - } - - // Factory method for SignatureLoadDataTransform. - private static IDataTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) - => Create(env, ctx).MakeDataTransform(input); - - // Factory method for SignatureLoadRowMapper. - private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, ISchema inputSchema) - => Create(env, ctx).MakeRowMapper(inputSchema); - - // Factory method for SignatureDataTransform. - private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) - { - Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); - env.CheckValue(input, nameof(input)); - env.CheckValue(args.Column, nameof(args.Column)); - var cols = args.Column.Select(item => new ColumnInfo( - item.Source, - item.Name, - item.WeightColumn, - item.Rank ?? args.Rank, - item.Oversampling ?? args.Oversampling, - item.Center ?? args.Center, - item.Seed ?? args.Seed)).ToArray(); - return new PcaTransformer(env, input, cols).MakeDataTransform(input); - } - - // Factory method for SignatureLoadModel. - private static PcaTransformer Create(IHostEnvironment env, ModelLoadContext ctx) - { - Contracts.CheckValue(env, nameof(env)); - var host = env.Register(nameof(PcaTransformer)); - - host.CheckValue(ctx, nameof(ctx)); - ctx.CheckAtModel(GetVersionInfo()); - if (ctx.Header.ModelVerWritten == 0x00010001) - { - int cbFloat = ctx.Reader.ReadInt32(); - env.CheckDecode(cbFloat == sizeof(float)); - } - return new PcaTransformer(host, ctx); - } - - public override void Save(ModelSaveContext ctx) - { - Host.CheckValue(ctx, nameof(ctx)); - ctx.CheckAtModel(); - ctx.SetVersionInfo(GetVersionInfo()); - - // *** Binary format *** - // int: sizeof(Float) - // - // transformInfos - ctx.Writer.Write(sizeof(Float)); - SaveColumns(ctx); - for (int i = 0; i < _transformInfos.Length; i++) - _transformInfos[i].Save(ctx); - } - - private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) - { - //Contracts.CheckValue(columns, nameof(columns)); - return columns.Select(x => (x.Input, x.Output)).ToArray(); - } - - private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataView trainingData) - { - var y = new Float[_numColumns][][]; - var omega = new Float[_numColumns][][]; - var mean = new Float[_numColumns][]; - var oversampledRank = new int[_numColumns]; - var rnd = Host.Rand; - Double totalMemoryUsageEstimate = 0; - for (int iinfo = 0; iinfo < _numColumns; iinfo++) - { - oversampledRank[iinfo] = Math.Min(transformInfos[iinfo].Rank + columns[iinfo].Oversampling, transformInfos[iinfo].Dimension); - - //exact: (size of the 2 big matrices + other minor allocations) / (2^30) - Double colMemoryUsageEstimate = 2.0 * transformInfos[iinfo].Dimension * oversampledRank[iinfo] * sizeof(Float) / 1e9; - totalMemoryUsageEstimate += colMemoryUsageEstimate; - if (colMemoryUsageEstimate > 2) - { - using (var ch = Host.Start("Memory usage")) - { - ch.Info("Estimate memory usage for transforming column {1}: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", - colMemoryUsageEstimate, ColumnPairs[iinfo].input); - } - } - - y[iinfo] = new Float[oversampledRank[iinfo]][]; - omega[iinfo] = new Float[oversampledRank[iinfo]][]; - for (int i = 0; i < oversampledRank[iinfo]; i++) - { - y[iinfo][i] = new Float[transformInfos[iinfo].Dimension]; - omega[iinfo][i] = new Float[transformInfos[iinfo].Dimension]; - for (int j = 0; j < transformInfos[iinfo].Dimension; j++) - { - omega[iinfo][i][j] = (Float)Stats.SampleFromGaussian(rnd); - } - } - - if (columns[iinfo].Center) - mean[iinfo] = new Float[transformInfos[iinfo].Dimension]; - } - if (totalMemoryUsageEstimate > 2) - { - using (var ch = Host.Start("Memory usage")) - { - ch.Info("Estimate memory usage for all PCA transforms: {0:G2} GB. If running out of memory, reduce ranks and oversampling factors.", - totalMemoryUsageEstimate); - } - } - - Project(trainingData, mean, omega, y, transformInfos); - - for (int iinfo = 0; iinfo < transformInfos.Length; iinfo++) - { - //Orthonormalize Y in-place using stabilized Gram Schmidt algorithm - //Ref: https://en.wikipedia.org/wiki/Gram-Schmidt#Algorithm - for (var i = 0; i < oversampledRank[iinfo]; ++i) - { - var v = y[iinfo][i]; - VectorUtils.ScaleBy(v, 1 / VectorUtils.Norm(y[iinfo][i])); // normalize - - // Make the next vectors in the queue orthogonal to the orthonormalized vectors - for (var j = i + 1; j < oversampledRank[iinfo]; ++j) - VectorUtils.AddMult(v, y[iinfo][j], -VectorUtils.DotProduct(v, y[iinfo][j])); //subtract the projection of y[j] on v - } - } - var q = y; // q in QR decomposition - - var b = omega; // reuse the memory allocated by Omega - Project(trainingData, mean, q, b, transformInfos); - - for (int iinfo = 0; iinfo < transformInfos.Length; iinfo++) - { - //Compute B2 = B' * B - var b2 = new Float[oversampledRank[iinfo] * oversampledRank[iinfo]]; - for (var i = 0; i < oversampledRank[iinfo]; ++i) - { - for (var j = i; j < oversampledRank[iinfo]; ++j) - b2[i * oversampledRank[iinfo] + j] = b2[j * oversampledRank[iinfo] + i] = VectorUtils.DotProduct(b[iinfo][i], b[iinfo][j]); - } - - Float[] smallEigenvalues; // eigenvectors and eigenvalues of the small matrix B2. - Float[] smallEigenvectors; - - EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors); - transformInfos[iinfo].Eigenvectors = PostProcess(b[iinfo], smallEigenvalues, smallEigenvectors, transformInfos[iinfo].Dimension, oversampledRank[iinfo]); - transformInfos[iinfo].ProjectMean(mean[iinfo]); - } - } - - //Project the covariance matrix A on to Omega: Y <- A * Omega - //A = X' * X / n, where X = data - mean - //Note that the covariance matrix is not computed explicitly - private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, Float[][][] y, TransformInfo[] transformInfos) - { - Host.Assert(mean.Length == omega.Length && omega.Length == y.Length && y.Length == _numColumns); - for (int i = 0; i < omega.Length; i++) - Contracts.Assert(omega[i].Length == y[i].Length); - - // set y to be all zeros - for (int iinfo = 0; iinfo < y.Length; iinfo++) - { - for (int i = 0; i < y[iinfo].Length; i++) - Array.Clear(y[iinfo][i], 0, y[iinfo][i].Length); - } - - bool[] center = Enumerable.Range(0, mean.Length).Select(i => mean[i] != null).ToArray(); - - Double[] totalColWeight = new Double[_numColumns]; - - bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; - for (int iinfo = 0; iinfo < _numColumns; iinfo++) - { - activeColumns[_inputColumnIndices[iinfo]] = true; - if (_weightColumnIndices[iinfo] >= 0) - activeColumns[_weightColumnIndices[iinfo]] = true; - } - - using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) - { - var weightGetters = new ValueGetter[_numColumns]; - var columnGetters = new ValueGetter>[_numColumns]; - for (int iinfo = 0; iinfo < _numColumns; iinfo++) - { - if (_weightColumnIndices[iinfo] >= 0) - weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndices[iinfo]); - columnGetters[iinfo] = cursor.GetGetter>(_inputColumnIndices[iinfo]); - } - - var features = default(VBuffer); - while (cursor.MoveNext()) - { - for (int iinfo = 0; iinfo < _numColumns; iinfo++) - { - Contracts.Check(_inputColumnTypes[iinfo].IsVector && _inputColumnTypes[iinfo].ItemType.IsNumber, - "PCA transform can only be performed on numeric columns of dimension > 1"); - - Float weight = 1; - weightGetters[iinfo]?.Invoke(ref weight); - columnGetters[iinfo](ref features); - - if (FloatUtils.IsFinite(weight) && weight >= 0 && (features.Count == 0 || FloatUtils.IsFinite(features.Values, features.Count))) - { - totalColWeight[iinfo] += weight; - - if (center[iinfo]) - VectorUtils.AddMult(ref features, mean[iinfo], weight); - - for (int i = 0; i < omega[iinfo].Length; i++) - VectorUtils.AddMult(ref features, y[iinfo][i], weight * VectorUtils.DotProductWithOffset(omega[iinfo][i], 0, ref features)); - } - } - } - - for (int iinfo = 0; iinfo < _numColumns; iinfo++) - { - if (totalColWeight[iinfo] <= 0) - throw Host.Except("Empty data in column '{0}'", ColumnPairs[iinfo].input); - } - - for (int iinfo = 0; iinfo < _numColumns; iinfo++) - { - var invn = (Float)(1 / totalColWeight[iinfo]); - - for (var i = 0; i < omega[iinfo].Length; ++i) - VectorUtils.ScaleBy(y[iinfo][i], invn); - - if (center[iinfo]) - { - VectorUtils.ScaleBy(mean[iinfo], invn); - for (int i = 0; i < omega[iinfo].Length; i++) - VectorUtils.AddMult(mean[iinfo], y[iinfo][i], -VectorUtils.DotProduct(omega[iinfo][i], mean[iinfo])); - } - } - } - } - - //return Y * eigenvectors / eigenvalues - // REVIEW: improve - private Float[][] PostProcess(Float[][] y, Float[] sigma, Float[] z, int d, int k) - { - var pinv = new Float[k]; - var tmp = new Float[k]; - - for (int i = 0; i < k; i++) - pinv[i] = (Float)(1.0) / ((Float)(1e-6) + sigma[i]); - - for (int i = 0; i < d; i++) - { - for (int j = 0; j < k; j++) - { - tmp[j] = 0; - for (int l = 0; l < k; l++) - tmp[j] += y[l][i] * z[j * k + l]; - } - for (int j = 0; j < k; j++) - y[j][i] = pinv[j] * tmp[j]; - } - - return y; - } - - private ColumnType[] InitColumnTypes() - { - Host.Assert(ColumnPairs.Length == _transformInfos.Length); - var types = _transformInfos.Select(tInfo => new VectorType(NumberType.Float, tInfo.Rank)).ToArray(); - return types; - } - - protected override IRowMapper MakeRowMapper(ISchema schema) => new Mapper(this, Schema.Create(schema)); - - protected override void CheckInputColumn(ISchema inputSchema, int col, int srcCol) - { - ValidatePcaInput(Host, inputSchema.GetColumnName(srcCol), inputSchema.GetColumnType(srcCol)); - } - - internal static void ValidatePcaInput(IHost host, string name, ColumnType type) - { - if (!type.IsVector) - throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of type ${type}"); - - if (!(type.IsKnownSizeVector && type.VectorSize > 1)) - throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of size ${type.VectorSize}"); - - var itemType = type.ItemType; - if (!itemType.IsNumber) - throw host.Except($"Pca transform can only be applied to vector of numeric items. Column ${name} contains type ${itemType}"); - } - - private sealed class Mapper : MapperBase - { - private readonly ColumnType[] _outputColumnTypes; - // Todo: replace with ColMapNewToOld - private readonly ColumnType[] _inputColumnTypes; - private readonly int[] _inputColumnIndices; - private readonly PcaTransformer _parent; - private readonly int _numColumns; - - public Mapper(PcaTransformer parent, Schema inputSchema) - : base(parent.Host.Register(nameof(Mapper)), parent, inputSchema) - { - _parent = parent; - _numColumns = parent._numColumns; - _outputColumnTypes = parent.InitColumnTypes(); - _inputColumnTypes = new ColumnType[_numColumns]; - _inputColumnIndices = new int[_numColumns]; - for (int i = 0; i < _numColumns; i++) - { - var inputColName = _parent.ColumnPairs[i].input; - if (!inputSchema.TryGetColumnIndex(inputColName, out _inputColumnIndices[i])) - throw Host.ExceptSchemaMismatch(nameof(inputColName), "input", inputColName); - _inputColumnTypes[i] = inputSchema[_inputColumnIndices[i]].Type; - Host.Check(_inputColumnTypes[i].IsKnownSizeVector && _inputColumnTypes[i].VectorSize > 1, - "Pca transform can only be applied to columns with known dimensionality greater than 1"); - if (_inputColumnTypes[i].VectorSize != _parent._transformInfos[i].Dimension) - { - var msg = $"Dimension of column ${inputColName} is ${_inputColumnTypes[i].VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; - throw Host.Except(msg); - } - } - // Ivan't comment: - //var getSrc = input.GetGetter>(ColMapNewToOld[iinfo]); - } - - public override Schema.Column[] GetOutputColumns() - { - var result = new Schema.Column[_numColumns]; - for (int i = 0; i < _numColumns; i++) - result[i] = new Schema.Column(_parent.ColumnPairs[i].output, _outputColumnTypes[i], null); - return result; - } - - protected override Delegate MakeGetter(IRow input, int iinfo, out Action disposer) - { - Contracts.AssertValue(input); - Contracts.Assert(0 <= iinfo && iinfo < _numColumns); - disposer = null; - - var srcGetter = input.GetGetter>(_inputColumnIndices[iinfo]); - var src = default(VBuffer); - - ValueGetter> dstGetter = (ref VBuffer dst) => - { - srcGetter(ref src); - TransformFeatures(Host, ref src, ref dst, _parent._transformInfos[iinfo]); - }; - - return dstGetter; - } - - private static void TransformFeatures(IExceptionContext ectx, ref VBuffer src, ref VBuffer dst, TransformInfo transformInfo) - { - ectx.Check(src.Length == transformInfo.Dimension); - - var values = dst.Values; - if (Utils.Size(values) < transformInfo.Rank) - values = new Float[transformInfo.Rank]; - - for (int i = 0; i < transformInfo.Rank; i++) - { - values[i] = VectorUtils.DotProductWithOffset(transformInfo.Eigenvectors[i], 0, ref src) - - (transformInfo.MeanProjected == null ? 0 : transformInfo.MeanProjected[i]); - } - - dst = new VBuffer(transformInfo.Rank, values, dst.Indices); - } - } - - [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator2", - Desc = Summary, - UserName = UserName, - ShortName = ShortName, - XmlInclude = new[] { @"", - @""})] - public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input) - { - var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input); - var view = PcaTransformer.Create(h, input, input.Data); - return new CommonOutputs.TransformOutput() - { - Model = new TransformModel(h, view, input.Data), - OutputData = view - }; - } - } - - public sealed class PcaEstimator2 : IEstimator - { - private readonly IHost _host; - private readonly PcaTransformer.ColumnInfo[] _columns; - - public PcaEstimator2(IHostEnvironment env, string inputColumn, string outputColumn = null, - string weightColumn = PcaTransformer.Defaults.WeightColumn, int rank = PcaTransformer.Defaults.Rank, - int overSampling = PcaTransformer.Defaults.Oversampling, bool center = PcaTransformer.Defaults.Center, - int? seed = null) - : this(env, new PcaTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, weightColumn, rank, overSampling, center, seed)) - { - } - - public PcaEstimator2(IHostEnvironment env, params PcaTransformer.ColumnInfo[] columns) - { - Contracts.CheckValue(env, nameof(env)); - _host = env.Register(nameof(PcaEstimator2)); - _columns = columns; - } - - //TODO: move the dosctrings above - ///// - ///// Convinence constructor for simple one column case - ///// - ///// - ///// The environment. - ///// Input column to apply PCA on. - ///// Output column. Null means is replaced. - ///// The number of components in the PCA. - ///// A delegate to apply all the advanced arguments to the algorithm. - //public PcaEstimator2(IHostEnvironment env, string inputColumn, string outputColumn = null, - // int rank = PcaTransformer.Defaults.Rank, - // Action advancedSettings = null) - // : this(env, new[] { (inputColumn, outputColumn ?? inputColumn) }, rank, advancedSettings) - //{ - //} - - ///// - ///// The environment. - ///// Pairs of columns to run the PCA on. - ///// The number of components in the PCA. - ///// A delegate to apply all the advanced arguments to the algorithm. - //public PcaEstimator2(IHostEnvironment env, (string input, string output)[] columns, - // int rank = PcaTransformer.Defaults.Rank, - // Action advancedSettings = null) - //{ - // Contracts.CheckValue(env, nameof(env)); - // _host = env.Register(nameof(PcaEstimator2)); - - // foreach (var (input, output) in columns) - // { - // _host.CheckUserArg(Utils.Size(input) > 0, nameof(input)); - // _host.CheckValue(output, nameof(output)); - // } - - // var args = new PcaTransformer.Arguments(); - // args.Column = columns.Select(x => new PcaTransformer.Column { Source = x.input, Name = x.output }).ToArray(); - // args.Rank = rank; - // advancedSettings?.Invoke(args); - // _columns = PcaTransformer.ArgumentsToColumnInfos(args); - //} - - public PcaTransformer Fit(IDataView input) => new PcaTransformer(_host, input, _columns); - - public SchemaShape GetOutputSchema(SchemaShape inputSchema) - { - _host.CheckValue(inputSchema, nameof(inputSchema)); - var result = inputSchema.Columns.ToDictionary(x => x.Name); - foreach (var colInfo in _columns) - { - if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) - throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); - - if (!(col.Kind == SchemaShape.Column.VectorKind.Vector && col.ItemType.IsNumber)) - throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); - - result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, - SchemaShape.Column.VectorKind.Vector, NumberType.R4, false); - } - - return new SchemaShape(result.Values); - } - } - - /// - /// Extensions for statically typed . - /// - public static class PcaEstimatorExtensions2 - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - - public OutPipelineColumn(Vector input, string weightColumn, int rank, - int overSampling, bool center, int? seed = null) - : base(new Reconciler(weightColumn, rank, overSampling, center, seed), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly PcaTransformer.ColumnInfo _colInfo; - - public Reconciler(string weightColumn, int rank, int overSampling, bool center, int? seed = null) - { - _colInfo = new PcaTransformer.ColumnInfo( - null, null, weightColumn, rank, overSampling, center, seed); - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - // Only one column is allowed. - Contracts.Assert(toOutput.Length == 1); - var outCol = (OutPipelineColumn)toOutput[0]; - var inputColName = inputNames[outCol.Input]; - var outputColName = outputNames[outCol]; - return new PcaEstimator2(env, inputColName, outputColName, - _colInfo.WeightColumn, _colInfo.Rank, _colInfo.Oversampling, - _colInfo.Center, _colInfo.Seed); - } - } - - // TODO: fix docstrings - // /// Replace current vector with its principal components. Can significantly reduce size of vector. - // /// - // /// The column to apply PCA to. - // /// The number of components in the PCA. - // /// A delegate to apply all the advanced arguments to the algorithm. - public static Vector ToPrincipalComponents(this Vector input, - string weightColumn = PcaTransformer.Defaults.WeightColumn, - int rank = PcaTransformer.Defaults.Rank, - int overSampling = PcaTransformer.Defaults.Oversampling, - bool center = PcaTransformer.Defaults.Center, - int? seed = null) => new OutPipelineColumn(input, weightColumn, rank, overSampling, center, seed); - } -} diff --git a/src/Microsoft.ML.PCA/WrappedPcaTransform.cs b/src/Microsoft.ML.PCA/WrappedPcaTransform.cs deleted file mode 100644 index 1f082e193e..0000000000 --- a/src/Microsoft.ML.PCA/WrappedPcaTransform.cs +++ /dev/null @@ -1,116 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Core.Data; -using Microsoft.ML.Runtime.Internal.Utilities; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.StaticPipe.Runtime; -using System; -using System.Collections.Generic; -using System.Linq; - -namespace Microsoft.ML.Runtime.Data -{ - /// - public sealed class PcaEstimator : TrainedWrapperEstimatorBase - { - private readonly PcaTransform.Arguments _args; - - /// - /// The environment. - /// Input column to apply PCA on. - /// Output column. Null means is replaced. - /// The number of components in the PCA. - /// A delegate to apply all the advanced arguments to the algorithm. - public PcaEstimator(IHostEnvironment env, - string inputColumn, - string outputColumn = null, - int rank = PcaTransform.Defaults.Rank, - Action advancedSettings = null) - : this(env, new[] { (inputColumn, outputColumn ?? inputColumn) }, rank, advancedSettings) - { - } - - /// - /// The environment. - /// Pairs of columns to run the PCA on. - /// The number of components in the PCA. - /// A delegate to apply all the advanced arguments to the algorithm. - public PcaEstimator(IHostEnvironment env, (string input, string output)[] columns, - int rank = PcaTransform.Defaults.Rank, - Action advancedSettings = null) - : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaEstimator))) - { - foreach (var (input, output) in columns) - { - Host.CheckUserArg(Utils.Size(input) > 0, nameof(input)); - Host.CheckValue(output, nameof(input)); - } - - _args = new PcaTransform.Arguments(); - _args.Column = columns.Select(x => new PcaTransform.Column { Source = x.input, Name = x.output }).ToArray(); - _args.Rank = rank; - - advancedSettings?.Invoke(_args); - } - - public override TransformWrapper Fit(IDataView input) - { - return new TransformWrapper(Host, new PcaTransform(Host, _args, input)); - } - } - - /// - /// Extensions for statically typed . - /// - public static class PcaEstimatorExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - - public OutPipelineColumn(Vector input, int rank, Action advancedSettings) - : base(new Reconciler(null, rank, advancedSettings), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly int _rank; - private readonly Action _advancedSettings; - - public Reconciler(PipelineColumn weightColumn, int rank, Action advancedSettings) - { - _rank = rank; - _advancedSettings = advancedSettings; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string input, string output)>(); - foreach (var outCol in toOutput) - pairs.Add((inputNames[((OutPipelineColumn)outCol).Input], outputNames[outCol])); - - return new PcaEstimator(env, pairs.ToArray(), _rank, _advancedSettings); - } - } - - /// Replace current vector with its principal components. Can significantly reduce size of vector. - /// - /// The column to apply PCA to. - /// The number of components in the PCA. - /// A delegate to apply all the advanced arguments to the algorithm. - public static Vector ToPrincipalComponents(this Vector input, - int rank = PcaTransform.Defaults.Rank, - Action advancedSettings = null) => new OutPipelineColumn(input, rank, advancedSettings); - } -} diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index 87f7244566..f0e86f46ba 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -39,7 +39,7 @@ public void PcaWorkout() separator: ';', hasHeader: true) .Read(new MultiFileSource(_dataSource)); - var est = new PcaEstimator2(_env, "features", "pca", rank: 4, seed: 10); + var est = new PcaEstimator(_env, "features", "pca", rank: 4, seed: 10); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); Done(); } @@ -52,7 +52,7 @@ public void TestPcaEstimator() separator: ';', hasHeader: true) .Read(new MultiFileSource(_dataSource)); - var est = new PcaEstimator2(_env, "features", "pca", rank: 5, seed: 1); + var est = new PcaEstimator(_env, "features", "pca", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = _env.Start("save")) { From 78da5940f327b535be0bd24c086626f3530d3b5d Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Fri, 19 Oct 2018 14:58:06 -0700 Subject: [PATCH 14/28] Float -> float --- src/Microsoft.ML.PCA/PcaTransform.cs | 81 ++++++++++++++-------------- 1 file changed, 39 insertions(+), 42 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index c8c1fef501..967d7badc0 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -2,8 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using Float = System.Single; - using System; using System.Linq; using System.Text; @@ -144,8 +142,8 @@ private sealed class TransformInfo public readonly int Dimension; public readonly int Rank; - public Float[][] Eigenvectors; - public Float[] MeanProjected; + public float[][] Eigenvectors; + public float[] MeanProjected; public TransformInfo(int rank, int dim) { @@ -162,15 +160,15 @@ public TransformInfo(ModelLoadContext ctx) // int: Dimension // int: Rank // for i=0,..,Rank-1: - // Float[]: the i'th eigenvector + // float[]: the i'th eigenvector // int: the size of MeanProjected (0 if it is null) - // Float[]: MeanProjected + // float[]: MeanProjected Dimension = ctx.Reader.ReadInt32(); Rank = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 < Rank && Rank <= Dimension); - Eigenvectors = new Float[Rank][]; + Eigenvectors = new float[Rank][]; for (int i = 0; i < Rank; i++) { Eigenvectors[i] = ctx.Reader.ReadFloatArray(Dimension); @@ -189,9 +187,9 @@ public void Save(ModelSaveContext ctx) // int: Dimension // int: Rank // for i=0,..,Rank-1: - // Float[]: the i'th eigenvector + // float[]: the i'th eigenvector // int: the size of MeanProjected (0 if it is null) - // Float[]: MeanProjected + // float[]: MeanProjected Contracts.Assert(0 < Rank && Rank <= Dimension); ctx.Writer.Write(Dimension); @@ -205,7 +203,7 @@ public void Save(ModelSaveContext ctx) ctx.Writer.WriteFloatArray(MeanProjected); } - internal void ProjectMean(Float[] mean) + internal void ProjectMean(float[] mean) { Contracts.AssertValue(Eigenvectors); if (mean == null) @@ -214,7 +212,7 @@ internal void ProjectMean(Float[] mean) return; } - MeanProjected = new Float[Rank]; + MeanProjected = new float[Rank]; for (var i = 0; i < Rank; ++i) MeanProjected[i] = VectorUtils.DotProduct(Eigenvectors[i], mean); } @@ -229,7 +227,8 @@ private static VersionInfo GetVersionInfo() { return new VersionInfo( modelSignature: "PCA FUNC", - verWrittenCur: 0x00010001, // Initial + //verWrittenCur: 0x00010001, // Initial + verWrittenCur: 0x00010002, // Get rid of writing float size in model context verReadableCur: 0x00010001, verWeCanReadBack: 0x00010001, loaderSignature: LoaderSignature, @@ -350,10 +349,8 @@ public override void Save(ModelSaveContext ctx) ctx.SetVersionInfo(GetVersionInfo()); // *** Binary format *** - // int: sizeof(Float) // // transformInfos - ctx.Writer.Write(sizeof(Float)); SaveColumns(ctx); for (int i = 0; i < _transformInfos.Length; i++) _transformInfos[i].Save(ctx); @@ -367,9 +364,9 @@ private static (string input, string output)[] GetColumnPairs(ColumnInfo[] colum private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataView trainingData) { - var y = new Float[_numColumns][][]; - var omega = new Float[_numColumns][][]; - var mean = new Float[_numColumns][]; + var y = new float[_numColumns][][]; + var omega = new float[_numColumns][][]; + var mean = new float[_numColumns][]; var oversampledRank = new int[_numColumns]; var rnd = Host.Rand; Double totalMemoryUsageEstimate = 0; @@ -378,7 +375,7 @@ private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataVi oversampledRank[iinfo] = Math.Min(transformInfos[iinfo].Rank + columns[iinfo].Oversampling, transformInfos[iinfo].Dimension); //exact: (size of the 2 big matrices + other minor allocations) / (2^30) - Double colMemoryUsageEstimate = 2.0 * transformInfos[iinfo].Dimension * oversampledRank[iinfo] * sizeof(Float) / 1e9; + Double colMemoryUsageEstimate = 2.0 * transformInfos[iinfo].Dimension * oversampledRank[iinfo] * sizeof(float) / 1e9; totalMemoryUsageEstimate += colMemoryUsageEstimate; if (colMemoryUsageEstimate > 2) { @@ -389,20 +386,20 @@ private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataVi } } - y[iinfo] = new Float[oversampledRank[iinfo]][]; - omega[iinfo] = new Float[oversampledRank[iinfo]][]; + y[iinfo] = new float[oversampledRank[iinfo]][]; + omega[iinfo] = new float[oversampledRank[iinfo]][]; for (int i = 0; i < oversampledRank[iinfo]; i++) { - y[iinfo][i] = new Float[transformInfos[iinfo].Dimension]; - omega[iinfo][i] = new Float[transformInfos[iinfo].Dimension]; + y[iinfo][i] = new float[transformInfos[iinfo].Dimension]; + omega[iinfo][i] = new float[transformInfos[iinfo].Dimension]; for (int j = 0; j < transformInfos[iinfo].Dimension; j++) { - omega[iinfo][i][j] = (Float)Stats.SampleFromGaussian(rnd); + omega[iinfo][i][j] = (float)Stats.SampleFromGaussian(rnd); } } if (columns[iinfo].Center) - mean[iinfo] = new Float[transformInfos[iinfo].Dimension]; + mean[iinfo] = new float[transformInfos[iinfo].Dimension]; } if (totalMemoryUsageEstimate > 2) { @@ -437,15 +434,15 @@ private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataVi for (int iinfo = 0; iinfo < transformInfos.Length; iinfo++) { //Compute B2 = B' * B - var b2 = new Float[oversampledRank[iinfo] * oversampledRank[iinfo]]; + var b2 = new float[oversampledRank[iinfo] * oversampledRank[iinfo]]; for (var i = 0; i < oversampledRank[iinfo]; ++i) { for (var j = i; j < oversampledRank[iinfo]; ++j) b2[i * oversampledRank[iinfo] + j] = b2[j * oversampledRank[iinfo] + i] = VectorUtils.DotProduct(b[iinfo][i], b[iinfo][j]); } - Float[] smallEigenvalues; // eigenvectors and eigenvalues of the small matrix B2. - Float[] smallEigenvectors; + float[] smallEigenvalues; // eigenvectors and eigenvalues of the small matrix B2. + float[] smallEigenvectors; EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors); transformInfos[iinfo].Eigenvectors = PostProcess(b[iinfo], smallEigenvalues, smallEigenvectors, transformInfos[iinfo].Dimension, oversampledRank[iinfo]); @@ -456,7 +453,7 @@ private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataVi //Project the covariance matrix A on to Omega: Y <- A * Omega //A = X' * X / n, where X = data - mean //Note that the covariance matrix is not computed explicitly - private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, Float[][][] y, TransformInfo[] transformInfos) + private void Project(IDataView trainingData, float[][] mean, float[][][] omega, float[][][] y, TransformInfo[] transformInfos) { Host.Assert(mean.Length == omega.Length && omega.Length == y.Length && y.Length == _numColumns); for (int i = 0; i < omega.Length; i++) @@ -483,16 +480,16 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) { - var weightGetters = new ValueGetter[_numColumns]; - var columnGetters = new ValueGetter>[_numColumns]; + var weightGetters = new ValueGetter[_numColumns]; + var columnGetters = new ValueGetter>[_numColumns]; for (int iinfo = 0; iinfo < _numColumns; iinfo++) { if (_weightColumnIndices[iinfo] >= 0) - weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndices[iinfo]); - columnGetters[iinfo] = cursor.GetGetter>(_inputColumnIndices[iinfo]); + weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndices[iinfo]); + columnGetters[iinfo] = cursor.GetGetter>(_inputColumnIndices[iinfo]); } - var features = default(VBuffer); + var features = default(VBuffer); while (cursor.MoveNext()) { for (int iinfo = 0; iinfo < _numColumns; iinfo++) @@ -500,7 +497,7 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, Contracts.Check(_inputColumnTypes[iinfo].IsVector && _inputColumnTypes[iinfo].ItemType.IsNumber, "PCA transform can only be performed on numeric columns of dimension > 1"); - Float weight = 1; + float weight = 1; weightGetters[iinfo]?.Invoke(ref weight); columnGetters[iinfo](ref features); @@ -525,7 +522,7 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - var invn = (Float)(1 / totalColWeight[iinfo]); + var invn = (float)(1 / totalColWeight[iinfo]); for (var i = 0; i < omega[iinfo].Length; ++i) VectorUtils.ScaleBy(y[iinfo][i], invn); @@ -542,13 +539,13 @@ private void Project(IDataView trainingData, Float[][] mean, Float[][][] omega, //return Y * eigenvectors / eigenvalues // REVIEW: improve - private Float[][] PostProcess(Float[][] y, Float[] sigma, Float[] z, int d, int k) + private float[][] PostProcess(float[][] y, float[] sigma, float[] z, int d, int k) { - var pinv = new Float[k]; - var tmp = new Float[k]; + var pinv = new float[k]; + var tmp = new float[k]; for (int i = 0; i < k; i++) - pinv[i] = (Float)(1.0) / ((Float)(1e-6) + sigma[i]); + pinv[i] = (float)(1.0) / ((float)(1e-6) + sigma[i]); for (int i = 0; i < d; i++) { @@ -653,13 +650,13 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose return dstGetter; } - private static void TransformFeatures(IExceptionContext ectx, ref VBuffer src, ref VBuffer dst, TransformInfo transformInfo) + private static void TransformFeatures(IExceptionContext ectx, ref VBuffer src, ref VBuffer dst, TransformInfo transformInfo) { ectx.Check(src.Length == transformInfo.Dimension); var values = dst.Values; if (Utils.Size(values) < transformInfo.Rank) - values = new Float[transformInfo.Rank]; + values = new float[transformInfo.Rank]; for (int i = 0; i < transformInfo.Rank; i++) { @@ -667,7 +664,7 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer (transformInfo.MeanProjected == null ? 0 : transformInfo.MeanProjected[i]); } - dst = new VBuffer(transformInfo.Rank, values, dst.Indices); + dst = new VBuffer(transformInfo.Rank, values, dst.Indices); } } From 220433da4231bb0536bd6e4df6f0acab4cab7712 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Fri, 19 Oct 2018 15:36:41 -0700 Subject: [PATCH 15/28] Cleaned docstrings --- src/Microsoft.ML.PCA/PcaTransform.cs | 70 +++++-------------- .../Transformers/PcaTests.cs | 1 - 2 files changed, 19 insertions(+), 52 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 967d7badc0..2f95cf8a75 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -691,6 +691,16 @@ public sealed class PcaEstimator : IEstimator private readonly IHost _host; private readonly PcaTransform.ColumnInfo[] _columns; + /// Convinence constructor for simple one column case. + /// + /// The environment. + /// Input column to apply PCA on. + /// Output column. Null means is replaced. + /// The name of the weight column. + /// The number of components in the PCA. + /// Oversampling parameter for randomized PCA training. + /// If enabled, data is centered to be zero mean. + /// The seed for random number generation public PcaEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, string weightColumn = PcaTransform.Defaults.WeightColumn, int rank = PcaTransform.Defaults.Rank, int overSampling = PcaTransform.Defaults.Oversampling, bool center = PcaTransform.Defaults.Center, @@ -706,48 +716,6 @@ public PcaEstimator(IHostEnvironment env, params PcaTransform.ColumnInfo[] colum _columns = columns; } - //TODO: move the dosctrings above - ///// - ///// Convinence constructor for simple one column case - ///// - ///// - ///// The environment. - ///// Input column to apply PCA on. - ///// Output column. Null means is replaced. - ///// The number of components in the PCA. - ///// A delegate to apply all the advanced arguments to the algorithm. - //public PcaEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, - // int rank = PcaTransform.Defaults.Rank, - // Action advancedSettings = null) - // : this(env, new[] { (inputColumn, outputColumn ?? inputColumn) }, rank, advancedSettings) - //{ - //} - - ///// - ///// The environment. - ///// Pairs of columns to run the PCA on. - ///// The number of components in the PCA. - ///// A delegate to apply all the advanced arguments to the algorithm. - //public PcaEstimator(IHostEnvironment env, (string input, string output)[] columns, - // int rank = PcaTransform.Defaults.Rank, - // Action advancedSettings = null) - //{ - // Contracts.CheckValue(env, nameof(env)); - // _host = env.Register(nameof(PcaEstimator)); - - // foreach (var (input, output) in columns) - // { - // _host.CheckUserArg(Utils.Size(input) > 0, nameof(input)); - // _host.CheckValue(output, nameof(output)); - // } - - // var args = new PcaTransform.Arguments(); - // args.Column = columns.Select(x => new PcaTransform.Column { Source = x.input, Name = x.output }).ToArray(); - // args.Rank = rank; - // advancedSettings?.Invoke(args); - // _columns = PcaTransform.ArgumentsToColumnInfos(args); - //} - public PcaTransform Fit(IDataView input) => new PcaTransform(_host, input, _columns); public SchemaShape GetOutputSchema(SchemaShape inputSchema) @@ -770,9 +738,6 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) } } - /// - /// Extensions for statically typed . - /// public static class PcaEstimatorExtensions { private sealed class OutPipelineColumn : Vector @@ -814,12 +779,15 @@ public override IEstimator Reconcile(IHostEnvironment env, } } - // TODO: fix docstrings - // /// Replace current vector with its principal components. Can significantly reduce size of vector. - // /// - // /// The column to apply PCA to. - // /// The number of components in the PCA. - // /// A delegate to apply all the advanced arguments to the algorithm. + /// Compute the principal components of the input column. Can significantly reduce size of vector. + /// + /// The column to apply PCA to. + /// The name of the weight column. + /// The number of components in the PCA. + /// Oversampling parameter for randomized PCA training. + /// If enabled, data is centered to be zero mean. + /// The seed for random number generation + /// Vector containing the principal components. public static Vector ToPrincipalComponents(this Vector input, string weightColumn = PcaTransform.Defaults.WeightColumn, int rank = PcaTransform.Defaults.Rank, diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index f0e86f46ba..fdab26c05a 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -5,7 +5,6 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; using Microsoft.ML.Runtime.RunTests; -using Microsoft.ML.StaticPipe; using System.IO; using Xunit; using Xunit.Abstractions; From 1a7121a5c9094f07a49a1b520aec4cf0e970dd25 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Fri, 19 Oct 2018 15:57:13 -0700 Subject: [PATCH 16/28] Removed some unnecessary checks --- src/Microsoft.ML.PCA/PcaTransform.cs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 2f95cf8a75..722e7d8b7f 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -609,11 +609,9 @@ public Mapper(PcaTransform parent, Schema inputSchema) for (int i = 0; i < _numColumns; i++) { var inputColName = _parent.ColumnPairs[i].input; - if (!inputSchema.TryGetColumnIndex(inputColName, out _inputColumnIndices[i])) - throw Host.ExceptSchemaMismatch(nameof(inputColName), "input", inputColName); + inputSchema.TryGetColumnIndex(inputColName, out _inputColumnIndices[i]); _inputColumnTypes[i] = inputSchema[_inputColumnIndices[i]].Type; - Host.Check(_inputColumnTypes[i].IsKnownSizeVector && _inputColumnTypes[i].VectorSize > 1, - "Pca transform can only be applied to columns with known dimensionality greater than 1"); + ValidatePcaInput(Host, inputColName, _inputColumnTypes[i]); if (_inputColumnTypes[i].VectorSize != _parent._transformInfos[i].Dimension) { var msg = $"Dimension of column ${inputColName} is ${_inputColumnTypes[i].VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; From b986eb0df6d3d8ea96efcb2ff100cc6ea1cfc9b6 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Fri, 19 Oct 2018 16:21:20 -0700 Subject: [PATCH 17/28] Simplified unnecessary code --- src/Microsoft.ML.PCA/PcaTransform.cs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 722e7d8b7f..01c6fe3058 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -121,10 +121,10 @@ public sealed class ColumnInfo /// public ColumnInfo(string input, string output, - string weightColumn = PcaTransform.Defaults.WeightColumn, - int rank = PcaTransform.Defaults.Rank, - int overSampling = PcaTransform.Defaults.Oversampling, - bool center = PcaTransform.Defaults.Center, + string weightColumn = Defaults.WeightColumn, + int rank = Defaults.Rank, + int overSampling = Defaults.Oversampling, + bool center = Defaults.Center, int? seed = null) { Input = input; @@ -358,7 +358,7 @@ public override void Save(ModelSaveContext ctx) private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) { - //Contracts.CheckValue(columns, nameof(columns)); + Contracts.CheckValue(columns, nameof(columns)); return columns.Select(x => (x.Input, x.Output)).ToArray(); } @@ -592,9 +592,7 @@ internal static void ValidatePcaInput(IHost host, string name, ColumnType type) private sealed class Mapper : MapperBase { private readonly ColumnType[] _outputColumnTypes; - // Todo: replace with ColMapNewToOld private readonly ColumnType[] _inputColumnTypes; - private readonly int[] _inputColumnIndices; private readonly PcaTransform _parent; private readonly int _numColumns; @@ -605,12 +603,11 @@ public Mapper(PcaTransform parent, Schema inputSchema) _numColumns = parent._numColumns; _outputColumnTypes = parent.InitColumnTypes(); _inputColumnTypes = new ColumnType[_numColumns]; - _inputColumnIndices = new int[_numColumns]; for (int i = 0; i < _numColumns; i++) { var inputColName = _parent.ColumnPairs[i].input; - inputSchema.TryGetColumnIndex(inputColName, out _inputColumnIndices[i]); - _inputColumnTypes[i] = inputSchema[_inputColumnIndices[i]].Type; + var inputColIndex = ColMapNewToOld[i]; + _inputColumnTypes[i] = inputSchema[inputColIndex].Type; ValidatePcaInput(Host, inputColName, _inputColumnTypes[i]); if (_inputColumnTypes[i].VectorSize != _parent._transformInfos[i].Dimension) { @@ -636,7 +633,7 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose Contracts.Assert(0 <= iinfo && iinfo < _numColumns); disposer = null; - var srcGetter = input.GetGetter>(_inputColumnIndices[iinfo]); + var srcGetter = input.GetGetter>(ColMapNewToOld[iinfo]); var src = default(VBuffer); ValueGetter> dstGetter = (ref VBuffer dst) => From 51aa6879d0db8f3ba76e120610c229f327059948 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Sun, 21 Oct 2018 10:25:29 -0700 Subject: [PATCH 18/28] Moved some fields to ColumnInfo for simplifications --- src/Microsoft.ML.PCA/PcaTransform.cs | 100 ++++++++++++++++----------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 01c6fe3058..2d202d30a7 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -135,6 +135,42 @@ public ColumnInfo(string input, Center = center; Seed = seed; } + + // The following functions and properties are all internal and used for simplifying the + // Transformer and Mapper code. + + internal ColumnInfo((string input, string output) columnPair) + { + Input = columnPair.input; + Output = columnPair.output; + } + + internal void SetSchema(Schema schema) + { + _schema = schema; + } + + private Schema _schema; + + internal int InputIndex + { + get + { + Contracts.AssertValue(_schema); + // Column names are already checked by PcaTransform + _schema.TryGetColumnIndex(Input, out int index); + return index; + } + } + + internal ColumnType InputType + { + get + { + Contracts.AssertValue(_schema); + return _schema[Input].Type; + } + } } private sealed class TransformInfo @@ -145,6 +181,8 @@ private sealed class TransformInfo public float[][] Eigenvectors; public float[] MeanProjected; + internal ColumnType OutputType => new VectorType(NumberType.Float, Rank); + public TransformInfo(int rank, int dim) { Dimension = dim; @@ -235,13 +273,10 @@ private static VersionInfo GetVersionInfo() loaderAssemblyName: typeof(PcaTransform).Assembly.FullName); } - // These are parallel to Infos. - private readonly ColumnType[] _outputColumnTypes; + private readonly int _numColumns; + private readonly ColumnInfo[] _columns; private readonly TransformInfo[] _transformInfos; private readonly int[] _weightColumnIndices; - private readonly int[] _inputColumnIndices; - private readonly ColumnType[] _inputColumnTypes; - private readonly int _numColumns; private const string RegistrationName = "Pca"; @@ -254,20 +289,19 @@ public PcaTransform(IHostEnvironment env, IDataView input, ColumnInfo[] columns) Host.AssertNonEmpty(ColumnPairs); _numColumns = ColumnPairs.Length; + _columns = columns; _transformInfos = new TransformInfo[_numColumns]; _weightColumnIndices = new int[_numColumns]; - _inputColumnIndices = new int[_numColumns]; - _inputColumnTypes = new ColumnType[_numColumns]; for (int i = 0; i < _numColumns; i++) { var col = columns[i]; - // Base class has checked existence of input columns - input.Schema.TryGetColumnIndex(col.Input, out _inputColumnIndices[i]); - _inputColumnTypes[i] = input.Schema[_inputColumnIndices[i]].Type; - ValidatePcaInput(Host, col.Input, _inputColumnTypes[i]); - _transformInfos[i] = new TransformInfo(col.Rank, _inputColumnTypes[i].ValueCount); + col.SetSchema(input.Schema); + + ValidatePcaInput(Host, col.Input, col.InputType); Host.CheckUserArg(col.Oversampling >= 0, nameof(col.Oversampling), "Oversampling must be non-negative"); + + _transformInfos[i] = new TransformInfo(col.Rank, col.InputType.ValueCount); _weightColumnIndices[i] = -1; var weightColumn = col.WeightColumn; if (weightColumn != null) @@ -280,7 +314,6 @@ public PcaTransform(IHostEnvironment env, IDataView input, ColumnInfo[] columns) } Train(columns, _transformInfos, input); - _outputColumnTypes = InitColumnTypes(); } private PcaTransform(IHost host, ModelLoadContext ctx) @@ -294,10 +327,13 @@ private PcaTransform(IHost host, ModelLoadContext ctx) // transformInfos Host.AssertNonEmpty(ColumnPairs); _numColumns = ColumnPairs.Length; + _columns = new ColumnInfo[_numColumns]; _transformInfos = new TransformInfo[_numColumns]; for (int i = 0; i < _numColumns; i++) + { + _columns[i] = new ColumnInfo(ColumnPairs[i]); _transformInfos[i] = new TransformInfo(ctx); - _outputColumnTypes = InitColumnTypes(); + } } // Factory method for SignatureLoadDataTransform. @@ -355,7 +391,6 @@ public override void Save(ModelSaveContext ctx) for (int i = 0; i < _transformInfos.Length; i++) _transformInfos[i].Save(ctx); } - private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) { Contracts.CheckValue(columns, nameof(columns)); @@ -473,7 +508,7 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - activeColumns[_inputColumnIndices[iinfo]] = true; + activeColumns[_columns[iinfo].InputIndex] = true; if (_weightColumnIndices[iinfo] >= 0) activeColumns[_weightColumnIndices[iinfo]] = true; } @@ -486,7 +521,7 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, { if (_weightColumnIndices[iinfo] >= 0) weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndices[iinfo]); - columnGetters[iinfo] = cursor.GetGetter>(_inputColumnIndices[iinfo]); + columnGetters[iinfo] = cursor.GetGetter>(_columns[iinfo].InputIndex); } var features = default(VBuffer); @@ -494,9 +529,6 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, { for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - Contracts.Check(_inputColumnTypes[iinfo].IsVector && _inputColumnTypes[iinfo].ItemType.IsNumber, - "PCA transform can only be performed on numeric columns of dimension > 1"); - float weight = 1; weightGetters[iinfo]?.Invoke(ref weight); columnGetters[iinfo](ref features); @@ -562,13 +594,6 @@ private float[][] PostProcess(float[][] y, float[] sigma, float[] z, int d, int return y; } - private ColumnType[] InitColumnTypes() - { - Host.Assert(ColumnPairs.Length == _transformInfos.Length); - var types = _transformInfos.Select(tInfo => new VectorType(NumberType.Float, tInfo.Rank)).ToArray(); - return types; - } - protected override IRowMapper MakeRowMapper(ISchema schema) => new Mapper(this, Schema.Create(schema)); protected override void CheckInputColumn(ISchema inputSchema, int col, int srcCol) @@ -591,8 +616,7 @@ internal static void ValidatePcaInput(IHost host, string name, ColumnType type) private sealed class Mapper : MapperBase { - private readonly ColumnType[] _outputColumnTypes; - private readonly ColumnType[] _inputColumnTypes; + private readonly ColumnInfo[] _columns; private readonly PcaTransform _parent; private readonly int _numColumns; @@ -601,29 +625,25 @@ public Mapper(PcaTransform parent, Schema inputSchema) { _parent = parent; _numColumns = parent._numColumns; - _outputColumnTypes = parent.InitColumnTypes(); - _inputColumnTypes = new ColumnType[_numColumns]; + _columns = new ColumnInfo[_numColumns]; for (int i = 0; i < _numColumns; i++) { - var inputColName = _parent.ColumnPairs[i].input; - var inputColIndex = ColMapNewToOld[i]; - _inputColumnTypes[i] = inputSchema[inputColIndex].Type; - ValidatePcaInput(Host, inputColName, _inputColumnTypes[i]); - if (_inputColumnTypes[i].VectorSize != _parent._transformInfos[i].Dimension) + var col = _columns[i] = new ColumnInfo(_parent.ColumnPairs[i]); + col.SetSchema(inputSchema); + ValidatePcaInput(Host, col.Input, col.InputType); + if (col.InputType.VectorSize != _parent._transformInfos[i].Dimension) { - var msg = $"Dimension of column ${inputColName} is ${_inputColumnTypes[i].VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; + var msg = $"Dimension of column ${col.Input} is ${col.InputType.VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; throw Host.Except(msg); } } - // Ivan't comment: - //var getSrc = input.GetGetter>(ColMapNewToOld[iinfo]); } public override Schema.Column[] GetOutputColumns() { var result = new Schema.Column[_numColumns]; for (int i = 0; i < _numColumns; i++) - result[i] = new Schema.Column(_parent.ColumnPairs[i].output, _outputColumnTypes[i], null); + result[i] = new Schema.Column(_columns[i].Output, _parent._transformInfos[i].OutputType, null); return result; } From 9e7f5719b55e98d0af38a0e7e0cecfd2471ce437 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Sun, 21 Oct 2018 10:46:06 -0700 Subject: [PATCH 19/28] Simplified weight columns --- src/Microsoft.ML.PCA/PcaTransform.cs | 48 ++++++++++--------- .../Transformers/PcaTests.cs | 8 +++- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 2d202d30a7..0a60bd4db8 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -171,6 +171,22 @@ internal ColumnType InputType return _schema[Input].Type; } } + + internal int WeightColumnIndex + { + get + { + Contracts.AssertValue(_schema); + var index = -1; + if (WeightColumn != null) + { + if (!_schema.TryGetColumnIndex(WeightColumn, out index)) + throw Contracts.Except("Weight column '{0}' does not exist.", WeightColumn); + Contracts.CheckUserArg(_schema[index].Type == NumberType.Float, nameof(WeightColumn)); + } + return index; + } + } } private sealed class TransformInfo @@ -276,7 +292,6 @@ private static VersionInfo GetVersionInfo() private readonly int _numColumns; private readonly ColumnInfo[] _columns; private readonly TransformInfo[] _transformInfos; - private readonly int[] _weightColumnIndices; private const string RegistrationName = "Pca"; @@ -288,29 +303,17 @@ public PcaTransform(IHostEnvironment env, IDataView input, ColumnInfo[] columns) { Host.AssertNonEmpty(ColumnPairs); - _numColumns = ColumnPairs.Length; _columns = columns; + _numColumns = columns.Length; _transformInfos = new TransformInfo[_numColumns]; - _weightColumnIndices = new int[_numColumns]; for (int i = 0; i < _numColumns; i++) { var col = columns[i]; col.SetSchema(input.Schema); - ValidatePcaInput(Host, col.Input, col.InputType); Host.CheckUserArg(col.Oversampling >= 0, nameof(col.Oversampling), "Oversampling must be non-negative"); - _transformInfos[i] = new TransformInfo(col.Rank, col.InputType.ValueCount); - _weightColumnIndices[i] = -1; - var weightColumn = col.WeightColumn; - if (weightColumn != null) - { - if (!input.Schema.TryGetColumnIndex(weightColumn, out _weightColumnIndices[i])) - throw Host.Except("weight column '{0}' does not exist", weightColumn); - var type = input.Schema.GetColumnType(_weightColumnIndices[i]); - Host.CheckUserArg(type == NumberType.Float, nameof(weightColumn)); - } } Train(columns, _transformInfos, input); @@ -506,11 +509,11 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, Double[] totalColWeight = new Double[_numColumns]; bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; - for (int iinfo = 0; iinfo < _numColumns; iinfo++) + foreach (var col in _columns) { - activeColumns[_columns[iinfo].InputIndex] = true; - if (_weightColumnIndices[iinfo] >= 0) - activeColumns[_weightColumnIndices[iinfo]] = true; + activeColumns[col.InputIndex] = true; + if (col.WeightColumnIndex >= 0) + activeColumns[col.WeightColumnIndex] = true; } using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) @@ -519,9 +522,10 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, var columnGetters = new ValueGetter>[_numColumns]; for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - if (_weightColumnIndices[iinfo] >= 0) - weightGetters[iinfo] = cursor.GetGetter(_weightColumnIndices[iinfo]); - columnGetters[iinfo] = cursor.GetGetter>(_columns[iinfo].InputIndex); + var col = _columns[iinfo]; + if (col.WeightColumnIndex >= 0) + weightGetters[iinfo] = cursor.GetGetter(col.WeightColumnIndex); + columnGetters[iinfo] = cursor.GetGetter>(col.InputIndex); } var features = default(VBuffer); @@ -549,7 +553,7 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, for (int iinfo = 0; iinfo < _numColumns; iinfo++) { if (totalColWeight[iinfo] <= 0) - throw Host.Except("Empty data in column '{0}'", ColumnPairs[iinfo].input); + throw Host.Except("Empty data in column '{0}'", _columns[iinfo].Input); } for (int iinfo = 0; iinfo < _numColumns; iinfo++) diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index fdab26c05a..ede3350b59 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -29,17 +29,21 @@ public PcaTests(ITestOutputHelper helper) public void PcaWorkout() { var data = TextLoader.CreateReader(_env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), + c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadFloat(1, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(_dataSource)); var invalidData = TextLoader.CreateReader(_env, - c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), + c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadText(1, 10)), separator: ';', hasHeader: true) .Read(new MultiFileSource(_dataSource)); var est = new PcaEstimator(_env, "features", "pca", rank: 4, seed: 10); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + + var est_non_default_args = new PcaEstimator(_env, "features", "pca", rank: 3, weightColumn: "weight", overSampling: 2, center: false); + TestEstimatorCore(est_non_default_args, data.AsDynamic, invalidInput: invalidData.AsDynamic); + Done(); } From db8d690063d3ec7f028644c96e5dba3c0c4cba9f Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 22 Oct 2018 14:21:11 -0700 Subject: [PATCH 20/28] Address PR comments #1 --- src/Microsoft.ML.PCA/PcaTransform.cs | 4 ++-- test/Microsoft.ML.Tests/Transformers/PcaTests.cs | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 0a60bd4db8..ed83cf1e13 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -3,8 +3,10 @@ // See the LICENSE file in the project root for more information. using System; +using System.Collections.Generic; using System.Linq; using System.Text; +using Microsoft.ML.Core.Data; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; @@ -13,10 +15,8 @@ using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Numeric; -using Microsoft.ML.Core.Data; using Microsoft.ML.StaticPipe; using Microsoft.ML.StaticPipe.Runtime; -using System.Collections.Generic; [assembly: LoadableClass(PcaTransform.Summary, typeof(IDataTransform), typeof(PcaTransform), typeof(PcaTransform.Arguments), typeof(SignatureDataTransform), PcaTransform.UserName, PcaTransform.LoaderSignature, PcaTransform.ShortName)] diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index ede3350b59..8202a2558b 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -20,7 +20,7 @@ public sealed class PcaTests : TestDataPipeBase public PcaTests(ITestOutputHelper helper) : base(helper) { - _env = new ConsoleEnvironment(seed: 1, conc: 1); + _env = new ConsoleEnvironment(seed: 1); _dataSource = GetDataPath("generated_regression_dataset.csv"); _saver = new TextSaver(_env, new TextSaver.Arguments { Silent = true, OutputHeader = false }); } @@ -31,12 +31,12 @@ public void PcaWorkout() var data = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadFloat(1, 10)), separator: ';', hasHeader: true) - .Read(new MultiFileSource(_dataSource)); + .Read(_dataSource); var invalidData = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadText(1, 10)), separator: ';', hasHeader: true) - .Read(new MultiFileSource(_dataSource)); + .Read(_dataSource); var est = new PcaEstimator(_env, "features", "pca", rank: 4, seed: 10); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); @@ -53,7 +53,7 @@ public void TestPcaEstimator() var data = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) - .Read(new MultiFileSource(_dataSource)); + .Read(_dataSource); var est = new PcaEstimator(_env, "features", "pca", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); @@ -76,7 +76,7 @@ public void TestPcaPigsty() var reader = TextLoader.CreateReader(_env, c => (label: c.LoadFloat(11), features1: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); - var data = reader.Read(new MultiFileSource(_dataSource)); + var data = reader.Read(_dataSource); var pipeline = reader.MakeNewEstimator() .Append(r => (r.label, pca: r.features1.ToPrincipalComponents(rank: 5, seed: 1))); From 1b18db56a489469fa6eed1bd68b2da3543f0d1f6 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 22 Oct 2018 14:51:06 -0700 Subject: [PATCH 21/28] Addressed PR comments #2 --- src/Microsoft.ML.PCA/PcaTransform.cs | 59 ++++++++++--------- .../Transformers/PcaTests.cs | 3 +- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index ed83cf1e13..e541714555 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -17,6 +17,7 @@ using Microsoft.ML.Runtime.Numeric; using Microsoft.ML.StaticPipe; using Microsoft.ML.StaticPipe.Runtime; +using Microsoft.ML.Transforms; [assembly: LoadableClass(PcaTransform.Summary, typeof(IDataTransform), typeof(PcaTransform), typeof(PcaTransform.Arguments), typeof(SignatureDataTransform), PcaTransform.UserName, PcaTransform.LoaderSignature, PcaTransform.ShortName)] @@ -32,39 +33,30 @@ [assembly: LoadableClass(typeof(void), typeof(PcaTransform), null, typeof(SignatureEntryPointModule), PcaTransform.LoaderSignature)] -namespace Microsoft.ML.Runtime.Data +namespace Microsoft.ML.Transforms { /// public sealed class PcaTransform : OneToOneTransformerBase { - internal static class Defaults - { - public const string WeightColumn = null; - public const int Rank = 20; - public const int Oversampling = 20; - public const bool Center = true; - public const int Seed = 0; - } - public sealed class Arguments : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", ShortName = "col", SortOrder = 1)] public Column[] Column; [Argument(ArgumentType.Multiple, HelpText = "The name of the weight column", ShortName = "weight", Purpose = SpecialPurpose.ColumnName)] - public string WeightColumn = Defaults.WeightColumn; + public string WeightColumn = PcaEstimator.Defaults.WeightColumn; [Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k")] - public int Rank = Defaults.Rank; + public int Rank = PcaEstimator.Defaults.Rank; [Argument(ArgumentType.AtMostOnce, HelpText = "Oversampling parameter for randomized PCA training", ShortName = "over")] - public int Oversampling = Defaults.Oversampling; + public int Oversampling = PcaEstimator.Defaults.Oversampling; [Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean")] - public bool Center = Defaults.Center; + public bool Center = PcaEstimator.Defaults.Center; [Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation")] - public int Seed = Defaults.Seed; + public int Seed = PcaEstimator.Defaults.Seed; } public class Column : OneToOneColumn @@ -121,10 +113,10 @@ public sealed class ColumnInfo /// public ColumnInfo(string input, string output, - string weightColumn = Defaults.WeightColumn, - int rank = Defaults.Rank, - int overSampling = Defaults.Oversampling, - bool center = Defaults.Center, + string weightColumn = PcaEstimator.Defaults.WeightColumn, + int rank = PcaEstimator.Defaults.Rank, + int overSampling = PcaEstimator.Defaults.Oversampling, + bool center = PcaEstimator.Defaults.Center, int? seed = null) { Input = input; @@ -134,6 +126,7 @@ public ColumnInfo(string input, Oversampling = overSampling; Center = center; Seed = seed; + Contracts.CheckUserArg(Oversampling >= 0, nameof(Oversampling), "Oversampling must be non-negative."); } // The following functions and properties are all internal and used for simplifying the @@ -312,7 +305,6 @@ public PcaTransform(IHostEnvironment env, IDataView input, ColumnInfo[] columns) var col = columns[i]; col.SetSchema(input.Schema); ValidatePcaInput(Host, col.Input, col.InputType); - Host.CheckUserArg(col.Oversampling >= 0, nameof(col.Oversampling), "Oversampling must be non-negative"); _transformInfos[i] = new TransformInfo(col.Rank, col.InputType.ValueCount); } @@ -614,8 +606,8 @@ internal static void ValidatePcaInput(IHost host, string name, ColumnType type) throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of size ${type.VectorSize}"); var itemType = type.ItemType; - if (!itemType.IsNumber) - throw host.Except($"Pca transform can only be applied to vector of numeric items. Column ${name} contains type ${itemType}"); + if (itemType.RawKind != DataKind.R4) + throw host.Except($"Pca transform can only be applied to vector of float items. Column ${name} contains type ${itemType}"); } private sealed class Mapper : MapperBase @@ -707,6 +699,15 @@ public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Argu public sealed class PcaEstimator : IEstimator { + internal static class Defaults + { + public const string WeightColumn = null; + public const int Rank = 20; + public const int Oversampling = 20; + public const bool Center = true; + public const int Seed = 0; + } + private readonly IHost _host; private readonly PcaTransform.ColumnInfo[] _columns; @@ -721,8 +722,8 @@ public sealed class PcaEstimator : IEstimator /// If enabled, data is centered to be zero mean. /// The seed for random number generation public PcaEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, - string weightColumn = PcaTransform.Defaults.WeightColumn, int rank = PcaTransform.Defaults.Rank, - int overSampling = PcaTransform.Defaults.Oversampling, bool center = PcaTransform.Defaults.Center, + string weightColumn = Defaults.WeightColumn, int rank = Defaults.Rank, + int overSampling = Defaults.Oversampling, bool center = Defaults.Center, int? seed = null) : this(env, new PcaTransform.ColumnInfo(inputColumn, outputColumn ?? inputColumn, weightColumn, rank, overSampling, center, seed)) { @@ -746,7 +747,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); - if (!(col.Kind == SchemaShape.Column.VectorKind.Vector && col.ItemType.IsNumber)) + if (col.Kind != SchemaShape.Column.VectorKind.Vector || col.ItemType.RawKind != DataKind.R4) throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, @@ -808,10 +809,10 @@ public override IEstimator Reconcile(IHostEnvironment env, /// The seed for random number generation /// Vector containing the principal components. public static Vector ToPrincipalComponents(this Vector input, - string weightColumn = PcaTransform.Defaults.WeightColumn, - int rank = PcaTransform.Defaults.Rank, - int overSampling = PcaTransform.Defaults.Oversampling, - bool center = PcaTransform.Defaults.Center, + string weightColumn = PcaEstimator.Defaults.WeightColumn, + int rank = PcaEstimator.Defaults.Rank, + int overSampling = PcaEstimator.Defaults.Oversampling, + bool center = PcaEstimator.Defaults.Center, int? seed = null) => new OutPipelineColumn(input, weightColumn, rank, overSampling, center, seed); } } diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index 8202a2558b..2cf2fbedd6 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -2,10 +2,11 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.IO; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; using Microsoft.ML.Runtime.RunTests; -using System.IO; +using Microsoft.ML.Transforms; using Xunit; using Xunit.Abstractions; From bb2ad55cbead2df0a2c6023862805a310697e27c Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 22 Oct 2018 15:07:38 -0700 Subject: [PATCH 22/28] Moved the static test --- .../StaticPipeTests.cs | 20 +++++++++++++ .../Transformers/PcaTests.cs | 28 ++----------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 6d4cae0995..7234173e9f 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -856,5 +856,25 @@ public void TextNormalizeStatic() type = schema.GetColumnType(numbers); Assert.True(!type.IsVector && type.ItemType.IsText); } + + [Fact] + public void TestPcaStatic() + { + var env = new ConsoleEnvironment(seed: 1); + var dataSource = GetDataPath("generated_regression_dataset.csv"); + var reader = TextLoader.CreateReader(env, + c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), + separator: ';', hasHeader: true); + var data = reader.Read(dataSource); + var est = reader.MakeNewEstimator() + .Append(r => (r.label, pca: r.features.ToPrincipalComponents(rank: 5))); + var tdata = est.Fit(data).Transform(data); + var schema = tdata.AsDynamic.Schema; + + Assert.True(schema.TryGetColumnIndex("pca", out int pca)); + var type = schema[pca].Type; + Assert.True(type.IsVector && type.ItemType.RawKind == DataKind.R4); + Assert.True(type.VectorSize == 5); + } } } \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index 2cf2fbedd6..8f1089e0dc 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -42,8 +42,8 @@ public void PcaWorkout() var est = new PcaEstimator(_env, "features", "pca", rank: 4, seed: 10); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); - var est_non_default_args = new PcaEstimator(_env, "features", "pca", rank: 3, weightColumn: "weight", overSampling: 2, center: false); - TestEstimatorCore(est_non_default_args, data.AsDynamic, invalidInput: invalidData.AsDynamic); + var estNonDefaultArgs = new PcaEstimator(_env, "features", "pca", rank: 3, weightColumn: "weight", overSampling: 2, center: false); + TestEstimatorCore(estNonDefaultArgs, data.AsDynamic, invalidInput: invalidData.AsDynamic); Done(); } @@ -70,29 +70,5 @@ public void TestPcaEstimator() CheckEquality("PCA", "pca.tsv"); Done(); } - - [Fact] - public void TestPcaPigsty() - { - var reader = TextLoader.CreateReader(_env, - c => (label: c.LoadFloat(11), features1: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - var data = reader.Read(_dataSource); - var pipeline = reader.MakeNewEstimator() - .Append(r => (r.label, pca: r.features1.ToPrincipalComponents(rank: 5, seed: 1))); - - var outputPath = GetOutputPath("PCA", "pca.tsv"); - using (var ch = _env.Start("save")) - { - IDataView savedData = TakeFilter.Create(_env, pipeline.Fit(data).Transform(data).AsDynamic, 4); - savedData = new ChooseColumnsTransform(_env, savedData, "pca"); - - using (var fs = File.Create(outputPath)) - DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true); - } - - CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 5); - Done(); - } } } From b729c701278e0467c05bccfbca4c0fd1d058cb95 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Mon, 22 Oct 2018 16:04:03 -0700 Subject: [PATCH 23/28] PR comments #3 --- src/Microsoft.ML.PCA/PcaTransform.cs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index e541714555..f0277ac86b 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -190,7 +190,7 @@ private sealed class TransformInfo public float[][] Eigenvectors; public float[] MeanProjected; - internal ColumnType OutputType => new VectorType(NumberType.Float, Rank); + public ColumnType OutputType => new VectorType(NumberType.Float, Rank); public TransformInfo(int rank, int dim) { @@ -250,7 +250,7 @@ public void Save(ModelSaveContext ctx) ctx.Writer.WriteFloatArray(MeanProjected); } - internal void ProjectMean(float[] mean) + public void ProjectMean(float[] mean) { Contracts.AssertValue(Eigenvectors); if (mean == null) @@ -275,7 +275,7 @@ private static VersionInfo GetVersionInfo() return new VersionInfo( modelSignature: "PCA FUNC", //verWrittenCur: 0x00010001, // Initial - verWrittenCur: 0x00010002, // Get rid of writing float size in model context + verWrittenCur: 0x00010002, // Got rid of writing float size in model context verReadableCur: 0x00010001, verWeCanReadBack: 0x00010001, loaderSignature: LoaderSignature, @@ -288,10 +288,7 @@ private static VersionInfo GetVersionInfo() private const string RegistrationName = "Pca"; - /// - /// Public constructor corresponding to SignatureDataTransform. - /// - public PcaTransform(IHostEnvironment env, IDataView input, ColumnInfo[] columns) + internal PcaTransform(IHostEnvironment env, IDataView input, ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaTransform)), GetColumnPairs(columns)) { Host.AssertNonEmpty(ColumnPairs); From a3febeb1b57a5976c6c22cc1402b530524fdcc82 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 23 Oct 2018 09:37:29 -0700 Subject: [PATCH 24/28] Moved schema related information out of ColumnInfo and into Mapper.ColumnSchemaInfo. --- src/Microsoft.ML.PCA/PcaTransform.cs | 144 ++++++++++++--------------- 1 file changed, 64 insertions(+), 80 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index f0277ac86b..40b04cd20b 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -128,58 +128,6 @@ public ColumnInfo(string input, Seed = seed; Contracts.CheckUserArg(Oversampling >= 0, nameof(Oversampling), "Oversampling must be non-negative."); } - - // The following functions and properties are all internal and used for simplifying the - // Transformer and Mapper code. - - internal ColumnInfo((string input, string output) columnPair) - { - Input = columnPair.input; - Output = columnPair.output; - } - - internal void SetSchema(Schema schema) - { - _schema = schema; - } - - private Schema _schema; - - internal int InputIndex - { - get - { - Contracts.AssertValue(_schema); - // Column names are already checked by PcaTransform - _schema.TryGetColumnIndex(Input, out int index); - return index; - } - } - - internal ColumnType InputType - { - get - { - Contracts.AssertValue(_schema); - return _schema[Input].Type; - } - } - - internal int WeightColumnIndex - { - get - { - Contracts.AssertValue(_schema); - var index = -1; - if (WeightColumn != null) - { - if (!_schema.TryGetColumnIndex(WeightColumn, out index)) - throw Contracts.Except("Weight column '{0}' does not exist.", WeightColumn); - Contracts.CheckUserArg(_schema[index].Type == NumberType.Float, nameof(WeightColumn)); - } - return index; - } - } } private sealed class TransformInfo @@ -283,7 +231,7 @@ private static VersionInfo GetVersionInfo() } private readonly int _numColumns; - private readonly ColumnInfo[] _columns; + private readonly Mapper.ColumnSchemaInfo[] _schemaInfos; private readonly TransformInfo[] _transformInfos; private const string RegistrationName = "Pca"; @@ -292,17 +240,16 @@ internal PcaTransform(IHostEnvironment env, IDataView input, ColumnInfo[] column : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaTransform)), GetColumnPairs(columns)) { Host.AssertNonEmpty(ColumnPairs); - - _columns = columns; _numColumns = columns.Length; _transformInfos = new TransformInfo[_numColumns]; + _schemaInfos = new Mapper.ColumnSchemaInfo[_numColumns]; for (int i = 0; i < _numColumns; i++) { - var col = columns[i]; - col.SetSchema(input.Schema); - ValidatePcaInput(Host, col.Input, col.InputType); - _transformInfos[i] = new TransformInfo(col.Rank, col.InputType.ValueCount); + var colInfo = columns[i]; + var sInfo = _schemaInfos[i] = new Mapper.ColumnSchemaInfo(ColumnPairs[i], input.Schema, colInfo.WeightColumn); + ValidatePcaInput(Host, colInfo.Input, sInfo.InputType); + _transformInfos[i] = new TransformInfo(colInfo.Rank, sInfo.InputType.ValueCount); } Train(columns, _transformInfos, input); @@ -319,13 +266,9 @@ private PcaTransform(IHost host, ModelLoadContext ctx) // transformInfos Host.AssertNonEmpty(ColumnPairs); _numColumns = ColumnPairs.Length; - _columns = new ColumnInfo[_numColumns]; _transformInfos = new TransformInfo[_numColumns]; for (int i = 0; i < _numColumns; i++) - { - _columns[i] = new ColumnInfo(ColumnPairs[i]); _transformInfos[i] = new TransformInfo(ctx); - } } // Factory method for SignatureLoadDataTransform. @@ -498,11 +441,11 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, Double[] totalColWeight = new Double[_numColumns]; bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; - foreach (var col in _columns) + foreach (var sInfo in _schemaInfos) { - activeColumns[col.InputIndex] = true; - if (col.WeightColumnIndex >= 0) - activeColumns[col.WeightColumnIndex] = true; + activeColumns[sInfo.InputIndex] = true; + if (sInfo.WeightColumnIndex >= 0) + activeColumns[sInfo.WeightColumnIndex] = true; } using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) @@ -511,10 +454,10 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, var columnGetters = new ValueGetter>[_numColumns]; for (int iinfo = 0; iinfo < _numColumns; iinfo++) { - var col = _columns[iinfo]; - if (col.WeightColumnIndex >= 0) - weightGetters[iinfo] = cursor.GetGetter(col.WeightColumnIndex); - columnGetters[iinfo] = cursor.GetGetter>(col.InputIndex); + var sInfo = _schemaInfos[iinfo]; + if (sInfo.WeightColumnIndex >= 0) + weightGetters[iinfo] = cursor.GetGetter(sInfo.WeightColumnIndex); + columnGetters[iinfo] = cursor.GetGetter>(sInfo.InputIndex); } var features = default(VBuffer); @@ -542,7 +485,7 @@ private void Project(IDataView trainingData, float[][] mean, float[][][] omega, for (int iinfo = 0; iinfo < _numColumns; iinfo++) { if (totalColWeight[iinfo] <= 0) - throw Host.Except("Empty data in column '{0}'", _columns[iinfo].Input); + throw Host.Except("Empty data in column '{0}'", ColumnPairs[iinfo].input); } for (int iinfo = 0; iinfo < _numColumns; iinfo++) @@ -609,7 +552,49 @@ internal static void ValidatePcaInput(IHost host, string name, ColumnType type) private sealed class Mapper : MapperBase { - private readonly ColumnInfo[] _columns; + public sealed class ColumnSchemaInfo + { + private readonly string _input; + private readonly string _output; + private readonly string _weightColumn; + private readonly Schema _schema; + + public ColumnSchemaInfo((string input, string output) columnPair, Schema schema, string weightColumn = null) + { + _input = columnPair.input; + _output = columnPair.output; + _weightColumn = weightColumn; + _schema = schema; + } + + public int InputIndex + { + get + { + // Column names are already checked by PcaTransform + _schema.TryGetColumnIndex(_input, out int index); + return index; + } + } + + public ColumnType InputType => _schema[_input].Type; + + public int WeightColumnIndex + { + get + { + var index = -1; + if (_weightColumn != null) + { + if (!_schema.TryGetColumnIndex(_weightColumn, out index)) + throw Contracts.Except("Weight column '{0}' does not exist.", _weightColumn); + Contracts.CheckUserArg(_schema[index].Type == NumberType.Float, nameof(_weightColumn)); + } + return index; + } + } + } + private readonly PcaTransform _parent; private readonly int _numColumns; @@ -618,15 +603,14 @@ public Mapper(PcaTransform parent, Schema inputSchema) { _parent = parent; _numColumns = parent._numColumns; - _columns = new ColumnInfo[_numColumns]; for (int i = 0; i < _numColumns; i++) { - var col = _columns[i] = new ColumnInfo(_parent.ColumnPairs[i]); - col.SetSchema(inputSchema); - ValidatePcaInput(Host, col.Input, col.InputType); - if (col.InputType.VectorSize != _parent._transformInfos[i].Dimension) + var colPair = _parent.ColumnPairs[i]; + var colSchemaInfo = new ColumnSchemaInfo(colPair, inputSchema); + ValidatePcaInput(Host, colPair.input, colSchemaInfo.InputType); + if (colSchemaInfo.InputType.VectorSize != _parent._transformInfos[i].Dimension) { - var msg = $"Dimension of column ${col.Input} is ${col.InputType.VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; + var msg = $"Dimension of column ${colPair.input} is ${colSchemaInfo.InputType.VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; throw Host.Except(msg); } } @@ -636,7 +620,7 @@ public override Schema.Column[] GetOutputColumns() { var result = new Schema.Column[_numColumns]; for (int i = 0; i < _numColumns; i++) - result[i] = new Schema.Column(_columns[i].Output, _parent._transformInfos[i].OutputType, null); + result[i] = new Schema.Column(_parent.ColumnPairs[i].output, _parent._transformInfos[i].OutputType, null); return result; } From 9c97e3b425cd58de3f0dfe03a31653e5b36b218a Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 23 Oct 2018 19:56:59 -0700 Subject: [PATCH 25/28] PR comments --- src/Microsoft.ML.PCA/PcaTransform.cs | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 40b04cd20b..82ee63bd2f 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -111,6 +111,13 @@ public sealed class ColumnInfo /// /// Describes how the transformer handles one column pair. /// + /// The column to apply PCA to. + /// The output column that contains PCA values. + /// The name of the weight column. + /// The number of components in the PCA. + /// Oversampling parameter for randomized PCA training. + /// If enabled, data is centered to be zero mean. + /// The seed for random number generation. public ColumnInfo(string input, string output, string weightColumn = PcaEstimator.Defaults.WeightColumn, @@ -126,7 +133,7 @@ public ColumnInfo(string input, Oversampling = overSampling; Center = center; Seed = seed; - Contracts.CheckUserArg(Oversampling >= 0, nameof(Oversampling), "Oversampling must be non-negative."); + Contracts.CheckParam(Oversampling >= 0, nameof(Oversampling), "Oversampling must be non-negative."); } } @@ -144,7 +151,7 @@ public TransformInfo(int rank, int dim) { Dimension = dim; Rank = rank; - Contracts.CheckUserArg(0 < Rank && Rank <= Dimension, nameof(Rank), "Rank must be positive, and at most the dimension of untransformed data"); + Contracts.CheckParam(0 < Rank && Rank <= Dimension, nameof(Rank), "Rank must be positive, and at most the dimension of untransformed data"); } public TransformInfo(ModelLoadContext ctx) @@ -224,7 +231,7 @@ private static VersionInfo GetVersionInfo() modelSignature: "PCA FUNC", //verWrittenCur: 0x00010001, // Initial verWrittenCur: 0x00010002, // Got rid of writing float size in model context - verReadableCur: 0x00010001, + verReadableCur: 0x00010002, verWeCanReadBack: 0x00010001, loaderSignature: LoaderSignature, loaderAssemblyName: typeof(PcaTransform).Assembly.FullName); @@ -537,17 +544,17 @@ protected override void CheckInputColumn(ISchema inputSchema, int col, int srcCo ValidatePcaInput(Host, inputSchema.GetColumnName(srcCol), inputSchema.GetColumnType(srcCol)); } - internal static void ValidatePcaInput(IHost host, string name, ColumnType type) + internal static void ValidatePcaInput(IExceptionContext ectx, string name, ColumnType type) { if (!type.IsVector) - throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of type ${type}"); + throw ectx.Except($"Pca transform can only be applied to vector columns. Column ${name} is of type ${type}"); if (!(type.IsKnownSizeVector && type.VectorSize > 1)) - throw host.Except($"Pca transform can only be applied to vector columns. Column ${name} is of size ${type.VectorSize}"); + throw ectx.Except($"Pca transform can only be applied to vector columns. Column ${name} is of size ${type.VectorSize}"); var itemType = type.ItemType; if (itemType.RawKind != DataKind.R4) - throw host.Except($"Pca transform can only be applied to vector of float items. Column ${name} contains type ${itemType}"); + throw ectx.Except($"Pca transform can only be applied to vector of float items. Column ${name} contains type ${itemType}"); } private sealed class Mapper : MapperBase @@ -701,7 +708,7 @@ internal static class Defaults /// The number of components in the PCA. /// Oversampling parameter for randomized PCA training. /// If enabled, data is centered to be zero mean. - /// The seed for random number generation + /// The seed for random number generation. public PcaEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, string weightColumn = Defaults.WeightColumn, int rank = Defaults.Rank, int overSampling = Defaults.Oversampling, bool center = Defaults.Center, @@ -780,7 +787,10 @@ public override IEstimator Reconcile(IHostEnvironment env, } } - /// Compute the principal components of the input column. Can significantly reduce size of vector. + /// + /// Replaces the input vector with its projection to the principal component subspace, + /// which can significantly reduce size of vector. + /// /// /// The column to apply PCA to. /// The name of the weight column. From 0fff9d2325244398bace27ab6e5c5babb227067a Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 23 Oct 2018 20:30:12 -0700 Subject: [PATCH 26/28] PR comments --- src/Microsoft.ML.PCA/PcaTransform.cs | 51 ++++++++-------------------- 1 file changed, 15 insertions(+), 36 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 82ee63bd2f..0f86262f16 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -550,10 +550,10 @@ internal static void ValidatePcaInput(IExceptionContext ectx, string name, Colum throw ectx.Except($"Pca transform can only be applied to vector columns. Column ${name} is of type ${type}"); if (!(type.IsKnownSizeVector && type.VectorSize > 1)) - throw ectx.Except($"Pca transform can only be applied to vector columns. Column ${name} is of size ${type.VectorSize}"); + throw ectx.Except($"Pca transform can only be applied to vector columns with known size greater than 1. Column ${name} is of size ${type.VectorSize}"); var itemType = type.ItemType; - if (itemType.RawKind != DataKind.R4) + if (!itemType.Equals(NumberType.R4)) throw ectx.Except($"Pca transform can only be applied to vector of float items. Column ${name} contains type ${itemType}"); } @@ -561,44 +561,24 @@ private sealed class Mapper : MapperBase { public sealed class ColumnSchemaInfo { - private readonly string _input; - private readonly string _output; - private readonly string _weightColumn; - private readonly Schema _schema; + public ColumnType InputType { get; } + public int InputIndex { get; } + public int WeightColumnIndex { get; } public ColumnSchemaInfo((string input, string output) columnPair, Schema schema, string weightColumn = null) { - _input = columnPair.input; - _output = columnPair.output; - _weightColumn = weightColumn; - _schema = schema; - } - - public int InputIndex - { - get - { - // Column names are already checked by PcaTransform - _schema.TryGetColumnIndex(_input, out int index); - return index; - } - } - - public ColumnType InputType => _schema[_input].Type; + schema.TryGetColumnIndex(columnPair.input, out int inputIndex); + InputIndex = inputIndex; + InputType = schema[columnPair.input].Type; - public int WeightColumnIndex - { - get + var weightIndex = -1; + if (weightColumn != null) { - var index = -1; - if (_weightColumn != null) - { - if (!_schema.TryGetColumnIndex(_weightColumn, out index)) - throw Contracts.Except("Weight column '{0}' does not exist.", _weightColumn); - Contracts.CheckUserArg(_schema[index].Type == NumberType.Float, nameof(_weightColumn)); - } - return index; + if (!schema.TryGetColumnIndex(weightColumn, out weightIndex)) + throw Contracts.Except("Weight column '{0}' does not exist.", weightColumn); + Contracts.CheckParam(schema[weightIndex].Type == NumberType.Float, nameof(weightColumn)); } + WeightColumnIndex = weightIndex; } } @@ -735,7 +715,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); - if (col.Kind != SchemaShape.Column.VectorKind.Vector || col.ItemType.RawKind != DataKind.R4) + if (col.Kind != SchemaShape.Column.VectorKind.Vector || !col.ItemType.Equals(NumberType.R4)) throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, @@ -776,7 +756,6 @@ public override IEstimator Reconcile(IHostEnvironment env, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) { - // Only one column is allowed. Contracts.Assert(toOutput.Length == 1); var outCol = (OutPipelineColumn)toOutput[0]; var inputColName = inputNames[outCol.Input]; From decb36c8f29f25fc321e99e4b710aade5ef0b7b4 Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Tue, 23 Oct 2018 20:33:15 -0700 Subject: [PATCH 27/28] Updated manifest for entrypoint PcaCalculator --- test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 7095eeaaed..bb0ca58a38 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -119,7 +119,7 @@ Transforms.ModelCombiner Combines a sequence of TransformModels into a single mo Transforms.NGramTranslator Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. Microsoft.ML.Runtime.Transforms.TextAnalytics NGramTransform Microsoft.ML.Runtime.Data.NgramTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.NoOperation Does nothing. Microsoft.ML.Runtime.Data.NopTransform Nop Microsoft.ML.Runtime.Data.NopTransform+NopInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.OptionalColumnCreator If the source column does not exist after deserialization, create a column with the right type and default values. Microsoft.ML.Runtime.DataPipe.OptionalColumnTransform MakeOptional Microsoft.ML.Runtime.DataPipe.OptionalColumnTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput -Transforms.PcaCalculator PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace. Microsoft.ML.Runtime.Data.PcaTransform Calculate Microsoft.ML.Runtime.Data.PcaTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput +Transforms.PcaCalculator PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace. Microsoft.ML.Transforms.PcaTransform Calculate Microsoft.ML.Transforms.PcaTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.PredictedLabelColumnOriginalValueConverter Transforms a predicted label column to its original values, unless it is of type bool. Microsoft.ML.Runtime.EntryPoints.FeatureCombiner ConvertPredictedLabel Microsoft.ML.Runtime.EntryPoints.FeatureCombiner+PredictedLabelInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.RandomNumberGenerator Adds a column with a generated number sequence. Microsoft.ML.Runtime.Data.RandomNumberGenerator Generate Microsoft.ML.Runtime.Data.GenerateNumberTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput Transforms.RowRangeFilter Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values. Microsoft.ML.Runtime.EntryPoints.SelectRows FilterByRange Microsoft.ML.Runtime.Data.RangeFilter+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput From b041d97e208552d597aad0df631e2de8b4d74c1c Mon Sep 17 00:00:00 2001 From: Shahab Moradi Date: Wed, 24 Oct 2018 14:24:10 -0700 Subject: [PATCH 28/28] Fixed schema exceptions --- src/Microsoft.ML.PCA/PcaTransform.cs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs index 0f86262f16..d956532921 100644 --- a/src/Microsoft.ML.PCA/PcaTransform.cs +++ b/src/Microsoft.ML.PCA/PcaTransform.cs @@ -134,6 +134,7 @@ public ColumnInfo(string input, Center = center; Seed = seed; Contracts.CheckParam(Oversampling >= 0, nameof(Oversampling), "Oversampling must be non-negative."); + Contracts.CheckParam(Rank > 0, nameof(Rank), "Rank must be positive."); } } @@ -546,15 +547,10 @@ protected override void CheckInputColumn(ISchema inputSchema, int col, int srcCo internal static void ValidatePcaInput(IExceptionContext ectx, string name, ColumnType type) { - if (!type.IsVector) - throw ectx.Except($"Pca transform can only be applied to vector columns. Column ${name} is of type ${type}"); + string inputSchema; // just used for the excpections - if (!(type.IsKnownSizeVector && type.VectorSize > 1)) - throw ectx.Except($"Pca transform can only be applied to vector columns with known size greater than 1. Column ${name} is of size ${type.VectorSize}"); - - var itemType = type.ItemType; - if (!itemType.Equals(NumberType.R4)) - throw ectx.Except($"Pca transform can only be applied to vector of float items. Column ${name} contains type ${itemType}"); + if (!(type.IsKnownSizeVector && type.VectorSize > 1 && type.ItemType.Equals(NumberType.R4))) + throw ectx.ExceptSchemaMismatch(nameof(inputSchema), "input", name, "vector of floats with fixed size greater than 1", type.ToString()); } private sealed class Mapper : MapperBase @@ -597,8 +593,8 @@ public Mapper(PcaTransform parent, Schema inputSchema) ValidatePcaInput(Host, colPair.input, colSchemaInfo.InputType); if (colSchemaInfo.InputType.VectorSize != _parent._transformInfos[i].Dimension) { - var msg = $"Dimension of column ${colPair.input} is ${colSchemaInfo.InputType.VectorSize}, which doesn't match the expected size ${_parent._transformInfos[i].Dimension}"; - throw Host.Except(msg); + throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colPair.input, + new VectorType(NumberType.R4, _parent._transformInfos[i].Dimension).ToString(), colSchemaInfo.InputType.ToString()); } } }