From cb9effcd091c60fa291aad96cc18c14ddf841b6f Mon Sep 17 00:00:00 2001 From: Eric Erhardt Date: Thu, 15 Nov 2018 19:36:32 -0600 Subject: [PATCH] Introduce VBufferEditor and hide VBuffer.Count (#1580) * Implement VBuffer master plan WIP #1 * Getting everything to build and tests passing * Keep moving to the master plan of VBuffer. * Remove the rest of the VBuffer.Count usages in ML.Data * Remove the rest of the VBuffer.Count usages and make VBuffer.Count private. * Fix two failing tests. * Fix FastTreeBinaryClassificationCategoricalSplitTest by remembering the underlying arrays in the column buffer in Transposer. Also enable a Transposer test, since it passes. --- src/Microsoft.ML.Core/Data/MetadataUtils.cs | 19 +- src/Microsoft.ML.Core/Data/VBuffer.cs | 361 +++------- src/Microsoft.ML.Core/Data/VBufferEditor.cs | 160 +++++ src/Microsoft.ML.Core/Utilities/MathUtils.cs | 54 +- src/Microsoft.ML.Core/Utilities/Utils.cs | 60 +- .../Utilities/VBufferUtils.cs | 650 ++++++++++-------- src/Microsoft.ML.CpuMath/AlignedArray.cs | 2 +- .../CpuMathUtils.netcoreapp.cs | 2 +- .../CpuMathUtils.netstandard.cs | 2 +- src/Microsoft.ML.CpuMath/Sse.cs | 3 +- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 2 +- .../Commands/ShowSchemaCommand.cs | 2 +- src/Microsoft.ML.Data/Data/BufferBuilder.cs | 77 +-- .../DataLoadSave/Binary/Codecs.cs | 20 +- .../DataLoadSave/Text/TextLoaderParser.cs | 18 +- .../DataLoadSave/Text/TextSaver.cs | 23 +- .../DataView/CompositeSchema.cs | 2 +- src/Microsoft.ML.Data/DataView/Transposer.cs | 80 ++- .../Depricated/Vector/VBufferMathUtils.cs | 217 +++--- .../Depricated/Vector/VectorUtils.cs | 154 +++-- .../Evaluators/EvaluatorUtils.cs | 62 +- .../Evaluators/RankerEvaluator.cs | 4 +- .../Transforms/ConcatTransform.cs | 16 +- .../Transforms/DropSlotsTransform.cs | 2 +- .../Transforms/HashTransform.cs | 118 ++-- .../Transforms/InvertHashUtils.cs | 2 +- .../Transforms/KeyToValueTransform.cs | 24 +- .../Transforms/KeyToVectorTransform.cs | 28 +- .../Transforms/NormalizeColumnSng.cs | 71 +- .../Transforms/TermTransformImpl.cs | 50 +- .../Utilities/SlotDropper.cs | 42 +- src/Microsoft.ML.Ensemble/EnsembleUtils.cs | 38 +- .../OutputCombiners/BaseMultiAverager.cs | 11 +- .../OutputCombiners/BaseMultiCombiner.cs | 8 +- .../OutputCombiners/BaseScalarStacking.cs | 8 +- .../OutputCombiners/MultiMedian.cs | 8 +- .../OutputCombiners/MultiStacking.cs | 8 +- .../OutputCombiners/MultiVoting.cs | 16 +- .../BinFile/BinFinder.cs | 38 +- src/Microsoft.ML.FastTree/FastTree.cs | 16 +- .../TreeEnsemble/RegressionTree.cs | 9 +- .../OlsLinearRegression.cs | 21 +- .../SymSgdClassificationTrainer.cs | 12 +- .../ImagePixelExtractorTransform.cs | 26 +- .../KMeansPlusPlusTrainer.cs | 11 +- .../KMeansPredictor.cs | 14 +- .../Models/ConfusionMatrix.cs | 5 +- .../Runtime/EntryPoints/FeatureCombiner.cs | 5 +- .../OnnxTransform.cs | 9 +- src/Microsoft.ML.OnnxTransform/OnnxUtils.cs | 21 +- .../Optimizer/DifferentiableFunction.cs | 4 +- .../Optimizer/LineSearch.cs | 2 +- .../Optimizer/OptimizationMonitor.cs | 2 +- .../Optimizer/Optimizer.cs | 2 +- .../Optimizer/SgdOptimizer.cs | 58 +- .../Standard/LinearPredictorUtils.cs | 4 +- .../LogisticRegression/LbfgsPredictorBase.cs | 10 +- .../LogisticRegression/LogisticRegression.cs | 11 +- .../MulticlassLogisticRegression.cs | 34 +- .../Standard/ModelStatistics.cs | 2 +- .../MultiClass/MultiClassNaiveBayesTrainer.cs | 31 +- .../Standard/Online/LinearSvm.cs | 6 +- .../PoissonRegression/PoissonRegression.cs | 5 +- .../Standard/SdcaBinary.cs | 14 +- .../Standard/SdcaMultiClass.cs | 16 +- .../TensorFlow/TensorflowUtils.cs | 20 +- .../TensorflowTransform.cs | 20 +- src/Microsoft.ML.Transforms/GcnTransform.cs | 104 +-- .../KeyToBinaryVectorTransform.cs | 2 +- .../LearnerFeatureSelection.cs | 34 +- .../Microsoft.ML.Transforms.csproj | 1 + .../MissingValueDroppingTransformer.cs | 69 +- .../MissingValueIndicatorTransform.cs | 58 +- .../MissingValueIndicatorTransformer.cs | 8 +- .../MissingValueReplacing.cs | 17 +- .../MissingValueReplacingUtils.cs | 13 +- .../MutualInformationFeatureSelection.cs | 84 ++- .../RandomFourierFeaturizing.cs | 8 +- .../Text/CharTokenizeTransform.cs | 59 +- .../Text/LdaSingleBox.cs | 29 +- .../Text/LdaTransform.cs | 52 +- .../Text/NgramTransform.cs | 9 +- .../Text/NgramUtils.cs | 8 +- .../Text/StopWordsRemoverTransform.cs | 18 +- .../Text/TextNormalizerTransform.cs | 9 +- .../Text/WordEmbeddingsTransform.cs | 31 +- .../Text/WordTokenizeTransform.cs | 20 +- .../UngroupTransform.cs | 10 +- .../VectorWhitening.cs | 31 +- .../TestTransposer.cs | 2 +- 90 files changed, 1811 insertions(+), 1717 deletions(-) create mode 100644 src/Microsoft.ML.Core/Data/VBufferEditor.cs diff --git a/src/Microsoft.ML.Core/Data/MetadataUtils.cs b/src/Microsoft.ML.Core/Data/MetadataUtils.cs index ee70bc732a..734ce31b23 100644 --- a/src/Microsoft.ML.Core/Data/MetadataUtils.cs +++ b/src/Microsoft.ML.Core/Data/MetadataUtils.cs @@ -318,7 +318,9 @@ public static void GetSlotNames(RoleMappedSchema schema, RoleMappedSchema.Column IReadOnlyList list; if ((list = schema?.GetColumns(role)) == null || list.Count != 1 || !schema.Schema.HasSlotNames(list[0].Index, vectorSize)) - slotNames = new VBuffer>(vectorSize, 0, slotNames.Values, slotNames.Indices); + { + VBufferUtils.Resize(ref slotNames, vectorSize, 0); + } else schema.Schema.GetMetadata(Kinds.SlotNames, list[0].Index, ref slotNames); } @@ -447,21 +449,22 @@ public static bool TryGetCategoricalFeatureIndices(Schema schema, int colIndex, { int previousEndIndex = -1; isValid = true; - for (int i = 0; i < catIndices.Values.Length; i += 2) + var catIndicesValues = catIndices.GetValues(); + for (int i = 0; i < catIndicesValues.Length; i += 2) { - if (catIndices.Values[i] > catIndices.Values[i + 1] || - catIndices.Values[i] <= previousEndIndex || - catIndices.Values[i] >= columnSlotsCount || - catIndices.Values[i + 1] >= columnSlotsCount) + if (catIndicesValues[i] > catIndicesValues[i + 1] || + catIndicesValues[i] <= previousEndIndex || + catIndicesValues[i] >= columnSlotsCount || + catIndicesValues[i + 1] >= columnSlotsCount) { isValid = false; break; } - previousEndIndex = catIndices.Values[i + 1]; + previousEndIndex = catIndicesValues[i + 1]; } if (isValid) - categoricalFeatures = catIndices.Values.Select(val => val).ToArray(); + categoricalFeatures = catIndicesValues.ToArray(); } } diff --git a/src/Microsoft.ML.Core/Data/VBuffer.cs b/src/Microsoft.ML.Core/Data/VBuffer.cs index b5ef6de0ea..217afa746c 100644 --- a/src/Microsoft.ML.Core/Data/VBuffer.cs +++ b/src/Microsoft.ML.Core/Data/VBuffer.cs @@ -16,32 +16,35 @@ namespace Microsoft.ML.Runtime.Data /// public readonly struct VBuffer { - /// - /// The logical length of the buffer. - /// - public readonly int Length; + private readonly T[] _values; + private readonly int[] _indices; /// /// The number of items explicitly represented. This is == Length when the representation /// is dense and < Length when sparse. /// - public readonly int Count; + private readonly int _count; + + /// + /// The logical length of the buffer. + /// + public readonly int Length; /// /// The values. Only the first Count of these are valid. /// - public readonly T[] Values; + public T[] Values => _values; /// /// The indices. For a dense representation, this array is not used. For a sparse representation /// it is parallel to values and specifies the logical indices for the corresponding values. /// - public readonly int[] Indices; + public int[] Indices => _indices; /// /// The explicitly represented values. /// - public ReadOnlySpan GetValues() => Values.AsSpan(0, Count); + public ReadOnlySpan GetValues() => _values.AsSpan(0, _count); /// /// The indices. For a dense representation, this array is not used. For a sparse representation @@ -53,17 +56,18 @@ public readonly struct VBuffer /// - non-zeros values 98 and 76 respectively at the 4th and 6th coordinates /// - zeros at all other coordinates /// - public ReadOnlySpan GetIndices() => IsDense ? default : Indices.AsSpan(0, Count); + public ReadOnlySpan GetIndices() => IsDense ? default : _indices.AsSpan(0, _count); /// - /// Equivalent to Count == Length. + /// Gets a value indicating whether every logical element is explicitly + /// represented in the buffer. /// public bool IsDense { get { - Contracts.Assert(Count <= Length); - return Count == Length; + Contracts.Assert(_count <= Length); + return _count == Length; } } @@ -77,9 +81,9 @@ public VBuffer(int length, T[] values, int[] indices = null) Contracts.CheckValueOrNull(indices); Length = length; - Count = length; - Values = values; - Indices = indices; + _count = length; + _values = values; + _indices = indices; } /// @@ -109,9 +113,9 @@ public VBuffer(int length, int count, T[] values, int[] indices) #endif Length = length; - Count = count; - Values = values; - Indices = indices; + _count = count; + _values = values; + _indices = indices; } /// @@ -119,15 +123,14 @@ public VBuffer(int length, int count, T[] values, int[] indices) /// public void CopyToDense(ref VBuffer dst) { - var values = dst.Values; - if (Utils.Size(values) < Length) - values = new T[Length]; + // create a dense editor + var editor = VBufferEditor.Create(ref dst, Length); if (!IsDense) - CopyTo(values); + CopyTo(editor.Values); else if (Length > 0) - Array.Copy(Values, values, Length); - dst = new VBuffer(Length, values, dst.Indices); + _values.AsSpan(0, Length).CopyTo(editor.Values); + dst = editor.Commit(); } /// @@ -135,31 +138,24 @@ public void CopyToDense(ref VBuffer dst) /// public void CopyTo(ref VBuffer dst) { - var values = dst.Values; - var indices = dst.Indices; + var editor = VBufferEditor.Create(ref dst, Length, _count); if (IsDense) { if (Length > 0) { - if (Utils.Size(values) < Length) - values = new T[Length]; - Array.Copy(Values, values, Length); + _values.AsSpan(0, Length).CopyTo(editor.Values); } - dst = new VBuffer(Length, values, indices); + dst = editor.Commit(); Contracts.Assert(dst.IsDense); } else { - if (Count > 0) + if (_count > 0) { - if (Utils.Size(values) < Count) - values = new T[Count]; - if (Utils.Size(indices) < Count) - indices = new int[Count]; - Array.Copy(Values, values, Count); - Array.Copy(Indices, indices, Count); + _values.AsSpan(0, _count).CopyTo(editor.Values); + _indices.AsSpan(0, _count).CopyTo(editor.Indices); } - dst = new VBuffer(Length, Count, values, indices); + dst = editor.Commit(); } } @@ -170,255 +166,81 @@ public void CopyTo(ref VBuffer dst, int srcMin, int length) { Contracts.Check(0 <= srcMin && srcMin <= Length, "srcMin"); Contracts.Check(0 <= length && srcMin <= Length - length, "length"); - var values = dst.Values; - var indices = dst.Indices; + if (IsDense) { + var editor = VBufferEditor.Create(ref dst, length, length); if (length > 0) { - if (Utils.Size(values) < length) - values = new T[length]; - Array.Copy(Values, srcMin, values, 0, length); + _values.AsSpan(srcMin, length).CopyTo(editor.Values); } - dst = new VBuffer(length, values, indices); + dst = editor.Commit(); Contracts.Assert(dst.IsDense); } else { int copyCount = 0; - if (Count > 0) + if (_count > 0) { - int copyMin = Indices.FindIndexSorted(0, Count, srcMin); - int copyLim = Indices.FindIndexSorted(copyMin, Count, srcMin + length); + int copyMin = _indices.FindIndexSorted(0, _count, srcMin); + int copyLim = _indices.FindIndexSorted(copyMin, _count, srcMin + length); Contracts.Assert(copyMin <= copyLim); copyCount = copyLim - copyMin; + var editor = VBufferEditor.Create(ref dst, length, copyCount); if (copyCount > 0) { - if (Utils.Size(values) < copyCount) - values = new T[copyCount]; - Array.Copy(Values, copyMin, values, 0, copyCount); + _values.AsSpan(copyMin, copyCount).CopyTo(editor.Values); if (copyCount < length) { - if (Utils.Size(indices) < copyCount) - indices = new int[copyCount]; for (int i = 0; i < copyCount; ++i) - indices[i] = Indices[i + copyMin] - srcMin; - } - } - } - dst = new VBuffer(length, copyCount, values, indices); - } - } - - /// - /// Copy from this buffer to the given destination, making sure to explicitly include the - /// first count indices in indicesInclude. Note that indicesInclude should be sorted - /// with each index less than this.Length. Note that this can make the destination be - /// dense even if "this" is sparse. - /// - public void CopyTo(ref VBuffer dst, int[] indicesInclude, int count) - { - Contracts.CheckParam(count >= 0, nameof(count)); - Contracts.CheckParam(Utils.Size(indicesInclude) >= count, nameof(indicesInclude)); - Contracts.CheckParam(Utils.Size(indicesInclude) <= Length, nameof(indicesInclude)); - - // REVIEW: Ideally we should Check that indicesInclude is sorted and in range. Would that - // check be too expensive? -#if DEBUG - int prev = -1; - for (int i = 0; i < count; i++) - { - Contracts.Assert(prev < indicesInclude[i]); - prev = indicesInclude[i]; - } - Contracts.Assert(prev < Length); -#endif - - if (IsDense || count == 0) - { - CopyTo(ref dst); - return; - } - - if (count >= Length / 2 || Count >= Length / 2) - { - CopyToDense(ref dst); - return; - } - - var indices = dst.Indices; - var values = dst.Values; - if (Count == 0) - { - // No values in "this". - if (Utils.Size(indices) < count) - indices = new int[count]; - Array.Copy(indicesInclude, indices, count); - if (Utils.Size(values) < count) - values = new T[count]; - else - Array.Clear(values, 0, count); - dst = new VBuffer(Length, count, values, indices); - return; - } - - int size = 0; - int max = count + Count; - Contracts.Assert(max < Length); - int ii1; - int ii2; - if (max >= Length / 2 || Utils.Size(values) < max || Utils.Size(indices) < max) - { - // Compute the needed size. - ii1 = 0; - ii2 = 0; - for (; ; ) - { - Contracts.Assert(ii1 < Count); - Contracts.Assert(ii2 < count); - size++; - int diff = Indices[ii1] - indicesInclude[ii2]; - if (diff == 0) - { - ii1++; - ii2++; - if (ii1 >= Count) - { - size += count - ii2; - break; - } - if (ii2 >= count) - { - size += Count - ii1; - break; + editor.Indices[i] = _indices[i + copyMin] - srcMin; } } - else if (diff < 0) - { - if (++ii1 >= Count) - { - size += count - ii2; - break; - } - } - else - { - if (++ii2 >= count) - { - size += Count - ii1; - break; - } - } - } - Contracts.Assert(size >= count && size >= Count); - - if (size == Count) - { - CopyTo(ref dst); - return; - } - - if (size >= Length / 2) - { - CopyToDense(ref dst); - return; - } - - if (Utils.Size(values) < size) - values = new T[size]; - if (Utils.Size(indices) < size) - indices = new int[size]; - max = size; - } - - int ii = 0; - ii1 = 0; - ii2 = 0; - for (; ; ) - { - Contracts.Assert(ii < max); - Contracts.Assert(ii1 < Count); - Contracts.Assert(ii2 < count); - int i1 = Indices[ii1]; - int i2 = indicesInclude[ii2]; - if (i1 <= i2) - { - indices[ii] = i1; - values[ii] = Values[ii1]; - ii++; - if (i1 == i2) - ii2++; - if (++ii1 >= Count) - { - if (ii2 >= count) - break; - Array.Clear(values, ii, count - ii2); - Array.Copy(indicesInclude, ii2, indices, ii, count - ii2); - ii += count - ii2; - break; - } - if (ii2 >= count) - { - Array.Copy(Values, ii1, values, ii, Count - ii1); - Array.Copy(Indices, ii1, indices, ii, Count - ii1); - ii += Count - ii1; - break; - } + dst = editor.Commit(); } else { - indices[ii] = i2; - values[ii] = default(T); - ii++; - if (++ii2 >= count) - { - Array.Copy(Values, ii1, values, ii, Count - ii1); - Array.Copy(Indices, ii1, indices, ii, Count - ii1); - ii += Count - ii1; - break; - } + var editor = VBufferEditor.Create(ref dst, length, copyCount); + dst = editor.Commit(); } } - Contracts.Assert(size == ii || size == 0); - - dst = new VBuffer(Length, ii, values, indices); } /// /// Copy from this buffer to the given destination array. This "densifies". /// - public void CopyTo(T[] dst) + public void CopyTo(Span dst) { CopyTo(dst, 0); } - public void CopyTo(T[] dst, int ivDst, T defaultValue = default(T)) + public void CopyTo(Span dst, int ivDst, T defaultValue = default(T)) { - Contracts.CheckParam(0 <= ivDst && ivDst <= Utils.Size(dst) - Length, nameof(dst), "dst is not large enough"); + Contracts.CheckParam(0 <= ivDst && ivDst <= dst.Length - Length, nameof(dst), "dst is not large enough"); if (Length == 0) return; if (IsDense) { - Array.Copy(Values, 0, dst, ivDst, Length); + _values.AsSpan(0, Length).CopyTo(dst.Slice(ivDst)); return; } - if (Count == 0) + if (_count == 0) { - Array.Clear(dst, ivDst, Length); + dst.Slice(ivDst, Length).Clear(); return; } int iv = 0; - for (int islot = 0; islot < Count; islot++) + for (int islot = 0; islot < _count; islot++) { - int slot = Indices[islot]; + int slot = _indices[islot]; Contracts.Assert(slot >= iv); while (iv < slot) dst[ivDst + iv++] = defaultValue; Contracts.Assert(iv == slot); - dst[ivDst + iv++] = Values[islot]; + dst[ivDst + iv++] = _values[islot]; } while (iv < Length) dst[ivDst + iv++] = defaultValue; @@ -431,24 +253,22 @@ public static void Copy(T[] src, int srcIndex, ref VBuffer dst, int length) { Contracts.CheckParam(0 <= length && length <= Utils.Size(src), nameof(length)); Contracts.CheckParam(0 <= srcIndex && srcIndex <= Utils.Size(src) - length, nameof(srcIndex)); - var values = dst.Values; + var editor = VBufferEditor.Create(ref dst, length, length); if (length > 0) { - if (Utils.Size(values) < length) - values = new T[length]; - Array.Copy(src, srcIndex, values, 0, length); + src.AsSpan(srcIndex, length).CopyTo(editor.Values); } - dst = new VBuffer(length, values, dst.Indices); + dst = editor.Commit(); } public IEnumerable> Items(bool all = false) { - return VBufferUtils.Items(Values, Indices, Length, Count, all); + return VBufferUtils.Items(_values, _indices, Length, _count, all); } public IEnumerable DenseValues() { - return VBufferUtils.DenseValues(Values, Indices, Length, Count); + return VBufferUtils.DenseValues(_values, _indices, Length, _count); } public void GetItemOrDefault(int slot, ref T dst) @@ -457,9 +277,9 @@ public void GetItemOrDefault(int slot, ref T dst) int index; if (IsDense) - dst = Values[slot]; - else if (Count > 0 && Indices.TryFindIndexSorted(0, Count, slot, out index)) - dst = Values[index]; + dst = _values[slot]; + else if (_count > 0 && _indices.TryFindIndexSorted(0, _count, slot, out index)) + dst = _values[index]; else dst = default(T); } @@ -470,13 +290,56 @@ public T GetItemOrDefault(int slot) int index; if (IsDense) - return Values[slot]; - if (Count > 0 && Indices.TryFindIndexSorted(0, Count, slot, out index)) - return Values[index]; + return _values[slot]; + if (_count > 0 && _indices.TryFindIndexSorted(0, _count, slot, out index)) + return _values[index]; return default(T); } public override string ToString() - => IsDense ? $"Dense vector of size {Length}" : $"Sparse vector of size {Length}, {Count} explicit values"; + => IsDense ? $"Dense vector of size {Length}" : $"Sparse vector of size {Length}, {_count} explicit values"; + + internal VBufferEditor GetEditor() + { + return GetEditor(Length, _count); + } + + internal VBufferEditor GetEditor( + int newLogicalLength, + int? valuesCount, + int maxCapacity = Utils.ArrayMaxSize, + bool keepOldOnResize = false, + bool requireIndicesOnDense = false) + { + Contracts.CheckParam(newLogicalLength >= 0, nameof(newLogicalLength)); + Contracts.CheckParam(valuesCount == null || valuesCount.Value <= newLogicalLength, nameof(valuesCount)); + + valuesCount = valuesCount ?? newLogicalLength; + + T[] values = _values; + bool createdNewValues; + Utils.EnsureSize(ref values, valuesCount.Value, maxCapacity, keepOldOnResize, out createdNewValues); + + int[] indices = _indices; + bool isDense = newLogicalLength == valuesCount.Value; + bool createdNewIndices; + if (isDense && !requireIndicesOnDense) + { + createdNewIndices = false; + } + else + { + Utils.EnsureSize(ref indices, valuesCount.Value, maxCapacity, keepOldOnResize, out createdNewIndices); + } + + return new VBufferEditor( + newLogicalLength, + valuesCount.Value, + values, + indices, + requireIndicesOnDense, + createdNewValues, + createdNewIndices); + } } -} +} \ No newline at end of file diff --git a/src/Microsoft.ML.Core/Data/VBufferEditor.cs b/src/Microsoft.ML.Core/Data/VBufferEditor.cs new file mode 100644 index 0000000000..8da19b641f --- /dev/null +++ b/src/Microsoft.ML.Core/Data/VBufferEditor.cs @@ -0,0 +1,160 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace Microsoft.ML.Runtime.Data +{ + /// + /// Various methods for creating instances. + /// + public static class VBufferEditor + { + /// + /// Creates a with the same shape + /// (length and density) as the . + /// + public static VBufferEditor CreateFromBuffer( + ref VBuffer destination) + { + return destination.GetEditor(); + } + + /// + /// Creates a using + /// 's values and indices buffers. + /// + /// + /// The destination buffer. + /// + /// + /// The logical length of the new buffer being edited. + /// + /// + /// The optional number of physical values to be represented in the buffer. + /// The buffer will be dense if is omitted. + /// + /// + /// True means that the old buffer values and indices are preserved, if possible (Array.Resize is called). + /// False means that a new array will be allocated, if necessary. + /// + /// + /// True means to ensure the Indices buffer is available, even if the buffer will be dense. + /// + public static VBufferEditor Create( + ref VBuffer destination, + int newLogicalLength, + int? valuesCount = null, + bool keepOldOnResize = false, + bool requireIndicesOnDense = false) + { + return destination.GetEditor( + newLogicalLength, + valuesCount, + keepOldOnResize: keepOldOnResize, + requireIndicesOnDense: requireIndicesOnDense); + } + + internal static VBufferEditor Create( + ref VBuffer destination, + int newLogicalLength, + int valuesCount, + int maxValuesCapacity) + { + return destination.GetEditor( + newLogicalLength, + valuesCount, + maxValuesCapacity); + } + } + + /// + /// An object capable of editing a by filling out + /// (and if the buffer is not dense). + /// + public readonly ref struct VBufferEditor + { + private readonly int _logicalLength; + private readonly T[] _values; + private readonly int[] _indices; + + /// + /// The mutable span of values. + /// + public readonly Span Values; + + /// + /// The mutable span of indices. + /// + public readonly Span Indices; + + /// + /// Gets a value indicating whether a new Values array was allocated. + /// + public bool CreatedNewValues { get; } + + /// + /// Gets a value indicating whether a new Indices array was allocated. + /// + public bool CreatedNewIndices { get; } + + internal VBufferEditor(int logicalLength, + int physicalValuesCount, + T[] values, + int[] indices, + bool requireIndicesOnDense, + bool createdNewValues, + bool createdNewIndices) + { + _logicalLength = logicalLength; + _values = values; + _indices = indices; + + bool isDense = logicalLength == physicalValuesCount; + + Values = _values.AsSpan(0, physicalValuesCount); + Indices = !isDense || requireIndicesOnDense ? _indices.AsSpan(0, physicalValuesCount) : default; + + CreatedNewValues = createdNewValues; + CreatedNewIndices = createdNewIndices; + } + + /// + /// Commits the edits and creates a new using + /// the current Values and Indices. + /// + /// + /// The newly created . + /// + public VBuffer Commit() + { + return new VBuffer(_logicalLength, Values.Length, _values, _indices); + } + + /// + /// Commits the edits and creates a new using + /// the current Values and Indices, while allowing to truncate the length + /// of Values and Indices. + /// + /// + /// The new number of physical values to be represented in the created buffer. + /// + /// + /// The newly created . + /// + /// + /// CommitTruncated allows to modify the length of the explicitly + /// defined values. + /// This is useful in sparse situations where the + /// was created with a larger physical value count than was needed + /// because the final value count was not known at creation time. + /// + public VBuffer CommitTruncated(int physicalValuesCount) + { + Contracts.CheckParam(physicalValuesCount <= Values.Length, nameof(physicalValuesCount), "Updating physicalValuesCount during CommitTruncated cannot be greater than the original physicalValuesCount value used in Create."); + + return new VBuffer(_logicalLength, physicalValuesCount, _values, _indices); + } + } +} diff --git a/src/Microsoft.ML.Core/Utilities/MathUtils.cs b/src/Microsoft.ML.Core/Utilities/MathUtils.cs index 7550a949c5..cb5028463e 100644 --- a/src/Microsoft.ML.Core/Utilities/MathUtils.cs +++ b/src/Microsoft.ML.Core/Utilities/MathUtils.cs @@ -133,40 +133,23 @@ public static Float Min(Float[] a) } /// - /// Finds the first index of the max element of the array. + /// Finds the first index of the max element of the span. /// NaNs are ignored. If all the elements to consider are NaNs, -1 is /// returned. The caller should distinguish in this case between two /// possibilities: /// 1) The number of the element to consider is zero. /// 2) All the elements to consider are NaNs. /// - /// an array - /// the first index of the max element - public static int ArgMax(Float[] a) - { - return ArgMax(a, Utils.Size(a)); - } - - /// - /// Finds the first index of the max element of the array. - /// NaNs are ignored. If all the elements to consider are NaNs, -1 is - /// returned. The caller should distinguish in this case between two - /// possibilities: - /// 1) The number of the element to consider is zero. - /// 2) All the elements to consider are NaNs. - /// - /// an array - /// number of the element in the array to consider + /// The span of floats. /// the first index of the max element - public static int ArgMax(Float[] a, int count) + public static int ArgMax(ReadOnlySpan a) { - Contracts.Assert(0 <= count && count <= Utils.Size(a)); - if (count == 0) + if (a.IsEmpty) return -1; int amax = -1; Float max = Float.NegativeInfinity; - for (int i = count - 1; i >= 0; i--) + for (int i = a.Length - 1; i >= 0; i--) { if (max <= a[i]) { @@ -179,40 +162,23 @@ public static int ArgMax(Float[] a, int count) } /// - /// Finds the first index of the minimum element of the array. + /// Finds the first index of the minimum element of the span. /// NaNs are ignored. If all the elements to consider are NaNs, -1 is /// returned. The caller should distinguish in this case between two /// possibilities: /// 1) The number of the element to consider is zero. /// 2) All the elements to consider are NaNs. /// - /// an array - /// the first index of the minimum element - public static int ArgMin(Float[] a) - { - return ArgMin(a, Utils.Size(a)); - } - - /// - /// Finds the first index of the minimum element of the array. - /// NaNs are ignored. If all the elements to consider are NaNs, -1 is - /// returned. The caller should distinguish in this case between two - /// possibilities: - /// 1) The number of the element to consider is zero. - /// 2) All the elements to consider are NaNs. - /// - /// an array - /// number of the element in the array to consider + /// The span of floats. /// the first index of the minimum element - public static int ArgMin(Float[] a, int count) + public static int ArgMin(ReadOnlySpan a) { - Contracts.Assert(0 <= count && count <= Utils.Size(a)); - if (count == 0) + if (a.IsEmpty) return -1; int amin = -1; Float min = Float.PositiveInfinity; - for (int i = count - 1; i >= 0; i--) + for (int i = a.Length - 1; i >= 0; i--) { if (min >= a[i]) { diff --git a/src/Microsoft.ML.Core/Utilities/Utils.cs b/src/Microsoft.ML.Core/Utilities/Utils.cs index a0b9019d14..d87b13fd1e 100644 --- a/src/Microsoft.ML.Core/Utilities/Utils.cs +++ b/src/Microsoft.ML.Core/Utilities/Utils.cs @@ -181,18 +181,6 @@ public static void Push(ref Stack stack, T item) stack.Push(item); } - /// - /// Assumes input is sorted and finds value using BinarySearch. - /// If value is not found, returns the logical index of 'value' in the sorted list i.e index of the first element greater than value. - /// In case of duplicates it returns the index of the first one. - /// It guarantees that items before the returned index are < value, while those at and after the returned index are >= value. - /// - public static int FindIndexSorted(this int[] input, int value) - { - Contracts.AssertValue(input); - return FindIndexSorted(input, 0, input.Length, value); - } - /// /// Assumes input is sorted and finds value using BinarySearch. /// If value is not found, returns the logical index of 'value' in the sorted list i.e index of the first element greater than value. @@ -240,6 +228,17 @@ public static bool TryFindIndexSorted(this int[] input, int min, int lim, int va return index < lim && input[index] == value; } + /// + /// Akin to FindIndexSorted, except stores the found index in the output + /// index parameter, and returns whether that index is a valid index + /// pointing to a value equal to the input parameter value. + /// + public static bool TryFindIndexSorted(ReadOnlySpan input, int min, int lim, int value, out int index) + { + index = FindIndexSorted(input, min, lim, value); + return index < lim && input[index] == value; + } + /// /// Assumes input is sorted and finds value using BinarySearch. /// If value is not found, returns the logical index of 'value' in the sorted list i.e index of the first element greater than value. @@ -466,9 +465,8 @@ public static int[] GetIdentityPermutation(int size) return res; } - public static void FillIdentity(int[] a, int lim) + public static void FillIdentity(Span a, int lim) { - Contracts.AssertValue(a); Contracts.Assert(0 <= lim & lim <= a.Length); for (int i = 0; i < lim; ++i) @@ -857,12 +855,19 @@ public static int EnsureSize(ref T[] array, int min, bool keepOld = true) /// /// The new size, that is no less than and no more that . public static int EnsureSize(ref T[] array, int min, int max, bool keepOld = true) + => EnsureSize(ref array, min, max, keepOld, out bool _); + + public static int EnsureSize(ref T[] array, int min, int max, bool keepOld, out bool resized) { Contracts.CheckParam(min <= max, nameof(max), "min must not exceed max"); // This code adapted from the private method EnsureCapacity code of List. int size = Utils.Size(array); if (size >= min) + { + resized = false; return size; + } + int newSize = size == 0 ? 4 : size * 2; // This constant taken from the internal code of system\array.cs of mscorlib. if ((uint)newSize > max) @@ -873,6 +878,8 @@ public static int EnsureSize(ref T[] array, int min, int max, bool keepOld = Array.Resize(ref array, newSize); else array = new T[newSize]; + + resized = true; return newSize; } @@ -1098,5 +1105,30 @@ public static string GetDescription(this Enum value) } return null; } + + public static int Count(this ReadOnlySpan source, Func predicate) + { + Contracts.CheckValue(predicate, nameof(predicate)); + + int result = 0; + for (int i = 0; i < source.Length; i++) + { + if (predicate(source[i])) + result++; + } + return result; + } + + public static bool All(this ReadOnlySpan source, Func predicate) + { + Contracts.CheckValue(predicate, nameof(predicate)); + + for (int i = 0; i < source.Length; i++) + { + if (!predicate(source[i])) + return false; + } + return true; + } } } diff --git a/src/Microsoft.ML.Core/Utilities/VBufferUtils.cs b/src/Microsoft.ML.Core/Utilities/VBufferUtils.cs index bc1e7f4f7f..f730d61724 100644 --- a/src/Microsoft.ML.Core/Utilities/VBufferUtils.cs +++ b/src/Microsoft.ML.Core/Utilities/VBufferUtils.cs @@ -176,7 +176,7 @@ public static void ForEachDefined(in VBuffer a, Action visitor) /// Applies the to each corresponding pair of elements /// where the item is emplicitly defined in the vector. By explicitly defined, /// we mean that for a given index i, both vectors have an entry in - /// corresponding to that index. + /// corresponding to that index. /// /// The first vector /// The second vector @@ -314,9 +314,8 @@ public static void ForEachEitherDefined(in VBuffer a, in VBuffer b, Act /// public static void Clear(ref VBuffer dst) { - if (dst.Count == 0) - return; - Array.Clear(dst.Values, 0, dst.Count); + var editor = VBufferEditor.CreateFromBuffer(ref dst); + editor.Values.Clear(); } // REVIEW: Look into removing slot in this and other manipulators, so that we @@ -344,15 +343,17 @@ public static void Apply(ref VBuffer dst, SlotValueManipulator manip) { Contracts.CheckValue(manip, nameof(manip)); + var editor = VBufferEditor.CreateFromBuffer(ref dst); if (dst.IsDense) { - for (int i = 0; i < dst.Length; i++) - manip(i, ref dst.Values[i]); + for (int i = 0; i < editor.Values.Length; i++) + manip(i, ref editor.Values[i]); } else { - for (int i = 0; i < dst.Count; i++) - manip(dst.Indices[i], ref dst.Values[i]); + var dstIndices = dst.GetIndices(); + for (int i = 0; i < editor.Values.Length; i++) + manip(dstIndices[i], ref editor.Values[i]); } } @@ -376,17 +377,19 @@ public static void ApplyAt(ref VBuffer dst, int slot, SlotValueManipulator Contracts.CheckValue(manip, nameof(manip)); Contracts.CheckValueOrNull(pred); + var editor = VBufferEditor.CreateFromBuffer(ref dst); + int dstValuesCount = editor.Values.Length; if (dst.IsDense) { // The vector is dense, so we can just do a direct access. - manip(slot, ref dst.Values[slot]); + manip(slot, ref editor.Values[slot]); return; } int idx = 0; - if (dst.Count > 0 && Utils.TryFindIndexSorted(dst.Indices, 0, dst.Count, slot, out idx)) + if (dstValuesCount > 0 && Utils.TryFindIndexSorted(editor.Indices, 0, dstValuesCount, slot, out idx)) { // Vector is sparse, but the item exists so we can access it. - manip(slot, ref dst.Values[idx]); + manip(slot, ref editor.Values[idx]); return; } // The vector is sparse and there is no corresponding item, yet. @@ -397,26 +400,24 @@ public static void ApplyAt(ref VBuffer dst, int slot, SlotValueManipulator if (pred(ref value)) return; // We have to insert this value, somehow. - int[] indices = dst.Indices; - T[] values = dst.Values; + // There is a modest special case where there is exactly one free slot // we are modifying in the sparse vector, in which case the vector becomes // dense. Then there is no need to do anything with indices. - bool needIndices = dst.Count + 1 < dst.Length; - if (needIndices) - Utils.EnsureSize(ref indices, dst.Count + 1, dst.Length - 1); - Utils.EnsureSize(ref values, dst.Count + 1, dst.Length); - if (idx != dst.Count) + bool needIndices = dstValuesCount + 1 < dst.Length; + editor = VBufferEditor.Create(ref dst, dst.Length, dstValuesCount + 1); + if (idx != dstValuesCount) { // We have to do some sort of shift copy. + int sliceLength = dstValuesCount - idx; if (needIndices) - Array.Copy(indices, idx, indices, idx + 1, dst.Count - idx); - Array.Copy(values, idx, values, idx + 1, dst.Count - idx); + editor.Indices.Slice(idx, sliceLength).CopyTo(editor.Indices.Slice(idx + 1)); + editor.Values.Slice(idx, sliceLength).CopyTo(editor.Values.Slice(idx + 1)); } if (needIndices) - indices[idx] = slot; - values[idx] = value; - dst = new VBuffer(dst.Length, dst.Count + 1, values, indices); + editor.Indices[idx] = slot; + editor.Values[idx] = value; + dst = editor.Commit(); } /// @@ -426,37 +427,41 @@ public static void Densify(ref VBuffer dst) { if (dst.IsDense) return; - var indices = dst.Indices; - var values = dst.Values; - if (Utils.Size(values) >= dst.Length) + + var indices = dst.GetIndices(); + var values = dst.GetValues(); + var editor = VBufferEditor.Create( + ref dst, + dst.Length); + + if (!editor.CreatedNewValues) { // Densify in place. - for (int i = dst.Count; --i >= 0; ) + for (int i = values.Length; --i >= 0; ) { Contracts.Assert(i <= indices[i]); - values[indices[i]] = values[i]; + editor.Values[indices[i]] = values[i]; } - if (dst.Count == 0) - Array.Clear(values, 0, dst.Length); + if (values.Length == 0) + editor.Values.Clear(); else { int min = 0; - for (int ii = 0; ii < dst.Count; ++ii) + for (int ii = 0; ii < values.Length; ++ii) { - Array.Clear(values, min, indices[ii] - min); + editor.Values.Slice(min, indices[ii] - min).Clear(); min = indices[ii] + 1; } - Array.Clear(values, min, dst.Length - min); + editor.Values.Slice(min, dst.Length - min).Clear(); } } else { - T[] newValues = new T[dst.Length]; - for (int i = 0; i < dst.Count; ++i) - newValues[indices[i]] = values[i]; - values = newValues; + // createdNewValues is true, keepOldOnResize is false, so Values is already cleared + for (int i = 0; i < values.Length; ++i) + editor.Values[indices[i]] = values[i]; } - dst = new VBuffer(dst.Length, values, indices); + dst = editor.Commit(); } /// @@ -466,7 +471,9 @@ public static void Densify(ref VBuffer dst) public static void DensifyFirst(ref VBuffer dst, int denseCount) { Contracts.Check(0 <= denseCount && denseCount <= dst.Length); - if (dst.IsDense || denseCount == 0 || (dst.Count >= denseCount && dst.Indices[denseCount - 1] == denseCount - 1)) + var dstValues = dst.GetValues(); + var dstIndices = dst.GetIndices(); + if (dst.IsDense || denseCount == 0 || (dstValues.Length >= denseCount && dstIndices[denseCount - 1] == denseCount - 1)) return; if (denseCount == dst.Length) { @@ -474,37 +481,36 @@ public static void DensifyFirst(ref VBuffer dst, int denseCount) return; } - // Densify the first BiasCount entries. - int[] indices = dst.Indices; - T[] values = dst.Values; - if (indices == null) + // Densify the first denseCount entries. + if (dstIndices.IsEmpty) { - Contracts.Assert(dst.Count == 0); - indices = Utils.GetIdentityPermutation(denseCount); - Utils.EnsureSize(ref values, denseCount, dst.Length, keepOld: false); - Array.Clear(values, 0, denseCount); - dst = new VBuffer(dst.Length, denseCount, values, indices); + // no previous values + var newIndicesEditor = VBufferEditor.Create(ref dst, dst.Length, denseCount); + Utils.FillIdentity(newIndicesEditor.Indices, denseCount); + newIndicesEditor.Values.Clear(); + dst = newIndicesEditor.Commit(); return; } - int lim = Utils.FindIndexSorted(indices, 0, dst.Count, denseCount); + int lim = Utils.FindIndexSorted(dstIndices, 0, dstValues.Length, denseCount); Contracts.Assert(lim < denseCount); - int newLen = dst.Count + denseCount - lim; + int newLen = dstValues.Length + denseCount - lim; if (newLen == dst.Length) { Densify(ref dst); return; } - Utils.EnsureSize(ref values, newLen, dst.Length); - Utils.EnsureSize(ref indices, newLen, dst.Length); - Array.Copy(values, lim, values, denseCount, dst.Count - lim); - Array.Copy(indices, lim, indices, denseCount, dst.Count - lim); + + var editor = VBufferEditor.Create(ref dst, dst.Length, newLen, keepOldOnResize: true); + int sliceLength = dstValues.Length - lim; + editor.Values.Slice(lim, sliceLength).CopyTo(editor.Values.Slice(denseCount)); + editor.Indices.Slice(lim, sliceLength).CopyTo(editor.Indices.Slice(denseCount)); int i = lim - 1; for (int ii = denseCount; --ii >= 0; ) { - values[ii] = i >= 0 && indices[i] == ii ? values[i--] : default(T); - indices[ii] = ii; + editor.Values[ii] = i >= 0 && dstIndices[i] == ii ? dstValues[i--] : default(T); + editor.Indices[ii] = ii; } - dst = new VBuffer(dst.Length, newLen, values, indices); + dst = editor.Commit(); } /// @@ -522,9 +528,10 @@ public static void CreateMaybeSparseCopy(in VBuffer src, ref VBuffer ds int sparseCount = 0; var sparseCountThreshold = (int)(src.Length * sparsityThreshold); + var srcValues = src.GetValues(); for (int i = 0; i < src.Length; i++) { - if (!isDefaultPredicate(in src.Values[i])) + if (!isDefaultPredicate(in srcValues[i])) sparseCount++; if (sparseCount > sparseCountThreshold) @@ -534,23 +541,17 @@ public static void CreateMaybeSparseCopy(in VBuffer src, ref VBuffer ds } } - var indices = dst.Indices; - var values = dst.Values; - + var editor = VBufferEditor.Create(ref dst, src.Length, sparseCount); if (sparseCount > 0) { - if (Utils.Size(values) < sparseCount) - values = new T[sparseCount]; - if (Utils.Size(indices) < sparseCount) - indices = new int[sparseCount]; int j = 0; for (int i = 0; i < src.Length; i++) { - if (!isDefaultPredicate(in src.Values[i])) + if (!isDefaultPredicate(in srcValues[i])) { Contracts.Assert(j < sparseCount); - indices[j] = i; - values[j] = src.Values[i]; + editor.Indices[j] = i; + editor.Values[j] = srcValues[i]; j++; } } @@ -558,7 +559,7 @@ public static void CreateMaybeSparseCopy(in VBuffer src, ref VBuffer ds Contracts.Assert(j == sparseCount); } - dst = new VBuffer(src.Length, sparseCount, values, indices); + dst = editor.Commit(); } /// @@ -667,10 +668,10 @@ private static void ApplyWithCore(in VBuffer src, ref VBuffer< // of the "outer" parameter. There are nine, top level cases. Each case is // considered in this order. - // 1. src.Count == 0. + // 1. srcValues.Length == 0. // 2. src.Dense. // 3. dst.Dense. - // 4. dst.Count == 0. + // 4. dstValues.Length == 0. // Beyond this point the cases can assume both src/dst are sparse non-empty vectors. // We then calculate the size of the resulting output array, then use that to fall @@ -688,20 +689,24 @@ private static void ApplyWithCore(in VBuffer src, ref VBuffer< // Case 5 does not require special handling, because it falls through to other cases // that do the special handling for them. - if (src.Count == 0) + var srcValues = src.GetValues(); + var dstValues = dst.GetValues(); + var dstIndices = dst.GetIndices(); + var editor = VBufferEditor.CreateFromBuffer(ref dst); + if (srcValues.Length == 0) { - // Major case 1, with src.Count == 0. + // Major case 1, with srcValues.Length == 0. if (!outer) return; if (dst.IsDense) { for (int i = 0; i < dst.Length; i++) - manip(i, default(TSrc), ref dst.Values[i]); + manip(i, default(TSrc), ref editor.Values[i]); } else { - for (int i = 0; i < dst.Count; i++) - manip(dst.Indices[i], default(TSrc), ref dst.Values[i]); + for (int i = 0; i < dstValues.Length; i++) + manip(dstIndices[i], default(TSrc), ref editor.Values[i]); } return; } @@ -710,76 +715,81 @@ private static void ApplyWithCore(in VBuffer src, ref VBuffer< { // Major case 2, with src.Dense. if (!dst.IsDense) + { Densify(ref dst); + editor = VBufferEditor.CreateFromBuffer(ref dst); + } + // Both are now dense. Both cases of outer are covered. - for (int i = 0; i < src.Length; i++) - manip(i, src.Values[i], ref dst.Values[i]); + for (int i = 0; i < srcValues.Length; i++) + manip(i, srcValues[i], ref editor.Values[i]); return; } + var srcIndices = src.GetIndices(); if (dst.IsDense) { - // Major case 3, with dst.Dense. Note that !a.Dense. + // Major case 3, with dst.Dense. Note that !src.Dense. if (outer) { int sI = 0; - int sIndex = src.Indices[sI]; + int sIndex = srcIndices[sI]; for (int i = 0; i < dst.Length; ++i) { if (i == sIndex) { - manip(i, src.Values[sI], ref dst.Values[i]); - sIndex = ++sI == src.Count ? src.Length : src.Indices[sI]; + manip(i, srcValues[sI], ref editor.Values[i]); + sIndex = ++sI == srcValues.Length ? src.Length : srcIndices[sI]; } else - manip(i, default(TSrc), ref dst.Values[i]); + manip(i, default(TSrc), ref editor.Values[i]); } } else { - for (int i = 0; i < src.Count; i++) - manip(src.Indices[i], src.Values[i], ref dst.Values[src.Indices[i]]); + for (int i = 0; i < srcValues.Length; i++) + manip(srcIndices[i], srcValues[i], ref editor.Values[srcIndices[i]]); } return; } - if (dst.Count == 0) + if (dstValues.Length == 0) { // Major case 4, with dst empty. Note that !src.Dense. // Neither is dense, and dst is empty. Both cases of outer are covered. - var values = dst.Values; - var indices = dst.Indices; - Utils.EnsureSize(ref values, src.Count, src.Length); - Array.Clear(values, 0, src.Count); - Utils.EnsureSize(ref indices, src.Count, src.Length); - for (int i = 0; i < src.Count; i++) - manip(indices[i] = src.Indices[i], src.Values[i], ref values[i]); - dst = new VBuffer(src.Length, src.Count, values, indices); + editor = VBufferEditor.Create(ref dst, + src.Length, + srcValues.Length, + maxValuesCapacity: src.Length); + editor.Values.Clear(); + for (int i = 0; i < srcValues.Length; i++) + manip(editor.Indices[i] = srcIndices[i], srcValues[i], ref editor.Values[i]); + dst = editor.Commit(); return; } // Beyond this point, we can assume both a and b are sparse with positive count. int dI = 0; - int newCount = dst.Count; + int newCount = dstValues.Length; // Try to find each src index in dst indices, counting how many more we'll add. - for (int sI = 0; sI < src.Count; sI++) + for (int sI = 0; sI < srcValues.Length; sI++) { - int sIndex = src.Indices[sI]; - while (dI < dst.Count && dst.Indices[dI] < sIndex) + int sIndex = srcIndices[sI]; + while (dI < dstValues.Length && dstIndices[dI] < sIndex) dI++; - if (dI == dst.Count) + if (dI == dstValues.Length) { - newCount += src.Count - sI; + newCount += srcValues.Length - sI; break; } - if (dst.Indices[dI] == sIndex) + if (dstIndices[dI] == sIndex) dI++; else newCount++; } Contracts.Assert(newCount > 0); - Contracts.Assert(0 < src.Count && src.Count <= newCount); - Contracts.Assert(0 < dst.Count && dst.Count <= newCount); + Contracts.Assert(0 < srcValues.Length && srcValues.Length <= newCount); + Contracts.Assert(0 < dstValues.Length && dstValues.Length <= newCount); // REVIEW: Densify above a certain threshold, not just if // the output will necessarily become dense? But then we get into @@ -798,21 +808,23 @@ private static void ApplyWithCore(in VBuffer src, ref VBuffer< return; } - if (newCount != src.Count && newCount != dst.Count) + if (newCount != srcValues.Length && newCount != dstValues.Length) { // Major case 6, neither set of indices is a subset of the other. // This subcase used to fall through to another subcase, but this // proved to be inefficient so we go to the little bit of extra work // to handle it here. - var indices = dst.Indices; - var values = dst.Values; - Utils.EnsureSize(ref indices, newCount, dst.Length, keepOld: false); - Utils.EnsureSize(ref values, newCount, dst.Length, keepOld: false); - int sI = src.Count - 1; - dI = dst.Count - 1; - int sIndex = src.Indices[sI]; - int dIndex = dst.Indices[dI]; + editor = VBufferEditor.Create(ref dst, + src.Length, + newCount, + maxValuesCapacity: dst.Length); + var indices = editor.Indices; + var values = editor.Values; + int sI = srcValues.Length - 1; + dI = dstValues.Length - 1; + int sIndex = srcIndices[sI]; + int dIndex = dstIndices[dI]; // Go from the end, so that even if we're writing over dst's vectors in // place, we do not corrupt the data as we are reorganizing it. @@ -821,17 +833,17 @@ private static void ApplyWithCore(in VBuffer src, ref VBuffer< if (sIndex < dIndex) { indices[i] = dIndex; - values[i] = dst.Values[dI]; + values[i] = dstValues[dI]; if (outer) manip(dIndex, default(TSrc), ref values[i]); - dIndex = --dI >= 0 ? dst.Indices[dI] : -1; + dIndex = --dI >= 0 ? dstIndices[dI] : -1; } else if (sIndex > dIndex) { indices[i] = sIndex; values[i] = default(TDst); - manip(sIndex, src.Values[sI], ref values[i]); - sIndex = --sI >= 0 ? src.Indices[sI] : -1; + manip(sIndex, srcValues[sI], ref values[i]); + sIndex = --sI >= 0 ? srcIndices[sI] : -1; } else { @@ -839,84 +851,88 @@ private static void ApplyWithCore(in VBuffer src, ref VBuffer< Contracts.Assert(sIndex >= 0); Contracts.Assert(sIndex == dIndex); indices[i] = dIndex; - values[i] = dst.Values[dI]; - manip(sIndex, src.Values[sI], ref values[i]); - sIndex = --sI >= 0 ? src.Indices[sI] : -1; - dIndex = --dI >= 0 ? dst.Indices[dI] : -1; + values[i] = dstValues[dI]; + manip(sIndex, srcValues[sI], ref values[i]); + sIndex = --sI >= 0 ? srcIndices[sI] : -1; + dIndex = --dI >= 0 ? dstIndices[dI] : -1; } } - dst = new VBuffer(dst.Length, newCount, values, indices); + dst = editor.Commit(); return; } - if (newCount == dst.Count) + if (newCount == dstValues.Length) { - if (newCount == src.Count) + if (newCount == srcValues.Length) { // Major case 7, the set of indices is the same for src and dst. - Contracts.Assert(src.Count == dst.Count); - for (int i = 0; i < src.Count; i++) + Contracts.Assert(srcValues.Length == dstValues.Length); + for (int i = 0; i < srcValues.Length; i++) { - Contracts.Assert(src.Indices[i] == dst.Indices[i]); - manip(src.Indices[i], src.Values[i], ref dst.Values[i]); + Contracts.Assert(srcIndices[i] == dstIndices[i]); + manip(srcIndices[i], srcValues[i], ref editor.Values[i]); } return; } // Major case 8, the indices of src must be a subset of dst's indices. - Contracts.Assert(newCount > src.Count); + Contracts.Assert(newCount > srcValues.Length); dI = 0; if (outer) { int sI = 0; - int sIndex = src.Indices[sI]; - for (int i = 0; i < dst.Count; ++i) + int sIndex = srcIndices[sI]; + for (int i = 0; i < dstValues.Length; ++i) { - if (dst.Indices[i] == sIndex) + if (dstIndices[i] == sIndex) { - manip(sIndex, src.Values[sI], ref dst.Values[i]); - sIndex = ++sI == src.Count ? src.Length : src.Indices[sI]; + manip(sIndex, srcValues[sI], ref editor.Values[i]); + sIndex = ++sI == srcValues.Length ? src.Length : srcIndices[sI]; } else - manip(dst.Indices[i], default(TSrc), ref dst.Values[i]); + manip(dstIndices[i], default(TSrc), ref editor.Values[i]); } } else { - for (int sI = 0; sI < src.Count; sI++) + for (int sI = 0; sI < srcValues.Length; sI++) { - int sIndex = src.Indices[sI]; - while (dst.Indices[dI] < sIndex) + int sIndex = srcIndices[sI]; + while (dstIndices[dI] < sIndex) dI++; - Contracts.Assert(dst.Indices[dI] == sIndex); - manip(sIndex, src.Values[sI], ref dst.Values[dI++]); + Contracts.Assert(dstIndices[dI] == sIndex); + manip(sIndex, srcValues[sI], ref editor.Values[dI++]); } } return; } - if (newCount == src.Count) + if (newCount == srcValues.Length) { // Major case 9, the indices of dst must be a subset of src's indices. Both cases of outer are covered. // First do a "quasi" densification of dst, by making the indices // of dst correspond to those in src. + editor = VBufferEditor.Create(ref dst, newCount, dstValues.Length); int sI = 0; - for (dI = 0; dI < dst.Count; ++dI) + for (dI = 0; dI < dstValues.Length; ++dI) { - int bIndex = dst.Indices[dI]; - while (src.Indices[sI] < bIndex) + int bIndex = dstIndices[dI]; + while (srcIndices[sI] < bIndex) sI++; - Contracts.Assert(src.Indices[sI] == bIndex); - dst.Indices[dI] = sI++; + Contracts.Assert(srcIndices[sI] == bIndex); + editor.Indices[dI] = sI++; } - dst = new VBuffer(newCount, dst.Count, dst.Values, dst.Indices); + dst = editor.Commit(); Densify(ref dst); - int[] indices = dst.Indices; - Utils.EnsureSize(ref indices, src.Count, src.Length, keepOld: false); - Array.Copy(src.Indices, indices, newCount); - dst = new VBuffer(src.Length, newCount, dst.Values, indices); - for (sI = 0; sI < src.Count; sI++) - manip(src.Indices[sI], src.Values[sI], ref dst.Values[sI]); + + editor = VBufferEditor.Create(ref dst, + src.Length, + newCount, + maxValuesCapacity: src.Length); + srcIndices.CopyTo(editor.Indices); + for (sI = 0; sI < srcValues.Length; sI++) + manip(srcIndices[sI], srcValues[sI], ref editor.Values[sI]); + dst = editor.Commit(); return; } @@ -933,73 +949,78 @@ private static void ApplyWithCoreCopy(in VBuffer src, ref VBuf { Contracts.Check(src.Length == dst.Length, "Vectors must have the same dimensionality."); Contracts.CheckValue(manip, nameof(manip)); - Contracts.Assert(Utils.Size(src.Values) >= src.Count); - Contracts.Assert(Utils.Size(dst.Values) >= dst.Count); + int length = src.Length; - if (dst.Count == 0) + var srcValues = src.GetValues(); + var dstValues = dst.GetValues(); + + if (dstValues.Length == 0) { - if (src.Count == 0) - res = new VBuffer(length, 0, res.Values, res.Indices); + if (srcValues.Length == 0) + { + Resize(ref res, length, 0); + } else if (src.IsDense) { - Contracts.Assert(src.Count == src.Length); - TDst[] resValues = Utils.Size(res.Values) >= length ? res.Values : new TDst[length]; + Contracts.Assert(srcValues.Length == src.Length); + var editor = VBufferEditor.Create(ref res, length); for (int i = 0; i < length; i++) - manip(i, src.Values[i], default(TDst), ref resValues[i]); - res = new VBuffer(length, resValues, res.Indices); + manip(i, srcValues[i], default(TDst), ref editor.Values[i]); + res = editor.Commit(); } else { // src is non-empty sparse. - int count = src.Count; + int count = srcValues.Length; Contracts.Assert(0 < count && count < length); - int[] resIndices = Utils.Size(res.Indices) >= count ? res.Indices : new int[count]; - TDst[] resValues = Utils.Size(res.Values) >= count ? res.Values : new TDst[count]; - Array.Copy(src.Indices, resIndices, count); + var editor = VBufferEditor.Create(ref res, length, count); + var srcIndices = src.GetIndices(); + srcIndices.CopyTo(editor.Indices); for (int ii = 0; ii < count; ii++) { - int i = src.Indices[ii]; - resIndices[ii] = i; - manip(i, src.Values[ii], default(TDst), ref resValues[ii]); + int i = srcIndices[ii]; + editor.Indices[ii] = i; + manip(i, srcValues[ii], default(TDst), ref editor.Values[ii]); } - res = new VBuffer(length, count, resValues, resIndices); + res = editor.Commit(); } } else if (dst.IsDense) { - TDst[] resValues = Utils.Size(res.Values) >= length ? res.Values : new TDst[length]; - if (src.Count == 0) + var editor = VBufferEditor.Create(ref res, length); + if (srcValues.Length == 0) { if (outer) { // Apply manip to all slots, as all slots of dst are defined. for (int j = 0; j < length; j++) - manip(j, default(TSrc), dst.Values[j], ref resValues[j]); + manip(j, default(TSrc), dstValues[j], ref editor.Values[j]); } else { // Copy only. No slot of src is defined. for (int j = 0; j < length; j++) - resValues[j] = dst.Values[j]; + editor.Values[j] = dstValues[j]; } - res = new VBuffer(length, resValues, res.Indices); + res = editor.Commit(); } else if (src.IsDense) { - Contracts.Assert(src.Count == src.Length); + Contracts.Assert(srcValues.Length == src.Length); for (int i = 0; i < length; i++) - manip(i, src.Values[i], dst.Values[i], ref resValues[i]); - res = new VBuffer(length, resValues, res.Indices); + manip(i, srcValues[i], dstValues[i], ref editor.Values[i]); + res = editor.Commit(); } else { // src is sparse and non-empty. - int count = src.Count; + int count = srcValues.Length; Contracts.Assert(0 < count && count < length); int ii = 0; - int i = src.Indices[ii]; + var srcIndices = src.GetIndices(); + int i = srcIndices[ii]; if (outer) { // All slots of dst are defined. Always apply manip. @@ -1007,11 +1028,11 @@ private static void ApplyWithCoreCopy(in VBuffer src, ref VBuf { if (j == i) { - manip(j, src.Values[ii], dst.Values[j], ref resValues[j]); - i = ++ii == count ? length : src.Indices[ii]; + manip(j, srcValues[ii], dstValues[j], ref editor.Values[j]); + i = ++ii == count ? length : srcIndices[ii]; } else - manip(j, default(TSrc), dst.Values[j], ref resValues[j]); + manip(j, default(TSrc), dstValues[j], ref editor.Values[j]); } } else @@ -1021,88 +1042,89 @@ private static void ApplyWithCoreCopy(in VBuffer src, ref VBuf { if (j == i) { - manip(j, src.Values[ii], dst.Values[j], ref resValues[j]); - i = ++ii == count ? length : src.Indices[ii]; + manip(j, srcValues[ii], dstValues[j], ref editor.Values[j]); + i = ++ii == count ? length : srcIndices[ii]; } else - resValues[j] = dst.Values[j]; + editor.Values[j] = dstValues[j]; } } - res = new VBuffer(length, resValues, res.Indices); + res = editor.Commit(); } } else { // dst is non-empty sparse - int dstCount = dst.Count; + int dstCount = dstValues.Length; + var dstIndices = dst.GetIndices(); Contracts.Assert(dstCount > 0); - if (src.Count == 0) + if (srcValues.Length == 0) { - int[] resIndices = Utils.Size(res.Indices) >= dstCount ? res.Indices : new int[dstCount]; - TDst[] resValues = Utils.Size(res.Values) >= dstCount ? res.Values : new TDst[dstCount]; + var editor = VBufferEditor.Create(ref res, length, dstCount); if (outer) { for (int jj = 0; jj < dstCount; jj++) { - int j = dst.Indices[jj]; - resIndices[jj] = j; - manip(j, default(TSrc), dst.Values[jj], ref resValues[jj]); + int j = dstIndices[jj]; + editor.Indices[jj] = j; + manip(j, default(TSrc), dstValues[jj], ref editor.Values[jj]); } } else { for (int jj = 0; jj < dstCount; jj++) { - resIndices[jj] = dst.Indices[jj]; - resValues[jj] = dst.Values[jj]; + editor.Indices[jj] = dstIndices[jj]; + editor.Values[jj] = dstValues[jj]; } } - res = new VBuffer(length, dstCount, resValues, resIndices); + res = editor.Commit(); } else if (src.IsDense) { // res will be dense. - TDst[] resValues = Utils.Size(res.Values) >= length ? res.Values : new TDst[length]; + var editor = VBufferEditor.Create(ref res, length); int jj = 0; - int j = dst.Indices[jj]; + int j = dstIndices[jj]; for (int i = 0; i < length; i++) { if (i == j) { - manip(i, src.Values[i], dst.Values[jj], ref resValues[i]); - j = ++jj == dstCount ? length : dst.Indices[jj]; + manip(i, srcValues[i], dstValues[jj], ref editor.Values[i]); + j = ++jj == dstCount ? length : dstIndices[jj]; } else - manip(i, src.Values[i], default(TDst), ref resValues[i]); + manip(i, srcValues[i], default(TDst), ref editor.Values[i]); } - res = new VBuffer(length, resValues, res.Indices); + res = editor.Commit(); } else { // Both src and dst are non-empty sparse. - Contracts.Assert(src.Count > 0); + Contracts.Assert(srcValues.Length > 0); // Find the count of result, which is the size of the union of the indices set of src and dst. int resCount = dstCount; - for (int ii = 0, jj = 0; ii < src.Count; ii++) + var srcIndices = src.GetIndices(); + for (int ii = 0, jj = 0; ii < srcValues.Length; ii++) { - int i = src.Indices[ii]; - while (jj < dst.Count && dst.Indices[jj] < i) + int i = srcIndices[ii]; + while (jj < dstValues.Length && dstIndices[jj] < i) jj++; - if (jj == dst.Count) + if (jj == dstValues.Length) { - resCount += src.Count - ii; + resCount += srcValues.Length - ii; break; } - if (dst.Indices[jj] == i) + if (dstIndices[jj] == i) jj++; else resCount++; } Contracts.Assert(0 < resCount && resCount <= length); - Contracts.Assert(resCount <= src.Count + dstCount); - Contracts.Assert(src.Count <= resCount); + Contracts.Assert(resCount <= srcValues.Length + dstCount); + Contracts.Assert(srcValues.Length <= resCount); Contracts.Assert(dstCount <= resCount); if (resCount == length) @@ -1115,13 +1137,12 @@ private static void ApplyWithCoreCopy(in VBuffer src, ref VBuf } else { - int[] resIndices = Utils.Size(res.Indices) >= resCount ? res.Indices : new int[resCount]; - TDst[] resValues = Utils.Size(res.Values) >= resCount ? res.Values : new TDst[resCount]; + var editor = VBufferEditor.Create(ref res, length, resCount); int ii = 0; - int i = src.Indices[ii]; + int i = srcIndices[ii]; int jj = 0; - int j = dst.Indices[jj]; + int j = dstIndices[jj]; for (int kk = 0; kk < resCount; kk++) { @@ -1129,35 +1150,35 @@ private static void ApplyWithCoreCopy(in VBuffer src, ref VBuf if (i == j) { // Slot (i == j) both defined in src and dst. Apply manip. - resIndices[kk] = i; - manip(i, src.Values[ii], dst.Values[jj], ref resValues[kk]); - i = ++ii == src.Count ? length : src.Indices[ii]; - j = ++jj == dstCount ? length : dst.Indices[jj]; + editor.Indices[kk] = i; + manip(i, srcValues[ii], dstValues[jj], ref editor.Values[kk]); + i = ++ii == srcValues.Length ? length : srcIndices[ii]; + j = ++jj == dstCount ? length : dstIndices[jj]; } else if (i < j) { // Slot i defined only in src, but not in dst. Apply manip. - resIndices[kk] = i; - manip(i, src.Values[ii], default(TDst), ref resValues[kk]); - i = ++ii == src.Count ? length : src.Indices[ii]; + editor.Indices[kk] = i; + manip(i, srcValues[ii], default(TDst), ref editor.Values[kk]); + i = ++ii == srcValues.Length ? length : srcIndices[ii]; } else { // Slot j defined only in dst, but not in src. Apply manip if outer. // Otherwise just copy. - resIndices[kk] = j; + editor.Indices[kk] = j; // REVIEW: Should we move checking of outer outside the loop? if (outer) - manip(j, default(TSrc), dst.Values[jj], ref resValues[kk]); + manip(j, default(TSrc), dstValues[jj], ref editor.Values[kk]); else - resValues[kk] = dst.Values[jj]; - j = ++jj == dstCount ? length : dst.Indices[jj]; + editor.Values[kk] = dstValues[jj]; + j = ++jj == dstCount ? length : dstIndices[jj]; } } - Contracts.Assert(ii == src.Count && jj == dstCount); + Contracts.Assert(ii == srcValues.Length && jj == dstCount); Contracts.Assert(i == length && j == length); - res = new VBuffer(length, resCount, resValues, resIndices); + res = editor.Commit(); } } } @@ -1177,29 +1198,34 @@ public static void ApplyIntoEitherDefined(in VBuffer src, ref { Contracts.CheckValue(func, nameof(func)); + var srcValues = src.GetValues(); + // REVIEW: The analogous WritableVector method insisted on // equal lengths, but I don't care here. - if (src.Count == 0) + if (srcValues.Length == 0) { - dst = new VBuffer(src.Length, src.Count, dst.Values, dst.Indices); + Resize(ref dst, src.Length, 0); return; } - int[] indices = dst.Indices; - TDst[] values = dst.Values; - Utils.EnsureSize(ref values, src.Count, src.Length, keepOld: false); + var editor = VBufferEditor.Create(ref dst, + src.Length, + srcValues.Length, + maxValuesCapacity: src.Length); + Span values = editor.Values; if (src.IsDense) { for (int i = 0; i < src.Length; ++i) - values[i] = func(i, src.Values[i]); + values[i] = func(i, srcValues[i]); } else { - Utils.EnsureSize(ref indices, src.Count, src.Length, keepOld: false); - Array.Copy(src.Indices, indices, src.Count); - for (int i = 0; i < src.Count; ++i) - values[i] = func(src.Indices[i], src.Values[i]); + Span indices = editor.Indices; + var srcIndices = src.GetIndices(); + srcIndices.CopyTo(indices); + for (int i = 0; i < srcValues.Length; ++i) + values[i] = func(srcIndices[i], srcValues[i]); } - dst = new VBuffer(src.Length, src.Count, values, indices); + dst = editor.Commit(); } /// @@ -1226,54 +1252,61 @@ public static void ApplyInto(in VBuffer a, in VBuffer // 5. b's indices are a subset of a's. // 6. Neither a nor b's indices are a subset of the other. - if (a.Count == 0 && b.Count == 0) + var aValues = a.GetValues(); + var bValues = b.GetValues(); + if (aValues.Length == 0 && bValues.Length == 0) { // Case 1. Output will be empty. - dst = new VBuffer(a.Length, 0, dst.Values, dst.Indices); + Resize(ref dst, a.Length, 0); return; } int aI = 0; int bI = 0; - TDst[] values = dst.Values; + ReadOnlySpan aIndices; + ReadOnlySpan bIndices; + VBufferEditor editor; if (a.IsDense || b.IsDense) { // Case 2. One of the two inputs is dense. The output will be dense. - Utils.EnsureSize(ref values, a.Length, a.Length, keepOld: false); - + editor = VBufferEditor.Create(ref dst, a.Length); if (!a.IsDense) { // a is sparse, b is dense + aIndices = a.GetIndices(); for (int i = 0; i < b.Length; i++) { - TSrc1 aVal = (aI < a.Count && i == a.Indices[aI]) ? a.Values[aI++] : default(TSrc1); - values[i] = func(i, aVal, b.Values[i]); + TSrc1 aVal = (aI < aIndices.Length && i == aIndices[aI]) ? aValues[aI++] : default(TSrc1); + editor.Values[i] = func(i, aVal, bValues[i]); } } else if (!b.IsDense) { // b is sparse, a is dense + bIndices = b.GetIndices(); for (int i = 0; i < a.Length; i++) { - TSrc2 bVal = (bI < b.Count && i == b.Indices[bI]) ? b.Values[bI++] : default(TSrc2); - values[i] = func(i, a.Values[i], bVal); + TSrc2 bVal = (bI < bIndices.Length && i == bIndices[bI]) ? bValues[bI++] : default(TSrc2); + editor.Values[i] = func(i, aValues[i], bVal); } } else { // both dense for (int i = 0; i < a.Length; i++) - values[i] = func(i, a.Values[i], b.Values[i]); + editor.Values[i] = func(i, aValues[i], bValues[i]); } - dst = new VBuffer(a.Length, values, dst.Indices); + dst = editor.Commit(); return; } // a, b both sparse. int newCount = 0; - while (aI < a.Count && bI < b.Count) + aIndices = a.GetIndices(); + bIndices = b.GetIndices(); + while (aI < aIndices.Length && bI < bIndices.Length) { - int aCompB = a.Indices[aI] - b.Indices[bI]; + int aCompB = aIndices[aI] - bIndices[bI]; if (aCompB <= 0) // a is no larger than b. aI++; if (aCompB >= 0) // b is no larger than a. @@ -1281,58 +1314,57 @@ public static void ApplyInto(in VBuffer a, in VBuffer newCount++; } - if (aI < a.Count) - newCount += a.Count - aI; - if (bI < b.Count) - newCount += b.Count - bI; + if (aI < aIndices.Length) + newCount += aIndices.Length - aI; + if (bI < bIndices.Length) + newCount += bIndices.Length - bI; // REVIEW: Worth optimizing the newCount == a.Length case? // Probably not... - int[] indices = dst.Indices; - Utils.EnsureSize(ref indices, newCount, a.Length, keepOld: false); - Utils.EnsureSize(ref values, newCount, a.Length, keepOld: false); + editor = VBufferEditor.Create(ref dst, a.Length, newCount); + Span indices = editor.Indices; - if (newCount == b.Count) + if (newCount == bValues.Length) { - if (newCount == a.Count) + if (newCount == aValues.Length) { // Case 3, a and b actually have the same indices! - Array.Copy(a.Indices, indices, a.Count); - for (aI = 0; aI < a.Count; aI++) + aIndices.CopyTo(indices); + for (aI = 0; aI < aValues.Length; aI++) { - Contracts.Assert(a.Indices[aI] == b.Indices[aI]); - values[aI] = func(a.Indices[aI], a.Values[aI], b.Values[aI]); + Contracts.Assert(aIndices[aI] == bIndices[aI]); + editor.Values[aI] = func(aIndices[aI], aValues[aI], bValues[aI]); } } else { // Case 4, a's indices are a subset of b's. - Array.Copy(b.Indices, indices, b.Count); + bIndices.CopyTo(indices); aI = 0; - for (bI = 0; aI < a.Count && bI < b.Count; bI++) + for (bI = 0; aI < aValues.Length && bI < bValues.Length; bI++) { - Contracts.Assert(a.Indices[aI] >= b.Indices[bI]); - TSrc1 aVal = a.Indices[aI] == b.Indices[bI] ? a.Values[aI++] : default(TSrc1); - values[bI] = func(b.Indices[bI], aVal, b.Values[bI]); + Contracts.Assert(aIndices[aI] >= bIndices[bI]); + TSrc1 aVal = aIndices[aI] == bIndices[bI] ? aValues[aI++] : default(TSrc1); + editor.Values[bI] = func(bIndices[bI], aVal, bValues[bI]); } - for (; bI < b.Count; bI++) - values[bI] = func(b.Indices[bI], default(TSrc1), b.Values[bI]); + for (; bI < bValues.Length; bI++) + editor.Values[bI] = func(bIndices[bI], default(TSrc1), bValues[bI]); } } - else if (newCount == a.Count) + else if (newCount == aValues.Length) { // Case 5, b's indices are a subset of a's. - Array.Copy(a.Indices, indices, a.Count); + aIndices.CopyTo(indices); bI = 0; - for (aI = 0; bI < b.Count && aI < a.Count; aI++) + for (aI = 0; bI < bValues.Length && aI < aValues.Length; aI++) { - Contracts.Assert(b.Indices[bI] >= a.Indices[aI]); - TSrc2 bVal = a.Indices[aI] == b.Indices[bI] ? b.Values[bI++] : default(TSrc2); - values[aI] = func(a.Indices[aI], a.Values[aI], bVal); + Contracts.Assert(bIndices[bI] >= aIndices[aI]); + TSrc2 bVal = aIndices[aI] == bIndices[bI] ? bValues[bI++] : default(TSrc2); + editor.Values[aI] = func(aIndices[aI], aValues[aI], bVal); } - for (; aI < a.Count; aI++) - values[aI] = func(a.Indices[aI], a.Values[aI], default(TSrc2)); + for (; aI < aValues.Length; aI++) + editor.Values[aI] = func(aIndices[aI], aValues[aI], default(TSrc2)); } else { @@ -1340,49 +1372,49 @@ public static void ApplyInto(in VBuffer a, in VBuffer int newI = aI = bI = 0; TSrc1 aVal = default(TSrc1); TSrc2 bVal = default(TSrc2); - while (aI < a.Count && bI < b.Count) + while (aI < aIndices.Length && bI < bIndices.Length) { - int aCompB = a.Indices[aI] - b.Indices[bI]; + int aCompB = aIndices[aI] - bIndices[bI]; int index = 0; if (aCompB < 0) { - index = a.Indices[aI]; - aVal = a.Values[aI++]; + index = aIndices[aI]; + aVal = aValues[aI++]; bVal = default(TSrc2); } else if (aCompB > 0) { - index = b.Indices[bI]; + index = bIndices[bI]; aVal = default(TSrc1); - bVal = b.Values[bI++]; + bVal = bValues[bI++]; } else { - index = a.Indices[aI]; - Contracts.Assert(index == b.Indices[bI]); - aVal = a.Values[aI++]; - bVal = b.Values[bI++]; + index = aIndices[aI]; + Contracts.Assert(index == bIndices[bI]); + aVal = aValues[aI++]; + bVal = bValues[bI++]; } - values[newI] = func(index, aVal, bVal); + editor.Values[newI] = func(index, aVal, bVal); indices[newI++] = index; } - for (; aI < a.Count; aI++) + for (; aI < aIndices.Length; aI++) { - int index = a.Indices[aI]; - values[newI] = func(index, a.Values[aI], default(TSrc2)); + int index = aIndices[aI]; + editor.Values[newI] = func(index, aValues[aI], default(TSrc2)); indices[newI++] = index; } - for (; bI < b.Count; bI++) + for (; bI < bIndices.Length; bI++) { - int index = b.Indices[bI]; - values[newI] = func(index, default(TSrc1), b.Values[bI]); + int index = bIndices[bI]; + editor.Values[newI] = func(index, default(TSrc1), bValues[bI]); indices[newI++] = index; } } - dst = new VBuffer(a.Length, newCount, values, indices); + dst = editor.Commit(); } /// @@ -1391,14 +1423,26 @@ public static void ApplyInto(in VBuffer a, in VBuffer public static void Copy(List src, ref VBuffer dst, int length) { Contracts.CheckParam(0 <= length && length <= Utils.Size(src), nameof(length)); - var values = dst.Values; + var editor = VBufferEditor.Create(ref dst, length); if (length > 0) { - if (Utils.Size(values) < length) - values = new T[length]; - src.CopyTo(values); + // List.CopyTo should have an overload for Span - https://github.com/dotnet/corefx/issues/33006 + for (int i = 0; i < length; i++) + { + editor.Values[i] = src[i]; + } } - dst = new VBuffer(length, values, dst.Indices); + dst = editor.Commit(); + } + + /// + /// Updates the logical length and number of physical values to be represented in + /// , while preserving the underlying buffers. + /// + public static void Resize(ref VBuffer dst, int newLogicalLength, int? valuesCount = null) + { + dst = VBufferEditor.Create(ref dst, newLogicalLength, valuesCount) + .Commit(); } } } diff --git a/src/Microsoft.ML.CpuMath/AlignedArray.cs b/src/Microsoft.ML.CpuMath/AlignedArray.cs index 87583a8ef6..9902edc4df 100644 --- a/src/Microsoft.ML.CpuMath/AlignedArray.cs +++ b/src/Microsoft.ML.CpuMath/AlignedArray.cs @@ -146,7 +146,7 @@ public void CopyFrom(int start, Float[] src, int index, int count) // valuesSrc contains only the non-zero entries. Those are copied into their logical positions in the dense array. // rgposSrc contains the logical positions + offset of the non-zero entries in the dense array. // rgposSrc runs parallel to the valuesSrc array. - public void CopyFrom(int[] rgposSrc, Float[] valuesSrc, int posMin, int iposMin, int iposLim, bool zeroItems) + public void CopyFrom(ReadOnlySpan rgposSrc, ReadOnlySpan valuesSrc, int posMin, int iposMin, int iposLim, bool zeroItems) { Contracts.Assert(rgposSrc != null); Contracts.Assert(valuesSrc != null); diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index e9d95ccc1d..973b2278a3 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -88,7 +88,7 @@ public static void MatrixTimesSource(bool transpose, AlignedArray matrix, Aligne } } - public static void MatrixTimesSource(AlignedArray matrix, int[] rgposSrc, AlignedArray sourceValues, + public static void MatrixTimesSource(AlignedArray matrix, ReadOnlySpan rgposSrc, AlignedArray sourceValues, int posMin, int iposMin, int iposLimit, AlignedArray destination, int stride) { Contracts.AssertValue(rgposSrc); diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index bc9569d390..5d54ee6fe0 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -18,7 +18,7 @@ public static int GetVectorAlignment() public static void MatrixTimesSource(bool transpose, AlignedArray matrix, AlignedArray source, AlignedArray destination, int stride) => SseUtils.MatTimesSrc(transpose, matrix, source, destination, stride); - public static void MatrixTimesSource(AlignedArray matrix, int[] rgposSrc, AlignedArray sourceValues, + public static void MatrixTimesSource(AlignedArray matrix, ReadOnlySpan rgposSrc, AlignedArray sourceValues, int posMin, int iposMin, int iposLimit, AlignedArray destination, int stride) => SseUtils.MatTimesSrc(matrix, rgposSrc, sourceValues, posMin, iposMin, iposLimit, destination, stride); public static void Add(float value, Span destination) => SseUtils.Add(value, destination); diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs index 3ff59f2840..5c4ace31cd 100644 --- a/src/Microsoft.ML.CpuMath/Sse.cs +++ b/src/Microsoft.ML.CpuMath/Sse.cs @@ -57,13 +57,12 @@ public static void MatTimesSrc(bool tran, AlignedArray mat, AlignedArray src, Al } } - public static void MatTimesSrc(AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, + public static void MatTimesSrc(AlignedArray mat, ReadOnlySpan rgposSrc, AlignedArray srcValues, int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) { Contracts.Assert(Compat(mat)); Contracts.Assert(Compat(srcValues)); Contracts.Assert(Compat(dst)); - Contracts.AssertValue(rgposSrc); Contracts.Assert(0 <= iposMin && iposMin <= iposLim && iposLim <= rgposSrc.Length); Contracts.Assert(mat.Size == dst.Size * srcValues.Size); diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index cf85a98132..8e1755e797 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -276,7 +276,7 @@ public static unsafe void MatMul(ReadOnlySpan mat, ReadOnlySpan sr } // Partial sparse source vector. - public static unsafe void MatMulP(AlignedArray mat, int[] rgposSrc, AlignedArray src, + public static unsafe void MatMulP(AlignedArray mat, ReadOnlySpan rgposSrc, AlignedArray src, int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) { MatMulP(mat.Items, rgposSrc, src.Items, posMin, iposMin, iposEnd, dst.Items, crow, ccol); diff --git a/src/Microsoft.ML.Data/Commands/ShowSchemaCommand.cs b/src/Microsoft.ML.Data/Commands/ShowSchemaCommand.cs index 4fb0738e58..5fbab5176a 100644 --- a/src/Microsoft.ML.Data/Commands/ShowSchemaCommand.cs +++ b/src/Microsoft.ML.Data/Commands/ShowSchemaCommand.cs @@ -279,7 +279,7 @@ private static void ShowMetadataValueVec(IndentingTextWriter itw, ISchema sch var value = default(VBuffer); schema.GetMetadata(kind, col, ref value); - itw.Write(": Length={0}, Count={0}", value.Length, value.Count); + itw.Write(": Length={0}, Count={0}", value.Length, value.GetValues().Length); using (itw.Nest()) { diff --git a/src/Microsoft.ML.Data/Data/BufferBuilder.cs b/src/Microsoft.ML.Data/Data/BufferBuilder.cs index 5020ae0418..2f37f4ea81 100644 --- a/src/Microsoft.ML.Data/Data/BufferBuilder.cs +++ b/src/Microsoft.ML.Data/Data/BufferBuilder.cs @@ -382,45 +382,6 @@ public bool TryGetFeature(int index, out T v) return false; } - private void GetResult(ref T[] values, ref int[] indices, out int count, out int length) - { - if (_count == 0) - { - count = 0; - length = _length; - return; - } - - if (!_dense) - { - if (!_sorted) - SortAndSumDups(); - if (!_dense && _count >= _length / 2) - MakeDense(); - } - - if (_dense) - { - if (Utils.Size(values) < _length) - values = new T[_length]; - Array.Copy(_values, values, _length); - count = _length; - length = _length; - } - else - { - Contracts.Assert(_count < _length); - if (Utils.Size(values) < _count) - values = new T[_count]; - if (Utils.Size(indices) < _count) - indices = new int[_count]; - Array.Copy(_values, values, _count); - Array.Copy(_indices, indices, _count); - count = _count; - length = _length; - } - } - public void Reset(int length, bool dense) { ResetImpl(length, dense); @@ -431,11 +392,11 @@ public void AddFeatures(int index, in VBuffer buffer) { Contracts.Check(0 <= index && index <= _length - buffer.Length); - int count = buffer.Count; + var values = buffer.GetValues(); + int count = values.Length; if (count == 0) return; - var values = buffer.Values; if (buffer.IsDense) { Contracts.Assert(count == buffer.Length); @@ -454,7 +415,7 @@ public void AddFeatures(int index, in VBuffer buffer) else { // REVIEW: Validate indices! - var indices = buffer.Indices; + var indices = buffer.GetIndices(); if (_dense) { for (int i = 0; i < count; i++) @@ -471,24 +432,34 @@ public void AddFeatures(int index, in VBuffer buffer) public void GetResult(ref VBuffer buffer) { - var values = buffer.Values; - var indices = buffer.Indices; - if (IsEmpty) { - buffer = new VBuffer(_length, 0, values, indices); + VBufferUtils.Resize(ref buffer, _length, 0); return; } - int count; - int length; - GetResult(ref values, ref indices, out count, out length); - Contracts.Assert(0 <= count && count <= length); + if (!_dense) + { + if (!_sorted) + SortAndSumDups(); + if (!_dense && _count >= _length / 2) + MakeDense(); + } - if (count == length) - buffer = new VBuffer(length, values, indices); + if (_dense) + { + var editor = VBufferEditor.Create(ref buffer, _length); + _values.AsSpan(0, _length).CopyTo(editor.Values); + buffer = editor.Commit(); + } else - buffer = new VBuffer(length, count, values, indices); + { + Contracts.Assert(_count < _length); + var editor = VBufferEditor.Create(ref buffer, _length, _count); + _values.AsSpan(0, _count).CopyTo(editor.Values); + _indices.AsSpan(0, _count).CopyTo(editor.Indices); + buffer = editor.Commit(); + } } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs index 792ec4f9f3..29fa46c1a0 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs @@ -1109,29 +1109,29 @@ public override void Get(ref VBuffer value) int length = FixedLength ? _size : _lengths[_vectorIndex]; int count = _counts[_vectorIndex]; - int[] indices = value.Indices; - T[] values = value.Values; if (count < 0) { // dense + var editor = VBufferEditor.Create(ref value, length); if (length > 0) { - Utils.EnsureSize(ref values, length); - Array.Copy(_values, _valuesOffset, values, 0, length); + _values.AsSpan(_valuesOffset, length) + .CopyTo(editor.Values); } - value = new VBuffer(length, values, indices); + value = editor.Commit(); } else { // sparse + var editor = VBufferEditor.Create(ref value, length, count); if (count > 0) { - Utils.EnsureSize(ref values, count); - Utils.EnsureSize(ref indices, count); - Array.Copy(_values, _valuesOffset, values, 0, count); - Array.Copy(_indices, _indicesOffset, indices, 0, count); + _values.AsSpan(_valuesOffset, count) + .CopyTo(editor.Values); + _indices.AsSpan(_indicesOffset, count) + .CopyTo(editor.Indices); } - value = new VBuffer(length, count, values, indices); + value = editor.Commit(); } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 130f158d4e..cdcb507f51 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -401,28 +401,22 @@ public void Get(ref VBuffer dst) { AssertValid(); - var values = dst.Values; - var indices = dst.Indices; - if (_count == 0) { - dst = new VBuffer(_size, 0, values, indices); + VBufferUtils.Resize(ref dst, _size, 0); return; } - if (Utils.Size(values) < _count) - values = new TItem[_count]; - Array.Copy(_values, values, _count); + var editor = VBufferEditor.Create(ref dst, _size, _count); + _values.AsSpan(0, _count).CopyTo(editor.Values); if (_count == _size) { - dst = new VBuffer(_size, values, indices); + dst = editor.Commit(); return; } - if (Utils.Size(indices) < _count) - indices = new int[_count]; - Array.Copy(_indices, indices, _count); - dst = new VBuffer(_size, _count, values, indices); + _indices.AsSpan(0, _count).CopyTo(editor.Indices); + dst = editor.Commit(); } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs index 8161c8e653..9474605079 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs @@ -166,20 +166,22 @@ public VecValueWriter(IRowCursor cursor, VectorType type, int source, char sep) public override void WriteData(Action appendItem, out int length) { _getSrc(ref _src); + var srcValues = _src.GetValues(); if (_src.IsDense) { - for (int i = 0; i < _src.Length; i++) + for (int i = 0; i < srcValues.Length; i++) { - Conv(in _src.Values[i], ref Sb); + Conv(in srcValues[i], ref Sb); appendItem(Sb, i); } } else { - for (int i = 0; i < _src.Count; i++) + var srcIndices = _src.GetIndices(); + for (int i = 0; i < srcValues.Length; i++) { - Conv(in _src.Values[i], ref Sb); - appendItem(Sb, _src.Indices[i]); + Conv(in srcValues[i], ref Sb); + appendItem(Sb, srcIndices[i]); } } length = _src.Length; @@ -188,15 +190,18 @@ public override void WriteData(Action appendItem, out int le public override void WriteHeader(Action appendItem, out int length) { length = _slotCount; - if (_slotNames.Count == 0) + var slotNamesValues = _slotNames.GetValues(); + if (slotNamesValues.Length == 0) return; - for (int i = 0; i < _slotNames.Count; i++) + + var slotNamesIndices = _slotNames.GetIndices(); + for (int i = 0; i < slotNamesValues.Length; i++) { - var name = _slotNames.Values[i]; + var name = slotNamesValues[i]; if (name.IsEmpty) continue; MapText(in name, ref Sb); - int index = _slotNames.IsDense ? i : _slotNames.Indices[i]; + int index = _slotNames.IsDense ? i : slotNamesIndices[i]; appendItem(Sb, index); } } diff --git a/src/Microsoft.ML.Data/DataView/CompositeSchema.cs b/src/Microsoft.ML.Data/DataView/CompositeSchema.cs index 2a526f152a..d61289b55c 100644 --- a/src/Microsoft.ML.Data/DataView/CompositeSchema.cs +++ b/src/Microsoft.ML.Data/DataView/CompositeSchema.cs @@ -67,7 +67,7 @@ public void CheckColumnInRange(int col) public void GetColumnSource(int col, out int srcIndex, out int srcCol) { CheckColumnInRange(col); - if (!_cumulativeColCounts.TryFindIndexSorted(0, _cumulativeColCounts.Length, col, out srcIndex)) + if (!Utils.TryFindIndexSorted(_cumulativeColCounts, 0, _cumulativeColCounts.Length, col, out srcIndex)) srcIndex--; Contracts.Assert(0 <= srcIndex && srcIndex < _cumulativeColCounts.Length); srcCol = col - _cumulativeColCounts[srcIndex]; diff --git a/src/Microsoft.ML.Data/DataView/Transposer.cs b/src/Microsoft.ML.Data/DataView/Transposer.cs index ae38fea8b0..df0d772a07 100644 --- a/src/Microsoft.ML.Data/DataView/Transposer.cs +++ b/src/Microsoft.ML.Data/DataView/Transposer.cs @@ -504,8 +504,25 @@ private sealed class SlotCursorVec : SlotCursor private T[][] _values; // Working intermediate value buffers. private int[] _counts; // Working intermediate count buffers. - // The transposed contents of _colStored. - private VBuffer[] _cbuff; // Working intermediate column-wise buffer. + private struct ColumnBufferStorage + { + // The transposed contents of _colStored. + public VBuffer Buffer; + + // These two arrays are the "cached" arrays inside of the Buffer + // to be swapped between the _cbuff and _values/_indices. + public readonly T[] Values; + public readonly int[] Indices; + + public ColumnBufferStorage(VBuffer buffer, T[] values, int[] indices) + { + Buffer = buffer; + Values = values; + Indices = indices; + } + } + + private ColumnBufferStorage[] _cbuff; // Working intermediate column-wise buffer. // Variables to track current cursor position. private int _colStored; // The current column of the source data view actually stored in the intermediate buffers. @@ -704,20 +721,24 @@ private void EnsureValid() Utils.EnsureSize(ref _cbuff, vecLen); for (int s = 0; s < vecLen; ++s) { - var temp = new VBuffer(_len, _counts[s], _values[s], _indices[s]); - if (temp.Count < _len / 2) + int count = _counts[s]; + T[] values = _values[s]; + int[] indices = _indices[s]; + var temp = new VBuffer(_len, count, values, indices); + if (count < _len / 2) { // Already sparse enough, I guess. Swap out the arrays. - Utils.Swap(ref temp, ref _cbuff[s]); - _indices[s] = temp.Indices ?? new int[_len]; - _values[s] = temp.Values ?? new T[_len]; + ColumnBufferStorage existingBuffer = _cbuff[s]; + _cbuff[s] = new ColumnBufferStorage(temp, values, indices); + _indices[s] = existingBuffer.Indices ?? new int[_len]; + _values[s] = existingBuffer.Values ?? new T[_len]; Ch.Assert(_indices[s].Length == _len); Ch.Assert(_values[s].Length == _len); } else { // Not dense enough. Densify temp into _cbuff[s]. Don't swap the arrays. - temp.CopyToDense(ref _cbuff[s]); + temp.CopyToDense(ref _cbuff[s].Buffer); } } _colStored = _colCurr; @@ -740,8 +761,8 @@ private void Getter(ref VBuffer dst) { Ch.Check(IsGood, "Cannot get values in the cursor's current state"); EnsureValid(); - Ch.Assert(0 <= _slotCurr && _slotCurr < Utils.Size(_cbuff) && _cbuff[_slotCurr].Length == _len); - _cbuff[_slotCurr].CopyTo(ref dst); + Ch.Assert(0 <= _slotCurr && _slotCurr < Utils.Size(_cbuff) && _cbuff[_slotCurr].Buffer.Length == _len); + _cbuff[_slotCurr].Buffer.CopyTo(ref dst); } protected override ValueGetter> GetGetterCore() @@ -1273,12 +1294,12 @@ private ValueGetter> CreateGetter(int col) (ref VBuffer value) => { EnsureValid(); - var values = value.Values; + VBufferEditor editor; if (_inputValue.IsDense) { - Utils.EnsureSize(ref values, len); - Array.Copy(_inputValue.Values, min, values, 0, len); - value = new VBuffer(len, values, value.Indices); + editor = VBufferEditor.Create(ref value, len); + _inputValue.GetValues().Slice(min, len).CopyTo(editor.Values); + value = editor.Commit(); return; } // In the sparse case we have ranges on Indices/Values to consider. @@ -1287,20 +1308,24 @@ private ValueGetter> CreateGetter(int col) int scount = slim - smin; if (scount == 0) { - value = new VBuffer(len, 0, value.Values, value.Indices); + VBufferUtils.Resize(ref value, len, 0); return; } - var indices = value.Indices; - Utils.EnsureSize(ref indices, scount); - Utils.EnsureSize(ref values, scount); - Array.Copy(_inputValue.Indices, smin, indices, 0, scount); - if (min != 0) + + editor = VBufferEditor.Create(ref value, len, scount); + bool isDense = len == scount; + if (!isDense) { - for (int i = 0; i < scount; ++i) - indices[i] -= min; + _inputValue.GetIndices().Slice(smin, scount).CopyTo(editor.Indices); + + if (min != 0) + { + for (int i = 0; i < scount; ++i) + editor.Indices[i] -= min; + } } - Array.Copy(_inputValue.Values, smin, values, 0, scount); - value = new VBuffer(len, scount, values, indices); + _inputValue.GetValues().Slice(smin, scount).CopyTo(editor.Values); + value = editor.Commit(); }; } @@ -1314,15 +1339,14 @@ private void EnsureValid() // and end of each slice. if (_inputValue.IsDense) return; - if (_inputValue.Count == 0) + var indices = _inputValue.GetIndices(); + if (indices.Length == 0) { // Handle this separately, since _inputValue.Indices might be null // in this case, and then we may as well short circuit it anyway. Array.Clear(_srcIndicesLims, 0, _srcIndicesLims.Length); return; } - var indices = _inputValue.Indices; - Contracts.AssertValue(indices); int ii = 0; for (int i = 0; i < Lims.Length; ++i) @@ -1331,7 +1355,7 @@ private void EnsureValid() // REVIEW: Would some form of bisection search be better // than this scan? Possibly if the search were to happen across // all lims at the same time, somehow. - while (ii < _inputValue.Count && indices[ii] < lim) + while (ii < indices.Length && indices[ii] < lim) ii++; _srcIndicesLims[i] = ii; } diff --git a/src/Microsoft.ML.Data/Depricated/Vector/VBufferMathUtils.cs b/src/Microsoft.ML.Data/Depricated/Vector/VBufferMathUtils.cs index 1438cce601..8aae1f7c66 100644 --- a/src/Microsoft.ML.Data/Depricated/Vector/VBufferMathUtils.cs +++ b/src/Microsoft.ML.Data/Depricated/Vector/VBufferMathUtils.cs @@ -20,9 +20,10 @@ public static partial class VectorUtils /// public static Float NormSquared(in VBuffer a) { - if (a.Count == 0) + var aValues = a.GetValues(); + if (aValues.Length == 0) return 0; - return CpuMathUtils.SumSq(a.Values.AsSpan(0, a.Count)); + return CpuMathUtils.SumSq(aValues); } /// @@ -48,9 +49,10 @@ public static Float Norm(in VBuffer a) /// L1 norm of the vector public static Float L1Norm(in VBuffer a) { - if (a.Count == 0) + var aValues = a.GetValues(); + if (aValues.Length == 0) return 0; - return CpuMathUtils.SumAbs(a.Values.AsSpan(0, a.Count)); + return CpuMathUtils.SumAbs(aValues); } /// @@ -59,9 +61,10 @@ public static Float L1Norm(in VBuffer a) /// L-infinity norm of the vector public static Float MaxNorm(in VBuffer a) { - if (a.Count == 0) + var aValues = a.GetValues(); + if (aValues.Length == 0) return 0; - return CpuMathUtils.MaxAbs(a.Values.AsSpan(0, a.Count)); + return CpuMathUtils.MaxAbs(aValues); } /// @@ -69,9 +72,10 @@ public static Float MaxNorm(in VBuffer a) /// public static Float Sum(in VBuffer a) { - if (a.Count == 0) + var aValues = a.GetValues(); + if (aValues.Length == 0) return 0; - return CpuMathUtils.Sum(a.Values.AsSpan(0, a.Count)); + return CpuMathUtils.Sum(aValues); } /// @@ -81,12 +85,13 @@ public static Float Sum(in VBuffer a) /// Value to multiply vector with public static void ScaleBy(ref VBuffer dst, Float c) { - if (c == 1 || dst.Count == 0) + if (c == 1 || dst.GetValues().Length == 0) return; + var editor = VBufferEditor.CreateFromBuffer(ref dst); if (c != 0) - CpuMathUtils.Scale(c, dst.Values.AsSpan(0, dst.Count)); + CpuMathUtils.Scale(c, editor.Values); else // Maintain density of dst. - Array.Clear(dst.Values, 0, dst.Count); + editor.Values.Clear(); // REVIEW: Any benefit in sparsifying? } @@ -97,35 +102,36 @@ public static void ScaleBy(ref VBuffer dst, Float c) public static void ScaleBy(in VBuffer src, ref VBuffer dst, Float c) { int length = src.Length; - int count = src.Count; + var srcValues = src.GetValues(); + int count = srcValues.Length; if (count == 0) { // dst is a zero vector. - dst = new VBuffer(length, 0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, length, 0); return; } - var dstValues = Utils.Size(dst.Values) >= count ? dst.Values : new Float[count]; if (src.IsDense) { // Maintain the density of src to dst in order to avoid slow down of L-BFGS. + var editor = VBufferEditor.Create(ref dst, length); Contracts.Assert(length == count); if (c == 0) - Array.Clear(dstValues, 0, length); + editor.Values.Clear(); else - CpuMathUtils.Scale(c, src.Values, dstValues, length); - dst = new VBuffer(length, dstValues, dst.Indices); + CpuMathUtils.Scale(c, srcValues, editor.Values, length); + dst = editor.Commit(); } else { - var dstIndices = Utils.Size(dst.Indices) >= count ? dst.Indices : new int[count]; - Array.Copy(src.Indices, dstIndices, count); + var editor = VBufferEditor.Create(ref dst, length, count); + src.GetIndices().CopyTo(editor.Indices); if (c == 0) - Array.Clear(dstValues, 0, count); + editor.Values.Clear(); else - CpuMathUtils.Scale(c, src.Values, dstValues, count); - dst = new VBuffer(length, count, dstValues, dstIndices); + CpuMathUtils.Scale(c, srcValues, editor.Values, count); + dst = editor.Commit(); } } @@ -136,15 +142,17 @@ public static void Add(in VBuffer src, ref VBuffer dst) { Contracts.Check(src.Length == dst.Length, "Vectors must have the same dimensionality."); - if (src.Count == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0) return; if (dst.IsDense) { + var editor = VBufferEditor.Create(ref dst, dst.Length); if (src.IsDense) - CpuMathUtils.Add(src.Values, dst.Values, src.Length); + CpuMathUtils.Add(srcValues, editor.Values, src.Length); else - CpuMathUtils.Add(src.Values, src.Indices, dst.Values, src.Count); + CpuMathUtils.Add(srcValues, src.GetIndices(), editor.Values, srcValues.Length); return; } // REVIEW: Should we use SSE for any of these possibilities? @@ -162,15 +170,17 @@ public static void AddMult(in VBuffer src, Float c, ref VBuffer ds { Contracts.Check(src.Length == dst.Length, "Vectors must have the same dimensionality."); - if (src.Count == 0 || c == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0 || c == 0) return; if (dst.IsDense) { + var editor = VBufferEditor.Create(ref dst, dst.Length); if (src.IsDense) - CpuMathUtils.AddScale(c, src.Values, dst.Values, src.Length); + CpuMathUtils.AddScale(c, srcValues, editor.Values, src.Length); else - CpuMathUtils.AddScale(c, src.Values, src.Indices, dst.Values, src.Count); + CpuMathUtils.AddScale(c, srcValues, src.GetIndices(), editor.Values, srcValues.Length); return; } // REVIEW: Should we use SSE for any of these possibilities? @@ -186,7 +196,8 @@ public static void AddMult(in VBuffer src, Float c, ref VBuffer ds Contracts.Check(src.Length == dst.Length, "Vectors must have the same dimensionality."); int length = src.Length; - if (src.Count == 0 || c == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0 || c == 0) { // src is zero vector, res = dst dst.CopyTo(ref res); @@ -196,9 +207,9 @@ public static void AddMult(in VBuffer src, Float c, ref VBuffer ds Contracts.Assert(length > 0); if (dst.IsDense && src.IsDense) { - Float[] resValues = Utils.Size(res.Values) >= length ? res.Values : new Float[length]; - CpuMathUtils.AddScaleCopy(c, src.Values, dst.Values, resValues, length); - res = new VBuffer(length, resValues, res.Indices); + var editor = VBufferEditor.Create(ref res, length); + CpuMathUtils.AddScaleCopy(c, srcValues, dst.GetValues(), editor.Values, length); + res = editor.Commit(); return; } @@ -214,9 +225,9 @@ public static void AddMultInto(in VBuffer a, Float c, in VBuffer b { Contracts.Check(a.Length == b.Length, "Vectors must have the same dimensionality."); - if (c == 0 || b.Count == 0) + if (c == 0 || b.GetValues().Length == 0) a.CopyTo(ref dst); - else if (a.Count == 0) + else if (a.GetValues().Length == 0) ScaleInto(in b, c, ref dst); else VBufferUtils.ApplyInto(in a, in b, ref dst, (ind, v1, v2) => v1 + c * v2); @@ -233,15 +244,20 @@ public static void AddMultWithOffset(in VBuffer src, Float c, ref VBuffer Contracts.CheckParam(0 <= offset && offset <= dst.Length, nameof(offset)); Contracts.CheckParam(src.Length <= dst.Length - offset, nameof(offset)); - if (src.Count == 0 || c == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0 || c == 0) return; + VBufferEditor editor; + Span values; if (dst.IsDense) { // This is by far the most common case. + editor = VBufferEditor.Create(ref dst, dst.Length); + values = editor.Values.Slice(offset); if (src.IsDense) - CpuMathUtils.AddScale(c, src.Values, dst.Values.AsSpan(offset), src.Count); + CpuMathUtils.AddScale(c, srcValues, values, srcValues.Length); else - CpuMathUtils.AddScale(c, src.Values, src.Indices, dst.Values.AsSpan(offset), src.Count); + CpuMathUtils.AddScale(c, srcValues, src.GetIndices(), values, srcValues.Length); return; } // REVIEW: Perhaps implementing an ApplyInto with an offset would be more @@ -250,8 +266,9 @@ public static void AddMultWithOffset(in VBuffer src, Float c, ref VBuffer // dst is sparse. I expect this will see limited practical use, since accumulants // are often better off going into a dense vector in all applications of interest to us. // Correspondingly, this implementation will be functional, but not optimized. - int dMin = dst.Count == 0 ? 0 : Utils.FindIndexSorted(dst.Indices, 0, dst.Count, offset); - int dLim = dst.Count == 0 ? 0 : Utils.FindIndexSorted(dst.Indices, dMin, dst.Count, offset + src.Length); + var dstIndices = dst.GetIndices(); + int dMin = dstIndices.Length == 0 ? 0 : dstIndices.FindIndexSorted(0, dstIndices.Length, offset); + int dLim = dstIndices.Length == 0 ? 0 : dstIndices.FindIndexSorted(dMin, dstIndices.Length, offset + src.Length); Contracts.Assert(dMin - dLim <= src.Length); // First get the number of extra values that we will need to accomodate. int gapCount; @@ -259,10 +276,11 @@ public static void AddMultWithOffset(in VBuffer src, Float c, ref VBuffer gapCount = src.Length - (dLim - dMin); else { - gapCount = src.Count; - for (int iS = 0, iD = dMin; iS < src.Count && iD < dLim; ) + gapCount = srcValues.Length; + var srcIndices = src.GetIndices(); + for (int iS = 0, iD = dMin; iS < srcIndices.Length && iD < dLim; ) { - var comp = src.Indices[iS] - dst.Indices[iD] + offset; + var comp = srcIndices[iS] - dstIndices[iD] + offset; if (comp < 0) // dst index is larger. iS++; else if (comp > 0) // src index is larger. @@ -276,18 +294,23 @@ public static void AddMultWithOffset(in VBuffer src, Float c, ref VBuffer } } // Extend dst so that it has room for this additional stuff. Shift things over as well. - var indices = dst.Indices; - var values = dst.Values; + var dstValues = dst.GetValues(); + editor = VBufferEditor.Create(ref dst, + dst.Length, + dstValues.Length + gapCount, + keepOldOnResize: true); + var indices = editor.Indices; + values = editor.Values; if (gapCount > 0) { - Utils.EnsureSize(ref indices, dst.Count + gapCount, dst.Length); - Utils.EnsureSize(ref values, dst.Count + gapCount, dst.Length); // Shift things over, unless there's nothing to shift over, or no new elements are being introduced anyway. - if (dst.Count != dLim) + if (dstValues.Length != dLim) { - Contracts.Assert(dLim < dst.Count); - Array.Copy(indices, dLim, indices, dLim + gapCount, dst.Count - dLim); - Array.Copy(values, dLim, values, dLim + gapCount, dst.Count - dLim); + Contracts.Assert(dLim < dstValues.Length); + indices.Slice(dLim, dstValues.Length - dLim) + .CopyTo(indices.Slice(dLim + gapCount)); + values.Slice(dLim, dstValues.Length - dLim) + .CopyTo(values.Slice(dLim + gapCount)); } } // Now, fill in the stuff in this "gap." Both of these implementations work @@ -303,10 +326,10 @@ public static void AddMultWithOffset(in VBuffer src, Float c, ref VBuffer Contracts.Assert(iDD == iS + dMin); // iDD and iD are the points in where we are writing and reading from. Contracts.Assert(iDD >= iD); - if (iD >= 0 && offset + iS == dst.Indices[iD]) // Collision. - values[iDD] = dst.Values[iD--] + c * src.Values[iS]; + if (iD >= 0 && offset + iS == dstIndices[iD]) // Collision. + values[iDD] = dstValues[iD--] + c * srcValues[iS]; else // Miss. - values[iDD] = c * src.Values[iS]; + values[iDD] = c * srcValues[iS]; indices[iDD] = offset + iS; } } @@ -314,9 +337,10 @@ public static void AddMultWithOffset(in VBuffer src, Float c, ref VBuffer { // Both dst and src are sparse. int iD = dLim - 1; - int iS = src.Count - 1; - int sIndex = iS < 0 ? -1 : src.Indices[iS]; - int dIndex = iD < 0 ? -1 : dst.Indices[iD] - offset; + var srcIndices = src.GetIndices(); + int iS = srcIndices.Length - 1; + int sIndex = iS < 0 ? -1 : srcIndices[iS]; + int dIndex = iD < 0 ? -1 : dstIndices[iD] - offset; for (int iDD = dLim + gapCount; --iDD >= dMin; ) { @@ -324,26 +348,26 @@ public static void AddMultWithOffset(in VBuffer src, Float c, ref VBuffer int comp = sIndex - dIndex; if (comp == 0) // Collision on both. { - indices[iDD] = dst.Indices[iD]; - values[iDD] = dst.Values[iD--] + c * src.Values[iS--]; - sIndex = iS < 0 ? -1 : src.Indices[iS]; - dIndex = iD < 0 ? -1 : dst.Indices[iD] - offset; + indices[iDD] = dstIndices[iD]; + values[iDD] = dstValues[iD--] + c * srcValues[iS--]; + sIndex = iS < 0 ? -1 : srcIndices[iS]; + dIndex = iD < 0 ? -1 : dstIndices[iD] - offset; } else if (comp < 0) // Collision on dst. { - indices[iDD] = dst.Indices[iD]; - values[iDD] = dst.Values[iD--]; - dIndex = iD < 0 ? -1 : dst.Indices[iD] - offset; + indices[iDD] = dstIndices[iD]; + values[iDD] = dstValues[iD--]; + dIndex = iD < 0 ? -1 : dstIndices[iD] - offset; } else // Collision on src. { indices[iDD] = sIndex + offset; - values[iDD] = c * src.Values[iS--]; - sIndex = iS < 0 ? -1 : src.Indices[iS]; + values[iDD] = c * srcValues[iS--]; + sIndex = iS < 0 ? -1 : srcIndices[iS]; } } } - dst = new VBuffer(dst.Length, dst.Count + gapCount, values, indices); + dst = editor.Commit(); } /// @@ -361,19 +385,20 @@ public static void ScaleInto(in VBuffer src, Float c, ref VBuffer // equal lengths, but I assume I don't care here. if (c == 1) src.CopyTo(ref dst); - else if (src.Count == 0 || c == 0) + else if (src.GetValues().Length == 0 || c == 0) { if (src.Length > 0 && src.IsDense) { - var values = dst.Values; // Due to sparsity preservation from src, dst must be dense, in the same way. - Utils.EnsureSize(ref values, src.Length, src.Length, keepOld: false); - if (values == dst.Values) // We need to clear it. - Array.Clear(values, 0, src.Length); - dst = new VBuffer(src.Length, values, dst.Indices); + var editor = VBufferEditor.Create(ref dst, src.Length); + if (!editor.CreatedNewValues) // We need to clear it. + editor.Values.Clear(); + dst = editor.Commit(); } else - dst = new VBuffer(src.Length, 0, dst.Values, dst.Indices); + { + VBufferUtils.Resize(ref dst, src.Length, 0); + } } else if (c == -1) VBufferUtils.ApplyIntoEitherDefined(in src, ref dst, (i, v) => -v); @@ -385,33 +410,35 @@ public static int ArgMax(in VBuffer src) { if (src.Length == 0) return -1; - if (src.Count == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0) return 0; - int ind = MathUtils.ArgMax(src.Values, src.Count); + int ind = MathUtils.ArgMax(srcValues); // ind < 0 iff all explicit values are NaN. - Contracts.Assert(-1 <= ind && ind < src.Count); + Contracts.Assert(-1 <= ind && ind < srcValues.Length); if (src.IsDense) return ind; + var srcIndices = src.GetIndices(); if (ind >= 0) { - Contracts.Assert(src.Indices[ind] >= ind); - if (src.Values[ind] > 0) - return src.Indices[ind]; + Contracts.Assert(srcIndices[ind] >= ind); + if (srcValues[ind] > 0) + return srcIndices[ind]; // This covers the case where there is an explicit zero, and zero is the max, // and the first explicit zero is before any implicit entries. - if (src.Values[ind] == 0 && src.Indices[ind] == ind) + if (srcValues[ind] == 0 && srcIndices[ind] == ind) return ind; } // All explicit values are non-positive or NaN, so return the first index not in src.Indices. ind = 0; - while (ind < src.Count && src.Indices[ind] == ind) + while (ind < srcIndices.Length && srcIndices[ind] == ind) ind++; - Contracts.Assert(ind <= src.Count); - Contracts.Assert(ind == src.Count || ind < src.Indices[ind]); + Contracts.Assert(ind <= srcIndices.Length); + Contracts.Assert(ind == srcIndices.Length || ind < srcIndices[ind]); return ind; } @@ -419,33 +446,35 @@ public static int ArgMin(in VBuffer src) { if (src.Length == 0) return -1; - if (src.Count == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0) return 0; - int ind = MathUtils.ArgMin(src.Values, src.Count); + int ind = MathUtils.ArgMin(srcValues); // ind < 0 iff all explicit values are NaN. - Contracts.Assert(-1 <= ind && ind < src.Count); + Contracts.Assert(-1 <= ind && ind < srcValues.Length); if (src.IsDense) return ind; + var srcIndices = src.GetIndices(); if (ind >= 0) { - Contracts.Assert(src.Indices[ind] >= ind); - if (src.Values[ind] < 0) - return src.Indices[ind]; + Contracts.Assert(srcIndices[ind] >= ind); + if (srcValues[ind] < 0) + return srcIndices[ind]; // This covers the case where there is an explicit zero, and zero is the min, // and the first explicit zero is before any implicit entries. - if (src.Values[ind] == 0 && src.Indices[ind] == ind) + if (srcValues[ind] == 0 && srcIndices[ind] == ind) return ind; } - // All explicit values are non-negative or NaN, so return the first index not in src.Indices. + // All explicit values are non-negative or NaN, so return the first index not in srcIndices. ind = 0; - while (ind < src.Count && src.Indices[ind] == ind) + while (ind < srcIndices.Length && srcIndices[ind] == ind) ind++; - Contracts.Assert(ind <= src.Count); - Contracts.Assert(ind == src.Count || ind < src.Indices[ind]); + Contracts.Assert(ind <= srcIndices.Length); + Contracts.Assert(ind == srcIndices.Length || ind < srcIndices[ind]); return ind; } } diff --git a/src/Microsoft.ML.Data/Depricated/Vector/VectorUtils.cs b/src/Microsoft.ML.Data/Depricated/Vector/VectorUtils.cs index 79af700bcc..2926b47f83 100644 --- a/src/Microsoft.ML.Data/Depricated/Vector/VectorUtils.cs +++ b/src/Microsoft.ML.Data/Depricated/Vector/VectorUtils.cs @@ -30,30 +30,33 @@ public static Float DotProduct(Float[] a, Float[] b) public static Float DotProduct(Float[] a, in VBuffer b) { Contracts.Check(Utils.Size(a) == b.Length, "Vectors must have the same dimensionality."); - if (b.Count == 0) + var bValues = b.GetValues(); + if (bValues.Length == 0) return 0; if (b.IsDense) - return CpuMathUtils.DotProductDense(a, b.Values, b.Length); - return CpuMathUtils.DotProductSparse(a, b.Values, b.Indices, b.Count); + return CpuMathUtils.DotProductDense(a, bValues, b.Length); + return CpuMathUtils.DotProductSparse(a, bValues, b.GetIndices(), bValues.Length); } public static Float DotProduct(in VBuffer a, in VBuffer b) { Contracts.Check(a.Length == b.Length, "Vectors must have the same dimensionality."); - if (a.Count == 0 || b.Count == 0) + var aValues = a.GetValues(); + var bValues = b.GetValues(); + if (aValues.Length == 0 || bValues.Length == 0) return 0; if (a.IsDense) { if (b.IsDense) - return CpuMathUtils.DotProductDense(a.Values, b.Values, a.Length); - return CpuMathUtils.DotProductSparse(a.Values, b.Values, b.Indices, b.Count); + return CpuMathUtils.DotProductDense(aValues, bValues, a.Length); + return CpuMathUtils.DotProductSparse(aValues, bValues, b.GetIndices(), bValues.Length); } if (b.IsDense) - return CpuMathUtils.DotProductSparse(b.Values, a.Values, a.Indices, a.Count); - return DotProductSparse(a.Values, a.Indices, 0, a.Count, b.Values, b.Indices, 0, b.Count, 0); + return CpuMathUtils.DotProductSparse(bValues, aValues, a.GetIndices(), aValues.Length); + return DotProductSparse(aValues, a.GetIndices(), 0, aValues.Length, bValues, b.GetIndices(), 0, bValues.Length); } /// @@ -75,10 +78,12 @@ public static void SparsifyNormalize(ref VBuffer a, int top, int bottom, var bottomHeap = new Heap>((left, right) => right.Value > left.Value, bottom + 1); bool isDense = a.IsDense; - for (int i = 0; i < a.Count; i++) + var aValues = a.GetValues(); + var aIndices = a.GetIndices(); + for (int i = 0; i < aValues.Length; i++) { - int idx = isDense ? i : a.Indices[i]; - var value = a.Values[i]; + int idx = isDense ? i : aIndices[i]; + var value = aValues[i]; if (value < 0 && bottom > 0) { @@ -159,27 +164,24 @@ public static void MulElementWise(in VBuffer a, ref VBuffer dst) Contracts.Check(a.Length == dst.Length, "Vectors must have the same dimensionality."); if (a.IsDense && dst.IsDense) - CpuMathUtils.MulElementWise(a.Values, dst.Values, dst.Values, a.Length); + { + var editor = VBufferEditor.CreateFromBuffer(ref dst); + CpuMathUtils.MulElementWise(a.GetValues(), dst.GetValues(), editor.Values, a.Length); + } else VBufferUtils.ApplyWithEitherDefined(in a, ref dst, (int ind, Float v1, ref Float v2) => { v2 *= v1; }); } - private static Float L2DistSquaredSparse(Float[] valuesA, int[] indicesA, int countA, Float[] valuesB, int[] indicesB, int countB, int length) + private static Float L2DistSquaredSparse(ReadOnlySpan valuesA, ReadOnlySpan indicesA, ReadOnlySpan valuesB, ReadOnlySpan indicesB) { - Contracts.AssertValueOrNull(valuesA); - Contracts.AssertValueOrNull(indicesA); - Contracts.AssertValueOrNull(valuesB); - Contracts.AssertValueOrNull(indicesB); - Contracts.Assert(0 <= countA && countA <= Utils.Size(indicesA)); - Contracts.Assert(0 <= countB && countB <= Utils.Size(indicesB)); - Contracts.Assert(countA <= Utils.Size(valuesA)); - Contracts.Assert(countB <= Utils.Size(valuesB)); + Contracts.Assert(valuesA.Length == indicesA.Length); + Contracts.Assert(valuesB.Length == indicesB.Length); Float res = 0; int ia = 0; int ib = 0; - while (ia < countA && ib < countB) + while (ia < indicesA.Length && ib < indicesB.Length) { int diff = indicesA[ia] - indicesB[ib]; Float d; @@ -202,14 +204,14 @@ private static Float L2DistSquaredSparse(Float[] valuesA, int[] indicesA, int co res += d * d; } - while (ia < countA) + while (ia < indicesA.Length) { var d = valuesA[ia]; res += d * d; ia++; } - while (ib < countB) + while (ib < indicesB.Length) { var d = valuesB[ib]; res += d * d; @@ -219,30 +221,21 @@ private static Float L2DistSquaredSparse(Float[] valuesA, int[] indicesA, int co return res; } - private static Float L2DistSquaredHalfSparse(Float[] valuesA, int lengthA, Float[] valuesB, int[] indicesB, int countB) + private static Float L2DistSquaredHalfSparse(ReadOnlySpan valuesA, ReadOnlySpan valuesB, ReadOnlySpan indicesB) { - Contracts.AssertValueOrNull(valuesA); - Contracts.AssertValueOrNull(valuesB); - Contracts.AssertValueOrNull(indicesB); - Contracts.Assert(0 <= lengthA && lengthA <= Utils.Size(valuesA)); - Contracts.Assert(0 <= countB && countB <= Utils.Size(indicesB)); - Contracts.Assert(countB <= Utils.Size(valuesB)); - - var normA = CpuMathUtils.SumSq(valuesA.AsSpan(0, lengthA)); - if (countB == 0) + var normA = CpuMathUtils.SumSq(valuesA); + if (valuesB.Length == 0) return normA; - var normB = CpuMathUtils.SumSq(valuesB.AsSpan(0, countB)); - var dotP = CpuMathUtils.DotProductSparse(valuesA, valuesB, indicesB, countB); + var normB = CpuMathUtils.SumSq(valuesB); + var dotP = CpuMathUtils.DotProductSparse(valuesA, valuesB, indicesB, valuesB.Length); var res = normA + normB - 2 * dotP; return res < 0 ? 0 : res; } - private static Float L2DiffSquaredDense(Float[] valuesA, Float[] valuesB, int length) + private static Float L2DiffSquaredDense(ReadOnlySpan valuesA, ReadOnlySpan valuesB, int length) { - Contracts.AssertValueOrNull(valuesA); - Contracts.AssertValueOrNull(valuesB); - Contracts.Assert(0 <= length && length <= Utils.Size(valuesA)); - Contracts.Assert(0 <= length && length <= Utils.Size(valuesB)); + Contracts.Assert(0 <= length && length <= valuesA.Length); + Contracts.Assert(0 <= length && length <= valuesB.Length); if (length == 0) return 0; @@ -262,32 +255,36 @@ public static Float DotProductWithOffset(in VBuffer a, int offset, in VBu Contracts.Check(0 <= offset && offset <= a.Length); Contracts.Check(b.Length <= a.Length - offset, "VBuffer b must be no longer than a.Length - offset."); - if (a.Count == 0 || b.Count == 0) + var aValues = a.GetValues(); + var bValues = b.GetValues(); + if (aValues.Length == 0 || bValues.Length == 0) return 0; if (a.IsDense) { if (b.IsDense) - return CpuMathUtils.DotProductDense(a.Values.AsSpan(offset), b.Values, b.Length); - return CpuMathUtils.DotProductSparse(a.Values.AsSpan(offset), b.Values, b.Indices, b.Count); + return CpuMathUtils.DotProductDense(aValues.Slice(offset), bValues, b.Length); + return CpuMathUtils.DotProductSparse(aValues.Slice(offset), bValues, b.GetIndices(), bValues.Length); } else { Float result = 0; - int aMin = Utils.FindIndexSorted(a.Indices, 0, a.Count, offset); - int aLim = Utils.FindIndexSorted(a.Indices, 0, a.Count, offset + b.Length); + var aIndices = a.GetIndices(); + int aMin = Utils.FindIndexSorted(aIndices, 0, aIndices.Length, offset); + int aLim = Utils.FindIndexSorted(aIndices, 0, aIndices.Length, offset + b.Length); if (b.IsDense) { for (int iA = aMin; iA < aLim; ++iA) - result += a.Values[iA] * b.Values[a.Indices[iA] - offset]; + result += aValues[iA] * bValues[aIndices[iA] - offset]; return result; } - for (int iA = aMin, iB = 0; iA < aLim && iB < b.Count; ) + var bIndices = b.GetIndices(); + for (int iA = aMin, iB = 0; iA < aLim && iB < bIndices.Length; ) { - int aIndex = a.Indices[iA]; - int bIndex = b.Indices[iB]; + int aIndex = aIndices[iA]; + int bIndex = bIndices[iB]; int comp = (aIndex - offset) - bIndex; if (comp == 0) - result += a.Values[iA++] * b.Values[iB++]; + result += aValues[iA++] * bValues[iB++]; else if (comp < 0) iA++; else @@ -310,20 +307,21 @@ public static Float DotProductWithOffset(Float[] a, int offset, in VBuffer aValues, ReadOnlySpan aIndices, int ia, int iaLim, ReadOnlySpan bValues, ReadOnlySpan bIndices, int ib, int ibLim) { - Contracts.AssertValue(aValues); - Contracts.AssertValue(aIndices); - Contracts.AssertValue(bValues); - Contracts.AssertValue(bIndices); + Contracts.AssertNonEmpty(aValues); + Contracts.AssertNonEmpty(aIndices); + Contracts.AssertNonEmpty(bValues); + Contracts.AssertNonEmpty(bIndices); Contracts.Assert(0 <= ia && ia < iaLim && iaLim <= aIndices.Length); Contracts.Assert(0 <= ib && ib < ibLim && ibLim <= bIndices.Length); @@ -334,7 +332,7 @@ private static Float DotProductSparse(Float[] aValues, int[] aIndices, int ia, i for (; ; ) { - int d = aIndices[ia] - offset - bIndices[ib]; + int d = aIndices[ia] - bIndices[ib]; if (d == 0) { res += aValues[ia] * bValues[ib]; @@ -347,7 +345,7 @@ private static Float DotProductSparse(Float[] aValues, int[] aIndices, int ia, i { ia++; if (d < -thresh) - ia = Utils.FindIndexSorted(aIndices, ia, iaLim, bIndices[ib] + offset); + ia = Utils.FindIndexSorted(aIndices, ia, iaLim, bIndices[ib]); if (ia >= iaLim) break; } @@ -355,7 +353,7 @@ private static Float DotProductSparse(Float[] aValues, int[] aIndices, int ia, i { ib++; if (d > thresh) - ib = Utils.FindIndexSorted(bIndices, ib, ibLim, aIndices[ia] - offset); + ib = Utils.FindIndexSorted(bIndices, ib, ibLim, aIndices[ia]); if (ib >= ibLim) break; } @@ -401,12 +399,12 @@ public static Float L2DistSquared(in VBuffer a, in VBuffer b) if (a.IsDense) { if (b.IsDense) - return L2DiffSquaredDense(a.Values, b.Values, b.Length); - return L2DistSquaredHalfSparse(a.Values, a.Length, b.Values, b.Indices, b.Count); + return L2DiffSquaredDense(a.GetValues(), b.GetValues(), b.Length); + return L2DistSquaredHalfSparse(a.GetValues(), b.GetValues(), b.GetIndices()); } if (b.IsDense) - return L2DistSquaredHalfSparse(b.Values, b.Length, a.Values, a.Indices, a.Count); - return L2DistSquaredSparse(a.Values, a.Indices, a.Count, b.Values, b.Indices, b.Count, a.Length); + return L2DistSquaredHalfSparse(b.GetValues(), a.GetValues(), a.GetIndices()); + return L2DistSquaredSparse(a.GetValues(), a.GetIndices(), b.GetValues(), b.GetIndices()); } /// @@ -420,8 +418,8 @@ public static Float L2DistSquared(Float[] a, in VBuffer b) Contracts.CheckValue(a, nameof(a)); Contracts.Check(Utils.Size(a) == b.Length, "Vectors must have the same dimensionality."); if (b.IsDense) - return L2DiffSquaredDense(a, b.Values, b.Length); - return L2DistSquaredHalfSparse(a, a.Length, b.Values, b.Indices, b.Count); + return L2DiffSquaredDense(a, b.GetValues(), b.Length); + return L2DistSquaredHalfSparse(a.AsSpan(0, a.Length), b.GetValues(), b.GetIndices()); } /// @@ -448,15 +446,17 @@ public static void AddMult(in VBuffer src, Float[] dst, Float c) Contracts.CheckValue(dst, nameof(dst)); Contracts.CheckParam(src.Length == dst.Length, nameof(dst), "Arrays must have the same dimensionality."); - if (src.Count == 0 || c == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0 || c == 0) return; if (src.IsDense) - CpuMathUtils.AddScale(c, src.Values, dst, src.Count); + CpuMathUtils.AddScale(c, srcValues, dst, srcValues.Length); else { - for (int i = 0; i < src.Count; i++) - dst[src.Indices[i]] += c * src.Values[i]; + var srcIndices = src.GetIndices(); + for (int i = 0; i < srcValues.Length; i++) + dst[srcIndices[i]] += c * srcValues[i]; } } @@ -474,18 +474,20 @@ public static void AddMultWithOffset(in VBuffer src, Float[] dst, int off Contracts.Check(0 <= offset && offset <= dst.Length); Contracts.Check(src.Length <= dst.Length - offset, "Vector src must be no longer than dst.Length - offset."); - if (src.Count == 0 || c == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0 || c == 0) return; if (src.IsDense) { for (int i = 0; i < src.Length; i++) - dst[i + offset] += c * src.Values[i]; + dst[i + offset] += c * srcValues[i]; } else { - for (int i = 0; i < src.Count; i++) - dst[src.Indices[i] + offset] += c * src.Values[i]; + var srcIndices = src.GetIndices(); + for (int i = 0; i < srcValues.Length; i++) + dst[srcIndices[i] + offset] += c * srcValues[i]; } } diff --git a/src/Microsoft.ML.Data/Evaluators/EvaluatorUtils.cs b/src/Microsoft.ML.Data/Evaluators/EvaluatorUtils.cs index 646b7b3547..6c68b3fa20 100644 --- a/src/Microsoft.ML.Data/Evaluators/EvaluatorUtils.cs +++ b/src/Microsoft.ML.Data/Evaluators/EvaluatorUtils.cs @@ -615,7 +615,7 @@ public static void ReconcileKeyValues(IHostEnvironment env, IDataView[] views, s var keyNamesVBuffer = new VBuffer>(keyNames.Count, keyNames.Keys.ToArray()); ValueGetter>> keyValueGetter = (ref VBuffer> dst) => - dst = new VBuffer>(keyNamesVBuffer.Length, keyNamesVBuffer.Count, keyNamesVBuffer.Values, keyNamesVBuffer.Indices); + keyNamesVBuffer.CopyTo(ref dst); // For each input data view, create the reconciled key column by wrapping it in a LambdaColumnMapper. for (int i = 0; i < dvCount; i++) @@ -683,7 +683,7 @@ public static void ReconcileVectorKeyValues(IHostEnvironment env, IDataView[] vi var keyNamesVBuffer = new VBuffer>(keyNames.Count, keyNames.Keys.ToArray()); ValueGetter>> keyValueGetter = (ref VBuffer> dst) => - dst = new VBuffer>(keyNamesVBuffer.Length, keyNamesVBuffer.Count, keyNamesVBuffer.Values, keyNamesVBuffer.Indices); + keyNamesVBuffer.CopyTo(ref dst); for (int i = 0; i < dvCount; i++) { @@ -691,35 +691,34 @@ public static void ReconcileVectorKeyValues(IHostEnvironment env, IDataView[] vi ValueMapper, VBuffer> mapper = (in VBuffer src, ref VBuffer dst) => { - var values = dst.Values; - if (Utils.Size(values) < src.Count) - values = new uint[src.Count]; + var srcValues = src.GetValues(); + var editor = VBufferEditor.Create( + ref dst, + src.Length, + srcValues.Length); if (src.IsDense) { for (int j = 0; j < src.Length; j++) { - if (src.Values[j] == 0 || src.Values[j] > keyMapperCur.Length) - values[j] = 0; + if (srcValues[j] == 0 || srcValues[j] > keyMapperCur.Length) + editor.Values[j] = 0; else - values[j] = (uint)keyMapperCur[src.Values[j] - 1] + 1; + editor.Values[j] = (uint)keyMapperCur[srcValues[j] - 1] + 1; } - dst = new VBuffer(src.Length, values, dst.Indices); } else { - var indices = dst.Indices; - if (Utils.Size(indices) < src.Count) - indices = new int[src.Count]; - for (int j = 0; j < src.Count; j++) + var srcIndices = src.GetIndices(); + for (int j = 0; j < srcValues.Length; j++) { - if (src.Values[j] == 0 || src.Values[j] > keyMapperCur.Length) - values[j] = 0; + if (srcValues[j] == 0 || srcValues[j] > keyMapperCur.Length) + editor.Values[j] = 0; else - values[j] = (uint)keyMapperCur[src.Values[j] - 1] + 1; - indices[j] = src.Indices[j]; + editor.Values[j] = (uint)keyMapperCur[srcValues[j] - 1] + 1; + editor.Indices[j] = srcIndices[j]; } - dst = new VBuffer(src.Length, src.Count, values, indices); } + dst = editor.Commit(); }; ValueGetter>> slotNamesGetter = null; @@ -1388,9 +1387,10 @@ public static string GetConfusionTable(IHost host, IDataView confusionDataView, var confusionTable = GetConfusionTableAsArray(confusionDataView, countCol, labelNames.Length, labelIndexToConfIndexMap, numConfusionTableLabels, out precisionSums, out recallSums); + var predictedLabelNames = GetPredictedLabelNames(in labelNames, labelIndexToConfIndexMap); var confusionTableString = GetConfusionTableAsString(confusionTable, recallSums, precisionSums, - labelNames.Values.Where((t, i) => labelIndexToConfIndexMap[i] >= 0).ToArray(), - sampled: numConfusionTableLabels < labelNames.Count, binary: binary); + predictedLabelNames, + sampled: numConfusionTableLabels < labelNames.Length, binary: binary); int weightIndex; if (confusionDataView.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.Weight, out weightIndex)) @@ -1398,8 +1398,8 @@ public static string GetConfusionTable(IHost host, IDataView confusionDataView, confusionTable = GetConfusionTableAsArray(confusionDataView, weightIndex, labelNames.Length, labelIndexToConfIndexMap, numConfusionTableLabels, out precisionSums, out recallSums); weightedConfusionTable = GetConfusionTableAsString(confusionTable, recallSums, precisionSums, - labelNames.Values.Where((t, i) => labelIndexToConfIndexMap[i] >= 0).ToArray(), - sampled: numConfusionTableLabels < labelNames.Count, prefix: "Weighted ", binary: binary); + predictedLabelNames, + sampled: numConfusionTableLabels < labelNames.Length, prefix: "Weighted ", binary: binary); } else weightedConfusionTable = null; @@ -1407,6 +1407,20 @@ public static string GetConfusionTable(IHost host, IDataView confusionDataView, return confusionTableString; } + private static List> GetPredictedLabelNames(in VBuffer> labelNames, int[] labelIndexToConfIndexMap) + { + List> result = new List>(); + var values = labelNames.GetValues(); + for (int i = 0; i < values.Length; i++) + { + if (labelIndexToConfIndexMap[i] >= 0) + { + result.Add(values[i]); + } + } + return result; + } + // This methods is given a data view and a column index of the counts, and computes three arrays: the confusion table, // the per class recall and the per class precision. private static double[][] GetConfusionTableAsArray(IDataView confusionDataView, int countIndex, int numClasses, @@ -1537,7 +1551,7 @@ private static string GetFoldMetricsAsString(IHostEnvironment env, IDataView dat // Get a string representation of a confusion table. private static string GetConfusionTableAsString(double[][] confusionTable, double[] rowSums, double[] columnSums, - ReadOnlyMemory[] predictedLabelNames, string prefix = "", bool sampled = false, bool binary = true) + List> predictedLabelNames, string prefix = "", bool sampled = false, bool binary = true) { int numLabels = Utils.Size(confusionTable); @@ -1555,7 +1569,7 @@ private static string GetConfusionTableAsString(double[][] confusionTable, doubl { // The row label will also include the index, so a user can easily match against the header. // In such a case, a label like "Foo" would be presented as something like "5. Foo". - rowDigitLen = Math.Max(predictedLabelNames.Length - 1, 0).ToString().Length; + rowDigitLen = Math.Max(predictedLabelNames.Count - 1, 0).ToString().Length; Contracts.Assert(rowDigitLen >= 1); rowLabelLen += rowDigitLen + 2; } diff --git a/src/Microsoft.ML.Data/Evaluators/RankerEvaluator.cs b/src/Microsoft.ML.Data/Evaluators/RankerEvaluator.cs index 3cc3ddb4a6..5755b285be 100644 --- a/src/Microsoft.ML.Data/Evaluators/RankerEvaluator.cs +++ b/src/Microsoft.ML.Data/Evaluators/RankerEvaluator.cs @@ -573,8 +573,8 @@ internal Result(IExceptionContext ectx, IRow overallResult) { VBuffer Fetch(string name) => Fetch>(ectx, overallResult, name); - Dcg = Fetch(RankerEvaluator.Dcg).Values; - Ndcg = Fetch(RankerEvaluator.Ndcg).Values; + Dcg = Fetch(RankerEvaluator.Dcg).GetValues().ToArray(); + Ndcg = Fetch(RankerEvaluator.Ndcg).GetValues().ToArray(); } } } diff --git a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs index 0348fbe40f..f19234c693 100644 --- a/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/ConcatTransform.cs @@ -722,7 +722,7 @@ private Delegate MakeGetter(IRow input) .MarkSensitive(MessageSensitivity.Schema); } dstLength = checked(dstLength + tmpBufs[i].Length); - dstCount = checked(dstCount + tmpBufs[i].Count); + dstCount = checked(dstCount + tmpBufs[i].GetValues().Length); } else { @@ -749,22 +749,24 @@ private Delegate MakeGetter(IRow input) if (_srcTypes[j].IsVector) { var buffer = tmpBufs[j]; - Contracts.Assert(buffer.Count <= dstCount - count); + var bufferValues = buffer.GetValues(); + Contracts.Assert(bufferValues.Length <= dstCount - count); Contracts.Assert(buffer.Length <= dstLength - offset); if (buffer.IsDense) { - for (int i = 0; i < buffer.Length; i++) + for (int i = 0; i < bufferValues.Length; i++) { - values[count] = buffer.Values[i]; + values[count] = bufferValues[i]; indices[count++] = offset + i; } } else { - for (int i = 0; i < buffer.Count; i++) + var bufferIndices = buffer.GetIndices(); + for (int i = 0; i < bufferValues.Length; i++) { - values[count] = buffer.Values[i]; - indices[count++] = offset + buffer.Indices[i]; + values[count] = bufferValues[i]; + indices[count++] = offset + bufferIndices[i]; } } offset += buffer.Length; diff --git a/src/Microsoft.ML.Data/Transforms/DropSlotsTransform.cs b/src/Microsoft.ML.Data/Transforms/DropSlotsTransform.cs index d644725618..27a73824b2 100644 --- a/src/Microsoft.ML.Data/Transforms/DropSlotsTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/DropSlotsTransform.cs @@ -701,7 +701,7 @@ private ValueGetter> MakeVecTrivialGetter() // Delegates onto instance methods are more efficient than delegates onto static methods. private void VecTrivialGetter(ref VBuffer value) { - value = new VBuffer(1, 0, value.Values, value.Indices); + VBufferUtils.Resize(ref value, 1, 0); } private Delegate MakeVecGetter(IRow input, int iinfo) diff --git a/src/Microsoft.ML.Data/Transforms/HashTransform.cs b/src/Microsoft.ML.Data/Transforms/HashTransform.cs index 869684d5af..0481d5e5e7 100644 --- a/src/Microsoft.ML.Data/Transforms/HashTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/HashTransform.cs @@ -743,36 +743,33 @@ private static ValueGetter> MakeVectorHashGetter(uint se return (ref VBuffer dst) => { srcGetter(ref src); - int[] indices = dst.Indices; - if (src.Count == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0) { - dst = new VBuffer(src.Length, 0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, src.Length, 0); return; } + var editor = VBufferEditor.Create(ref dst, src.Length, srcValues.Length); + + for (int i = 0; i < srcValues.Length; ++i) + editor.Values[i] = hasher.HashCore(seed, mask, srcValues[i]); if (!src.IsDense) - { - Utils.EnsureSize(ref indices, src.Count, keepOld: false); - Array.Copy(src.Indices, 0, indices, 0, src.Count); - } - var values = dst.Values; - Utils.EnsureSize(ref values, src.Count, keepOld: false); - var srcValuesSpan = src.Values.AsSpan(0, src.Count); - for (int i = 0; i < srcValuesSpan.Length; ++i) - values[i] = hasher.HashCore(seed, mask, srcValuesSpan[i]); - dst = new VBuffer(src.Length, src.Count, values, indices); + src.GetIndices().CopyTo(editor.Indices); + + dst = editor.Commit(); }; } // It is not sparsity preserving. return (ref VBuffer dst) => { srcGetter(ref src); - uint[] values = dst.Values; - Utils.EnsureSize(ref values, src.Length, keepOld: false); - var srcValuesSpan = src.Values.AsSpan(0, src.Count); + var editor = VBufferEditor.Create(ref dst, src.Length); + + var srcValues = src.GetValues(); if (src.IsDense) { - for (int i = 0; i < srcValuesSpan.Length; ++i) - values[i] = hasher.HashCore(seed, mask, srcValuesSpan[i]); + for (int i = 0; i < srcValues.Length; ++i) + editor.Values[i] = hasher.HashCore(seed, mask, srcValues[i]); } else { @@ -781,12 +778,13 @@ private static ValueGetter> MakeVectorHashGetter(uint se // values, rather than having complicated logic to do a simultaneous traversal of the // sparse vs. dense array. for (int i = 0; i < src.Length; ++i) - values[i] = defaultHash; + editor.Values[i] = defaultHash; // Next overwrite the values in the explicit entries. - for (int i = 0; i < srcValuesSpan.Length; ++i) - values[src.Indices[i]] = hasher.HashCore(seed, mask, srcValuesSpan[i]); + var srcIndices = src.GetIndices(); + for (int i = 0; i < srcValues.Length; ++i) + editor.Values[srcIndices[i]] = hasher.HashCore(seed, mask, srcValues[i]); } - dst = new VBuffer(src.Length, values, dst.Indices); + dst = editor.Commit(); }; } @@ -807,60 +805,58 @@ private static ValueGetter> MakeVectorOrderedHashGetter( return (ref VBuffer dst) => { srcGetter(ref src); - int[] indices = dst.Indices; - if (src.Count == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0) { - dst = new VBuffer(src.Length, 0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, src.Length, 0); return; } - if (!src.IsDense) - { - Utils.EnsureSize(ref indices, src.Count, keepOld: false); - Array.Copy(src.Indices, 0, indices, 0, src.Count); - } - var values = dst.Values; - Utils.EnsureSize(ref values, src.Count, keepOld: false); - var srcValuesSpan = src.Values.AsSpan(0, src.Count); + var editor = VBufferEditor.Create(ref dst, src.Length, srcValues.Length); + if (src.IsDense) { - for (int i = 0; i < srcValuesSpan.Length; ++i) - values[i] = hasher.HashCore(Hashing.MurmurRound(seed, (uint)i), mask, srcValuesSpan[i]); + for (int i = 0; i < srcValues.Length; ++i) + editor.Values[i] = hasher.HashCore(Hashing.MurmurRound(seed, (uint)i), mask, srcValues[i]); } else { - for (int i = 0; i < srcValuesSpan.Length; ++i) - values[i] = hasher.HashCore(Hashing.MurmurRound(seed, (uint)src.Indices[i]), mask, srcValuesSpan[i]); + var srcIndices = src.GetIndices(); + for (int i = 0; i < srcValues.Length; ++i) + editor.Values[i] = hasher.HashCore(Hashing.MurmurRound(seed, (uint)srcIndices[i]), mask, srcValues[i]); + srcIndices.CopyTo(editor.Indices); + } - dst = new VBuffer(src.Length, src.Count, values, indices); + dst = editor.Commit(); }; } // It is not sparsity preserving. return (ref VBuffer dst) => { srcGetter(ref src); - uint[] values = dst.Values; - Utils.EnsureSize(ref values, src.Length, keepOld: false); - var srcValuesSpan = src.Values.AsSpan(0, src.Count); + var editor = VBufferEditor.Create(ref dst, src.Length); + + var srcValues = src.GetValues(); if (src.IsDense) { - for (int i = 0; i < srcValuesSpan.Length; ++i) - values[i] = hasher.HashCore(Hashing.MurmurRound(seed, (uint)i), mask, srcValuesSpan[i]); + for (int i = 0; i < srcValues.Length; ++i) + editor.Values[i] = hasher.HashCore(Hashing.MurmurRound(seed, (uint)i), mask, srcValues[i]); } else { + var srcIndices = src.GetIndices(); int j = 0; for (int i = 0; i < src.Length; i++) { uint indexSeed = Hashing.MurmurRound(seed, (uint)i); - if (src.Count <= j || src.Indices[j] > i) - values[i] = hasher.HashCore(indexSeed, mask, default); - else if (src.Indices[j] == i) - values[i] = hasher.HashCore(indexSeed, mask, srcValuesSpan[j++]); + if (srcIndices.Length <= j || srcIndices[j] > i) + editor.Values[i] = hasher.HashCore(indexSeed, mask, default); + else if (srcIndices[j] == i) + editor.Values[i] = hasher.HashCore(indexSeed, mask, srcValues[j++]); else Contracts.Assert(false, "this should have never happened."); } } - dst = new VBuffer(src.Length, values, dst.Indices); + dst = editor.Commit(); }; } @@ -1111,12 +1107,17 @@ public override void Process() { _srcGetter(ref _value); _dstGetter(ref _hash); + + var valueValues = _value.GetValues(); + var hashValues = _hash.GetValues(); + // The two arrays should be consistent in their density, length, count, etc. Contracts.Assert(_value.IsDense == _hash.IsDense); Contracts.Assert(_value.Length == _hash.Length); - Contracts.Assert(_value.Count == _hash.Count); - for (int i = 0; i < _value.Count; ++i) - Collector.Add(_hash.Values[i], _value.Values[i]); + Contracts.Assert(valueValues.Length == hashValues.Length); + + for (int i = 0; i < valueValues.Length; ++i) + Collector.Add(hashValues[i], valueValues[i]); } } @@ -1151,19 +1152,24 @@ public override void Process() { _srcGetter(ref _value); _dstGetter(ref _hash); + + var valueValues = _value.GetValues(); + var hashValues = _hash.GetValues(); + // The two arrays should be consistent in their density, length, count, etc. Contracts.Assert(_value.IsDense == _hash.IsDense); Contracts.Assert(_value.Length == _hash.Length); - Contracts.Assert(_value.Count == _hash.Count); + Contracts.Assert(valueValues.Length == hashValues.Length); if (_hash.IsDense) { - for (int i = 0; i < _value.Count; ++i) - Collector.Add(_hash.Values[i], new KeyValuePair(i, _value.Values[i])); + for (int i = 0; i < valueValues.Length; ++i) + Collector.Add(hashValues[i], new KeyValuePair(i, valueValues[i])); } else { - for (int i = 0; i < _value.Count; ++i) - Collector.Add(_hash.Values[i], new KeyValuePair(_hash.Indices[i], _value.Values[i])); + var hashIndices = _hash.GetIndices(); + for (int i = 0; i < valueValues.Length; ++i) + Collector.Add(hashValues[i], new KeyValuePair(hashIndices[i], valueValues[i])); } } } diff --git a/src/Microsoft.ML.Data/Transforms/InvertHashUtils.cs b/src/Microsoft.ML.Data/Transforms/InvertHashUtils.cs index 2a5f0a3fc1..88f68ab778 100644 --- a/src/Microsoft.ML.Data/Transforms/InvertHashUtils.cs +++ b/src/Microsoft.ML.Data/Transforms/InvertHashUtils.cs @@ -412,7 +412,7 @@ private static void Save(IChannel ch, ModelSaveContext ctx, CodecFactory factory ctx.SaveTextStream("Terms.txt", writer => { - writer.WriteLine("# Number of terms = {0} of length {1}", v.Count, v.Length); + writer.WriteLine("# Number of terms = {0} of length {1}", v.GetValues().Length, v.Length); foreach (var pair in v.Items()) { var text = pair.Value; diff --git a/src/Microsoft.ML.Data/Transforms/KeyToValueTransform.cs b/src/Microsoft.ML.Data/Transforms/KeyToValueTransform.cs index c437c17942..759e3bffde 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToValueTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToValueTransform.cs @@ -329,7 +329,7 @@ public KeyToValueMap(Mapper parent, KeyType typeKey, PrimitiveType typeVal, TVal _convertToUInt = Runtime.Data.Conversion.Conversions.Instance.GetStandardConversion(typeKey, NumberType.U4, out identity); } - private void MapKey(ref TKey src, ref TValue dst) + private void MapKey(in TKey src, ref TValue dst) { uint uintSrc = 0; _convertToUInt(in src, ref uintSrc); @@ -361,7 +361,7 @@ public override Delegate GetMappingGetter(IRow input) (ref TValue dst) => { getSrc(ref src); - MapKey(ref src, ref dst); + MapKey(in src, ref dst); }; return retVal; } @@ -376,8 +376,8 @@ public override Delegate GetMappingGetter(IRow input) { getSrc(ref src); int srcSize = src.Length; - int srcCount = src.Count; - var srcValues = src.Values; + var srcValues = src.GetValues(); + int srcCount = srcValues.Length; var dstValues = dst.Values; var dstIndices = dst.Indices; @@ -389,7 +389,7 @@ public override Delegate GetMappingGetter(IRow input) for (int slot = 0; slot < srcSize; ++slot) { - MapKey(ref srcValues[slot], ref dstValues[slot]); + MapKey(in srcValues[slot], ref dstValues[slot]); // REVIEW: // The current implementation always maps dense to dense, even if the resulting columns could benefit from @@ -408,17 +408,17 @@ public override Delegate GetMappingGetter(IRow input) // Currently this always maps sparse to dense, as long as the output type's NA does not equal its default value. Utils.EnsureSize(ref dstValues, srcSize, maxSize, keepOld: false); - var srcIndices = src.Indices; - int nextExplicitSlot = src.Count == 0 ? srcSize : srcIndices[0]; + var srcIndices = src.GetIndices(); + int nextExplicitSlot = srcCount == 0 ? srcSize : srcIndices[0]; int islot = 0; for (int slot = 0; slot < srcSize; ++slot) { if (nextExplicitSlot == slot) { // Current slot has an explicitly defined value. - Parent.Host.Assert(islot < src.Count); - MapKey(ref srcValues[islot], ref dstValues[slot]); - nextExplicitSlot = ++islot == src.Count ? srcSize : srcIndices[islot]; + Parent.Host.Assert(islot < srcCount); + MapKey(in srcValues[islot], ref dstValues[slot]); + nextExplicitSlot = ++islot == srcCount ? srcSize : srcIndices[islot]; Parent.Host.Assert(slot < nextExplicitSlot); } else @@ -434,12 +434,12 @@ public override Delegate GetMappingGetter(IRow input) // As the default value equals the NA value for the output type, we produce sparse output. Utils.EnsureSize(ref dstValues, srcCount, maxSize, keepOld: false); Utils.EnsureSize(ref dstIndices, srcCount, maxSize, keepOld: false); - var srcIndices = src.Indices; + var srcIndices = src.GetIndices(); for (int islotSrc = 0; islotSrc < srcCount; ++islotSrc) { // Current slot has an explicitly defined value. Parent.Host.Assert(islotSrc < srcCount); - MapKey(ref srcValues[islotSrc], ref dstItem); + MapKey(in srcValues[islotSrc], ref dstItem); if (!_isDefault(in dstItem)) { dstValues[islotDst] = dstItem; diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVectorTransform.cs b/src/Microsoft.ML.Data/Transforms/KeyToVectorTransform.cs index a076dc452b..79500df97e 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVectorTransform.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVectorTransform.cs @@ -523,8 +523,8 @@ private ValueGetter> MakeGetterBag(IRow input, int iinfo) Host.Check(cv == 0 || src.Length == cv); // The indices are irrelevant in the bagging case. - var values = src.Values; - int count = src.Count; + var values = src.GetValues(); + int count = values.Length; for (int slot = 0; slot < count; slot++) { uint key = values[slot] - 1; @@ -564,17 +564,11 @@ private ValueGetter> MakeGetterInd(IRow input, int iinfo) Host.Check(lenSrc == cv || cv == 0); // Since we generate values in order, no need for a builder. - var valuesDst = dst.Values; - var indicesDst = dst.Indices; - int lenDst = checked(size * lenSrc); - int cntSrc = src.Count; - if (Utils.Size(valuesDst) < cntSrc) - valuesDst = new float[cntSrc]; - if (Utils.Size(indicesDst) < cntSrc) - indicesDst = new int[cntSrc]; + var values = src.GetValues(); + int cntSrc = values.Length; + var editor = VBufferEditor.Create(ref dst, lenDst, cntSrc); - var values = src.Values; int count = 0; if (src.IsDense) { @@ -585,24 +579,24 @@ private ValueGetter> MakeGetterInd(IRow input, int iinfo) uint key = values[slot] - 1; if (key >= (uint)size) continue; - valuesDst[count] = 1; - indicesDst[count++] = slot * size + (int)key; + editor.Values[count] = 1; + editor.Indices[count++] = slot * size + (int)key; } } else { - var indices = src.Indices; + var indices = src.GetIndices(); for (int islot = 0; islot < cntSrc; islot++) { Host.Assert(count < cntSrc); uint key = values[islot] - 1; if (key >= (uint)size) continue; - valuesDst[count] = 1; - indicesDst[count++] = indices[islot] * size + (int)key; + editor.Values[count] = 1; + editor.Indices[count++] = indices[islot] * size + (int)key; } } - dst = new VBuffer(lenDst, count, valuesDst, indicesDst); + dst = editor.CommitTruncated(count); }; } diff --git a/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs b/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs index 7015814e70..cfe7465480 100644 --- a/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs +++ b/src/Microsoft.ML.Data/Transforms/NormalizeColumnSng.cs @@ -356,14 +356,14 @@ public void ProcessValue(in VBuffer value) var size = _min.Length; Contracts.Check(value.Length == size); _trainCount++; - var count = value.Count; + var values = value.GetValues(); + var count = values.Length; Contracts.Assert(0 <= count & count <= size); if (count == 0) return; if (count == size) { - var values = value.Values; for (int j = 0; j < count; j++) { var val = values[j]; @@ -373,8 +373,7 @@ public void ProcessValue(in VBuffer value) } else { - var indices = value.Indices; - var values = value.Values; + var indices = value.GetIndices(); for (int k = 0; k < count; k++) { var val = values[k]; @@ -459,14 +458,14 @@ public void ProcessValue(in VBuffer value) { _trainCount++; var size = _mean.Length; - var count = value.Count; + var values = value.GetValues(); + var count = values.Length; Contracts.Assert(0 <= count & count <= size); if (count == 0) return; if (count == size) { - var values = value.Values; for (int j = 0; j < count; j++) { var origVal = values[j]; @@ -475,8 +474,7 @@ public void ProcessValue(in VBuffer value) } else { - var indices = value.Indices; - var values = value.Values; + var indices = value.GetIndices(); for (int k = 0; k < count; k++) { var origVal = values[k]; @@ -706,7 +704,8 @@ private static void FillValues(in VBuffer input, BufferBuilder b { Contracts.Assert(input.Length == scale.Length); int size = scale.Length; - int count = input.Count; + var values = input.GetValues(); + int count = values.Length; Contracts.Assert(0 <= count & count <= size); // We always start with sparse, since we may make things sparser than the source. @@ -714,7 +713,6 @@ private static void FillValues(in VBuffer input, BufferBuilder b if (count == 0) return; - var values = input.Values; if (count >= size) { for (int i = 0; i < size; i++) @@ -723,7 +721,7 @@ private static void FillValues(in VBuffer input, BufferBuilder b } // The input is sparse. - var indices = input.Indices; + var indices = input.GetIndices(); for (int ii = 0; ii < count; ii++) { int i = indices[ii]; @@ -737,7 +735,8 @@ private static void FillValues(in VBuffer input, BufferBuilder b { Contracts.Assert(input.Length == scale.Length); int size = scale.Length; - int count = input.Count; + var values = input.GetValues(); + int count = values.Length; Contracts.Assert(0 <= count & count <= size); // We always start with sparse, since we may make things sparser than the source. @@ -750,7 +749,6 @@ private static void FillValues(in VBuffer input, BufferBuilder b return; } - var values = input.Values; if (count >= size) { for (int i = 0; i < size; i++) @@ -759,7 +757,7 @@ private static void FillValues(in VBuffer input, BufferBuilder b } // The input is sparse. - var indices = input.Indices; + var indices = input.GetIndices(); int ii = 0; int ivSrc = indices[ii]; Contracts.Assert(ivSrc < size); @@ -783,7 +781,8 @@ private static void FillValues(in VBuffer input, BufferBuilder b Contracts.Assert(input.Length == scale.Length); int size = scale.Length; - int count = input.Count; + var values = input.GetValues(); + int count = values.Length; Contracts.Assert(0 <= count & count <= size); // We always start with sparse, since we may make things sparser than the source. @@ -796,7 +795,6 @@ private static void FillValues(in VBuffer input, BufferBuilder b return; } - var values = input.Values; if (count >= size) { for (int i = 0; i < size; i++) @@ -805,7 +803,7 @@ private static void FillValues(in VBuffer input, BufferBuilder b } // The input is sparse. - var indices = input.Indices; + var indices = input.GetIndices(); int ii = 0; int ivSrc = indices[ii]; int inz = 0; @@ -983,7 +981,8 @@ private static void FillValues(in VBuffer input, BufferBuilder b { Contracts.Assert(input.Length == mean.Length); int size = mean.Length; - int count = input.Count; + var values = input.GetValues(); + int count = values.Length; Contracts.Assert(0 <= count & count <= size); // We always start with sparse, since we may make things sparser than the source. @@ -992,7 +991,6 @@ private static void FillValues(in VBuffer input, BufferBuilder b if (count == 0) return; - var values = input.Values; if (count >= size) { for (int i = 0; i < size; i++) @@ -1009,7 +1007,7 @@ private static void FillValues(in VBuffer input, BufferBuilder b } // The input is sparse. - var indices = input.Indices; + var indices = input.GetIndices(); for (int ii = 0; ii < indices.Length; ii++) { var ivDst = indices[ii]; @@ -1101,14 +1099,14 @@ public override Delegate GetGetter(IRow input, int icol) (ref TFloat dst) => { getSrc(ref dst); - GetResult(ref dst, ref dst); + GetResult(dst, ref dst); }; return del; } - private void GetResult(ref TFloat input, ref TFloat value) + private void GetResult(TFloat input, ref TFloat value) { - value = BinUtils.GetValue(ref input, _binUpperBounds, _den, _offset); + value = BinUtils.GetValue(input, _binUpperBounds, _den, _offset); } } @@ -1197,7 +1195,8 @@ private void GetResult(in VBuffer input, ref VBuffer value, Buff { Contracts.Assert(input.Length == _binUpperBounds.Length); int size = _binUpperBounds.Length; - int count = input.Count; + var values = input.GetValues(); + int count = values.Length; Contracts.Assert(0 <= count & count <= size); // We always start with sparse, since we may make things sparser than the source. @@ -1208,18 +1207,17 @@ private void GetResult(in VBuffer input, ref VBuffer value, Buff return; } - var values = input.Values; if (count >= size) { if (_offset != null) { for (int i = 0; i < size; i++) - bldr.AddFeature(i, BinUtils.GetValue(ref values[i], _binUpperBounds[i], _den[i], _offset[i])); + bldr.AddFeature(i, BinUtils.GetValue(values[i], _binUpperBounds[i], _den[i], _offset[i])); } else { for (int i = 0; i < size; i++) - bldr.AddFeature(i, BinUtils.GetValue(ref values[i], _binUpperBounds[i], _den[i])); + bldr.AddFeature(i, BinUtils.GetValue(values[i], _binUpperBounds[i], _den[i])); } bldr.GetResult(ref value); return; @@ -1228,7 +1226,7 @@ private void GetResult(in VBuffer input, ref VBuffer value, Buff // The input is sparse. if (_offset != null) { - var indices = input.Indices; + var indices = input.GetIndices(); int ii = 0; int ivSrc = indices[ii]; Contracts.Assert(ivSrc < size); @@ -1239,13 +1237,13 @@ private void GetResult(in VBuffer input, ref VBuffer value, Buff if (ivDst == ivSrc) { bldr.AddFeature(ivDst, - BinUtils.GetValue(ref values[ii], _binUpperBounds[ivDst], _den[ivDst], _offset[ivDst])); + BinUtils.GetValue(values[ii], _binUpperBounds[ivDst], _den[ivDst], _offset[ivDst])); ivSrc = ++ii < count ? indices[ii] : size; Contracts.Assert(ii == count || ivSrc < size); } else bldr.AddFeature(ivDst, - BinUtils.GetValue(ref zero, _binUpperBounds[ivDst], _den[ivDst], _offset[ivDst])); + BinUtils.GetValue(zero, _binUpperBounds[ivDst], _den[ivDst], _offset[ivDst])); } } else @@ -1255,7 +1253,7 @@ private void GetResult(in VBuffer input, ref VBuffer value, Buff { int i = indices[ii]; Contracts.Assert(0 <= i & i < size); - bldr.AddFeature(i, BinUtils.GetValue(ref values[ii], _binUpperBounds[i], _den[i])); + bldr.AddFeature(i, BinUtils.GetValue(values[ii], _binUpperBounds[i], _den[i])); } } @@ -1376,7 +1374,7 @@ public static TFloat Cdf(TFloat input, TFloat mean, TFloat stddev) internal static partial class BinUtils { - public static TFloat GetValue(ref TFloat input, TFloat[] binUpperBounds, TFloat den, TFloat offset) + public static TFloat GetValue(TFloat input, TFloat[] binUpperBounds, TFloat den, TFloat offset) { if (TFloat.IsNaN(input)) return input; @@ -1387,7 +1385,7 @@ public static TFloat GetValue(ref TFloat input, TFloat[] binUpperBounds, TFloat return value; } - public static TFloat GetValue(ref TFloat input, TFloat[] binUpperBounds, TFloat den) + public static TFloat GetValue(TFloat input, TFloat[] binUpperBounds, TFloat den) { if (TFloat.IsNaN(input)) return input; @@ -1803,21 +1801,20 @@ protected override bool ProcessValue(in VBuffer buffer) int size = _values.Length; Host.Check(buffer.Length == size); - int count = buffer.Count; + var values = buffer.GetValues(); + int count = values.Length; Host.Assert(0 <= count & count <= size); if (count == 0) return true; if (count == size) { - var values = buffer.Values; for (int j = 0; j < count; j++) _values[j].Add(values[j]); } else { - var indices = buffer.Indices; - var values = buffer.Values; + var indices = buffer.GetIndices(); for (int k = 0; k < count; k++) { var val = values[k]; diff --git a/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs b/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs index 1b70ff0f2d..49372056e7 100644 --- a/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs +++ b/src/Microsoft.ML.Data/Transforms/TermTransformImpl.cs @@ -106,7 +106,7 @@ public TextImpl(bool sorted) _sorted = sorted; } - public override bool TryAdd(ref ReadOnlyMemory val) + public override bool TryAdd(in ReadOnlyMemory val) { if (val.IsEmpty) return false; @@ -170,7 +170,7 @@ public Impl(PrimitiveType type, InPredicate mapsToMissing, bool sort) _sort = sort; } - public override bool TryAdd(ref T val) + public override bool TryAdd(in T val) { return !_mapsToMissing(in val) && _values.TryAdd(val); } @@ -195,7 +195,7 @@ protected Builder(PrimitiveType type) /// Ensures that the item is in the set. Returns true iff it added the item. /// /// The value to consider - public abstract bool TryAdd(ref T val); + public abstract bool TryAdd(in T val); /// /// Handling for the "terms" arg. @@ -215,7 +215,7 @@ public override void ParseAddTermArg(ref ReadOnlyMemory terms, IChannel ch ch.Warning("Empty strings ignored in 'terms' specification"); else if (!tryParse(in term, out val)) throw ch.Except($"Item '{term}' in 'terms' specification could not be parsed as '{ItemType}'"); - else if (!TryAdd(ref val)) + else if (!TryAdd(in val)) ch.Warning($"Duplicate item '{term}' ignored in 'terms' specification", term); } @@ -240,7 +240,7 @@ public override void ParseAddTermArg(string[] terms, IChannel ch) ch.Warning("Empty strings ignored in 'term' specification"); else if (!tryParse(in term, out val)) ch.Warning("Item '{0}' ignored in 'term' specification since it could not be parsed as '{1}'", term, ItemType); - else if (!TryAdd(ref val)) + else if (!TryAdd(in val)) ch.Warning("Duplicate item '{0}' ignored in 'term' specification", term); } @@ -361,7 +361,7 @@ public sealed override bool ProcessRow() if (_remaining <= 0) return false; _getter(ref _val); - return !_bldr.TryAdd(ref _val) || --_remaining > 0; + return !_bldr.TryAdd(in _val) || --_remaining > 0; } } @@ -381,10 +381,10 @@ public ImplVec(ValueGetter> getter, int max, Builder bldr) _bldr = bldr; } - private bool AccumAndDecrement(ref T val) + private bool AccumAndDecrement(in T val) { Contracts.Assert(_remaining > 0); - return !_bldr.TryAdd(ref val) || --_remaining > 0; + return !_bldr.TryAdd(in val) || --_remaining > 0; } public sealed override bool ProcessRow() @@ -393,11 +393,12 @@ public sealed override bool ProcessRow() if (_remaining <= 0) return false; _getter(ref _val); + var values = _val.GetValues(); if (_val.IsDense || _addedDefaultFromSparse) { - for (int i = 0; i < _val.Count; ++i) + for (int i = 0; i < values.Length; ++i) { - if (!AccumAndDecrement(ref _val.Values[i])) + if (!AccumAndDecrement(in values[i])) return false; } return true; @@ -412,21 +413,22 @@ public sealed override bool ProcessRow() // excited about the slight inefficiency of that first if check. Contracts.Assert(!_val.IsDense && !_addedDefaultFromSparse); T def = default(T); - for (int i = 0; i < _val.Count; ++i) + var valIndices = _val.GetIndices(); + for (int i = 0; i < values.Length; ++i) { - if (!_addedDefaultFromSparse && _val.Indices[i] != i) + if (!_addedDefaultFromSparse && valIndices[i] != i) { _addedDefaultFromSparse = true; - if (!AccumAndDecrement(ref def)) + if (!AccumAndDecrement(in def)) return false; } - if (!AccumAndDecrement(ref _val.Values[i])) + if (!AccumAndDecrement(in values[i])) return false; } if (!_addedDefaultFromSparse) { _addedDefaultFromSparse = true; - if (!AccumAndDecrement(ref def)) + if (!AccumAndDecrement(in def)) return false; } return true; @@ -960,15 +962,15 @@ public override Delegate GetMappingGetter(IRow input) bldr.Reset(cval, dense: false); - var values = src.Values; - var indices = !src.IsDense ? src.Indices : null; - int count = src.Count; + var values = src.GetValues(); + var indices = src.GetIndices(); + int count = values.Length; for (int islot = 0; islot < count; islot++) { map(in values[islot], ref dstItem); if (dstItem != 0) { - int slot = indices != null ? indices[islot] : islot; + int slot = !src.IsDense ? indices[islot] : islot; bldr.AddFeature(slot, dstItem); } } @@ -998,7 +1000,7 @@ public override Delegate GetMappingGetter(IRow input) // unrecognized items. bldr.Reset(cval, dense: false); - var values = src.Values; + var values = src.GetValues(); if (src.IsDense) { for (int slot = 0; slot < src.Length; ++slot) @@ -1010,19 +1012,19 @@ public override Delegate GetMappingGetter(IRow input) } else { - var indices = src.Indices; - int nextExplicitSlot = src.Count == 0 ? src.Length : indices[0]; + var indices = src.GetIndices(); + int nextExplicitSlot = indices.Length == 0 ? src.Length : indices[0]; int islot = 0; for (int slot = 0; slot < src.Length; ++slot) { if (nextExplicitSlot == slot) { // This was an explicitly defined value. - _host.Assert(islot < src.Count); + _host.Assert(islot < values.Length); map(in values[islot], ref dstItem); if (dstItem != 0) bldr.AddFeature(slot, dstItem); - nextExplicitSlot = ++islot == src.Count ? src.Length : indices[islot]; + nextExplicitSlot = ++islot == indices.Length ? src.Length : indices[islot]; } else { diff --git a/src/Microsoft.ML.Data/Utilities/SlotDropper.cs b/src/Microsoft.ML.Data/Utilities/SlotDropper.cs index 64b510a655..188f8e72b5 100644 --- a/src/Microsoft.ML.Data/Utilities/SlotDropper.cs +++ b/src/Microsoft.ML.Data/Utilities/SlotDropper.cs @@ -104,11 +104,10 @@ public void DropSlots(ref VBuffer src, ref VBuffer dst) } int newLength = DstLength == 0 ? ComputeLength(src.Length) : DstLength; - var values = dst.Values; if (newLength == 0) { // All slots dropped. - dst = new VBuffer(1, 0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, 1, 0); return; } @@ -116,12 +115,11 @@ public void DropSlots(ref VBuffer src, ref VBuffer dst) // End of the trivial cases // At this point, we need to drop some slots and keep some slots. + VBufferEditor editor; + var srcValues = src.GetValues(); if (src.IsDense) { - Contracts.Assert(Utils.Size(values) == Utils.Size(src.Values) || src.Values != dst.Values); - - if (Utils.Size(values) < newLength) - values = new TDst[newLength]; + editor = VBufferEditor.Create(ref dst, newLength); int iDst = 0; int iSrc = 0; @@ -131,33 +129,33 @@ public void DropSlots(ref VBuffer src, ref VBuffer dst) while (iSrc < lim) { Contracts.Assert(iDst <= iSrc); - values[iDst++] = src.Values[iSrc++]; + editor.Values[iDst++] = srcValues[iSrc++]; } iSrc = SlotsMax[i] + 1; } while (iSrc < src.Length) { Contracts.Assert(iDst <= iSrc); - values[iDst++] = src.Values[iSrc++]; + editor.Values[iDst++] = srcValues[iSrc++]; } Contracts.Assert(iDst == newLength); - dst = new VBuffer(newLength, values, dst.Indices); + dst = editor.Commit(); return; } // Sparse case. // Approximate new count is min(#indices, newLength). - var newCount = Math.Min(src.Count, newLength); - var indices = dst.Indices; + var newCount = Math.Min(srcValues.Length, newLength); + var indices = dst.GetIndices(); + var srcIndices = src.GetIndices(); Contracts.Assert(newCount <= src.Length); - Contracts.Assert(Utils.Size(values) == Utils.Size(src.Values) || src.Values != dst.Values); - Contracts.Assert(Utils.Size(indices) == Utils.Size(src.Indices) || src.Indices != dst.Indices); - if (Utils.Size(indices) < newCount) - indices = new int[newCount]; - if (Utils.Size(values) < newCount) - values = new TDst[newCount]; + editor = VBufferEditor.Create( + ref dst, + newLength, + newCount, + requireIndicesOnDense: true); int iiDst = 0; int iiSrc = 0; @@ -167,15 +165,15 @@ public void DropSlots(ref VBuffer src, ref VBuffer dst) // REVIEW: Consider using a BitArray with the slots to keep instead of SlotsMax. It would // only make sense when the number of ranges is greater than the number of slots divided by 32. int max = SlotsMax[iRange]; - while (iiSrc < src.Count) + while (iiSrc < srcValues.Length) { // Copy (with offset) the elements before the current range. - var index = src.Indices[iiSrc]; + var index = srcIndices[iiSrc]; if (index < min) { Contracts.Assert(iiDst <= iiSrc); - indices[iiDst] = index - iOffset; - values[iiDst++] = src.Values[iiSrc++]; + editor.Indices[iiDst] = index - iOffset; + editor.Values[iiDst++] = srcValues[iiSrc++]; continue; } if (index <= max) @@ -211,7 +209,7 @@ public void DropSlots(ref VBuffer src, ref VBuffer dst) Contracts.Assert(index <= max); } - dst = new VBuffer(newLength, iiDst, values, indices); + dst = editor.CommitTruncated(iiDst); } } } diff --git a/src/Microsoft.ML.Ensemble/EnsembleUtils.cs b/src/Microsoft.ML.Ensemble/EnsembleUtils.cs index 275594e17b..66a6ff165e 100644 --- a/src/Microsoft.ML.Ensemble/EnsembleUtils.cs +++ b/src/Microsoft.ML.Ensemble/EnsembleUtils.cs @@ -47,27 +47,20 @@ public static void SelectFeatures(in VBuffer src, BitArray includedIndices Contracts.Assert(cardinality == Utils.GetCardinality(includedIndices)); Contracts.Assert(cardinality < src.Length); - var values = dst.Values; - var indices = dst.Indices; - var srcValues = src.GetValues(); if (src.IsDense) { if (cardinality >= src.Length / 2) { T defaultValue = default; - if (Utils.Size(values) < src.Length) - values = new T[src.Length]; + var editor = VBufferEditor.Create(ref dst, src.Length); for (int i = 0; i < srcValues.Length; i++) - values[i] = !includedIndices[i] ? defaultValue : srcValues[i]; - dst = new VBuffer(src.Length, values, indices); + editor.Values[i] = !includedIndices[i] ? defaultValue : srcValues[i]; + dst = editor.Commit(); } else { - if (Utils.Size(values) < cardinality) - values = new T[cardinality]; - if (Utils.Size(indices) < cardinality) - indices = new int[cardinality]; + var editor = VBufferEditor.Create(ref dst, src.Length, cardinality); int count = 0; for (int i = 0; i < srcValues.Length; i++) @@ -75,28 +68,19 @@ public static void SelectFeatures(in VBuffer src, BitArray includedIndices if (includedIndices[i]) { Contracts.Assert(count < cardinality); - values[count] = srcValues[i]; - indices[count] = i; + editor.Values[count] = srcValues[i]; + editor.Indices[count] = i; count++; } } Contracts.Assert(count == cardinality); - dst = new VBuffer(src.Length, count, values, indices); + dst = editor.Commit(); } } else { - int valuesSize = Utils.Size(values); - int indicesSize = Utils.Size(indices); - - if (valuesSize < srcValues.Length || indicesSize < srcValues.Length) - { - if (valuesSize < cardinality) - values = new T[cardinality]; - if (indicesSize < cardinality) - indices = new int[cardinality]; - } + var editor = VBufferEditor.Create(ref dst, src.Length, cardinality); int count = 0; var srcIndices = src.GetIndices(); @@ -104,13 +88,13 @@ public static void SelectFeatures(in VBuffer src, BitArray includedIndices { if (includedIndices[srcIndices[i]]) { - values[count] = srcValues[i]; - indices[count] = srcIndices[i]; + editor.Values[count] = srcValues[i]; + editor.Indices[count] = srcIndices[i]; count++; } } - dst = new VBuffer(src.Length, count, values, indices); + dst = editor.CommitTruncated(count); } } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs index 15985316fc..e7a50c11c3 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs @@ -35,14 +35,11 @@ protected void CombineCore(ref VBuffer dst, VBuffer[] src, Singl return; } - var values = dst.Values; - if (Utils.Size(values) < len) - values = new Single[len]; - else - Array.Clear(values, 0, len); - + var editor = VBufferEditor.Create(ref dst, len); + if (!editor.CreatedNewValues) + editor.Values.Clear(); // Set the output to values. - dst = new VBuffer(len, values, dst.Indices); + dst = editor.Commit(); Single weightTotal; if (weights == null) diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs index cea9c698ae..350833aebb 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs @@ -98,12 +98,10 @@ protected bool TryNormalize(VBuffer[] values) protected void GetNaNOutput(ref VBuffer dst, int len) { Contracts.Assert(len >= 0); - var values = dst.Values; - if (Utils.Size(values) < len) - values = new Single[len]; + var editor = VBufferEditor.Create(ref dst, len); for (int i = 0; i < len; i++) - values[i] = Single.NaN; - dst = new VBuffer(len, values, dst.Indices); + editor.Values[i] = Single.NaN; + dst = editor.Commit(); } } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseScalarStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseScalarStacking.cs index 9b9900a65f..dbe1517f22 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseScalarStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseScalarStacking.cs @@ -25,11 +25,9 @@ protected override void FillFeatureBuffer(Single[] src, ref VBuffer dst) { Contracts.AssertNonEmpty(src); int len = src.Length; - var values = dst.Values; - if (Utils.Size(values) < len) - values = new Single[len]; - Array.Copy(src, values, len); - dst = new VBuffer(len, values, dst.Indices); + var editor = VBufferEditor.Create(ref dst, len); + src.CopyTo(editor.Values); + dst = editor.Commit(); } } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs index 86312393de..3b11146203 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs @@ -81,9 +81,7 @@ public override Combiner> GetCombiner() return; } - var values = dst.Values; - if (Utils.Size(values) < len) - values = new Single[len]; + var editor = VBufferEditor.Create(ref dst, len); int count = src.Length; if (Utils.Size(raw) < count) @@ -92,11 +90,11 @@ public override Combiner> GetCombiner() { for (int j = 0; j < count; j++) raw[j] = i < src[j].Length ? src[j].GetItemOrDefault(i) : 0; - values[i] = MathUtils.GetMedianInPlace(raw, count); + editor.Values[i] = MathUtils.GetMedianInPlace(raw, count); } // Set the output to values. - dst = new VBuffer(len, values, dst.Indices); + dst = editor.Commit(); }; } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs index f02d692f54..4e352e8265 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs @@ -83,19 +83,17 @@ protected override void FillFeatureBuffer(VBuffer[] src, ref VBuffer(len, values, dst.Indices); + var editor = VBufferEditor.Create(ref dst, len); int iv = 0; for (int i = 0; i < src.Length; i++) { - src[i].CopyTo(values, iv); + src[i].CopyTo(editor.Values, iv); iv += src[i].Length; Contracts.Assert(iv <= len); } Contracts.Assert(iv == len); + dst = editor.Commit(); } } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs index 75fd4f5222..ffa1b9c647 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs @@ -77,16 +77,14 @@ private void CombineCore(ref VBuffer dst, VBuffer[] src, Single[ int count = Utils.Size(src); if (count == 0) { - dst = new VBuffer(0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, 0); return; } int len = GetClassCount(src); - var values = dst.Values; - if (Utils.Size(values) < len) - values = new Single[len]; - else - Array.Clear(values, 0, len); + var editor = VBufferEditor.Create(ref dst, len); + if (!editor.CreatedNewValues) + editor.Values.Clear(); int voteCount = 0; for (int i = 0; i < count; i++) @@ -94,17 +92,17 @@ private void CombineCore(ref VBuffer dst, VBuffer[] src, Single[ int index = VectorUtils.ArgMax(in src[i]); if (index >= 0) { - values[index]++; + editor.Values[index]++; voteCount++; } } // Normalize by dividing by the number of votes. for (int i = 0; i < len; i++) - values[i] /= voteCount; + editor.Values[i] /= voteCount; // Set the output to values. - dst = new VBuffer(len, values, dst.Indices); + dst = editor.Commit(); } } } diff --git a/src/Microsoft.ML.FastTree/BinFile/BinFinder.cs b/src/Microsoft.ML.FastTree/BinFile/BinFinder.cs index 782d85f24a..eaae7d5448 100644 --- a/src/Microsoft.ML.FastTree/BinFile/BinFinder.cs +++ b/src/Microsoft.ML.FastTree/BinFile/BinFinder.cs @@ -17,6 +17,7 @@ internal sealed class BinFinder { private readonly GreedyBinFinder _finder; private double[] _distinctValues; + private double[] _distinctCountsBuffer; private int[] _counts; private static double[] _trivialBinUpperBounds; // Will be initialized to a single element positive infinity array. @@ -43,15 +44,19 @@ public BinFinder() /// The scheme is destructive, because it modifies the arrays within . /// /// The values we are binning + /// A buffer space to work over the values, so the original + /// values aren't modified. /// This working array will be filled with a sorted list of the /// distinct values detected within /// This working array will be filled with a sorted list of the distinct /// values detected within /// The logical length of both and /// - private int FindDistinctCounts(in VBuffer values, double[] distinctValues, int[] counts) + private int FindDistinctCounts(in VBuffer values, double[] valueBuffer, double[] distinctValues, int[] counts) { - if (values.Count == 0) + var explicitValues = values.GetValues(); + var explicitValuesCount = explicitValues.Length; + if (explicitValuesCount == 0) { if (values.Length == 0) return 0; @@ -59,30 +64,31 @@ private int FindDistinctCounts(in VBuffer values, double[] distinctValue counts[0] = values.Length; return 1; } - var valArray = values.Values; // Get histogram of values - Array.Sort(valArray, 0, values.Count); + Contracts.Assert(valueBuffer.Length >= explicitValuesCount); + explicitValues.CopyTo(valueBuffer); + Array.Sort(valueBuffer, 0, explicitValuesCount); // Note that Array.Sort will, by MSDN documentation, make NaN be the first item of a sorted // list (that is, NaN is considered to be ordered "below" any other value for the purpose of // a sort, including negative infinity). So when checking if values contains no NaN values, it // suffices to check only the first item. - if (double.IsNaN(valArray[0])) + if (double.IsNaN(valueBuffer[0])) return -1; int idist = 0; // Index into the "distinct" arrays. - if (!values.IsDense && valArray[0] > 0) + if (!values.IsDense && valueBuffer[0] > 0) { // Implicit zeros at the head. distinctValues[0] = 0; - counts[0] = values.Length - values.Count; + counts[0] = values.Length - explicitValuesCount; idist = 1; } - double last = distinctValues[idist] = valArray[0]; + double last = distinctValues[idist] = valueBuffer[0]; counts[idist] = 1; - for (int i = 1; i < values.Count; ++i) + for (int i = 1; i < explicitValuesCount; ++i) { - double curr = valArray[i]; + double curr = valueBuffer[i]; if (curr != last) { Contracts.Assert(curr > last); @@ -92,7 +98,7 @@ private int FindDistinctCounts(in VBuffer values, double[] distinctValue { // This boundary is going from negative, to non-negative, and there are "implicit" zeros. distinctValues[idist] = 0; - counts[idist] = values.Length - values.Count; + counts[idist] = values.Length - explicitValuesCount; if (curr == 0) { // No need to do any more work. @@ -117,7 +123,7 @@ private int FindDistinctCounts(in VBuffer values, double[] distinctValue { // Implicit zeros at the tail. distinctValues[++idist] = 0; - counts[idist] = values.Length - values.Count; + counts[idist] = values.Length - explicitValuesCount; } return idist + 1; @@ -224,17 +230,19 @@ public bool FindBins(in VBuffer values, int maxBins, int minPerLeaf, out Contracts.Assert(maxBins > 0); Contracts.Assert(minPerLeaf >= 0); - if (values.Count == 0) + var valuesCount = values.GetValues().Length; + if (valuesCount == 0) { binUpperBounds = TrivialBinUpperBounds; return true; } - int arraySize = values.IsDense ? values.Count : values.Count + 1; + int arraySize = values.IsDense ? valuesCount : valuesCount + 1; + Utils.EnsureSize(ref _distinctCountsBuffer, arraySize, arraySize, keepOld: false); Utils.EnsureSize(ref _distinctValues, arraySize, arraySize, keepOld: false); Utils.EnsureSize(ref _counts, arraySize, arraySize, keepOld: false); - int numValues = FindDistinctCounts(in values, _distinctValues, _counts); + int numValues = FindDistinctCounts(in values, _distinctCountsBuffer, _distinctValues, _counts); if (numValues < 0) { binUpperBounds = null; diff --git a/src/Microsoft.ML.FastTree/FastTree.cs b/src/Microsoft.ML.FastTree/FastTree.cs index 9728598d3b..06faf2f292 100644 --- a/src/Microsoft.ML.FastTree/FastTree.cs +++ b/src/Microsoft.ML.FastTree/FastTree.cs @@ -1357,20 +1357,18 @@ private ValueMapper, VBuffer> GetCopier(ColumnType itemT return (in VBuffer src, ref VBuffer dst) => { - var indices = dst.Indices; - var values = dst.Values; - if (src.Count > 0) + var srcValues = src.GetValues(); + var editor = VBufferEditor.Create(ref dst, src.Length, srcValues.Length); + if (srcValues.Length > 0) { if (!src.IsDense) { - Utils.EnsureSize(ref indices, src.Count); - Array.Copy(src.Indices, indices, src.Count); + src.GetIndices().CopyTo(editor.Indices); } - Utils.EnsureSize(ref values, src.Count); - for (int i = 0; i < src.Count; ++i) - conv(in src.Values[i], ref values[i]); + for (int i = 0; i < srcValues.Length; ++i) + conv(in srcValues[i], ref editor.Values[i]); } - dst = new VBuffer(src.Length, src.Count, values, indices); + dst = editor.Commit(); }; } diff --git a/src/Microsoft.ML.FastTree/TreeEnsemble/RegressionTree.cs b/src/Microsoft.ML.FastTree/TreeEnsemble/RegressionTree.cs index 05d809f8b4..834a4188f1 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsemble/RegressionTree.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsemble/RegressionTree.cs @@ -762,7 +762,7 @@ public int GetLeaf(in VBuffer feat) { // REVIEW: This really should validate feat.Length! if (feat.IsDense) - return GetLeafCore(feat.Values); + return GetLeafCore(feat.GetValues()); return GetLeafCore(feat.GetIndices(), feat.GetValues()); } @@ -778,7 +778,7 @@ private int GetLeafFrom(in VBuffer feat, int root) } if (feat.IsDense) - return GetLeafCore(feat.Values, root: root); + return GetLeafCore(feat.GetValues(), root: root); return GetLeafCore(feat.GetIndices(), feat.GetValues(), root: root); } @@ -796,7 +796,7 @@ public int GetLeaf(in VBuffer feat, ref List path) path.Clear(); if (feat.IsDense) - return GetLeafCore(feat.Values, path); + return GetLeafCore(feat.GetValues(), path); return GetLeafCore(feat.GetIndices(), feat.GetValues(), path); } @@ -816,9 +816,8 @@ private Float GetFeatureValue(Float x, int node) } } - private int GetLeafCore(Float[] nonBinnedInstance, List path = null, int root = 0) + private int GetLeafCore(ReadOnlySpan nonBinnedInstance, List path = null, int root = 0) { - Contracts.AssertValue(nonBinnedInstance); Contracts.Assert(path == null || path.Count == 0); Contracts.Assert(root >= 0); diff --git a/src/Microsoft.ML.HalLearners/OlsLinearRegression.cs b/src/Microsoft.ML.HalLearners/OlsLinearRegression.cs index 405310ec50..48ff08b67b 100644 --- a/src/Microsoft.ML.HalLearners/OlsLinearRegression.cs +++ b/src/Microsoft.ML.HalLearners/OlsLinearRegression.cs @@ -278,9 +278,11 @@ private OlsLinearRegressionPredictor TrainCore(IChannel ch, FloatLabelCursor.Fac for (int i = 0; i < beta.Length; ++i) ch.Check(FloatUtils.IsFinite(beta[i]), "Non-finite values detected in OLS solution"); - var weights = VBufferUtils.CreateDense(beta.Length - 1); + var weightsValues = new float[beta.Length - 1]; for (int i = 1; i < beta.Length; ++i) - weights.Values[i - 1] = (float)beta[i]; + weightsValues[i - 1] = (float)beta[i]; + var weights = new VBuffer(weightsValues.Length, weightsValues); + var bias = (float)beta[0]; if (!(_l2Weight > 0) && m == n) { @@ -670,8 +672,9 @@ private OlsLinearRegressionPredictor(IHostEnvironment env, ModelLoadContext ctx) _tValues = ctx.Reader.ReadDoubleArray(m); TValueCheckDecode(Bias, _tValues[0]); + var weightValues = Weight.GetValues(); for (int i = 1; i < m; ++i) - TValueCheckDecode(Weight.Values[i - 1], _tValues[i]); + TValueCheckDecode(weightValues[i - 1], _tValues[i]); _pValues = ctx.Reader.ReadDoubleArray(m); for (int i = 0; i < m; ++i) @@ -747,7 +750,7 @@ public override void SaveSummary(TextWriter writer, RoleMappedSchema schema) const string format = "{0}\t{1}\t{2}\t{3:g4}\t{4:g4}\t{5:e4}"; writer.WriteLine(format, "", "Bias", Bias, _standardErrors[0], _tValues[0], _pValues[0]); Contracts.Assert(Weight.IsDense); - var coeffs = Weight.Values; + var coeffs = Weight.GetValues(); for (int i = 0; i < coeffs.Length; i++) { var name = names.GetItemOrDefault(i); @@ -762,7 +765,7 @@ public override void SaveSummary(TextWriter writer, RoleMappedSchema schema) const string format = "{0}\t{1}\t{2}"; writer.WriteLine(format, "", "Bias", Bias); Contracts.Assert(Weight.IsDense); - var coeffs = Weight.Values; + var coeffs = Weight.GetValues(); for (int i = 0; i < coeffs.Length; i++) { var name = names.GetItemOrDefault(i); @@ -779,18 +782,16 @@ public override void GetFeatureWeights(ref VBuffer weights) return; } - var values = weights.Values; var size = _pValues.Length - 1; - if (Utils.Size(values) < size) - values = new float[size]; + var editor = VBufferEditor.Create(ref weights, size); for (int i = 0; i < size; i++) { var score = -(float)Math.Log(_pValues[i + 1]); if (score > float.MaxValue) score = float.MaxValue; - values[i] = score; + editor.Values[i] = score; } - weights = new VBuffer(size, values, weights.Indices); + weights = editor.Commit(); } } } diff --git a/src/Microsoft.ML.HalLearners/SymSgdClassificationTrainer.cs b/src/Microsoft.ML.HalLearners/SymSgdClassificationTrainer.cs index 0e2d6720d8..23280c6176 100644 --- a/src/Microsoft.ML.HalLearners/SymSgdClassificationTrainer.cs +++ b/src/Microsoft.ML.HalLearners/SymSgdClassificationTrainer.cs @@ -654,6 +654,8 @@ private TPredictor TrainCore(IChannel ch, RoleMappedData data, LinearPredictor p else weights = VBufferUtils.CreateDense(numFeatures); + var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights); + // Reference: Parasail. SymSGD. bool tuneLR = _args.LearningRate == null; var lr = _args.LearningRate ?? 1.0f; @@ -688,7 +690,7 @@ private TPredictor TrainCore(IChannel ch, RoleMappedData data, LinearPredictor p pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, state.PassIteration, _args.NumberOfIterations)); // If fully loaded, call the SymSGDNative and do not come back until learned for all iterations. - Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weights.Values, ref bias, numFeatures, + Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, _args.NumberOfIterations, numThreads, tuneNumLocIter, ref numLocIter, _args.Tolerance, _args.Shuffle, shouldInitialize, stateGCHandle); shouldInitialize = false; } @@ -709,7 +711,7 @@ private TPredictor TrainCore(IChannel ch, RoleMappedData data, LinearPredictor p // If all of this leaves us with 0 passes, then set numPassesForThisBatch to 1 numPassesForThisBatch = Math.Max(1, numPassesForThisBatch); state.PassIteration = iter; - Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weights.Values, ref bias, numFeatures, + Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, numPassesForThisBatch, numThreads, tuneNumLocIter, ref numLocIter, _args.Tolerance, _args.Shuffle, shouldInitialize, stateGCHandle); shouldInitialize = false; @@ -730,7 +732,7 @@ private TPredictor TrainCore(IChannel ch, RoleMappedData data, LinearPredictor p // Maps back the dense features that are mislocated if (numThreads > 1) - Native.MapBackWeightVector(weights.Values, stateGCHandle); + Native.MapBackWeightVector(weightsEditor.Values, stateGCHandle); Native.DeallocateSequentially(stateGCHandle); } } @@ -784,7 +786,7 @@ private static extern void LearnAll(int totalNumInstances, int* instSizes, int** /// Specifies if this is the first time to run SymSGD /// public static void LearnAll(InputDataManager inputDataManager, bool tuneLR, - ref float lr, float l2Const, float piw, float[] weightVector, ref float bias, int numFeatres, int numPasses, + ref float lr, float l2Const, float piw, Span weightVector, ref float bias, int numFeatres, int numPasses, int numThreads, bool tuneNumLocIter, ref int numLocIter, float tolerance, bool needShuffle, bool shouldInitialize, GCHandle stateGCHandle) { inputDataManager.PrepareCursoring(); @@ -838,7 +840,7 @@ public static void LearnAll(InputDataManager inputDataManager, bool tuneLR, /// /// The weight vector /// - public static void MapBackWeightVector(float[] weightVector, GCHandle stateGCHandle) + public static void MapBackWeightVector(Span weightVector, GCHandle stateGCHandle) { fixed (float* pweightVector = &weightVector[0]) MapBackWeightVector(pweightVector, (State*)stateGCHandle.AddrOfPinnedObject()); diff --git a/src/Microsoft.ML.ImageAnalytics/ImagePixelExtractorTransform.cs b/src/Microsoft.ML.ImageAnalytics/ImagePixelExtractorTransform.cs index 2bb9b912ae..b5419fa981 100644 --- a/src/Microsoft.ML.ImageAnalytics/ImagePixelExtractorTransform.cs +++ b/src/Microsoft.ML.ImageAnalytics/ImagePixelExtractorTransform.cs @@ -6,6 +6,7 @@ using System.Collections.Generic; using System.Drawing; using System.Linq; +using System.Runtime.InteropServices; using System.Text; using Microsoft.ML.Core.Data; using Microsoft.ML.Runtime; @@ -439,6 +440,7 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose //REVIEW Rewrite it to where TValue : IConvertible private ValueGetter> GetGetterCore(IRow input, int iinfo, out Action disposer) + where TValue : struct { var type = _types[iinfo]; var dims = type.Dimensions; @@ -476,26 +478,26 @@ private ValueGetter> GetGetterCore(IRow input, int iinfo if (src == null) { - dst = new VBuffer(size, 0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, size, 0); return; } Host.Check(src.PixelFormat == System.Drawing.Imaging.PixelFormat.Format32bppArgb); Host.Check(src.Height == height && src.Width == width); - var values = dst.Values; - if (Utils.Size(values) < size) - values = new TValue[size]; + var editor = VBufferEditor.Create(ref dst, size); + var values = editor.Values; float offset = ex.Offset; float scale = ex.Scale; Contracts.Assert(scale != 0); - var vf = values as float[]; - var vb = values as byte[]; - Contracts.Assert(vf != null || vb != null); + // REVIEW: split the getter into 2 specialized getters, one for float case and one for byte case. + Span vf = typeof(TValue) == typeof(float) ? MemoryMarshal.Cast(editor.Values) : default; + Span vb = typeof(TValue) == typeof(byte) ? MemoryMarshal.Cast(editor.Values) : default; + Contracts.Assert(!vf.IsEmpty || !vb.IsEmpty); bool needScale = offset != 0 || scale != 1; - Contracts.Assert(!needScale || vf != null); + Contracts.Assert(!needScale || !vf.IsEmpty); bool a = ex.Alpha; bool r = ex.Red; @@ -512,7 +514,7 @@ private ValueGetter> GetGetterCore(IRow input, int iinfo for (int y = 0; y < h; ++y) { var pb = src.GetPixel(x, y); - if (vb != null) + if (!vb.IsEmpty) { if (a) { vb[idst++] = pb.A; } if (r) { vb[idst++] = pb.R; } @@ -543,7 +545,7 @@ private ValueGetter> GetGetterCore(IRow input, int iinfo { // The image only has rgb but we need to supply alpha as well, so fake it up, // assuming that it is 0xFF. - if (vf != null) + if (!vf.IsEmpty) { Single v = (0xFF - offset) * scale; for (int i = 0; i < cpix; i++) @@ -566,7 +568,7 @@ private ValueGetter> GetGetterCore(IRow input, int iinfo int idstBase = idstMin + y * w; // Note that the bytes are in order BGR[A]. We arrange the layers in order ARGB. - if (vb != null) + if (!vb.IsEmpty) { for (int x = 0; x < w; x++, idstBase++) { @@ -605,7 +607,7 @@ private ValueGetter> GetGetterCore(IRow input, int iinfo } } - dst = new VBuffer(size, values, dst.Indices); + dst = editor.Commit(); }; } diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs index 6562a1b6b6..2682e50301 100644 --- a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs +++ b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs @@ -396,7 +396,7 @@ public static void Initialize( "Not enough distinct instances to populate {0} clusters (only found {1} distinct instances)", k, i); } - candidate.CopyTo(centroids[i].Values); + candidate.CopyToDense(ref centroids[i]); centroidL2s[i] = cachedCandidateL2 ?? VectorUtils.NormSquared(candidate); } } @@ -655,7 +655,7 @@ private static void FindBestCluster(in VBuffer point, int pointRowIndex, if (pointRowIndex != -1) // if the space was available for cur in initializationState. { // pointNorm is necessary for using triangle inequality. - float pointNorm = VectorUtils.NormSquared(point); + float pointNorm = VectorUtils.NormSquared(in point); // We have cached distance information for this point. bestCluster = initializationState.GetBestCluster(pointRowIndex); float bestWeight = initializationState.GetBestWeight(pointRowIndex); @@ -788,6 +788,7 @@ public static void Initialize(IHost host, int numThreads, IChannel ch, FeatureFl // The final chosen points, to be approximately clustered to determine starting // centroids. VBuffer[] clusters = new VBuffer[totalSamples]; + // L2s, kept for distance trick. float[] clustersL2s = new float[totalSamples]; @@ -1318,7 +1319,7 @@ public static void Train(IHost host, int numThreads, IChannel ch, FeatureFloatVe float[] centroidL2s = new float[k]; for (int i = 0; i < k; i++) - centroidL2s[i] = VectorUtils.NormSquared(centroids[i]); + centroidL2s[i] = VectorUtils.NormSquared(in centroids[i]); using (var pch = host.StartProgressChannel("KMeansTrain")) { @@ -1388,8 +1389,10 @@ public static void Train(IHost host, int numThreads, IChannel ch, FeatureFloatVe for (int i = 0; i < k; i++) { + var reducedStateCacheValues = reducedState.CachedSumDebug[i].GetValues(); + var cachedSumCopyValues = cachedSumCopy[i].GetValues(); for (int j = 0; j < dimensionality; j++) - Contracts.Assert(AlmostEq(reducedState.CachedSumDebug[i].Values[j], cachedSumCopy[i].Values[j])); + Contracts.Assert(AlmostEq(reducedStateCacheValues[j], cachedSumCopyValues[j])); } } #endif diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPredictor.cs b/src/Microsoft.ML.KMeansClustering/KMeansPredictor.cs index 2fb4d29b59..38b5116da4 100644 --- a/src/Microsoft.ML.KMeansClustering/KMeansPredictor.cs +++ b/src/Microsoft.ML.KMeansClustering/KMeansPredictor.cs @@ -148,21 +148,19 @@ public ValueMapper GetMapper() { if (src.Length != _dimensionality) throw Host.Except($"Incorrect number of features: expected {_dimensionality}, got {src.Length}"); - var values = dst.Values; - if (Utils.Size(values) < _k) - values = new Float[_k]; - Map(in src, values); - dst = new VBuffer(_k, values, dst.Indices); + var editor = VBufferEditor.Create(ref dst, _k); + Map(in src, editor.Values); + dst = editor.Commit(); }; return (ValueMapper)(Delegate)del; } - private void Map(in VBuffer src, Float[] distances) + private void Map(in VBuffer src, Span distances) { - Host.Assert(Utils.Size(distances) >= _k); + Host.Assert(distances.Length >= _k); - Float instanceL2 = VectorUtils.NormSquared(src); + Float instanceL2 = VectorUtils.NormSquared(in src); for (int i = 0; i < _k; i++) { Float distance = Math.Max(0, diff --git a/src/Microsoft.ML.Legacy/Models/ConfusionMatrix.cs b/src/Microsoft.ML.Legacy/Models/ConfusionMatrix.cs index d8bb404d49..04c978ea44 100644 --- a/src/Microsoft.ML.Legacy/Models/ConfusionMatrix.cs +++ b/src/Microsoft.ML.Legacy/Models/ConfusionMatrix.cs @@ -75,9 +75,10 @@ internal static List Create(IHostEnvironment env, IDataView con elements = new double[type.VectorSize, type.VectorSize]; countGetter(ref countValues); - for (int i = 0; i < countValues.Length; i++) + ReadOnlySpan values = countValues.GetValues(); + for (int i = 0; i < values.Length; i++) { - elements[valuesRowIndex, i] = countValues.Values[i]; + elements[valuesRowIndex, i] = values[i]; } valuesRowIndex++; diff --git a/src/Microsoft.ML.Legacy/Runtime/EntryPoints/FeatureCombiner.cs b/src/Microsoft.ML.Legacy/Runtime/EntryPoints/FeatureCombiner.cs index f560671fae..50ad0da77a 100644 --- a/src/Microsoft.ML.Legacy/Runtime/EntryPoints/FeatureCombiner.cs +++ b/src/Microsoft.ML.Legacy/Runtime/EntryPoints/FeatureCombiner.cs @@ -129,10 +129,11 @@ private static string GetTerms(IDataView data, string colName) return null; var sb = new StringBuilder(); var pre = ""; - for (int i = 0; i < metadata.Length; i++) + var metadataValues = metadata.GetValues(); + for (int i = 0; i < metadataValues.Length; i++) { sb.Append(pre); - sb.AppendMemory(metadata.Values[i]); + sb.AppendMemory(metadataValues[i]); pre = ","; } return sb.ToString(); diff --git a/src/Microsoft.ML.OnnxTransform/OnnxTransform.cs b/src/Microsoft.ML.OnnxTransform/OnnxTransform.cs index 834889b946..5496e67994 100644 --- a/src/Microsoft.ML.OnnxTransform/OnnxTransform.cs +++ b/src/Microsoft.ML.OnnxTransform/OnnxTransform.cs @@ -282,12 +282,9 @@ private Delegate MakeGetter(IRow input) var outputTensors = _parent.Model.Run(inputTensors); Contracts.Assert(outputTensors.Count() > 0); - var values = dst.Values; - if (Utils.Size(values) < _outputColType.VectorSize) - values = new T[_outputColType.VectorSize]; - - OnnxUtils.CopyTo(outputTensors[0], values); - dst = new VBuffer(values.Length, values, dst.Indices); + var editor = VBufferEditor.Create(ref dst, _outputColType.VectorSize); + OnnxUtils.CopyTo(outputTensors[0], editor.Values); + dst = editor.Commit(); }; return valueGetter; diff --git a/src/Microsoft.ML.OnnxTransform/OnnxUtils.cs b/src/Microsoft.ML.OnnxTransform/OnnxUtils.cs index 05abdfb40a..92a4188966 100644 --- a/src/Microsoft.ML.OnnxTransform/OnnxUtils.cs +++ b/src/Microsoft.ML.OnnxTransform/OnnxUtils.cs @@ -99,19 +99,20 @@ private class TensorValueGetterVec : ITensorValueGetter private readonly ValueGetter> _srcgetter; private readonly OnnxShape _tensorShape; private VBuffer _vBuffer; - private VBuffer _vBufferDense; + private T[] _denseData; public TensorValueGetterVec(IRow input, int colIndex, OnnxShape tensorShape) { _srcgetter = input.GetGetter>(colIndex); _tensorShape = tensorShape; _vBuffer = default; - _vBufferDense = default; + _denseData = default; } public Tensor GetTensor() { _srcgetter(ref _vBuffer); - _vBuffer.CopyToDense(ref _vBufferDense); - return OnnxUtils.CreateTensor(_vBufferDense.Values, _tensorShape); + Utils.EnsureSize(ref _denseData, _vBuffer.Length, keepOld: false); + _vBuffer.CopyTo(_denseData); + return OnnxUtils.CreateTensor(_denseData, _tensorShape); } } } @@ -338,12 +339,18 @@ public static Tensor CreateTensor(T[] data, OnnxShape shape) /// Also Tensor.CopyTo(List<T> dst) requires a list input, whereas ML.NET /// provides array buffers to copy values to. This mismatch causes an extra copy. /// - public static void CopyTo(Tensor tensor, T[] dst) + public static unsafe void CopyTo(Tensor tensor, Span dst) { if (typeof(T) == typeof(System.Single)) { - var typedDst = (System.Single[])(object)dst; - tensor.CopyTo(typedDst); + DataType dataType = tensor.GetDataType(); + if (dataType != DataType.Type_Float) + { + throw new InvalidOperationException(string.Format("Cannot copy source tensor {0} to managed type System.Single (DataType.Type_Float).", dataType)); + } + + Span tensorSpan = new Span(tensor.UnsafeGetData().ToPointer(), tensor.GetSize()); + tensorSpan.CopyTo(dst); // TODO: the CopyTo() function is susceptible to GC reclaiming tensor // during the method call. Use KeepAlive for now, and remove // after permanent fix in CopyTo(). diff --git a/src/Microsoft.ML.StandardLearners/Optimizer/DifferentiableFunction.cs b/src/Microsoft.ML.StandardLearners/Optimizer/DifferentiableFunction.cs index ee928e5cda..3c8ba56724 100644 --- a/src/Microsoft.ML.StandardLearners/Optimizer/DifferentiableFunction.cs +++ b/src/Microsoft.ML.StandardLearners/Optimizer/DifferentiableFunction.cs @@ -247,7 +247,7 @@ public static Float Test(DifferentiableFunction f, in VBuffer x, bool qui /// /// /// - public static void TestAllCoords(DifferentiableFunction f, ref VBuffer x) + public static void TestAllCoords(DifferentiableFunction f, in VBuffer x) { // REVIEW: Delete this method? VBuffer grad = default(VBuffer); @@ -286,7 +286,7 @@ public static void TestAllCoords(DifferentiableFunction f, ref VBuffer x) /// Function to test /// Point at which to test /// List of coordinates to test - public static void TestCoords(DifferentiableFunction f, ref VBuffer x, IList coords) + public static void TestCoords(DifferentiableFunction f, in VBuffer x, IList coords) { // REVIEW: Delete this method? VBuffer grad = default(VBuffer); diff --git a/src/Microsoft.ML.StandardLearners/Optimizer/LineSearch.cs b/src/Microsoft.ML.StandardLearners/Optimizer/LineSearch.cs index fb8e2a6520..6b905a8ef2 100644 --- a/src/Microsoft.ML.StandardLearners/Optimizer/LineSearch.cs +++ b/src/Microsoft.ML.StandardLearners/Optimizer/LineSearch.cs @@ -530,7 +530,7 @@ public static void Main(string[] argv) GDOptimizer gdo = new GDOptimizer(term, null, true); print = true; CreateWrapped(out init, 0, 0); - gdo.Minimize(QuadTest2D, ref init, ref ans); + gdo.Minimize(QuadTest2D, in init, ref ans); QuadTest2D(in ans, ref grad); Console.WriteLine(VectorUtils.Norm(grad)); } diff --git a/src/Microsoft.ML.StandardLearners/Optimizer/OptimizationMonitor.cs b/src/Microsoft.ML.StandardLearners/Optimizer/OptimizationMonitor.cs index 7b231bb027..705c9f8477 100644 --- a/src/Microsoft.ML.StandardLearners/Optimizer/OptimizationMonitor.cs +++ b/src/Microsoft.ML.StandardLearners/Optimizer/OptimizationMonitor.cs @@ -85,7 +85,7 @@ private Float Check(Optimizer.OptimizerState state) { Console.Error.Write(_checkingMessage); Console.Error.Flush(); - var x = state.X; + VBuffer x = state.X; var lastDir = state.LastDir; Float checkResult = GradientTester.Test(state.Function, in x, ref lastDir, true, ref _newGrad, ref _newX); for (int i = 0; i < _checkingMessage.Length; i++) diff --git a/src/Microsoft.ML.StandardLearners/Optimizer/Optimizer.cs b/src/Microsoft.ML.StandardLearners/Optimizer/Optimizer.cs index 4ec56d0eaa..914924d762 100644 --- a/src/Microsoft.ML.StandardLearners/Optimizer/Optimizer.cs +++ b/src/Microsoft.ML.StandardLearners/Optimizer/Optimizer.cs @@ -645,7 +645,7 @@ public void Minimize(DifferentiableFunction function, ref VBuffer initial double? improvement = null; double x; int end; - if (message != null && DoubleParser.TryParse(message.AsMemory().Span, out x, out end)) + if (message != null && DoubleParser.TryParse(message.AsSpan(), out x, out end)) improvement = x; pch.Checkpoint(state.Value, improvement, state.Iter); diff --git a/src/Microsoft.ML.StandardLearners/Optimizer/SgdOptimizer.cs b/src/Microsoft.ML.StandardLearners/Optimizer/SgdOptimizer.cs index 67fcf1c18b..2777cb077d 100644 --- a/src/Microsoft.ML.StandardLearners/Optimizer/SgdOptimizer.cs +++ b/src/Microsoft.ML.StandardLearners/Optimizer/SgdOptimizer.cs @@ -349,7 +349,7 @@ public void ChangeDir() /// Function to minimize /// Initial point /// Approximate minimum - public void Minimize(DifferentiableFunction function, ref VBuffer initial, ref VBuffer result) + public void Minimize(DifferentiableFunction function, in VBuffer initial, ref VBuffer result) { Contracts.Check(FloatUtils.IsFinite(initial.GetValues()), "The initial vector contains NaNs or infinite values."); LineFunc lineFunc = new LineFunc(function, in initial, UseCG); @@ -387,96 +387,102 @@ internal static bool ShouldTerminate(in VBuffer x, in VBuffer xpre Contracts.Assert(x.Length == xprev.Length, "Vectors must have the same dimensionality."); Contracts.Assert(FloatUtils.IsFinite(xprev.GetValues())); - if (!FloatUtils.IsFinite(x.GetValues())) + var xValues = x.GetValues(); + if (!FloatUtils.IsFinite(xValues)) return true; + var xprevValues = xprev.GetValues(); if (x.IsDense && xprev.IsDense) { - for (int i = 0; i < x.Length; i++) + for (int i = 0; i < xValues.Length; i++) { - if (x.Values[i] != xprev.Values[i]) + if (xValues[i] != xprevValues[i]) return false; } } else if (xprev.IsDense) { + var xIndices = x.GetIndices(); int j = 0; - for (int ii = 0; ii < x.Count; ii++) + for (int ii = 0; ii < xValues.Length; ii++) { - int i = x.Indices[ii]; + int i = xIndices[ii]; while (j < i) { - if (xprev.Values[j++] != 0) + if (xprevValues[j++] != 0) return false; } Contracts.Assert(i == j); - if (x.Values[ii] != xprev.Values[j++]) + if (xValues[ii] != xprevValues[j++]) return false; } - while (j < xprev.Length) + while (j < xprevValues.Length) { - if (xprev.Values[j++] != 0) + if (xprevValues[j++] != 0) return false; } } else if (x.IsDense) { + var xprevIndices = xprev.GetIndices(); int i = 0; - for (int jj = 0; jj < xprev.Count; jj++) + for (int jj = 0; jj < xprevValues.Length; jj++) { - int j = xprev.Indices[jj]; + int j = xprevIndices[jj]; while (i < j) { - if (x.Values[i++] != 0) + if (xValues[i++] != 0) return false; } Contracts.Assert(j == i); - if (x.Values[i++] != xprev.Values[jj]) + if (xValues[i++] != xprevValues[jj]) return false; } - while (i < x.Length) + while (i < xValues.Length) { - if (x.Values[i++] != 0) + if (xValues[i++] != 0) return false; } } else { // Both sparse. + var xIndices = x.GetIndices(); + var xprevIndices = xprev.GetIndices(); int ii = 0; int jj = 0; - while (ii < x.Count && jj < xprev.Count) + while (ii < xValues.Length && jj < xprevValues.Length) { - int i = x.Indices[ii]; - int j = xprev.Indices[jj]; + int i = xIndices[ii]; + int j = xprevIndices[jj]; if (i == j) { - if (x.Values[ii++] != xprev.Values[jj++]) + if (xValues[ii++] != xprevValues[jj++]) return false; } else if (i < j) { - if (x.Values[ii++] != 0) + if (xValues[ii++] != 0) return false; } else { - if (xprev.Values[jj++] != 0) + if (xprevValues[jj++] != 0) return false; } } - while (ii < x.Count) + while (ii < xValues.Length) { - if (x.Values[ii++] != 0) + if (xValues[ii++] != 0) return false; } - while (jj < xprev.Count) + while (jj < xprevValues.Length) { - if (xprev.Values[jj++] != 0) + if (xprevValues[jj++] != 0) return false; } } diff --git a/src/Microsoft.ML.StandardLearners/Standard/LinearPredictorUtils.cs b/src/Microsoft.ML.StandardLearners/Standard/LinearPredictorUtils.cs index 244019bcdd..69740fa7fe 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/LinearPredictorUtils.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/LinearPredictorUtils.cs @@ -51,7 +51,7 @@ public static void SaveAsCode(TextWriter writer, in VBuffer weights, Floa writer.Write(FloatUtils.ToRoundTripString(value)); writer.Write("*"); - if (featureNames.Count > 0) + if (featureNames.GetValues().Length > 0) writer.Write(FeatureNameAsCode(featureNames.GetItemOrDefault(idx).ToString(), idx)); else writer.Write("f_" + idx); @@ -118,7 +118,7 @@ public static string LinearModelAsIni(in VBuffer weights, Float bias, IPr var name = featureNames.GetItemOrDefault(idx); inputBuilder.AppendLine("[Input:" + numNonZeroWeights + "]"); - inputBuilder.AppendLine("Name=" + (featureNames.Count == 0 ? "Feature_" + idx : name.IsEmpty ? $"f{idx}" : name.ToString())); + inputBuilder.AppendLine("Name=" + (featureNames.GetValues().Length == 0 ? "Feature_" + idx : name.IsEmpty ? $"f{idx}" : name.ToString())); inputBuilder.AppendLine("Transform=linear"); inputBuilder.AppendLine("Slope=1"); inputBuilder.AppendLine("Intercept=0"); diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs index 37d838d6b3..04f272683c 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs @@ -595,11 +595,15 @@ protected virtual float DifferentiableFunction(in VBuffer x, ref VBuffer< Contracts.AssertValueOrNull(progress); float scaleFactor = 1 / (float)WeightSum; - VBuffer xDense = default(VBuffer); + VBuffer xDense = default; if (x.IsDense) xDense = x; else - x.CopyToDense(ref xDense); + { + VBuffer xDenseTemp = default; + x.CopyToDense(ref xDenseTemp); + xDense = xDenseTemp; + } IProgressChannel pch = progress != null ? progress.StartProgressChannel("Gradient") : null; float loss; @@ -613,7 +617,7 @@ protected virtual float DifferentiableFunction(in VBuffer x, ref VBuffer< if (L2Weight > 0) { Contracts.Assert(xDense.IsDense); - var values = xDense.Values; + var values = xDense.GetValues(); Double r = 0; for (int i = BiasCount; i < values.Length; i++) { diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs index 60c81b0ed1..382caf53e3 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs @@ -157,8 +157,9 @@ protected override float AccumulateOneGradient(in VBuffer feat, float lab VectorUtils.AddMultWithOffset(in feat, mult, ref grad, 1); // Note that 0th L-BFGS weight is for bias. // Add bias using this strange trick that has advantage of working well for dense and sparse arrays. // Due to the call to EnsureBiases, we know this region is dense. - Contracts.Assert(grad.Count >= BiasCount && (grad.IsDense || grad.Indices[BiasCount - 1] == BiasCount - 1)); - grad.Values[0] += mult; + var editor = VBufferEditor.CreateFromBuffer(ref grad); + Contracts.Assert(editor.Values.Length >= BiasCount && (grad.IsDense || editor.Indices[BiasCount - 1] == BiasCount - 1)); + editor.Values[0] += mult; return weight * datumLoss; } @@ -298,7 +299,7 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor. // Increment the first entry of hessian. hessian[0] += variance; - var values = cursor.Features.Values; + var values = cursor.Features.GetValues(); if (cursor.Features.IsDense) { int ioff = 1; @@ -324,8 +325,8 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor. } else { - var indices = cursor.Features.Indices; - for (int ii = 0; ii < cursor.Features.Count; ++ii) + var indices = cursor.Features.GetIndices(); + for (int ii = 0; ii < values.Length; ++ii) { int i = indices[ii]; int wi = i + 1; diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs index 7d5840a776..4e4571b9c1 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs @@ -218,8 +218,9 @@ protected override float AccumulateOneGradient(in VBuffer feat, float lab float mult = weight * (modelProb - probLabel); VectorUtils.AddMultWithOffset(in feat, mult, ref grad, start); // Due to the call to EnsureBiases, we know this region is dense. - Contracts.Assert(grad.Count >= BiasCount && (grad.IsDense || grad.Indices[BiasCount - 1] == BiasCount - 1)); - grad.Values[c] += mult; + var editor = VBufferEditor.CreateFromBuffer(ref grad); + Contracts.Assert(editor.Values.Length >= BiasCount && (grad.IsDense || editor.Indices[BiasCount - 1] == BiasCount - 1)); + editor.Values[c] += mult; } Contracts.Check(FloatUtils.IsFinite(datumLoss), "Data contain bad values."); @@ -663,21 +664,22 @@ protected override void SaveCore(ModelSaveContext ctx) int count = 0; foreach (var fw in _weights) { + var fwValues = fw.GetValues(); if (fw.IsDense) { - for (int i = 0; i < fw.Length; i++) + for (int i = 0; i < fwValues.Length; i++) { - if (fw.Values[i] != 0) + if (fwValues[i] != 0) { - ctx.Writer.Write(fw.Values[i]); + ctx.Writer.Write(fwValues[i]); count++; } } } else { - ctx.Writer.WriteSinglesNoCount(fw.GetValues()); - count += fw.Count; + ctx.Writer.WriteSinglesNoCount(fwValues); + count += fwValues.Length; } } Host.Assert(count == numIndices); @@ -697,21 +699,11 @@ protected override void SaveCore(ModelSaveContext ctx) private static int NonZeroCount(in VBuffer vector) { int count = 0; - if (!vector.IsDense) + var values = vector.GetValues(); + for (int i = 0; i < values.Length; i++) { - for (int i = 0; i < vector.Count; i++) - { - if (vector.Values[i] != 0) - count++; - } - } - else - { - for (int i = 0; i < vector.Length; i++) - { - if (vector.Values[i] != 0) - count++; - } + if (values[i] != 0) + count++; } return count; } diff --git a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs index 1eeb043c01..812e114091 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/ModelStatistics.cs @@ -106,7 +106,7 @@ internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, internal LinearModelStatistics(IHostEnvironment env, long trainingExampleCount, int paramCount, Single deviance, Single nullDeviance, in VBuffer coeffStdError) : this(env, trainingExampleCount, paramCount, deviance, nullDeviance) { - _env.Assert(coeffStdError.Count == _paramCount); + _env.Assert(coeffStdError.GetValues().Length == _paramCount); _coeffStdError = coeffStdError; } diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs index 8de4a4dd8e..756607deea 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs @@ -133,20 +133,22 @@ protected override MultiClassNaiveBayesPredictor TrainModelCore(TrainContext con labelHistogram[cursor.Label] += 1; labelCount = labelCount < size ? size : labelCount; + var featureValues = cursor.Features.GetValues(); if (cursor.Features.IsDense) { - for (int i = 0; i < cursor.Features.Count; i += 1) + for (int i = 0; i < featureValues.Length; i += 1) { - if (cursor.Features.Values[i] > 0) + if (featureValues[i] > 0) featureHistogram[cursor.Label][i] += 1; } } else { - for (int i = 0; i < cursor.Features.Count; i += 1) + var featureIndices = cursor.Features.GetIndices(); + for (int i = 0; i < featureValues.Length; i += 1) { - if (cursor.Features.Values[i] > 0) - featureHistogram[cursor.Label][cursor.Features.Indices[i]] += 1; + if (featureValues[i] > 0) + featureHistogram[cursor.Label][featureIndices[i]] += 1; } } @@ -374,7 +376,12 @@ private void ComputeLabelProbabilityFromFeature(double labelOccurrenceCount, int private void Map(in VBuffer src, ref VBuffer dst) { Host.Check(src.Length == _featureCount, "Invalid number of features passed."); - float[] labelScores = (dst.Length >= _labelCount) ? dst.Values : new float[_labelCount]; + + var srcValues = src.GetValues(); + var srcIndices = src.GetIndices(); + + var editor = VBufferEditor.Create(ref dst, _labelCount); + Span labelScores = editor.Values; for (int iLabel = 0; iLabel < _labelCount; iLabel += 1) { double labelOccurrenceCount = _labelHistogram[iLabel]; @@ -384,18 +391,18 @@ private void Map(in VBuffer src, ref VBuffer dst) { if (src.IsDense) { - for (int iFeature = 0; iFeature < src.Count; iFeature += 1) + for (int iFeature = 0; iFeature < srcValues.Length; iFeature += 1) { ComputeLabelProbabilityFromFeature(labelOccurrenceCount, iLabel, iFeature, - src.Values[iFeature], ref logProb, ref absentFeatureLogProb); + srcValues[iFeature], ref logProb, ref absentFeatureLogProb); } } else { - for (int iFeature = 0; iFeature < src.Count; iFeature += 1) + for (int iFeature = 0; iFeature < srcValues.Length; iFeature += 1) { - ComputeLabelProbabilityFromFeature(labelOccurrenceCount, iLabel, src.Indices[iFeature], - src.Values[iFeature], ref logProb, ref absentFeatureLogProb); + ComputeLabelProbabilityFromFeature(labelOccurrenceCount, iLabel, srcIndices[iFeature], + srcValues[iFeature], ref logProb, ref absentFeatureLogProb); } } } @@ -404,7 +411,7 @@ private void Map(in VBuffer src, ref VBuffer dst) (float)(logProb + (_absentFeaturesLogProb[iLabel] - absentFeatureLogProb)); } - dst = new VBuffer(_labelCount, labelScores, dst.Indices); + dst = editor.Commit(); } } } diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/LinearSvm.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/LinearSvm.cs index 716bd1c4fc..09700c8184 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/Online/LinearSvm.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/Online/LinearSvm.cs @@ -119,7 +119,7 @@ private void BeginBatch() _batch++; _numBatchExamples = 0; _biasUpdate = 0; - _weightsUpdate = new VBuffer(_weightsUpdate.Length, 0, _weightsUpdate.Values, _weightsUpdate.Indices); + VBufferUtils.Resize(ref _weightsUpdate, _weightsUpdate.Length, 0); } private void FinishBatch(in VBuffer weightsUpdate, Float weightsUpdateScale) @@ -147,7 +147,7 @@ public override void ProcessDataInstance(IChannel ch, in VBuffer feat, Fl Float currentBiasUpdate = trueOutput * weight; _biasUpdate += currentBiasUpdate; // Only aggregate in the case where we're handling multiple instances. - if (_weightsUpdate.Count == 0) + if (_weightsUpdate.GetValues().Length == 0) { VectorUtils.ScaleInto(in feat, currentBiasUpdate, ref _weightsUpdate); _weightsUpdateScale = 1; @@ -160,7 +160,7 @@ public override void ProcessDataInstance(IChannel ch, in VBuffer feat, Fl { if (_batchSize == 1 && loss < 0) { - Contracts.Assert(_weightsUpdate.Count == 0); + Contracts.Assert(_weightsUpdate.GetValues().Length == 0); // If we aren't aggregating multiple instances, just use the instance's // vector directly. Float currentBiasUpdate = trueOutput * weight; diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs index 606b09b341..cfa56e1e6f 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs @@ -138,8 +138,9 @@ protected override float AccumulateOneGradient(in VBuffer feat, float lab float mult = -(y - lambda) * weight; VectorUtils.AddMultWithOffset(in feat, mult, ref grad, 1); // Due to the call to EnsureBiases, we know this region is dense. - Contracts.Assert(grad.Count >= BiasCount && (grad.IsDense || grad.Indices[BiasCount - 1] == BiasCount - 1)); - grad.Values[0] += mult; + var editor = VBufferEditor.CreateFromBuffer(ref grad); + Contracts.Assert(editor.Values.Length >= BiasCount && (grad.IsDense || editor.Indices[BiasCount - 1] == BiasCount - 1)); + editor.Values[0] += mult; // From the computer's perspective exp(infinity)==infinity // so inf-inf=nan, but in reality, infinity is just a large // number we can't represent, and exp(X)-X for X=inf is just inf. diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs index 27e4450f7f..b7b8c32804 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs @@ -772,7 +772,7 @@ protected virtual void TrainWithoutLock(IProgressChannelProvider progress, Float while (cursor.MoveNext()) { long idx = getIndexFromId(cursor.Id); - var features = cursor.Features; + VBuffer features = cursor.Features; var label = cursor.Label; float invariant; if (invariants != null) @@ -829,10 +829,11 @@ protected virtual void TrainWithoutLock(IProgressChannelProvider progress, Float : 0; } + var featureValues = features.GetValues(); if (features.IsDense) - CpuMathUtils.SdcaL1UpdateDense(primalUpdate, features.Count, features.Values, l1Threshold, l1IntermediateWeights[0].Values, weights[0].Values); - else if (features.Count > 0) - CpuMathUtils.SdcaL1UpdateSparse(primalUpdate, features.Count, features.Values, features.Indices, l1Threshold, l1IntermediateWeights[0].Values, weights[0].Values); + CpuMathUtils.SdcaL1UpdateDense(primalUpdate, featureValues.Length, featureValues, l1Threshold, l1IntermediateWeights[0].Values, weights[0].Values); + else if (featureValues.Length > 0) + CpuMathUtils.SdcaL1UpdateSparse(primalUpdate, featureValues.Length, featureValues, features.GetIndices(), l1Threshold, l1IntermediateWeights[0].Values, weights[0].Values); } break; @@ -919,6 +920,7 @@ protected virtual bool CheckConvergence( var lossSum = new CompensatedSum(); var dualLossSum = new CompensatedSum(); var biasTotal = biasReg[0] + biasUnreg[0]; + VBuffer firstWeights = weights[0]; using (var cursor = cursorFactory.Create()) { @@ -955,7 +957,7 @@ protected virtual bool CheckConvergence( var dualityGap = metrics[(int)MetricKind.DualityGap] = newLoss - newDualLoss; metrics[(int)MetricKind.BiasUnreg] = biasUnreg[0]; metrics[(int)MetricKind.BiasReg] = biasReg[0]; - metrics[(int)MetricKind.L1Sparsity] = Args.L1Threshold == 0 ? 1 : (Double)weights[0].Values.Count(w => w != 0) / weights.Length; + metrics[(int)MetricKind.L1Sparsity] = Args.L1Threshold == 0 ? 1 : (Double)firstWeights.GetValues().Count(w => w != 0) / weights.Length; bool converged = dualityGap / newLoss < Args.ConvergenceTolerance; @@ -964,7 +966,7 @@ protected virtual bool CheckConvergence( // Maintain a copy of weights and bias with best primal loss thus far. // This is some extra work and uses extra memory, but it seems worth doing it. // REVIEW: Sparsify bestWeights? - weights[0].CopyTo(ref bestWeights[0]); + firstWeights.CopyTo(ref bestWeights[0]); bestBiasReg[0] = biasReg[0]; bestBiasUnreg[0] = biasUnreg[0]; bestPrimalLoss = metrics[(int)MetricKind.Loss]; diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs index 48a88d6009..b5158bcce6 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs @@ -167,7 +167,7 @@ protected override void TrainWithoutLock(IProgressChannelProvider progress, Floa } else { - normSquared = VectorUtils.NormSquared(features); + normSquared = VectorUtils.NormSquared(in features); if (Args.BiasLearningRate == 0) normSquared += 1; @@ -240,10 +240,11 @@ protected override void TrainWithoutLock(IProgressChannelProvider progress, Floa : 0; } + var featureValues = features.GetValues(); if (features.IsDense) - CpuMathUtils.SdcaL1UpdateDense(-primalUpdate, features.Count, features.Values, l1Threshold, l1IntermediateWeights[iClass].Values, weights[iClass].Values); - else if (features.Count > 0) - CpuMathUtils.SdcaL1UpdateSparse(-primalUpdate, features.Count, features.Values, features.Indices, l1Threshold, l1IntermediateWeights[iClass].Values, weights[iClass].Values); + CpuMathUtils.SdcaL1UpdateDense(-primalUpdate, featureValues.Length, featureValues, l1Threshold, l1IntermediateWeights[iClass].Values, weights[iClass].Values); + else if (featureValues.Length > 0) + CpuMathUtils.SdcaL1UpdateSparse(-primalUpdate, featureValues.Length, featureValues, features.GetIndices(), l1Threshold, l1IntermediateWeights[iClass].Values, weights[iClass].Values); } break; @@ -267,10 +268,11 @@ protected override void TrainWithoutLock(IProgressChannelProvider progress, Floa ? intermediateBias - Math.Sign(intermediateBias) * l1Threshold : 0; + var featureValues = features.GetValues(); if (features.IsDense) - CpuMathUtils.SdcaL1UpdateDense(labelPrimalUpdate, features.Count, features.Values, l1Threshold, l1IntermediateWeights[label].Values, weights[label].Values); - else if (features.Count > 0) - CpuMathUtils.SdcaL1UpdateSparse(labelPrimalUpdate, features.Count, features.Values, features.Indices, l1Threshold, l1IntermediateWeights[label].Values, weights[label].Values); + CpuMathUtils.SdcaL1UpdateDense(labelPrimalUpdate, featureValues.Length, featureValues, l1Threshold, l1IntermediateWeights[label].Values, weights[label].Values); + else if (featureValues.Length > 0) + CpuMathUtils.SdcaL1UpdateSparse(labelPrimalUpdate, featureValues.Length, featureValues, features.GetIndices(), l1Threshold, l1IntermediateWeights[label].Values, weights[label].Values); } rowCount++; diff --git a/src/Microsoft.ML.TensorFlow/TensorFlow/TensorflowUtils.cs b/src/Microsoft.ML.TensorFlow/TensorFlow/TensorflowUtils.cs index 8ac5e532a3..dbf080d9a1 100644 --- a/src/Microsoft.ML.TensorFlow/TensorFlow/TensorflowUtils.cs +++ b/src/Microsoft.ML.TensorFlow/TensorFlow/TensorflowUtils.cs @@ -115,8 +115,12 @@ public static ISchema GetModelSchema(IExceptionContext ectx, string modelFile) Contracts.Assert(metadataType.IsKnownSizeVector && metadataType.ItemType.IsText); schema.GetMetadata(TensorFlowUtils.InputOps, i, ref inputOps); } - yield return (name, opType.ToString(), type, - Utils.Size(inputOps.Values) > 0 ? inputOps.Values.Select(input => input.ToString()).ToArray() : new string[0]); + + string[] inputOpsResult = inputOps.DenseValues() + .Select(input => input.ToString()) + .ToArray(); + + yield return (name, opType.ToString(), type, inputOpsResult); } } @@ -328,16 +332,10 @@ internal static TFSession GetSession(IHostEnvironment env, string modelPath) return LoadTFSession(env, bytes, modelPath); } - internal static unsafe void FetchData(IntPtr data, T[] result) + internal static unsafe void FetchData(IntPtr data, Span result) { - var size = result.Length; - - GCHandle handle = GCHandle.Alloc(result, GCHandleType.Pinned); - IntPtr target = handle.AddrOfPinnedObject(); - - Int64 sizeInBytes = size * Marshal.SizeOf((typeof(T))); - Buffer.MemoryCopy(data.ToPointer(), target.ToPointer(), sizeInBytes, sizeInBytes); - handle.Free(); + var dataSpan = new Span(data.ToPointer(), result.Length); + dataSpan.CopyTo(result); } internal static bool IsTypeSupported(TFDataType tfoutput) diff --git a/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs b/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs index 8c0a079dd8..39157b0eba 100644 --- a/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs +++ b/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs @@ -927,12 +927,9 @@ private Delegate MakeGetter(IRow input, int iinfo, ITensorValueGetter[] srcTe var tensor = outputCache.Outputs[_parent.Outputs[iinfo]]; var tensorSize = tensor.Shape.Where(x => x > 0).Aggregate((x, y) => x * y); - var values = dst.Values; - if (Utils.Size(values) < tensorSize) - values = new T[tensorSize]; - - TensorFlowUtils.FetchData(tensor.Data, values); - dst = new VBuffer(values.Length, values, dst.Indices); + var editor = VBufferEditor.Create(ref dst, (int)tensorSize); + TensorFlowUtils.FetchData(tensor.Data, editor.Values); + dst = editor.Commit(); }; return valuegetter; } @@ -1058,7 +1055,7 @@ private class TensorValueGetterVec : ITensorValueGetter private readonly ValueGetter> _srcgetter; private readonly TFShape _tfShape; private VBuffer _vBuffer; - private VBuffer _vBufferDense; + private T[] _denseData; private readonly T[] _bufferedData; private int _position; @@ -1067,7 +1064,7 @@ public TensorValueGetterVec(IRow input, int colIndex, TFShape tfShape) _srcgetter = input.GetGetter>(colIndex); _tfShape = tfShape; _vBuffer = default; - _vBufferDense = default; + _denseData = default; long size = 0; _position = 0; @@ -1083,8 +1080,11 @@ public TensorValueGetterVec(IRow input, int colIndex, TFShape tfShape) public TFTensor GetTensor() { _srcgetter(ref _vBuffer); - _vBuffer.CopyToDense(ref _vBufferDense); - return TFTensor.Create(_vBufferDense.Values, _vBufferDense.Length, _tfShape); + + Utils.EnsureSize(ref _denseData, _vBuffer.Length, keepOld: false); + _vBuffer.CopyTo(_denseData); + + return TFTensor.Create(_denseData, _vBuffer.Length, _tfShape); } public void BufferTrainingData() diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index 9c09bf940f..62db703c06 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -483,8 +483,9 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose (ref VBuffer dst) => { getSrc(ref src); - var mean = Mean(src.Values, src.Count, src.Length); - var divisor = StdDev(src.Values, src.Count, src.Length, mean); + var srcValues = src.GetValues(); + var mean = Mean(srcValues, src.Length); + var divisor = StdDev(srcValues, src.Length, mean); FillValues(Host, in src, ref dst, divisor, scale, mean); }; return del; @@ -493,8 +494,9 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose (ref VBuffer dst) => { getSrc(ref src); - var mean = Mean(src.Values, src.Count, src.Length); - var divisor = L2Norm(src.Values, src.Count, mean); + var srcValues = src.GetValues(); + var mean = Mean(srcValues, src.Length); + var divisor = L2Norm(srcValues, mean); FillValues(Host, in src, ref dst, divisor, scale, mean); }; return del; @@ -503,8 +505,9 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose (ref VBuffer dst) => { getSrc(ref src); - var mean = Mean(src.Values, src.Count, src.Length); - var divisor = L1Norm(src.Values, src.Count, mean); + var srcValues = src.GetValues(); + var mean = Mean(srcValues, src.Length); + var divisor = L1Norm(srcValues, mean); FillValues(Host, in src, ref dst, divisor, scale, mean); }; return del; @@ -513,8 +516,9 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose (ref VBuffer dst) => { getSrc(ref src); - var mean = Mean(src.Values, src.Count, src.Length); - var divisor = LInfNorm(src.Values, src.Count, mean); + var srcValues = src.GetValues(); + var mean = Mean(srcValues, src.Length); + var divisor = LInfNorm(srcValues, mean); FillValues(Host, in src, ref dst, divisor, scale, mean); }; return del; @@ -531,7 +535,7 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose (ref VBuffer dst) => { getSrc(ref src); - var divisor = StdDev(src.Values, src.Count, src.Length); + var divisor = StdDev(src.GetValues(), src.Length); FillValues(Host, in src, ref dst, divisor, scale); }; return del; @@ -540,7 +544,7 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose (ref VBuffer dst) => { getSrc(ref src); - var divisor = L2Norm(src.Values, src.Count); + var divisor = L2Norm(src.GetValues()); FillValues(Host, in src, ref dst, divisor, scale); }; return del; @@ -549,7 +553,7 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose (ref VBuffer dst) => { getSrc(ref src); - var divisor = L1Norm(src.Values, src.Count); + var divisor = L1Norm(src.GetValues()); FillValues(Host, in src, ref dst, divisor, scale); }; return del; @@ -558,7 +562,7 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose (ref VBuffer dst) => { getSrc(ref src); - var divisor = LInfNorm(src.Values, src.Count); + var divisor = LInfNorm(src.GetValues()); FillValues(Host, in src, ref dst, divisor, scale); }; return del; @@ -570,14 +574,14 @@ protected override Delegate MakeGetter(IRow input, int iinfo, out Action dispose private static void FillValues(IExceptionContext ectx, in VBuffer src, ref VBuffer dst, float divisor, float scale, float offset = 0) { - int count = src.Count; + var srcValues = src.GetValues(); + int count = srcValues.Length; int length = src.Length; - ectx.Assert(Utils.Size(src.Values) >= count); ectx.Assert(divisor >= 0); if (count == 0) { - dst = new VBuffer(length, 0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, length, 0); return; } ectx.Assert(count > 0); @@ -591,21 +595,18 @@ private static void FillValues(IExceptionContext ectx, in VBuffer src, re if (normScale < MinScale) normScale = 1; + VBufferEditor editor; if (offset == 0) { - var dstValues = dst.Values; - if (Utils.Size(dstValues) < count) - dstValues = new float[count]; - var dstIndices = dst.Indices; + editor = VBufferEditor.Create(ref dst, length, count); + var dstValues = editor.Values; if (!src.IsDense) { - if (Utils.Size(dstIndices) < count) - dstIndices = new int[count]; - Array.Copy(src.Indices, dstIndices, count); + src.GetIndices().CopyTo(editor.Indices); } - CpuMathUtils.Scale(normScale, src.Values, dstValues, count); - dst = new VBuffer(length, count, dstValues, dstIndices); + CpuMathUtils.Scale(normScale, src.GetValues(), dstValues, count); + dst = editor.Commit(); return; } @@ -613,10 +614,11 @@ private static void FillValues(IExceptionContext ectx, in VBuffer src, re // Subtracting the mean requires a dense representation. src.CopyToDense(ref dst); + editor = VBufferEditor.CreateFromBuffer(ref dst); if (normScale != 1) - CpuMathUtils.ScaleAdd(normScale, -offset, dst.Values.AsSpan(0, length)); + CpuMathUtils.ScaleAdd(normScale, -offset, editor.Values); else - CpuMathUtils.Add(-offset, dst.Values.AsSpan(0, length)); + CpuMathUtils.Add(-offset, editor.Values); } /// @@ -624,21 +626,21 @@ private static void FillValues(IExceptionContext ectx, in VBuffer src, re /// based on centered values (i.e. after subtracting the mean). But since the centered /// values mean is approximately zero, we can use variance of non-centered values. /// - private static float StdDev(float[] values, int count, int length) + private static float StdDev(ReadOnlySpan values, int length) { - Contracts.Assert(0 <= count && count <= length); - if (count == 0) + Contracts.Assert(0 <= values.Length && values.Length <= length); + if (values.Length == 0) return 0; // We need a mean to compute variance. - var tmpMean = CpuMathUtils.Sum(values.AsSpan(0, count)) / length; + var tmpMean = CpuMathUtils.Sum(values) / length; float sumSq = 0; - if (count != length && tmpMean != 0) + if (values.Length != length && tmpMean != 0) { // Sparse representation. float meanSq = tmpMean * tmpMean; - sumSq = (length - count) * meanSq; + sumSq = (length - values.Length) * meanSq; } - sumSq += CpuMathUtils.SumSq(tmpMean, values.AsSpan(0, count)); + sumSq += CpuMathUtils.SumSq(tmpMean, values); return MathUtils.Sqrt(sumSq / length); } @@ -646,19 +648,19 @@ private static float StdDev(float[] values, int count, int length) /// Compute Standard Deviation. /// We have two overloads of StdDev instead of one with mean for perf reasons. /// - private static float StdDev(float[] values, int count, int length, float mean) + private static float StdDev(ReadOnlySpan values, int length, float mean) { - Contracts.Assert(0 <= count && count <= length); - if (count == 0) + Contracts.Assert(0 <= values.Length && values.Length <= length); + if (values.Length == 0) return 0; float sumSq = 0; - if (count != length && mean != 0) + if (values.Length != length && mean != 0) { // Sparse representation. float meanSq = mean * mean; - sumSq = (length - count) * meanSq; + sumSq = (length - values.Length) * meanSq; } - sumSq += CpuMathUtils.SumSq(mean, values.AsSpan(0, count)); + sumSq += CpuMathUtils.SumSq(mean, values); return MathUtils.Sqrt(sumSq / length); } @@ -666,40 +668,40 @@ private static float StdDev(float[] values, int count, int length, float mean) /// Compute L2-norm. L2-norm computation doesn't subtract the mean from the source values. /// However, we substract the mean here in case subMean is true (if subMean is false, mean is zero). /// - private static float L2Norm(float[] values, int count, float mean = 0) + private static float L2Norm(ReadOnlySpan values, float mean = 0) { - if (count == 0) + if (values.Length == 0) return 0; - return MathUtils.Sqrt(CpuMathUtils.SumSq(mean, values.AsSpan(0, count))); + return MathUtils.Sqrt(CpuMathUtils.SumSq(mean, values)); } /// /// Compute L1-norm. L1-norm computation doesn't subtract the mean from the source values. /// However, we substract the mean here in case subMean is true (if subMean is false, mean is zero). /// - private static float L1Norm(float[] values, int count, float mean = 0) + private static float L1Norm(ReadOnlySpan values, float mean = 0) { - if (count == 0) + if (values.Length == 0) return 0; - return CpuMathUtils.SumAbs(mean, values.AsSpan(0, count)); + return CpuMathUtils.SumAbs(mean, values); } /// /// Compute LInf-norm. LInf-norm computation doesn't subtract the mean from the source values. /// However, we substract the mean here in case subMean is true (if subMean is false, mean is zero). /// - private static float LInfNorm(float[] values, int count, float mean = 0) + private static float LInfNorm(ReadOnlySpan values, float mean = 0) { - if (count == 0) + if (values.Length == 0) return 0; - return CpuMathUtils.MaxAbsDiff(mean, values.AsSpan(0, count)); + return CpuMathUtils.MaxAbsDiff(mean, values); } - private static float Mean(float[] src, int count, int length) + private static float Mean(ReadOnlySpan src, int length) { - if (length == 0 || count == 0) + if (length == 0 || src.Length == 0) return 0; - return CpuMathUtils.Sum(src.AsSpan(0, count)) / length; + return CpuMathUtils.Sum(src) / length; } } } diff --git a/src/Microsoft.ML.Transforms/KeyToBinaryVectorTransform.cs b/src/Microsoft.ML.Transforms/KeyToBinaryVectorTransform.cs index 7cc814b4ce..cf29a87feb 100644 --- a/src/Microsoft.ML.Transforms/KeyToBinaryVectorTransform.cs +++ b/src/Microsoft.ML.Transforms/KeyToBinaryVectorTransform.cs @@ -341,7 +341,7 @@ private void GetSlotNames(int iinfo, ref VBuffer> dst) sb.Append('.'); int len = sb.Length; - foreach (var key in bits.Values) + foreach (var key in bits.GetValues()) { sb.Length = len; sb.AppendMemory(key); diff --git a/src/Microsoft.ML.Transforms/LearnerFeatureSelection.cs b/src/Microsoft.ML.Transforms/LearnerFeatureSelection.cs index a67816f348..09a054be25 100644 --- a/src/Microsoft.ML.Transforms/LearnerFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/LearnerFeatureSelection.cs @@ -120,9 +120,10 @@ private static DropSlotsTransform.Column CreateDropSlotsColumn(Arguments args, i var col = new DropSlotsTransform.Column(); col.Source = args.FeatureColumn; selectedCount = 0; + var scoresValues = scores.GetValues(); // Degenerate case, dropping all slots. - if (scores.Count == 0) + if (scoresValues.Length == 0) { var range = new DropSlotsTransform.Range(); col.Slots = new DropSlotsTransform.Range[] { range }; @@ -139,13 +140,13 @@ private static DropSlotsTransform.Column CreateDropSlotsColumn(Arguments args, i else { Contracts.Assert(args.NumSlotsToKeep.HasValue); - threshold = ComputeThreshold(scores.Values, scores.Count, args.NumSlotsToKeep.Value, out tiedScoresToKeep); + threshold = ComputeThreshold(scoresValues, args.NumSlotsToKeep.Value, out tiedScoresToKeep); } var slots = new List(); - for (int i = 0; i < scores.Count; i++) + for (int i = 0; i < scoresValues.Length; i++) { - var score = Math.Abs(scores.Values[i]); + var score = Math.Abs(scoresValues[i]); if (score > threshold) { selectedCount++; @@ -160,9 +161,9 @@ private static DropSlotsTransform.Column CreateDropSlotsColumn(Arguments args, i var range = new DropSlotsTransform.Range(); range.Min = i; - while (++i < scores.Count) + while (++i < scoresValues.Length) { - score = Math.Abs(scores.Values[i]); + score = Math.Abs(scoresValues[i]); if (score > threshold) { selectedCount++; @@ -181,6 +182,7 @@ private static DropSlotsTransform.Column CreateDropSlotsColumn(Arguments args, i if (!scores.IsDense) { + var scoresIndices = scores.GetIndices(); int ii = 0; var count = slots.Count; for (int i = 0; i < count; i++) @@ -190,16 +192,16 @@ private static DropSlotsTransform.Column CreateDropSlotsColumn(Arguments args, i var min = range.Min; var max = range.Max.Value; Contracts.Assert(min <= max); - Contracts.Assert(max < scores.Count); + Contracts.Assert(max < scoresValues.Length); - range.Min = min == 0 ? 0 : scores.Indices[min - 1] + 1; - range.Max = max == scores.Count - 1 ? scores.Length - 1 : scores.Indices[max + 1] - 1; + range.Min = min == 0 ? 0 : scoresIndices[min - 1] + 1; + range.Max = max == scoresIndices.Length - 1 ? scores.Length - 1 : scoresIndices[max + 1] - 1; // Add the gaps before this range. for (; ii < min; ii++) { - var gapMin = ii == 0 ? 0 : scores.Indices[ii - 1] + 1; - var gapMax = scores.Indices[ii] - 1; + var gapMin = ii == 0 ? 0 : scoresIndices[ii - 1] + 1; + var gapMax = scoresIndices[ii] - 1; if (gapMin <= gapMax) { var gap = new DropSlotsTransform.Range(); @@ -212,10 +214,10 @@ private static DropSlotsTransform.Column CreateDropSlotsColumn(Arguments args, i } // Add the gaps after the last range. - for (; ii <= scores.Count; ii++) + for (; ii <= scoresIndices.Length; ii++) { - var gapMin = ii == 0 ? 0 : scores.Indices[ii - 1] + 1; - var gapMax = ii == scores.Count ? scores.Length - 1 : scores.Indices[ii] - 1; + var gapMin = ii == 0 ? 0 : scoresIndices[ii - 1] + 1; + var gapMax = ii == scoresIndices.Length ? scores.Length - 1 : scoresIndices[ii] - 1; if (gapMin <= gapMax) { var gap = new DropSlotsTransform.Range(); @@ -240,12 +242,12 @@ private static DropSlotsTransform.Column CreateDropSlotsColumn(Arguments args, i return null; } - private static float ComputeThreshold(float[] scores, int count, int topk, out int tiedScoresToKeep) + private static float ComputeThreshold(ReadOnlySpan scores, int topk, out int tiedScoresToKeep) { // Use a min-heap for the topk elements var heap = new Heap((f1, f2) => f1 > f2, topk); - for (int i = 0; i < count; i++) + for (int i = 0; i < scores.Length; i++) { var score = Math.Abs(scores[i]); if (float.IsNaN(score)) diff --git a/src/Microsoft.ML.Transforms/Microsoft.ML.Transforms.csproj b/src/Microsoft.ML.Transforms/Microsoft.ML.Transforms.csproj index 11d96a6cfc..7ab146b21c 100644 --- a/src/Microsoft.ML.Transforms/Microsoft.ML.Transforms.csproj +++ b/src/Microsoft.ML.Transforms/Microsoft.ML.Transforms.csproj @@ -4,6 +4,7 @@ netstandard2.0 Microsoft.ML CORECLR + true diff --git a/src/Microsoft.ML.Transforms/MissingValueDroppingTransformer.cs b/src/Microsoft.ML.Transforms/MissingValueDroppingTransformer.cs index 910941e527..360ab5db82 100644 --- a/src/Microsoft.ML.Transforms/MissingValueDroppingTransformer.cs +++ b/src/Microsoft.ML.Transforms/MissingValueDroppingTransformer.cs @@ -229,110 +229,107 @@ private void DropNAsAndDefaults(ref VBuffer src, ref VBuffer d { Host.AssertValue(isNA); + var srcValues = src.GetValues(); int newCount = 0; - for (int i = 0; i < src.Count; i++) + for (int i = 0; i < srcValues.Length; i++) { - if (!isNA(in src.Values[i])) + if (!isNA(in srcValues[i])) newCount++; } - Host.Assert(newCount <= src.Count); + Host.Assert(newCount <= srcValues.Length); if (newCount == 0) { - dst = new VBuffer(0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, 0); return; } - if (newCount == src.Count) + if (newCount == srcValues.Length) { Utils.Swap(ref src, ref dst); if (!dst.IsDense) { - Host.Assert(dst.Count == newCount); - dst = new VBuffer(dst.Count, dst.Values, dst.Indices); + Host.Assert(dst.GetValues().Length == newCount); + VBufferUtils.Resize(ref dst, newCount); } return; } int iDst = 0; - var values = dst.Values; - if (Utils.Size(values) < newCount) - values = new TDst[newCount]; // Densifying sparse vectors since default value equals NA and hence should be dropped. - for (int i = 0; i < src.Count; i++) + var editor = VBufferEditor.Create(ref dst, newCount); + for (int i = 0; i < srcValues.Length; i++) { - if (!isNA(in src.Values[i])) - values[iDst++] = src.Values[i]; + if (!isNA(in srcValues[i])) + editor.Values[iDst++] = srcValues[i]; } Host.Assert(iDst == newCount); - dst = new VBuffer(newCount, values, dst.Indices); + dst = editor.Commit(); } private void DropNAs(ref VBuffer src, ref VBuffer dst, InPredicate isNA) { Host.AssertValue(isNA); + var srcValues = src.GetValues(); int newCount = 0; - for (int i = 0; i < src.Count; i++) + for (int i = 0; i < srcValues.Length; i++) { - if (!isNA(in src.Values[i])) + if (!isNA(in srcValues[i])) newCount++; } - Host.Assert(newCount <= src.Count); + Host.Assert(newCount <= srcValues.Length); if (newCount == 0) { - dst = new VBuffer(src.Length - src.Count, 0, dst.Values, dst.Indices); + VBufferUtils.Resize(ref dst, src.Length - srcValues.Length, 0); return; } - if (newCount == src.Count) + if (newCount == srcValues.Length) { Utils.Swap(ref src, ref dst); return; } - var values = dst.Values; - if (Utils.Size(values) < newCount) - values = new TDst[newCount]; - int iDst = 0; if (src.IsDense) { - for (int i = 0; i < src.Count; i++) + var editor = VBufferEditor.Create(ref dst, newCount); + for (int i = 0; i < srcValues.Length; i++) { - if (!isNA(in src.Values[i])) + if (!isNA(in srcValues[i])) { - values[iDst] = src.Values[i]; + editor.Values[iDst] = srcValues[i]; iDst++; } } Host.Assert(iDst == newCount); - dst = new VBuffer(newCount, values, dst.Indices); + dst = editor.Commit(); } else { - var indices = dst.Indices; - if (Utils.Size(indices) < newCount) - indices = new int[newCount]; + var newLength = src.Length - srcValues.Length - newCount; + var editor = VBufferEditor.Create(ref dst, newLength, newCount); + var srcIndices = src.GetIndices(); int offset = 0; - for (int i = 0; i < src.Count; i++) + for (int i = 0; i < srcValues.Length; i++) { - if (!isNA(in src.Values[i])) + if (!isNA(in srcValues[i])) { - values[iDst] = src.Values[i]; - indices[iDst] = src.Indices[i] - offset; + editor.Values[iDst] = srcValues[i]; + editor.Indices[iDst] = srcIndices[i] - offset; iDst++; } else offset++; } Host.Assert(iDst == newCount); - Host.Assert(offset == src.Count - newCount); - dst = new VBuffer(src.Length - offset, newCount, values, indices); + Host.Assert(offset == srcValues.Length - newCount); + dst = editor.Commit(); } } } diff --git a/src/Microsoft.ML.Transforms/MissingValueIndicatorTransform.cs b/src/Microsoft.ML.Transforms/MissingValueIndicatorTransform.cs index 5e42acfedb..6ebc10459b 100644 --- a/src/Microsoft.ML.Transforms/MissingValueIndicatorTransform.cs +++ b/src/Microsoft.ML.Transforms/MissingValueIndicatorTransform.cs @@ -274,32 +274,25 @@ protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, ou private static void FillValues(Float input, ref VBuffer result) { - var values = result.Values; - var indices = result.Indices; - if (input == 0) { - result = new VBuffer(2, 0, values, indices); + VBufferUtils.Resize(ref result, 2, 0); return; } - if (Utils.Size(values) < 1) - values = new Float[1]; - if (Utils.Size(indices) < 1) - indices = new int[1]; - + var editor = VBufferEditor.Create(ref result, 2, 1); if (Float.IsNaN(input)) { - values[0] = 1; - indices[0] = 1; + editor.Values[0] = 1; + editor.Indices[0] = 1; } else { - values[0] = input; - indices[0] = 0; + editor.Values[0] = input; + editor.Indices[0] = 0; } - result = new VBuffer(2, 1, values, indices); + result = editor.Commit(); } // This converts in place. @@ -308,18 +301,14 @@ private static void FillValues(IExceptionContext ectx, ref VBuffer buffer int size = buffer.Length; ectx.Check(0 <= size & size < int.MaxValue / 2); - int count = buffer.Count; - var values = buffer.Values; - var indices = buffer.Indices; + var values = buffer.GetValues(); + var editor = VBufferEditor.Create(ref buffer, size * 2, values.Length); int iivDst = 0; - if (count >= size) + if (buffer.IsDense) { // Currently, it's dense. We always produce sparse. - ectx.Assert(Utils.Size(values) >= size); - if (Utils.Size(indices) < size) - indices = new int[size]; - for (int ivSrc = 0; ivSrc < count; ivSrc++) + for (int ivSrc = 0; ivSrc < values.Length; ivSrc++) { ectx.Assert(iivDst <= ivSrc); var val = values[ivSrc]; @@ -327,13 +316,13 @@ private static void FillValues(IExceptionContext ectx, ref VBuffer buffer continue; if (Float.IsNaN(val)) { - values[iivDst] = 1; - indices[iivDst] = 2 * ivSrc + 1; + editor.Values[iivDst] = 1; + editor.Indices[iivDst] = 2 * ivSrc + 1; } else { - values[iivDst] = val; - indices[iivDst] = 2 * ivSrc; + editor.Values[iivDst] = val; + editor.Indices[iivDst] = 2 * ivSrc; } iivDst++; } @@ -341,11 +330,10 @@ private static void FillValues(IExceptionContext ectx, ref VBuffer buffer else { // Currently, it's sparse. - ectx.Assert(Utils.Size(values) >= count); - ectx.Assert(Utils.Size(indices) >= count); + var indices = buffer.GetIndices(); int ivPrev = -1; - for (int iivSrc = 0; iivSrc < count; iivSrc++) + for (int iivSrc = 0; iivSrc < values.Length; iivSrc++) { ectx.Assert(iivDst <= iivSrc); var val = values[iivSrc]; @@ -356,20 +344,20 @@ private static void FillValues(IExceptionContext ectx, ref VBuffer buffer ivPrev = iv; if (Float.IsNaN(val)) { - values[iivDst] = 1; - indices[iivDst] = 2 * iv + 1; + editor.Values[iivDst] = 1; + editor.Indices[iivDst] = 2 * iv + 1; } else { - values[iivDst] = val; - indices[iivDst] = 2 * iv; + editor.Values[iivDst] = val; + editor.Indices[iivDst] = 2 * iv; } iivDst++; } } - ectx.Assert(0 <= iivDst & iivDst <= count); - buffer = new VBuffer(size * 2, iivDst, values, indices); + ectx.Assert(0 <= iivDst & iivDst <= values.Length); + buffer = editor.CommitTruncated(iivDst); } } } diff --git a/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs b/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs index 440f71d0be..305196c743 100644 --- a/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs +++ b/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs @@ -294,8 +294,8 @@ private void FindNAs(in VBuffer src, InPredicate isNA, bool defaultIsNA // Find the indices of all of the NAs. indices.Clear(); - var srcValues = src.Values; - var srcCount = src.Count; + var srcValues = src.GetValues(); + var srcCount = srcValues.Length; if (src.IsDense) { for (int i = 0; i < srcCount; i++) @@ -307,7 +307,7 @@ private void FindNAs(in VBuffer src, InPredicate isNA, bool defaultIsNA } else if (!defaultIsNA) { - var srcIndices = src.Indices; + var srcIndices = src.GetIndices(); for (int ii = 0; ii < srcCount; ii++) { if (isNA(in srcValues[ii])) @@ -318,7 +318,7 @@ private void FindNAs(in VBuffer src, InPredicate isNA, bool defaultIsNA else { // Note that this adds non-NAs to indices -- this is indicated by sense being false. - var srcIndices = src.Indices; + var srcIndices = src.GetIndices(); for (int ii = 0; ii < srcCount; ii++) { if (!isNA(in srcValues[ii])) diff --git a/src/Microsoft.ML.Transforms/MissingValueReplacing.cs b/src/Microsoft.ML.Transforms/MissingValueReplacing.cs index c4e39b877f..b70f02f6ad 100644 --- a/src/Microsoft.ML.Transforms/MissingValueReplacing.cs +++ b/src/Microsoft.ML.Transforms/MissingValueReplacing.cs @@ -734,10 +734,8 @@ private void FillValues(in VBuffer src, ref VBuffer dst, InPredicate Host.AssertValue(isNA); int srcSize = src.Length; - int srcCount = src.Count; - var srcValues = src.Values; - Host.Assert(Utils.Size(srcValues) >= srcCount); - var srcIndices = src.Indices; + var srcValues = src.GetValues(); + int srcCount = srcValues.Length; var dstValues = dst.Values; var dstIndices = dst.Indices; @@ -768,8 +766,8 @@ private void FillValues(in VBuffer src, ref VBuffer dst, InPredicate else { // The source vector is sparse. - Host.Assert(Utils.Size(srcIndices) >= srcCount); Host.Assert(srcCount < srcSize); + var srcIndices = src.GetIndices(); // Allocate more space if necessary. // REVIEW: One thing that changing the code to simply ensure that there are srcCount indices in the arrays @@ -818,10 +816,8 @@ private void FillValues(in VBuffer src, ref VBuffer dst, InPredicate Host.AssertValue(isNA); int srcSize = src.Length; - int srcCount = src.Count; - var srcValues = src.Values; - Host.Assert(Utils.Size(srcValues) >= srcCount); - var srcIndices = src.Indices; + var srcValues = src.GetValues(); + int srcCount = srcValues.Length; var dstValues = dst.Values; var dstIndices = dst.Indices; @@ -830,7 +826,6 @@ private void FillValues(in VBuffer src, ref VBuffer dst, InPredicate Utils.EnsureSize(ref dstValues, srcCount, srcSize, keepOld: false); int iivDst = 0; - Host.Assert(Utils.Size(srcValues) >= srcCount); if (src.IsDense) { // The source vector is dense. @@ -852,8 +847,8 @@ private void FillValues(in VBuffer src, ref VBuffer dst, InPredicate else { // The source vector is sparse. - Host.Assert(Utils.Size(srcIndices) >= srcCount); Host.Assert(srcCount < srcSize); + var srcIndices = src.GetIndices(); // Allocate more space if necessary. // REVIEW: One thing that changing the code to simply ensure that there are srcCount indices in the arrays diff --git a/src/Microsoft.ML.Transforms/MissingValueReplacingUtils.cs b/src/Microsoft.ML.Transforms/MissingValueReplacingUtils.cs index 921cd4292e..8466d1b5ef 100644 --- a/src/Microsoft.ML.Transforms/MissingValueReplacingUtils.cs +++ b/src/Microsoft.ML.Transforms/MissingValueReplacingUtils.cs @@ -185,9 +185,8 @@ protected StatAggregatorAcrossSlots(IChannel ch, IRowCursor cursor, int col) protected sealed override void ProcessRow(in VBuffer src) { - var srcCount = src.Count; - var srcValues = src.Values; - Ch.Assert(Utils.Size(srcValues) >= srcCount); + var srcValues = src.GetValues(); + var srcCount = srcValues.Length; for (int slot = 0; slot < srcCount; slot++) ProcessValue(in srcValues[slot]); @@ -210,9 +209,8 @@ protected StatAggregatorBySlot(IChannel ch, ColumnType type, IRowCursor cursor, protected sealed override void ProcessRow(in VBuffer src) { - var srcCount = src.Count; - var srcValues = src.Values; - Ch.Assert(Utils.Size(srcValues) >= srcCount); + var srcValues = src.GetValues(); + var srcCount = srcValues.Length; if (src.IsDense) { // The src vector is dense. @@ -222,8 +220,7 @@ protected sealed override void ProcessRow(in VBuffer src) else { // The src vector is sparse. - var srcIndices = src.Indices; - Ch.Assert(Utils.Size(srcIndices) >= srcCount); + var srcIndices = src.GetIndices(); for (int islot = 0; islot < srcCount; islot++) ProcessValue(in srcValues[islot], srcIndices[islot]); } diff --git a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs index c98ccfab04..906e7447ad 100644 --- a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs @@ -406,28 +406,28 @@ private void GetLabels(Transposer trans, ColumnType labelType, int labelCol) { var tmp = default(VBuffer); trans.GetSingleSlotValue(labelCol, ref tmp); - BinInts(ref tmp, ref labels, _numBins, out min, out lim); + BinInts(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType == NumberType.R4) { var tmp = default(VBuffer); trans.GetSingleSlotValue(labelCol, ref tmp); - BinSingles(ref tmp, ref labels, _numBins, out min, out lim); + BinSingles(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType == NumberType.R8) { var tmp = default(VBuffer); trans.GetSingleSlotValue(labelCol, ref tmp); - BinDoubles(ref tmp, ref labels, _numBins, out min, out lim); + BinDoubles(in tmp, ref labels, _numBins, out min, out lim); _numLabels = lim - min; } else if (labelType.IsBool) { var tmp = default(VBuffer); trans.GetSingleSlotValue(labelCol, ref tmp); - BinBools(ref tmp, ref labels); + BinBools(in tmp, ref labels); _numLabels = 3; min = -1; lim = 2; @@ -485,7 +485,7 @@ private Single[] ComputeMutualInformation(Transposer trans, int col) return ComputeMutualInformation(trans, col, (ref VBuffer src, ref VBuffer dst, out int min, out int lim) => { - BinInts(ref src, ref dst, _numBins, out min, out lim); + BinInts(in src, ref dst, _numBins, out min, out lim); }); } if (type.ItemType == NumberType.R4) @@ -493,7 +493,7 @@ private Single[] ComputeMutualInformation(Transposer trans, int col) return ComputeMutualInformation(trans, col, (ref VBuffer src, ref VBuffer dst, out int min, out int lim) => { - BinSingles(ref src, ref dst, _numBins, out min, out lim); + BinSingles(in src, ref dst, _numBins, out min, out lim); }); } if (type.ItemType == NumberType.R8) @@ -501,7 +501,7 @@ private Single[] ComputeMutualInformation(Transposer trans, int col) return ComputeMutualInformation(trans, col, (ref VBuffer src, ref VBuffer dst, out int min, out int lim) => { - BinDoubles(ref src, ref dst, _numBins, out min, out lim); + BinDoubles(in src, ref dst, _numBins, out min, out lim); }); } if (type.ItemType.IsBool) @@ -511,7 +511,7 @@ private Single[] ComputeMutualInformation(Transposer trans, int col) { min = -1; lim = 2; - BinBools(ref src, ref dst); + BinBools(in src, ref dst); }); } Contracts.Assert(0 < type.ItemType.KeyCount && type.ItemType.KeyCount < Utils.ArrayMaxSize); @@ -610,12 +610,13 @@ private Single ComputeMutualInformation(in VBuffer features, int numFeature private void FillTable(in VBuffer features, int offset, int numFeatures) { Contracts.Assert(_labels.Length == features.Length); + var featureValues = features.GetValues(); if (features.IsDense) { for (int i = 0; i < _labels.Length; i++) { var label = _labels[i]; - var feature = features.Values[i] - offset; + var feature = featureValues[i] - offset; Contracts.Assert(0 <= label && label < _numLabels); Contracts.Assert(0 <= feature && feature < numFeatures); _contingencyTable[label][feature]++; @@ -623,23 +624,24 @@ private void FillTable(in VBuffer features, int offset, int numFeatures) return; } + var featureIndices = features.GetIndices(); int ii = 0; for (int i = 0; i < _labels.Length; i++) { var label = _labels[i]; int feature; - if (ii == features.Count || i < features.Indices[ii]) + if (ii == featureIndices.Length || i < featureIndices[ii]) feature = -offset; else { - feature = features.Values[ii] - offset; + feature = featureValues[ii] - offset; ii++; } Contracts.Assert(0 <= label && label < _numLabels); Contracts.Assert(0 <= feature && feature < numFeatures); _contingencyTable[label][feature]++; } - Contracts.Assert(ii == features.Count); + Contracts.Assert(ii == featureIndices.Length); } /// @@ -673,12 +675,12 @@ private static ValueMapper, VBuffer> BinKeys(ColumnType colTy /// /// Maps Ints. /// - private void BinInts(ref VBuffer input, ref VBuffer output, + private void BinInts(in VBuffer input, ref VBuffer output, int numBins, out int min, out int lim) { Contracts.Assert(_singles.Count == 0); - var bounds = _binFinder.FindBins(numBins, _singles, input.Length - input.Count); + var bounds = _binFinder.FindBins(numBins, _singles, input.Length - input.GetValues().Length); min = -1 - bounds.FindIndexSorted(0); lim = min + bounds.Length + 1; int offset = min; @@ -692,21 +694,19 @@ private void BinInts(ref VBuffer input, ref VBuffer output, /// /// Maps from Singles to ints. NaNs (and only NaNs) are mapped to the first bin. /// - private void BinSingles(ref VBuffer input, ref VBuffer output, + private void BinSingles(in VBuffer input, ref VBuffer output, int numBins, out int min, out int lim) { Contracts.Assert(_singles.Count == 0); - if (input.Values != null) + var inputValues = input.GetValues(); + for (int i = 0; i < inputValues.Length; i++) { - for (int i = 0; i < input.Count; i++) - { - var val = input.Values[i]; - if (!Single.IsNaN(val)) - _singles.Add(val); - } + var val = inputValues[i]; + if (!Single.IsNaN(val)) + _singles.Add(val); } - var bounds = _binFinder.FindBins(numBins, _singles, input.Length - input.Count); + var bounds = _binFinder.FindBins(numBins, _singles, input.Length - inputValues.Length); min = -1 - bounds.FindIndexSorted(0); lim = min + bounds.Length + 1; int offset = min; @@ -720,21 +720,19 @@ private void BinSingles(ref VBuffer input, ref VBuffer output, /// /// Maps from Doubles to ints. NaNs (and only NaNs) are mapped to the first bin. /// - private void BinDoubles(ref VBuffer input, ref VBuffer output, + private void BinDoubles(in VBuffer input, ref VBuffer output, int numBins, out int min, out int lim) { Contracts.Assert(_doubles.Count == 0); - if (input.Values != null) + var inputValues = input.GetValues(); + for (int i = 0; i < inputValues.Length; i++) { - for (int i = 0; i < input.Count; i++) - { - var val = input.Values[i]; - if (!Double.IsNaN(val)) - _doubles.Add(val); - } + var val = inputValues[i]; + if (!Double.IsNaN(val)) + _doubles.Add(val); } - var bounds = _binFinder.FindBins(numBins, _doubles, input.Length - input.Count); + var bounds = _binFinder.FindBins(numBins, _doubles, input.Length - inputValues.Length); var offset = min = -1 - bounds.FindIndexSorted(0); lim = min + bounds.Length + 1; ValueMapper mapper = @@ -744,7 +742,7 @@ private void BinDoubles(ref VBuffer input, ref VBuffer output, _doubles.Clear(); } - private void BinBools(ref VBuffer input, ref VBuffer output) + private void BinBools(in VBuffer input, ref VBuffer output) { if (_boolMapper == null) _boolMapper = CreateVectorMapper(BinOneBool); @@ -775,24 +773,20 @@ private static ValueMapper, VBuffer> CreateVectorMapper(this ValueMapper map, in VBuffer input, ref VBuffer output) { - var values = output.Values; - if (Utils.Size(values) < input.Count) - values = new TDst[input.Count]; - for (int i = 0; i < input.Count; i++) + var inputValues = input.GetValues(); + var editor = VBufferEditor.Create(ref output, input.Length, inputValues.Length); + for (int i = 0; i < inputValues.Length; i++) { - TSrc val = input.Values[i]; - map(in val, ref values[i]); + TSrc val = inputValues[i]; + map(in val, ref editor.Values[i]); } - var indices = output.Indices; - if (!input.IsDense && input.Count > 0) + if (!input.IsDense && inputValues.Length > 0) { - if (Utils.Size(indices) < input.Count) - indices = new int[input.Count]; - Array.Copy(input.Indices, indices, input.Count); + input.GetIndices().CopyTo(editor.Indices); } - output = new VBuffer(input.Length, input.Count, values, indices); + output = editor.Commit(); } } } diff --git a/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs b/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs index d3865c303e..fb8207c5fb 100644 --- a/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs +++ b/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs @@ -615,9 +615,11 @@ private void TransformFeatures(in VBuffer src, ref VBuffer dst, Tr { // This overload of MatTimesSrc ignores the values in slots that are not in src.Indices, so there is // no need to zero them out. - featuresAligned.CopyFrom(src.Indices, src.Values, 0, 0, src.Count, zeroItems: false); - CpuMathUtils.MatrixTimesSource(transformInfo.RndFourierVectors, src.Indices, featuresAligned, 0, 0, - src.Count, productAligned, transformInfo.NewDim); + var srcValues = src.GetValues(); + var srcIndices = src.GetIndices(); + featuresAligned.CopyFrom(srcIndices, srcValues, 0, 0, srcValues.Length, zeroItems: false); + CpuMathUtils.MatrixTimesSource(transformInfo.RndFourierVectors, srcIndices, featuresAligned, 0, 0, + srcValues.Length, productAligned, transformInfo.NewDim); } for (int i = 0; i < transformInfo.NewDim; i++) diff --git a/src/Microsoft.ML.Transforms/Text/CharTokenizeTransform.cs b/src/Microsoft.ML.Transforms/Text/CharTokenizeTransform.cs index 53f99405d7..f885d6722d 100644 --- a/src/Microsoft.ML.Transforms/Text/CharTokenizeTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/CharTokenizeTransform.cs @@ -457,39 +457,37 @@ private ValueGetter> MakeGetterVec(IRow input, int iinfo) getSrc(ref src); int len = 0; - for (int i = 0; i < src.Count; i++) + var srcValues = src.GetValues(); + for (int i = 0; i < srcValues.Length; i++) { - if (!src.Values[i].IsEmpty) + if (!srcValues[i].IsEmpty) { - len += src.Values[i].Length; + len += srcValues[i].Length; if (_parent._useMarkerChars) len += TextMarkersCount; } } - var values = dst.Values; + var editor = VBufferEditor.Create(ref dst, len); if (len > 0) { - if (Utils.Size(values) < len) - values = new ushort[len]; - int index = 0; - for (int i = 0; i < src.Count; i++) + for (int i = 0; i < srcValues.Length; i++) { - if (src.Values[i].IsEmpty) + if (srcValues[i].IsEmpty) continue; if (_parent._useMarkerChars) - values[index++] = TextStartMarker; - var span = src.Values[i].Span; - for (int ich = 0; ich < src.Values[i].Length; ich++) - values[index++] = span[ich]; + editor.Values[index++] = TextStartMarker; + var span = srcValues[i].Span; + for (int ich = 0; ich < srcValues[i].Length; ich++) + editor.Values[index++] = span[ich]; if (_parent._useMarkerChars) - values[index++] = TextEndMarker; + editor.Values[index++] = TextEndMarker; } Contracts.Assert(index == len); } - dst = new VBuffer(len, values, dst.Indices); + dst = editor.Commit(); }; ValueGetter> getterWithUnitSep = (ref VBuffer dst) => @@ -498,11 +496,12 @@ private ValueGetter> MakeGetterVec(IRow input, int iinfo) int len = 0; - for (int i = 0; i < src.Count; i++) + var srcValues = src.GetValues(); + for (int i = 0; i < srcValues.Length; i++) { - if (!src.Values[i].IsEmpty) + if (!srcValues[i].IsEmpty) { - len += src.Values[i].Length; + len += srcValues[i].Length; if (i > 0) len += 1; // add UnitSeparator character to len that will be added @@ -512,12 +511,9 @@ private ValueGetter> MakeGetterVec(IRow input, int iinfo) if (_parent._useMarkerChars) len += TextMarkersCount; - var values = dst.Values; + var editor = VBufferEditor.Create(ref dst, len); if (len > 0) { - if (Utils.Size(values) < len) - values = new ushort[len]; - int index = 0; // ReadOnlyMemory can be a result of either concatenating text columns together @@ -527,33 +523,32 @@ private ValueGetter> MakeGetterVec(IRow input, int iinfo) // Therefore, prepend and append start and end markers only once i.e. at the start and at end of vector. // Insert UnitSeparator after every piece of text in the vector. if (_parent._useMarkerChars) - values[index++] = TextStartMarker; + editor.Values[index++] = TextStartMarker; - for (int i = 0; i < src.Count; i++) + for (int i = 0; i < srcValues.Length; i++) { - if (src.Values[i].IsEmpty) + if (srcValues[i].IsEmpty) continue; if (i > 0) - values[index++] = UnitSeparator; + editor.Values[index++] = UnitSeparator; - var span = src.Values[i].Span; - for (int ich = 0; ich < src.Values[i].Length; ich++) - values[index++] = span[ich]; + var span = srcValues[i].Span; + for (int ich = 0; ich < srcValues[i].Length; ich++) + editor.Values[index++] = span[ich]; } if (_parent._useMarkerChars) - values[index++] = TextEndMarker; + editor.Values[index++] = TextEndMarker; Contracts.Assert(index == len); } - dst = new VBuffer(len, values, dst.Indices); + dst = editor.Commit(); }; return _parent._isSeparatorStartEnd ? getterWithStartEndSep : getterWithUnitSep; } } - } /// diff --git a/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs b/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs index 4a9ef780ca..af643a2907 100644 --- a/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs +++ b/src/Microsoft.ML.Transforms/Text/LdaSingleBox.cs @@ -181,7 +181,7 @@ public void SetAlphaSum(float averageDocLength) LdaInterface.SetAlphaSum(_engine, averageDocLength); } - public int LoadDoc(int[] termID, double[] termVal, int termNum, int numVocab) + public int LoadDoc(ReadOnlySpan termID, ReadOnlySpan termVal, int termNum, int numVocab) { Contracts.Check(numVocab == NumVocab); Contracts.Check(termNum > 0); @@ -189,12 +189,14 @@ public int LoadDoc(int[] termID, double[] termVal, int termNum, int numVocab) Contracts.Check(termVal.Length >= termNum); int[] pID = new int[termNum]; - int[] pVal = termVal.Select(item => (int)item).ToArray(); - Array.Copy(termID, pID, termNum); + int[] pVal = new int[termVal.Length]; + for (int i = 0; i < termVal.Length; i++) + pVal[i] = (int)termVal[i]; + termID.Slice(0, termNum).CopyTo(pID); return LdaInterface.FeedInData(_engine, pID, pVal, termNum, NumVocab); } - public int LoadDocDense(double[] termVal, int termNum, int numVocab) + public int LoadDocDense(ReadOnlySpan termVal, int termNum, int numVocab) { Contracts.Check(numVocab == NumVocab); Contracts.Check(termNum > 0); @@ -202,9 +204,10 @@ public int LoadDocDense(double[] termVal, int termNum, int numVocab) Contracts.Check(termVal.Length >= termNum); int[] pID = new int[termNum]; - int[] pVal = termVal.Select(item => (int)item).ToArray(); + int[] pVal = new int[termVal.Length]; + for (int i = 0; i < termVal.Length; i++) + pVal[i] = (int)termVal[i]; return LdaInterface.FeedInDataDense(_engine, pVal, termNum, NumVocab); - } public List> GetDocTopicVector(int docID) @@ -244,17 +247,19 @@ public List> GetDocTopicVector(int docID) return topicRet; } - public List> TestDoc(int[] termID, double[] termVal, int termNum, int numBurninIter, bool reset) + public List> TestDoc(ReadOnlySpan termID, ReadOnlySpan termVal, int termNum, int numBurninIter, bool reset) { Contracts.Check(termNum > 0); Contracts.Check(termVal.Length >= termNum); Contracts.Check(termID.Length >= termNum); int[] pID = new int[termNum]; - int[] pVal = termVal.Select(item => (int)item).ToArray(); + int[] pVal = new int[termVal.Length]; + for (int i = 0; i < termVal.Length; i++) + pVal[i] = (int)termVal[i]; int[] pTopic = new int[NumTopic]; int[] pProb = new int[NumTopic]; - Array.Copy(termID, pID, termNum); + termID.Slice(0, termNum).CopyTo(pID); int numTopicReturn = NumTopic; @@ -273,12 +278,14 @@ public List> TestDoc(int[] termID, double[] termVal, in return topicRet; } - public List> TestDocDense(double[] termVal, int termNum, int numBurninIter, bool reset) + public List> TestDocDense(ReadOnlySpan termVal, int termNum, int numBurninIter, bool reset) { Contracts.Check(termNum > 0); Contracts.Check(numBurninIter > 0); Contracts.Check(termVal.Length >= termNum); - int[] pVal = termVal.Select(item => (int)item).ToArray(); + int[] pVal = new int[termVal.Length]; + for (int i = 0; i < termVal.Length; i++) + pVal[i] = (int)termVal[i]; int[] pTopic = new int[NumTopic]; int[] pProb = new int[NumTopic]; diff --git a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs index 3f697a8478..0c18f713d3 100644 --- a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs @@ -505,9 +505,10 @@ private void Train(IChannel ch, IDataView trainingData, LdaState[] states) getters[i](ref src); // compute term, doc instance#. - for (int termID = 0; termID < src.Count; termID++) + var srcValues = src.GetValues(); + for (int termID = 0; termID < srcValues.Length; termID++) { - int termFreq = GetFrequency(src.Values[termID]); + int termFreq = GetFrequency(srcValues[termID]); if (termFreq < 0) { // Ignore this row. @@ -792,9 +793,10 @@ public int FeedTrain(IExceptionContext ectx, in VBuffer input) int docSize = 0; int termNum = 0; - for (int i = 0; i < input.Count; i++) + var inputValues = input.GetValues(); + for (int i = 0; i < inputValues.Length; i++) { - int termFreq = GetFrequency(input.Values[i]); + int termFreq = GetFrequency(inputValues[i]); if (termFreq < 0) { // Ignore this row. @@ -814,9 +816,9 @@ public int FeedTrain(IExceptionContext ectx, in VBuffer input) int actualSize = 0; if (input.IsDense) - actualSize = _ldaTrainer.LoadDocDense(input.Values, termNum, input.Length); + actualSize = _ldaTrainer.LoadDocDense(inputValues, termNum, input.Length); else - actualSize = _ldaTrainer.LoadDoc(input.Indices, input.Values, termNum, input.Length); + actualSize = _ldaTrainer.LoadDoc(input.GetIndices(), inputValues, termNum, input.Length); ectx.Assert(actualSize == 2 * docSize + 1, string.Format("The doc size are distinct. Actual: {0}, Expected: {1}", actualSize, 2 * docSize + 1)); return actualSize; @@ -849,30 +851,29 @@ public void Output(in VBuffer src, ref VBuffer dst, int numBurnin } int len = InfoEx.NumTopic; - var values = dst.Values; - var indices = dst.Indices; - if (src.Count == 0) + var srcValues = src.GetValues(); + if (srcValues.Length == 0) { - dst = new VBuffer(len, 0, values, indices); + VBufferUtils.Resize(ref dst, len, 0); return; } + VBufferEditor editor; // Make sure all the frequencies are valid and truncate if the sum gets too large. int docSize = 0; int termNum = 0; - for (int i = 0; i < src.Count; i++) + for (int i = 0; i < srcValues.Length; i++) { - int termFreq = GetFrequency(src.Values[i]); + int termFreq = GetFrequency(srcValues[i]); if (termFreq < 0) { // REVIEW: Should this log a warning message? And what should it produce? // It currently produces a vbuffer of all NA values. // REVIEW: Need a utility method to do this... - if (Utils.Size(values) < len) - values = new Float[len]; + editor = VBufferEditor.Create(ref dst, len); for (int k = 0; k < len; k++) - values[k] = Float.NaN; - dst = new VBuffer(len, values, indices); + editor.Values[k] = Float.NaN; + dst = editor.Commit(); return; } @@ -886,17 +887,14 @@ public void Output(in VBuffer src, ref VBuffer dst, int numBurnin // REVIEW: Too much memory allocation here on each prediction. List> retTopics; if (src.IsDense) - retTopics = _ldaTrainer.TestDocDense(src.Values, termNum, numBurninIter, reset); + retTopics = _ldaTrainer.TestDocDense(srcValues, termNum, numBurninIter, reset); else - retTopics = _ldaTrainer.TestDoc(src.Indices.Take(src.Count).ToArray(), src.Values.Take(src.Count).ToArray(), termNum, numBurninIter, reset); + retTopics = _ldaTrainer.TestDoc(src.GetIndices(), srcValues, termNum, numBurninIter, reset); int count = retTopics.Count; Contracts.Assert(count <= len); - if (Utils.Size(values) < count) - values = new Float[count]; - if (count < len && Utils.Size(indices) < count) - indices = new int[count]; + editor = VBufferEditor.Create(ref dst, len, count); double normalizer = 0; for (int i = 0; i < count; i++) { @@ -906,22 +904,22 @@ public void Output(in VBuffer src, ref VBuffer dst, int numBurnin Contracts.Assert(0 <= index && index < len); if (count < len) { - Contracts.Assert(i == 0 || indices[i - 1] < index); - indices[i] = index; + Contracts.Assert(i == 0 || editor.Indices[i - 1] < index); + editor.Indices[i] = index; } else Contracts.Assert(index == i); - values[i] = value; + editor.Values[i] = value; normalizer += value; } if (normalizer > 0) { for (int i = 0; i < count; i++) - values[i] = (Float)(values[i] / normalizer); + editor.Values[i] = (Float)(editor.Values[i] / normalizer); } - dst = new VBuffer(len, count, values, indices); + dst = editor.Commit(); } public void Dispose() diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index a443b9ecdd..deaa7855cd 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -315,14 +315,15 @@ public override void Save(ModelSaveContext ctx) if (_slotNamesTypes[i] != null) { GetSlotNames(i, ref ngramsNames); - Host.Assert(_ngramMaps[i].Count == ngramsNames.Count); + Host.Assert(_ngramMaps[i].Count == ngramsNames.GetValues().Length); Host.Assert(ngramsNames.IsDense); ctx.SaveTextStream(string.Format("{0}-ngrams.txt", Infos[i].Name), writer => { - writer.WriteLine("# Number of Ngrams terms = {0}", ngramsNames.Count); - for (int j = 0; j < ngramsNames.Count; j++) - writer.WriteLine("{0}\t{1}", j, ngramsNames.Values[j]); + var explicitNgramNames = ngramsNames.GetValues(); + writer.WriteLine("# Number of Ngrams terms = {0}", explicitNgramNames.Length); + for (int j = 0; j < explicitNgramNames.Length; j++) + writer.WriteLine("{0}\t{1}", j, explicitNgramNames[j]); }); } } diff --git a/src/Microsoft.ML.Transforms/Text/NgramUtils.cs b/src/Microsoft.ML.Transforms/Text/NgramUtils.cs index 7a0db6d8bd..38bc6333e8 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramUtils.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramUtils.cs @@ -73,12 +73,13 @@ public bool AddNgrams(in VBuffer src, int icol, uint keyMax) Contracts.Assert(icol >= 0); Contracts.Assert(keyMax > 0); + var srcValues = src.GetValues(); uint curKey = 0; if (src.IsDense) { for (int i = 0; i < src.Length; i++) { - curKey = src.Values[i]; + curKey = srcValues[i]; if (curKey > keyMax) curKey = 0; @@ -92,13 +93,14 @@ public bool AddNgrams(in VBuffer src, int icol, uint keyMax) else { var queueSize = _queue.Capacity; + var srcIndices = src.GetIndices(); int iindex = 0; for (int i = 0; i < src.Length; i++) { - if (iindex < src.Count && i == src.Indices[iindex]) + if (iindex < srcIndices.Length && i == srcIndices[iindex]) { - curKey = src.Values[iindex]; + curKey = srcValues[iindex]; if (curKey > keyMax) curKey = 0; iindex++; diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemoverTransform.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemoverTransform.cs index 98300ebb08..625f5503f9 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemoverTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemoverTransform.cs @@ -464,16 +464,17 @@ protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, ou getSrc(ref src); list.Clear(); - for (int i = 0; i < src.Count; i++) + var srcValues = src.GetValues(); + for (int i = 0; i < srcValues.Length; i++) { - if (src.Values[i].IsEmpty) + if (srcValues[i].IsEmpty) continue; buffer.Clear(); - ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Values[i].Span, buffer); + ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(srcValues[i].Span, buffer); // REVIEW nihejazi: Consider using a trie for string matching (Aho-Corasick, etc.) if (StopWords[(int)langToUse].Get(buffer) == null) - list.Add(src.Values[i]); + list.Add(srcValues[i]); } VBufferUtils.Copy(list, ref dst, list.Count); @@ -936,16 +937,17 @@ protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, ou getSrc(ref src); list.Clear(); - for (int i = 0; i < src.Count; i++) + var srcValues = src.GetValues(); + for (int i = 0; i < srcValues.Length; i++) { - if (src.Values[i].IsEmpty) + if (srcValues[i].IsEmpty) continue; buffer.Clear(); - ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Values[i].Span, buffer); + ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(srcValues[i].Span, buffer); // REVIEW nihejazi: Consider using a trie for string matching (Aho-Corasick, etc.) if (_stopWordsMap.Get(buffer) == null) - list.Add(src.Values[i]); + list.Add(srcValues[i]); } VBufferUtils.Copy(list, ref dst, list.Count); diff --git a/src/Microsoft.ML.Transforms/Text/TextNormalizerTransform.cs b/src/Microsoft.ML.Transforms/Text/TextNormalizerTransform.cs index 60de96701b..6bad1cd87f 100644 --- a/src/Microsoft.ML.Transforms/Text/TextNormalizerTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/TextNormalizerTransform.cs @@ -308,7 +308,7 @@ private ValueGetter> MakeGetterOne(IRow input, int iinfo) (ref ReadOnlyMemory dst) => { getSrc(ref src); - NormalizeSrc(ref src, ref dst, buffer); + NormalizeSrc(in src, ref dst, buffer); }; } @@ -325,9 +325,10 @@ private ValueGetter>> MakeGetterVec(IRow input, int { getSrc(ref src); list.Clear(); - for (int i = 0; i < src.Count; i++) + var srcValues = src.GetValues(); + for (int i = 0; i < srcValues.Length; i++) { - NormalizeSrc(ref src.Values[i], ref temp, buffer); + NormalizeSrc(in srcValues[i], ref temp, buffer); if (!temp.IsEmpty) list.Add(temp); } @@ -336,7 +337,7 @@ private ValueGetter>> MakeGetterVec(IRow input, int }; } - private void NormalizeSrc(ref ReadOnlyMemory src, ref ReadOnlyMemory dst, StringBuilder buffer) + private void NormalizeSrc(in ReadOnlyMemory src, ref ReadOnlyMemory dst, StringBuilder buffer) { Host.AssertValue(buffer); diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs index c32c716e32..57d23d3b74 100644 --- a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs @@ -118,7 +118,7 @@ public void AddWordVector(IChannel ch, string word, float[] wordVector) } } - public bool GetWordVector(ref ReadOnlyMemory word, float[] wordVector) + public bool GetWordVector(in ReadOnlyMemory word, float[] wordVector) { NormStr str = _pool.Get(word); if (str != null) @@ -583,38 +583,37 @@ private ValueGetter> GetGetterVec(IRow input, int iinfo) { int deno = 0; srcGetter(ref src); - var values = dst.Values; - if (Utils.Size(values) != 3 * dimension) - values = new float[3 * dimension]; + var editor = VBufferEditor.Create(ref dst, 3 * dimension); int offset = 2 * dimension; for (int i = 0; i < dimension; i++) { - values[i] = float.MaxValue; - values[i + dimension] = 0; - values[i + offset] = float.MinValue; + editor.Values[i] = float.MaxValue; + editor.Values[i + dimension] = 0; + editor.Values[i + offset] = float.MinValue; } - for (int word = 0; word < src.Count; word++) + var srcValues = src.GetValues(); + for (int word = 0; word < srcValues.Length; word++) { - if (_parent._currentVocab.GetWordVector(ref src.Values[word], wordVector)) + if (_parent._currentVocab.GetWordVector(in srcValues[word], wordVector)) { deno++; for (int i = 0; i < dimension; i++) { float currentTerm = wordVector[i]; - if (values[i] > currentTerm) - values[i] = currentTerm; - values[dimension + i] += currentTerm; - if (values[offset + i] < currentTerm) - values[offset + i] = currentTerm; + if (editor.Values[i] > currentTerm) + editor.Values[i] = currentTerm; + editor.Values[dimension + i] += currentTerm; + if (editor.Values[offset + i] < currentTerm) + editor.Values[offset + i] = currentTerm; } } } if (deno != 0) for (int index = 0; index < dimension; index++) - values[index + dimension] /= deno; + editor.Values[index + dimension] /= deno; - dst = new VBuffer(values.Length, values, dst.Indices); + dst = editor.Commit(); }; } } diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizeTransform.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizeTransform.cs index 05288d7135..af3979792b 100644 --- a/src/Microsoft.ML.Transforms/Text/WordTokenizeTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordTokenizeTransform.cs @@ -316,18 +316,14 @@ private ValueGetter>> MakeGetterVec(IRow input, int getSrc(ref src); terms.Clear(); - for (int i = 0; i < src.Count; i++) - AddTerms(src.Values[i], separators, terms); - - var values = dst.Values; - if (terms.Count > 0) - { - if (Utils.Size(values) < terms.Count) - values = new ReadOnlyMemory[terms.Count]; - terms.CopyTo(values); - } - - dst = new VBuffer>(terms.Count, values, dst.Indices); + var srcValues = src.GetValues(); + for (int i = 0; i < srcValues.Length; i++) + AddTerms(srcValues[i], separators, terms); + + var editor = VBufferEditor.Create(ref dst, terms.Count); + for (int i = 0; i < terms.Count; i++) + editor.Values[i] = terms[i]; + dst = editor.Commit(); }; } diff --git a/src/Microsoft.ML.Transforms/UngroupTransform.cs b/src/Microsoft.ML.Transforms/UngroupTransform.cs index c34100da37..e8182215e8 100644 --- a/src/Microsoft.ML.Transforms/UngroupTransform.cs +++ b/src/Microsoft.ML.Transforms/UngroupTransform.cs @@ -631,18 +631,20 @@ private ValueGetter MakeGetter(int col, PrimitiveType itemType) cachedIndex = 0; } + var rowValues = row.GetValues(); if (_pivotColPosition >= row.Length) value = naValue; else if (row.IsDense) - value = row.Values[_pivotColPosition]; + value = rowValues[_pivotColPosition]; else { // The row is sparse. - while (cachedIndex < row.Count && _pivotColPosition > row.Indices[cachedIndex]) + var rowIndices = row.GetIndices(); + while (cachedIndex < rowIndices.Length && _pivotColPosition > rowIndices[cachedIndex]) cachedIndex++; - if (cachedIndex < row.Count && _pivotColPosition == row.Indices[cachedIndex]) - value = row.Values[cachedIndex]; + if (cachedIndex < rowIndices.Length && _pivotColPosition == rowIndices[cachedIndex]) + value = rowValues[cachedIndex]; else value = default(T); } diff --git a/src/Microsoft.ML.Transforms/VectorWhitening.cs b/src/Microsoft.ML.Transforms/VectorWhitening.cs index b60098d572..1bf5cbc861 100644 --- a/src/Microsoft.ML.Transforms/VectorWhitening.cs +++ b/src/Microsoft.ML.Transforms/VectorWhitening.cs @@ -613,10 +613,19 @@ public enum SvdJob : byte MinOvr = (byte)'O', } + public static unsafe void Gemv(Layout layout, Transpose trans, int m, int n, float alpha, + float[] a, int lda, ReadOnlySpan x, int incx, float beta, Span y, int incy) + { + fixed (float* pA = a) + fixed (float* pX = x) + fixed (float* pY = y) + Gemv(layout, trans, m, n, alpha, pA, lda, pX, incx, beta, pY, incy); + } + // See: https://software.intel.com/en-us/node/520750 [DllImport(DllName, EntryPoint = "cblas_sgemv")] - public static extern void Gemv(Layout layout, Transpose trans, int m, int n, float alpha, - float[] a, int lda, float[] x, int incx, float beta, float[] y, int incy); + private static unsafe extern void Gemv(Layout layout, Transpose trans, int m, int n, float alpha, + float* a, int lda, float* x, int incx, float beta, float* y, int incy); // See: https://software.intel.com/en-us/node/520775 [DllImport(DllName, EntryPoint = "cblas_sgemm")] @@ -715,36 +724,34 @@ private ValueGetter GetSrcGetter(IRow input, int iinfo) private static void FillValues(float[] model, ref VBuffer src, ref VBuffer dst, int cdst) { - int count = src.Count; + var values = src.GetValues(); + int count = values.Length; int length = src.Length; - var values = src.Values; - var indices = src.Indices; - Contracts.Assert(Utils.Size(values) >= count); // Since the whitening process produces dense vector, always use dense representation of dst. - var a = Utils.Size(dst.Values) >= cdst ? dst.Values : new float[cdst]; + var editor = VBufferEditor.Create(ref dst, cdst); if (src.IsDense) { Mkl.Gemv(Mkl.Layout.RowMajor, Mkl.Transpose.NoTrans, cdst, length, - 1, model, length, values, 1, 0, a, 1); + 1, model, length, values, 1, 0, editor.Values, 1); } else { - Contracts.Assert(Utils.Size(indices) >= count); + var indices = src.GetIndices(); int offs = 0; for (int i = 0; i < cdst; i++) { // Returns a dot product of dense vector 'model' starting from offset 'offs' and sparse vector 'values' // with first 'count' valid elements and their corresponding 'indices'. - a[i] = CpuMathUtils.DotProductSparse(model.AsSpan(offs), values, indices, count); + editor.Values[i] = CpuMathUtils.DotProductSparse(model.AsSpan(offs), values, indices, count); offs += length; } } - dst = new VBuffer(cdst, a, dst.Indices); + dst = editor.Commit(); } - private static float DotProduct(float[] a, int aOffset, float[] b, int[] indices, int count) + private static float DotProduct(float[] a, int aOffset, ReadOnlySpan b, ReadOnlySpan indices, int count) { Contracts.Assert(count <= indices.Length); return CpuMathUtils.DotProductSparse(a.AsSpan(aOffset), b, indices, count); diff --git a/test/Microsoft.ML.Predictor.Tests/TestTransposer.cs b/test/Microsoft.ML.Predictor.Tests/TestTransposer.cs index 9517e82a55..a9f969a7f1 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestTransposer.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestTransposer.cs @@ -143,7 +143,7 @@ private static T[] GenerateHelper(int rowCount, Double density, Random rgen, return values; } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] [TestCategory("Transposer")] public void TransposerTest() {