From 7df7664b2792f12585714935c16228d2552da062 Mon Sep 17 00:00:00 2001 From: Tom Finley Date: Mon, 1 Apr 2019 12:52:48 -0700 Subject: [PATCH] Update VBuffer documentation (#3136) * Improve VBuffer documentation. * Improve the XML documentation for VBuffer/VBufferEditor. * Update the "best practices" documentation to reflect recent changes. * Update parameter names to avoid abbreviations, e.g., src => source and suchlike. --- docs/code/VBufferCareFeeding.md | 331 ++++++++++----------- src/Microsoft.ML.DataView/VBuffer.cs | 267 ++++++++++++----- src/Microsoft.ML.DataView/VBufferEditor.cs | 35 ++- 3 files changed, 379 insertions(+), 254 deletions(-) diff --git a/docs/code/VBufferCareFeeding.md b/docs/code/VBufferCareFeeding.md index 91918c5e9b..03fc38494e 100644 --- a/docs/code/VBufferCareFeeding.md +++ b/docs/code/VBufferCareFeeding.md @@ -10,43 +10,34 @@ A `VBuffer` is a generic type that supports both dense and sparse vectors over items of type `T`. This is the representation type for all [`VectorType`](IDataViewTypeSystem.md#vector-representations) instances in the `IDataView` ecosystem. When an instance of this is passed to a row cursor -getter, the callee is free to take ownership of and re-use the arrays -(`Values` and `Indices`). +getter, the callee is free to take ownership of and re-use the buffers, which +internally are arrays, and accessed externally as `ReadOnlySpan`s through the +`GetValues` and `GetIndices` methods. -A `VBuffer` is a struct, and has the following `readonly` fields: +A `VBuffer` is a struct, and has the following most important members: * `int Length`: The logical length of the buffer. -* `int Count`: The number of items explicitly represented. This equals `Length` -when the representation is dense and is less than `Length` when sparse. +* `bool IsDense`: Whether the vector is dense (if `true`) or sparse (if + `false`). -* `T[] Values`: The values. Only the first `Count` of these are valid. +* `ReadOnlySpan GetValues()`: The values. If `IsDense` is `true`, then this + is of length exactly equal to `Length`. Otherwise, it will have length less + than `Length`. -* `int[] Indices`: The indices. For a dense representation, this array is not - used, and may be `null`. For a sparse representation it is parallel to - values and specifies the logical indices for the corresponding values. Only - the first `Count` of these are valid. +* `ReadOnlySpan GetIndices()`: For a dense representation, this span will + have length of `0`, but for a sparse representation, this will be of the + same length as the span returned from `GetValues()`, and be parallel to it. + All indices must be between `0` inclusive and `Length` exclusive, and all + values in this span should be in strictly increasing order. -`Values` must have length equal to at least `Count`. If the representation is -sparse, that is, `Count < Length`, then `Indices` must have length also -greater than or equal to `Count`. If `Count == 0`, then it is entirely legal -for `Values` or `Indices` to be `null`, and if dense then `Indices` can always -be `null`. - -On the subject of `Count == 0`, note that having no valid values in `Indices` -and `Values` merely means that no values are explicitly defined, and the -vector should be treated, logically, as being filled with `default(T)`. - -For sparse vectors, `Indices` must have length equal to at least `Count`, and -the first `Count` indices must be increasing, with all indices between `0` -inclusive and `Length` exclusive. - -Regarding the generic type parameter `T`, the only real assumption made about -this type is that assignment (that is, using `=`) is sufficient to create an -*independent* copy of that item. All representation types of the [primitive -types](IDataViewTypeSystem.md#standard-column-types) have this property (for example, -`DvText`, `DvInt4`, `Single`, `Double`, etc.), but for example, `VBuffer<>` -itself does not have this property. So, no `VBuffer` of `VBuffer`s for you. +Regarding the generic type parameter `T`, the basic assumption made about this +type is that assignment (that is, using `=`) is sufficient to create an +*independent* copy of that item. All representation types of the [primitive +types](IDataViewTypeSystem.md#standard-column-types) have this property (for +example, `ReadOnlyMemory`, `int`, `float`, `double`, etc.), but for +example, `VBuffer<>` itself does not have this property. So, no `VBuffer` of +`VBuffer`s is possible. ## Sparse Values as `default(T)` @@ -73,7 +64,7 @@ As a corollary to the above note about equivalence of sparse and dense representations, since they are equivalent it follows that any code consuming `VBuffer`s must work equally well with *both*. That is, there must never be a condition where data is read and assumed to be either sparse, or dense, since -implementers of `IDataView` and related interfaces are perfectly free to +implementors of `IDataView` and related interfaces are perfectly free to produce either. The only "exception" to this rule is a necessary acknowledgment of the reality @@ -83,57 +74,62 @@ not commutative, operations over sparse `VBuffer` or `VBuffer` vectors can sometimes result in modestly different results than the "same" operation over dense values. -## Why Buffer Reuse +There is also another point about sparsity, that is, about the increasing +indices. While it is a *requirement*, it is regrettably not one we can +practically enforce without a heavy performance cost. So, if you are creating +your own `VBuffer`s, be careful that you are creating indices that are in +strictly increasing order, and in the right range (non-negative and less than +`Length`). We cannot check, but we *do* rely on it! + +# Buffer Reuse The question is often asked by people new to this codebase: why bother with buffer reuse at all? Without going into too many details, we used to not and -suffered for it. We had a far simpler system where examples were yielded -through an -[`IEnumerable<>`](https://msdn.microsoft.com/en-us/library/9eekhta0.aspx), and -our vector type at the time had `Indices` and `Values` arrays as well, but -their sizes were there actual sizes, and being returned through an -`IEnumerable<>` there was no plausible way to "recycle" the buffers. - -Also: who "owned" a fetched example (the caller, or callee) was not clear. -Because it was not clear, code was inevitably written and checked in that made +suffered for it. Long ago we had a far simpler system where examples were +yielded through an [`IEnumerable<>`][1], and our vector type at the time had +`Indices` and `Values`, but exposed as arrays, and their sizes were the actual +"logical" sizes, and being returned through an `IEnumerable<>` there was no +plausible way to reuse buffers. + +Also: who "owned" a fetched example (the caller, or callee) was not clear; +code was at one time written under both assumptions, which was a mess. Because +it was not clear, code was inevitably written and checked in that made *either* assumption, which meant, ultimately, that everything that touched -these would try to duplicate everything by default, because doing anything -else would fail in some case. - -The reason why this becomes important is because [garbage -collection](https://msdn.microsoft.com/en-us/library/0xy59wtx.aspx) in the -.NET framework is not free. Creating and destroying these arrays *can* be -cheap, provided that they are sufficiently small, short lived, and only ever -exist in a single thread. But, violate any of these, there is a possibility -these arrays could be allocated on the large object heap, or promoted to gen-2 -collection. The results could be disastrous: in one particularly memorable -incident regarding neural net training, the move to `IDataView` and its -`VBuffer`s resulted in a more than tenfold decrease in runtime performance, -because under the old regime the garbage collection of the feature vectors was -just taking so much time. - -This is somewhat unfortunate: a joke-that's-not-really-a-joke on the team was -that we were writing C# as though it were C code. Be that as it may, buffer -reuse is essential to our performance, especially on larger problems. +these would try to duplicate everything, because doing anything else would +fail in *some* case. + +The reason why this becomes important is because [garbage collection][2] in +.NET is not free. Creating and destroying these arrays *can* be cheap, +provided that they are sufficiently small, short lived, and only ever exist in +a single thread. But, when that does not hold, there is a possibility these +arrays could be allocated on the large object heap, or promoted to gen-2 +collection. The results then are sometimes disastrous: in one particularly +memorable incident regarding neural net training, the move to `IDataView` and +its `VBuffer`s resulted in a more than tenfold decrease in runtime +performance, because under the old regime the garbage collection of the +feature vectors was just taking so much time. This design requirement of buffer reuse has deeper implications for the ecosystem merely than the type here. For example, it is one crucial reason why so many value accessors in the `IDataView` ecosystem fill in values passed in through a `ref` parameter, rather than, say, being a return value. +[1]: https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1?view=netstandard-2.0 +[2]: https://docs.microsoft.com/en-us/dotnet/standard/garbage-collection/index + ## Buffer Re-use as a User -Let's imagine we have an `IDataView` in a variable `dataview`, and we just so +Let's imagine we have an `IDataView` in a variable `data`, and we just so happen to know that the column with index 5 has representation type `VBuffer`. (In real code, this would presumably we achieved through -more complicated involving an inspection of `dataview.Schema`, but we omit -such details here.) +more complicated involving an inspection of `data.Schema`, but we omit such +details here.) ```csharp -using (IRowCursor cursor = dataview.GetRowCursor(col => col == 5)) +using (DataViewRowCursor cursor = data.GetRowCursor(data.Schema[5])) { ValueGetter> getter = cursor.GetGetter>(5); - var value = default(VBuffer); + VBuffer value = default; while (cursor.MoveNext()) { getter(ref value); @@ -142,129 +138,130 @@ using (IRowCursor cursor = dataview.GetRowCursor(col => col == 5)) } ``` -In this example, we open a cursor (telling it to make only column 5 active), -then get the "getter" over this column. What enables buffer re-use for this is -that, as we go row by row over the data with the `while` loop, we pass in the -same `value` variable in to the `getter` delegate, again and again. Presumably -the first time, or several, memory is allocated. Initially `value = -default(VBuffer)`, that is, it has zero `Length` and `Count` and `null` -`Indices` and `Values`. Presumably at some point, probably the first call, -`value` is replaced with a `VBuffer` that has actual values allocated. -In subsequent calls, perhaps these are judged as insufficiently large, and new -arrays are allocated, but we would expect at some point the arrays would -become "large enough" to accommodate many values, so reallocations would -become increasingly rare. +To be explicit, a `ValueGetter` is defined this way: + +```csharp +public delegate void ValueGetter(ref TValue value); +``` + +Anyway, in this example, we open a cursor (telling it to make only column 5 +active), then get the "getter" over this column. What enables buffer re-use +for this is that, as we go row by row over the data with the `while` loop, we +pass in the same `value` variable in to the `getter` delegate, again and +again. Presumably the first time, or several, memory is allocated (though, +internally, and invisibly to us). Initially `VBuffer value = default`, +that is, it has zero `Length` and similarly empty spans. Presumably at some +point, probably the first call, `value` is replaced with a `VBuffer` +that has actual values, stored in freshly allocated buffers. In subsequent +calls, *perhaps* these are judged as insufficiently large, and new arrays are +internally allocated, but we would expect at some point the arrays would +become "large enough," and we would no longer have to allocate new buffers and +garbage collect old ones after that point. A common mistake made by first time users is to do something like move the `var value` declaration inside the `while` loop, thus dooming `getter` to have to allocate the arrays every single time, completely defeating the purpose of buffer reuse. -## Buffer Re-use as a Developer - -Nearly all methods in ML.NET that "return" a `VBuffer` do not really return -a `VBuffer` *at all*, but instead have a parameter `ref VBuffer dst`, -where they are expected to put the result. See the above example, with the -`getter`. A `ValueGetter` is defined: - -```csharp -public delegate void ValueGetter(ref TValue value); -``` - -Let's describe the typical practice of "returning" a `VBuffer` in, say, a -`ref` parameter named `dst`: if `dst.Indices` and `dst.Values` are -sufficiently large to contain the result, they are used, and the value is -calculated, or sometimes copied, into them. If either is insufficiently large, -then a new array is allocated in its place. After all the calculation happens, -a *new* `VBuffer` is constructed and assigned to `dst`. (And possibly, if they -were large enough, using the same `Indices` and `Values` arrays as were passed -in, albeit with different values.) - -`VBuffer`s can be either sparse or dense. However, even when returning a dense -`VBuffer`, you would not discard the `Indices` array of the passed in buffer, -assuming there was one. The `Indices` array was merely larger than necessary -to store *this* result: that you happened to not need it this call does not -justify throwing it away. We don't care about buffer re-use just for a single -call, after all! The dense constructor for the `VBuffer` accepts an `Indices` -array for precisely this reason! - -Also note: when you return a `VBuffer` in this fashion, the caller is assumed -to *own* it at that point. This means they can do whatever they like to it, -like pass the same variable into some other getter, or modify its values. +## Buffer Re-use as a Developer + +So we've seen what it looks like from the user's point of view, but some words +on what a (well implemented) `getter` delegate is doing is also worthwhile, to +understand what is going on here. + +`VBuffer` by itself may appear to be a strictly immutable structure, but +"buffer reuse" implies it is in some way mutable. This mutability is primarily +accomplished with the `VBufferEditor`. The `VBufferEditor` actually +strongly resembles a `VBuffer`, *except* instead of being `ReadOnlySpan` +for values and indices they are `Span`, so, it is mutable. + +First, one creates the `VBufferEditor` structure using one of the +`VBufferEditor.Create` methods. You pass in a `VBuffer` as a `ref` +parameter. (For this purpose, it is useful to remember that a +`default(VBuffer)` is itself a completely valid, though completely empty, +`VBuffer`.) The editor is, past that statement, considered to "own" the +internal structure of the `VBuffer` you passed in. (So, correct code should +not continue to use the input `VBuffer` structure from that point onwards.) + +Then, one puts new values into `Span` and, if in a sparse vector is the +desired result, puts new indices into `Span`, on that editor, in whatever +way is appropriate and necessary for the task. + +Last, one calls one of the `Commit` methods to get another `VBuffer`, with the +values and indices accessible through the `VBuffer` methods the same as +those that had been set in the editor structure. + +Similar to how creating a `VBufferEditor` out of a `VBuffer` renders the +passed in `VBuffer` invalid, likewise, getting the `VBuffer` out of the +editor out of the `VBufferEditor` through one of the `Commit` methods +renders the *editor* invalid. In both cases, "ownership" of the internal +buffers is passed along to the successor structure, rendering the original +structure invalid, in some sense. + +Internally, these buffers are backed by arrays that are reallocated *if +needed* by the editor upon its creation, but reused if they are large enough. + +## Ownership of `VBuffer` + +Nonetheless, this does not mean that bugs are impossible, because the concept +of buffer reuse does carry the implication that we must be quite clear about +who "owns" a `VBuffer` at any given time, for which we have developed this +convention: + +Unless clearly specified in documentation, a caller is assumed to always own a +`VBuffer` as returned by `ref`. This means they can do whatever they like to +it, like pass the same variable into some other getter, or modify its values. Indeed, this is quite common: normalizers in ML.NET get values from their source, then immediately scale the contents of `Values` appropriately. This would hardly be possible if the callee was considered to have some stake in that result. +The `ValueGetter` indicated above is the most important example of this +principle. By passing in an existing `VBuffer` into that delegate by +reference, they are giving the implementation control over it to use (or, +reallocate) as necessary to store the resulting value. But, once the delegate +returns with the value returned, the caller is now considered to own that +buffer. + There is a corollary on this point: because the caller owns any `VBuffer`, -then you shouldn't do anything that irrevocably destroys their usefulness to -the caller. For example, consider this method that takes a vector `src`, and -stores the scaled result in `dst`. +then code should not do anything that irrevocably destroys its usefulness to +the caller. For example, consider this method that takes a vector `source`, +and stores the scaled result in `destination`. ```csharp -VectorUtils.ScaleBy(ref VBuffer src, ref VBuffer dst, float c) +ScaleBy(in VBuffer source, ref VBuffer destination, float c) ``` -What this does is, copy the values from `src` to `dst`, while scaling each -value seen by `c`. +What this does is, copy the values from `source` to `destination` while +scaling the values we are copying by the factor `c`. One possible alternate (wrong) implementation of this would be to just say -`dst=src` then scale all contents of `dst.Values` by `c`. But, then `dst` and -`src` would share references to their internal arrays, completely compromising -the caller's ability to do anything useful with them: if the caller were to -pass `dst` into some other method that modified it, this could easily -(silently!) modify the contents of `src`. The point is: if you are writing -code *anywhere* whose end result is that two distinct `VBuffer` structs share -references to their internal arrays, you've almost certainly introduced a -**nasty** pernicious bug for your users. - -## Utilities for Working with `VBuffer`s - -ML.NET's runtime code has a number of utilities for operating over `VBuffer`s -that we have written to be generally useful. We will not treat on these in -detail here, but: - -* `Microsoft.ML.Data.VBuffer` itself contains a few methods for - accessing and iterating over its values. - -* `Microsoft.ML.Internal.Utilities.VBufferUtils` contains utilities +`destination=source` then edit `destination` and scale each value by `c`. But, +then `destination` and `source` would share references to their internal +arrays, completely compromising the caller's ability to do anything useful +with `source`: if the caller were to pass `destination` into some other method +that modified it, this could easily (silently!) modify the contents of +`source`. The point is: if you are writing code *anywhere* whose end result is +that two distinct `VBuffer`s share references to their internal arrays, you've +almost certainly introduced a **nasty** pernicious bug. + +# Internal Utilities for Working with `VBuffer`s + +In addition to the public utilities around `VBuffer` and `VBufferEditor` +already mentioned, ML.NET's internal infrastructure and implementation code +has a number of utilities for operating over `VBuffer`s that we have written +to be generally useful. We will not treat on these in detail here since they +are internal and not part of the public API, but: + +* `VBufferUtils` contains utilities mainly for non-numeric manipulation of `VBuffer`s. -* `Microsoft.ML.Numeric.VectorUtils` contains math operations - over `VBuffer` and `float[]`, like computing norms, dot-products, and - whatnot. +* `VectorUtils` contains math operations over `VBuffer` and `float[]`, + like computing norms, dot-products, and whatnot. -* `Microsoft.ML.Data.BufferBuilder` is an abstract class whose - concrete implementations are used throughout ML.NET to build up `VBuffer` +* `BufferBuilder` is a class for iteratively building up `VBuffer` instances. Note that if one *can* simply build a `VBuffer` oneself easily and do not need the niceties provided by the buffer builder, you should - probably just do it yourself. - -* `Microsoft.MachineLearning.Internal.Utilities.EnsureSize` is often useful to -ensure that the arrays are of the right size. - -## Golden Rules - -Here are some golden rules to remember: - -Remember the conditions under which `Indices` and `Values` can be `null`! A -developer forgetting that `null` values for these fields are legal is probably -the most common error in our code. (And unfortunately one that sometimes takes -a while to pop up: most users don't feed in empty inputs to our trainers.) - -In terms of accessing anything in `Values` or `Indices`, remember, treat -`Count` as the real length of these arrays, not the actual length of the -arrays. - -If you write code that results in two distinct `VBuffer`s sharing references -to their internal arrays, (for example, there are two `VBuffer`s `a` and `b`, with -`a.Indices == b.Indices` with `a.Indices != null`, or `a.Values == b.Values` -with `a.Values != null`) then you've almost certainly done something wrong. - -Structure your code so that `VBuffer`s have their buffers re-used as much as -possible. If you have code called repeatedly where you are passing in some -`default(VBuffer)`, there's almost certainly an opportunity there. - -When re-using a `VBuffer` that's been passed to you, remember that even when -constructing a dense vector, you should still re-use the `Indices` array that -was passed in. \ No newline at end of file + probably just do it yourself, since there is a great deal of additional + facilities here for combining multiple results at the same indices that many + applications will not need. diff --git a/src/Microsoft.ML.DataView/VBuffer.cs b/src/Microsoft.ML.DataView/VBuffer.cs index 64ff504715..6c2a94e991 100644 --- a/src/Microsoft.ML.DataView/VBuffer.cs +++ b/src/Microsoft.ML.DataView/VBuffer.cs @@ -11,47 +11,75 @@ namespace Microsoft.ML.Data { /// - /// A buffer that supports both dense and sparse representations. This is the - /// representation type for all VectorType instances. When an instance of this - /// is passed to a row cursor getter, the callee is free to take ownership of - /// and re-use the arrays (Values and Indices). + /// A buffer that supports both dense and sparse representations. This is the representation type for all + /// instances. The explicitly defined values of this vector are exposed through + /// and, if not dense, . /// + /// + /// This structure is by itself immutable, but to enable buffer editing including re-use of the internal buffers, + /// a mutable variant can be accessed through . + /// + /// Throughout the code, we make the assumption that a sparse is logically equivalent to + /// a dense with the default value for filling in the default values. + /// + /// The type of the vector. There are no compile-time restrictions on what this could be, but + /// this code and practically all code that uses makes the assumption that an assignment of + /// a value is sufficient to make a completely independent copy of it. So, for example, this means that a buffer of + /// buffers is not possible. But, things like , , and , are totally fine. public readonly struct VBuffer { + /// + /// The internal re-usable array of values. + /// private readonly T[] _values; + + /// + /// The internal re-usable array of indices. + /// private readonly int[] _indices; /// - /// The number of items explicitly represented. This is == Length when the representation - /// is dense and < Length when sparse. + /// The number of items explicitly represented. This equals when the representation + /// is dense and less than when sparse. /// private readonly int _count; /// /// The logical length of the buffer. /// + /// + /// Note that if this vector , then this will be the same as the + /// as returned from , since all values are explicitly represented in a dense representation. If + /// this is a sparse representation, then that will be somewhat shorter, as this + /// field contains the number of both explicit and implicit entries. + /// public readonly int Length; /// - /// The explicitly represented values. + /// The explicitly represented values. When this , the + /// of the returned value will equal , and otherwise will have length less than + /// . /// public ReadOnlySpan GetValues() => _values.AsSpan(0, _count); /// - /// The indices. For a dense representation, this array is not used. For a sparse representation - /// it is parallel to values and specifies the logical indices for the corresponding values. + /// The indices. For a dense representation, this array is not used, and will return the default "empty" span. + /// For a sparse representation it is parallel to that returned from and specifies the + /// logical indices for the corresponding values, in increasing order, between 0 inclusive and + /// exclusive, corresponding to all explicitly defined values. All values at unspecified + /// indices should be treated as being implicitly defined with the default value of . /// /// - /// For example, if GetIndices() returns [3, 5] and GetValues() produces [98, 76], this VBuffer - /// stands for a vector with: - /// - non-zeros values 98 and 76 respectively at the 4th and 6th coordinates - /// - zeros at all other coordinates + /// To give one example, if returns [3, 5] and () produces [98, 76], + /// this stands for a vector with non-zero values 98 and 76 respectively at the 4th and 6th + /// coordinates, and zeros at all other indices. (Zero, because that is the default value for all .NET numeric + /// types.) /// public ReadOnlySpan GetIndices() => IsDense ? default : _indices.AsSpan(0, _count); /// - /// Gets a value indicating whether every logical element is explicitly - /// represented in the buffer. + /// Gets a value indicating whether every logical element is explicitly represented in the buffer. /// public bool IsDense { @@ -63,8 +91,26 @@ public bool IsDense } /// - /// Construct a dense representation with unused Indices array. + /// Construct a dense representation. The array is often unspecified, but if + /// specified it should be considered a buffer to be held on to, to be possibly used. /// + /// The logical length of the resulting instance. + /// + /// The values to be used. This must be at least as long as . If + /// is 0, it is legal for this to be . The constructed buffer + /// takes ownership of this array. + /// + /// + /// The internal indices buffer. Because this constructor is for dense representations + /// this will not be immediately useful, but it does provide a buffer to be potentially reused to avoid + /// allocation. This is mostly non-null in situations where you want to produce a dense + /// , but you happen to have an indices array "left over" and you don't want to + /// needlessly lose. + /// + /// + /// The resulting structure takes ownership of the passed in arrays, so they should not be used for + /// other purposes in the future. + /// public VBuffer(int length, T[] values, int[] indices = null) { Contracts.CheckParam(length >= 0, nameof(length)); @@ -77,8 +123,21 @@ public VBuffer(int length, T[] values, int[] indices = null) } /// - /// Construct a possibly sparse representation. + /// Construct a possibly sparse vector representation. /// + /// The length of the constructed buffer. + /// The count of explicit entries. This must be between 0 and , both + /// inclusive. If it equals the result is a dense vector, and if less this will be a + /// sparse vector. + /// + /// The values to be used. This must be at least as long as . If + /// is 0, it is legal for this to be . + /// + /// The indices to be used. If we are constructing a dense representation, or + /// is 0, this can be . Otherwise, this must be at least as long + /// as . + /// The resulting structure takes ownership of the passed in arrays, so they should not be used for + /// other purposes in the future. public VBuffer(int length, int count, T[] values, int[] indices) { Contracts.CheckParam(length >= 0, nameof(length)); @@ -86,7 +145,7 @@ public VBuffer(int length, int count, T[] values, int[] indices) Contracts.CheckParam(ArrayUtils.Size(values) >= count, nameof(values)); Contracts.CheckParam(count == length || ArrayUtils.Size(indices) >= count, nameof(indices)); -#if DEBUG // REVIEW: This validation should really use "Checks" and be in release code, but it is not cheap. +#if DEBUG // REVIEW: This validation should really use "Checks" and be in release code, but it is not cheap, so for practical reasons we must forego it. if (0 < count && count < length) { int cur = indices[0]; @@ -111,32 +170,36 @@ public VBuffer(int length, int count, T[] values, int[] indices) /// /// Copy from this buffer to the given destination, forcing a dense representation. /// - public void CopyToDense(ref VBuffer dst) + /// The destination buffer. After the copy, this will have + /// of . + public void CopyToDense(ref VBuffer destination) { // create a dense editor - var editor = VBufferEditor.Create(ref dst, Length); + var editor = VBufferEditor.Create(ref destination, Length); if (!IsDense) CopyTo(editor.Values); else if (Length > 0) _values.AsSpan(0, Length).CopyTo(editor.Values); - dst = editor.Commit(); + destination = editor.Commit(); } /// /// Copy from this buffer to the given destination. /// - public void CopyTo(ref VBuffer dst) + /// The destination buffer. After the copy, this will have + /// of . + public void CopyTo(ref VBuffer destination) { - var editor = VBufferEditor.Create(ref dst, Length, _count); + var editor = VBufferEditor.Create(ref destination, Length, _count); if (IsDense) { if (Length > 0) { _values.AsSpan(0, Length).CopyTo(editor.Values); } - dst = editor.Commit(); - Debug.Assert(dst.IsDense); + destination = editor.Commit(); + Debug.Assert(destination.IsDense); } else { @@ -145,80 +208,95 @@ public void CopyTo(ref VBuffer dst) _values.AsSpan(0, _count).CopyTo(editor.Values); _indices.AsSpan(0, _count).CopyTo(editor.Indices); } - dst = editor.Commit(); + destination = editor.Commit(); } } /// /// Copy a range of values from this buffer to the given destination. /// - public void CopyTo(ref VBuffer dst, int srcMin, int length) + /// The destination buffer. After the copy, this will have + /// of . + /// The minimum inclusive index to start copying from this vector. + /// The logical number of values to copy from this vector into . + public void CopyTo(ref VBuffer destination, int sourceIndex, int length) { - Contracts.Check(0 <= srcMin && srcMin <= Length, "srcMin"); - Contracts.Check(0 <= length && srcMin <= Length - length, "length"); + Contracts.CheckParam(0 <= sourceIndex && sourceIndex <= Length, nameof(sourceIndex)); + Contracts.CheckParam(0 <= length && sourceIndex <= Length - length, nameof(length)); if (IsDense) { - var editor = VBufferEditor.Create(ref dst, length, length); + var editor = VBufferEditor.Create(ref destination, length, length); if (length > 0) { - _values.AsSpan(srcMin, length).CopyTo(editor.Values); + _values.AsSpan(sourceIndex, length).CopyTo(editor.Values); } - dst = editor.Commit(); - Debug.Assert(dst.IsDense); + destination = editor.Commit(); + Debug.Assert(destination.IsDense); } else { int copyCount = 0; if (_count > 0) { - int copyMin = ArrayUtils.FindIndexSorted(_indices, 0, _count, srcMin); - int copyLim = ArrayUtils.FindIndexSorted(_indices, copyMin, _count, srcMin + length); + int copyMin = ArrayUtils.FindIndexSorted(_indices, 0, _count, sourceIndex); + int copyLim = ArrayUtils.FindIndexSorted(_indices, copyMin, _count, sourceIndex + length); Debug.Assert(copyMin <= copyLim); copyCount = copyLim - copyMin; - var editor = VBufferEditor.Create(ref dst, length, copyCount); + var editor = VBufferEditor.Create(ref destination, length, copyCount); if (copyCount > 0) { _values.AsSpan(copyMin, copyCount).CopyTo(editor.Values); if (copyCount < length) { for (int i = 0; i < copyCount; ++i) - editor.Indices[i] = _indices[i + copyMin] - srcMin; + editor.Indices[i] = _indices[i + copyMin] - sourceIndex; } } - dst = editor.Commit(); + destination = editor.Commit(); } else { - var editor = VBufferEditor.Create(ref dst, length, copyCount); - dst = editor.Commit(); + var editor = VBufferEditor.Create(ref destination, length, copyCount); + destination = editor.Commit(); } } } /// - /// Copy from this buffer to the given destination array. This "densifies". + /// Copy from this buffer to the given destination span. This "densifies." /// - public void CopyTo(Span dst) + /// The destination buffer. This must have least . + public void CopyTo(Span destination) { - CopyTo(dst, 0); + CopyTo(destination, 0); } - public void CopyTo(Span dst, int ivDst, T defaultValue = default(T)) + /// + /// Copy from this buffer to the given destination span, starting at the specified index. This "densifies." + /// + /// The destination buffer. This must be at least + /// plus . + /// The starting index of at which to start copying. + /// The value to fill in for the implicit sparse entries. This is a potential exception to + /// general expectation of sparse that the implicit sparse entries have the default value + /// of . + public void CopyTo(Span destination, int destinationIndex, T defaultValue = default(T)) { - Contracts.CheckParam(0 <= ivDst && ivDst <= dst.Length - Length, nameof(dst), "dst is not large enough"); + Contracts.CheckParam(0 <= destinationIndex && destinationIndex <= destination.Length - Length, + nameof(destination), "Not large enough to hold these values."); if (Length == 0) return; if (IsDense) { - _values.AsSpan(0, Length).CopyTo(dst.Slice(ivDst)); + _values.AsSpan(0, Length).CopyTo(destination.Slice(destinationIndex)); return; } if (_count == 0) { - dst.Slice(ivDst, Length).Clear(); + destination.Slice(destinationIndex, Length).Clear(); return; } @@ -228,61 +306,97 @@ public void CopyTo(Span dst) int slot = _indices[islot]; Debug.Assert(slot >= iv); while (iv < slot) - dst[ivDst + iv++] = defaultValue; + destination[destinationIndex + iv++] = defaultValue; Debug.Assert(iv == slot); - dst[ivDst + iv++] = _values[islot]; + destination[destinationIndex + iv++] = _values[islot]; } while (iv < Length) - dst[ivDst + iv++] = defaultValue; + destination[destinationIndex + iv++] = defaultValue; } /// /// Copy from a section of a source array to the given destination. /// - public static void Copy(T[] src, int srcIndex, ref VBuffer dst, int length) + public static void Copy(T[] source, int sourceIndex, ref VBuffer destination, int length) { - Contracts.CheckParam(0 <= length && length <= ArrayUtils.Size(src), nameof(length)); - Contracts.CheckParam(0 <= srcIndex && srcIndex <= ArrayUtils.Size(src) - length, nameof(srcIndex)); - var editor = VBufferEditor.Create(ref dst, length, length); + Contracts.CheckParam(0 <= length && length <= ArrayUtils.Size(source), nameof(length)); + Contracts.CheckParam(0 <= sourceIndex && sourceIndex <= ArrayUtils.Size(source) - length, nameof(sourceIndex)); + var editor = VBufferEditor.Create(ref destination, length, length); if (length > 0) { - src.AsSpan(srcIndex, length).CopyTo(editor.Values); + source.AsSpan(sourceIndex, length).CopyTo(editor.Values); } - dst = editor.Commit(); + destination = editor.Commit(); } + /// + /// Returns the joint list of all index/value pairs. + /// + /// + /// If all pairs, even those implicit values of a sparse representation, + /// will be returned, with the implicit values having the default value, as is appropriate. If left + /// then only explicitly defined values are returned. + /// + /// The index/value pairs. public IEnumerable> Items(bool all = false) { return Items(_values, _indices, Length, _count, all); } + /// + /// Returns an enumerable with items, representing the values. + /// public IEnumerable DenseValues() { return DenseValues(_values, _indices, Length, _count); } - public void GetItemOrDefault(int slot, ref T dst) + /// + /// Gets the item stored in this structure. In the case of a dense vector this is a simple lookup. + /// In the case of a sparse vector, it will try to find the entry with that index, and set + /// to that stored value, or if no such value was found, assign it the default value. + /// + /// + /// In the case where is , this will take constant time since it an + /// directly lookup. For sparse vectors, however, because it must perform a bisection search on the indices to + /// find the appropriate value, that takes logarithmic time with respect to the number of explicitly represented + /// items, which is to say, the of the return value of . + /// + /// For that reason, a single completely isolated lookup, since constructing as + /// does is not a free operation, it may be more efficient to use this method. However + /// if one is doing a more involved computation involving many operations, it may be faster to utiltize + /// and, if appropriate, directly. + /// + /// The index, which must be a non-negative number less than . + /// The value stored at that index, or if this is a sparse vector where this is an implicit + /// entry, the default value for . + public void GetItemOrDefault(int index, ref T destination) { - Contracts.CheckParam(0 <= slot && slot < Length, nameof(slot)); + Contracts.CheckParam(0 <= index && index < Length, nameof(index)); - int index; if (IsDense) - dst = _values[slot]; - else if (_count > 0 && ArrayUtils.TryFindIndexSorted(_indices, 0, _count, slot, out index)) - dst = _values[index]; + destination = _values[index]; + else if (_count > 0 && ArrayUtils.TryFindIndexSorted(_indices, 0, _count, index, out int bufferIndex)) + destination = _values[bufferIndex]; else - dst = default(T); + destination = default(T); } - public T GetItemOrDefault(int slot) + /// + /// A variant of that returns the value instead of passing it + /// back using a reference parameter. + /// + /// The index, which must be a non-negative number less than . + /// The value stored at that index, or if this is a sparse vector where this is an implicit + /// entry, the default value for . + public T GetItemOrDefault(int index) { - Contracts.CheckParam(0 <= slot && slot < Length, nameof(slot)); + Contracts.CheckParam(0 <= index && index < Length, nameof(index)); - int index; if (IsDense) - return _values[slot]; - if (_count > 0 && ArrayUtils.TryFindIndexSorted(_indices, 0, _count, slot, out index)) return _values[index]; + if (_count > 0 && ArrayUtils.TryFindIndexSorted(_indices, 0, _count, index, out int bufferIndex)) + return _values[bufferIndex]; return default(T); } @@ -301,19 +415,22 @@ internal VBufferEditor GetEditor( bool keepOldOnResize = false, bool requireIndicesOnDense = false) { - Contracts.CheckParam(newLogicalLength >= 0, nameof(newLogicalLength)); - Contracts.CheckParam(valuesCount == null || valuesCount.Value <= newLogicalLength, nameof(valuesCount)); + Contracts.CheckParam(newLogicalLength >= 0, nameof(newLogicalLength), "Must be non-negative."); + Contracts.CheckParam(valuesCount == null || valuesCount.Value >= 0, nameof(valuesCount), + "If specified, must be non-negative."); + Contracts.CheckParam(valuesCount == null || valuesCount.Value <= newLogicalLength, nameof(valuesCount), + "If specified, must be no greater than " + nameof(newLogicalLength)); - valuesCount = valuesCount ?? newLogicalLength; + int logicalValuesCount = valuesCount ?? newLogicalLength; int maxCapacity = maxValuesCapacity ?? ArrayUtils.ArrayMaxSize; T[] values = _values; bool createdNewValues; - ArrayUtils.EnsureSize(ref values, valuesCount.Value, maxCapacity, keepOldOnResize, out createdNewValues); + ArrayUtils.EnsureSize(ref values, logicalValuesCount, maxCapacity, keepOldOnResize, out createdNewValues); int[] indices = _indices; - bool isDense = newLogicalLength == valuesCount.Value; + bool isDense = newLogicalLength == logicalValuesCount; bool createdNewIndices; if (isDense && !requireIndicesOnDense) { @@ -321,12 +438,12 @@ internal VBufferEditor GetEditor( } else { - ArrayUtils.EnsureSize(ref indices, valuesCount.Value, maxCapacity, keepOldOnResize, out createdNewIndices); + ArrayUtils.EnsureSize(ref indices, logicalValuesCount, maxCapacity, keepOldOnResize, out createdNewIndices); } return new VBufferEditor( newLogicalLength, - valuesCount.Value, + logicalValuesCount, values, indices, requireIndicesOnDense, diff --git a/src/Microsoft.ML.DataView/VBufferEditor.cs b/src/Microsoft.ML.DataView/VBufferEditor.cs index 4359d2b950..3c221af54a 100644 --- a/src/Microsoft.ML.DataView/VBufferEditor.cs +++ b/src/Microsoft.ML.DataView/VBufferEditor.cs @@ -16,6 +16,9 @@ public static class VBufferEditor /// Creates a with the same shape /// (length and density) as the . /// + /// The destination buffer. Note that the resulting is assumed to take ownership + /// of this passed in object, and so whatever was passed in as this parameter should not be used again, since its + /// underlying buffers are being potentially reused. public static VBufferEditor CreateFromBuffer( ref VBuffer destination) { @@ -27,7 +30,9 @@ public static VBufferEditor CreateFromBuffer( /// 's values and indices buffers. /// /// - /// The destination buffer. + /// The destination buffer. Note that the resulting is assumed to take ownership + /// of this passed in object, and so whatever was passed in as this parameter should not be used again, since its + /// underlying buffers are being potentially reused. /// /// /// The logical length of the new buffer being edited. @@ -68,6 +73,12 @@ public static VBufferEditor Create( /// An object capable of editing a by filling out /// (and if the buffer is not dense). /// + /// + /// The structure by itself is immutable. However, the purpose of + /// is to enable buffer re-use we can edit them through this structure, as created through + /// or + /// . + /// public readonly ref struct VBufferEditor { private readonly int _logicalLength; @@ -85,12 +96,12 @@ public readonly ref struct VBufferEditor public readonly Span Indices; /// - /// Gets a value indicating whether a new Values array was allocated. + /// Gets a value indicating whether a new array was allocated. /// public bool CreatedNewValues { get; } /// - /// Gets a value indicating whether a new Indices array was allocated. + /// Gets a value indicating whether a new array was allocated. /// public bool CreatedNewIndices { get; } @@ -116,12 +127,10 @@ internal VBufferEditor(int logicalLength, } /// - /// Commits the edits and creates a new using - /// the current Values and Indices. + /// Commits the edits and creates a new using the current and . + /// Note that this structure and its properties should not be used once this is called. /// - /// - /// The newly created . - /// + /// The newly created . public VBuffer Commit() { return new VBuffer(_logicalLength, Values.Length, _values, _indices); @@ -130,7 +139,8 @@ public VBuffer Commit() /// /// Commits the edits and creates a new using /// the current Values and Indices, while allowing to truncate the length - /// of Values and Indices. + /// of and, if sparse, . + /// Like , this structure and its properties should not be used once this is called. /// /// /// The new number of physical values to be represented in the created buffer. @@ -139,15 +149,16 @@ public VBuffer Commit() /// The newly created . /// /// - /// CommitTruncated allows to modify the length of the explicitly - /// defined values. + /// This method allows to modify the length of the explicitly defined values. /// This is useful in sparse situations where the /// was created with a larger physical value count than was needed /// because the final value count was not known at creation time. /// public VBuffer CommitTruncated(int physicalValuesCount) { - Contracts.CheckParam(physicalValuesCount <= Values.Length, nameof(physicalValuesCount), "Updating physicalValuesCount during CommitTruncated cannot be greater than the original physicalValuesCount value used in Create."); + Contracts.CheckParam(physicalValuesCount <= Values.Length, nameof(physicalValuesCount), + "Updating " + nameof(physicalValuesCount) + " during " + nameof(CommitTruncated) + + " cannot be greater than the original physicalValuesCount value used in " + nameof(VBufferEditor.Create) + "."); return new VBuffer(_logicalLength, physicalValuesCount, _values, _indices); }