-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-35344: [Go][Format] Implementation of the LIST_VIEW and LARGE_LIST_VIEW array formats #37468
Changes from all commits
1c8f181
c575b61
a86f3e6
b3591a3
45386b4
178067d
71a4a94
7c58a72
69d657e
b2f25c5
9b53720
69fe50a
22ce90d
a651320
2058b5c
d337d61
7f25473
2387981
300f792
eac4b32
4c62c20
eac653e
5b92d58
1e3ae75
9daeb81
9824669
6852a2e
650f359
af2813a
9897360
9dbc99a
bc47993
72c563a
4aaa8bf
f0fa0d7
83bbd6a
a5ccd73
b83beff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ import ( | |
"fmt" | ||
"math" | ||
"math/bits" | ||
"unsafe" | ||
|
||
"github.com/apache/arrow/go/v14/arrow" | ||
"github.com/apache/arrow/go/v14/arrow/bitutil" | ||
|
@@ -355,6 +356,164 @@ func concatOffsets(buffers []*memory.Buffer, byteWidth int, mem memory.Allocator | |
} | ||
} | ||
|
||
func sumArraySizes(data []arrow.ArrayData) int { | ||
outSize := 0 | ||
for _, arr := range data { | ||
outSize += arr.Len() | ||
} | ||
return outSize | ||
} | ||
|
||
func getListViewBufferValues[T int32 | int64](data arrow.ArrayData, i int) []T { | ||
bytes := data.Buffers()[i].Bytes() | ||
base := (*T)(unsafe.Pointer(&bytes[0])) | ||
ret := unsafe.Slice(base, data.Offset()+data.Len()) | ||
return ret[data.Offset():] | ||
} | ||
|
||
func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Buffer, outOff int) { | ||
debug.Assert(in.DataType().ID() == arrow.LIST_VIEW, "putListViewOffsets32: expected LIST_VIEW data") | ||
inOff, inLen := in.Offset(), in.Len() | ||
if inLen == 0 { | ||
return | ||
} | ||
bitmap := in.Buffers()[0] | ||
srcOffsets := getListViewBufferValues[int32](in, 1) | ||
srcSizes := getListViewBufferValues[int32](in, 2) | ||
isValidAndNonEmpty := func(i int) bool { | ||
return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0 | ||
} | ||
|
||
dstOffsets := arrow.Int32Traits.CastFromBytes(out.Bytes()) | ||
for i, offset := range srcOffsets { | ||
if isValidAndNonEmpty(i) { | ||
// This is guaranteed by RangeOfValuesUsed returning the smallest offset | ||
// of valid and non-empty list-views. | ||
debug.Assert(offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays") | ||
dstOffsets[outOff+i] = offset + displacement | ||
} else { | ||
dstOffsets[outOff+i] = 0 | ||
} | ||
} | ||
} | ||
|
||
func putListViewOffsets64(in arrow.ArrayData, displacement int64, out *memory.Buffer, outOff int) { | ||
debug.Assert(in.DataType().ID() == arrow.LARGE_LIST_VIEW, "putListViewOffsets64: expected LARGE_LIST_VIEW data") | ||
inOff, inLen := in.Offset(), in.Len() | ||
if inLen == 0 { | ||
return | ||
} | ||
bitmap := in.Buffers()[0] | ||
srcOffsets := getListViewBufferValues[int64](in, 1) | ||
srcSizes := getListViewBufferValues[int64](in, 2) | ||
isValidAndNonEmpty := func(i int) bool { | ||
return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0 | ||
} | ||
|
||
dstOffsets := arrow.Int64Traits.CastFromBytes(out.Bytes()) | ||
for i, offset := range srcOffsets { | ||
if isValidAndNonEmpty(i) { | ||
// This is guaranteed by RangeOfValuesUsed returning the smallest offset | ||
// of valid and non-empty list-views. | ||
debug.Assert(offset+displacement >= 0, "putListViewOffsets64: offset underflow while concatenating arrays") | ||
dstOffsets[outOff+i] = offset + displacement | ||
} else { | ||
dstOffsets[outOff+i] = 0 | ||
} | ||
} | ||
} | ||
|
||
// Concatenate buffers holding list-view offsets into a single buffer of offsets | ||
// | ||
// valueRanges contains the relevant ranges of values in the child array actually | ||
// referenced to by the views. Most commonly, these ranges will start from 0, | ||
// but when that is not the case, we need to adjust the displacement of offsets. | ||
// The concatenated child array does not contain values from the beginning | ||
// if they are not referenced to by any view. | ||
func concatListViewOffsets(data []arrow.ArrayData, byteWidth int, valueRanges []rng, mem memory.Allocator) (*memory.Buffer, error) { | ||
outSize := sumArraySizes(data) | ||
if byteWidth == 4 && outSize > math.MaxInt32 { | ||
return nil, fmt.Errorf("%w: offset overflow while concatenating arrays", arrow.ErrInvalid) | ||
} | ||
out := memory.NewResizableBuffer(mem) | ||
out.Resize(byteWidth * outSize) | ||
|
||
numChildValues, elementsLength := 0, 0 | ||
for i, arr := range data { | ||
displacement := numChildValues - valueRanges[i].offset | ||
if byteWidth == 4 { | ||
putListViewOffsets32(arr, int32(displacement), out, elementsLength) | ||
} else { | ||
putListViewOffsets64(arr, int64(displacement), out, elementsLength) | ||
} | ||
elementsLength += arr.Len() | ||
numChildValues += valueRanges[i].len | ||
} | ||
debug.Assert(elementsLength == outSize, "implementation error") | ||
|
||
return out, nil | ||
} | ||
|
||
func zeroNullListViewSizes[T int32 | int64](data arrow.ArrayData) { | ||
if data.Len() == 0 || data.Buffers()[0] == nil { | ||
return | ||
} | ||
validity := data.Buffers()[0].Bytes() | ||
sizes := getListViewBufferValues[T](data, 2) | ||
|
||
for i := 0; i < data.Len(); i++ { | ||
if !bitutil.BitIsSet(validity, data.Offset()+i) { | ||
sizes[i] = 0 | ||
} | ||
} | ||
} | ||
|
||
func concatListView(data []arrow.ArrayData, offsetType arrow.FixedWidthDataType, out *Data, mem memory.Allocator) (err error) { | ||
// Calculate the ranges of values that each list-view array uses | ||
valueRanges := make([]rng, len(data)) | ||
for i, input := range data { | ||
offset, len := rangeOfValuesUsed(input) | ||
valueRanges[i].offset = offset | ||
valueRanges[i].len = len | ||
} | ||
|
||
// Gather the children ranges of each input array | ||
childData := gatherChildrenRanges(data, 0, valueRanges) | ||
for _, c := range childData { | ||
defer c.Release() | ||
} | ||
|
||
// Concatenate the values | ||
values, err := concat(childData, mem) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// Concatenate the offsets | ||
offsetBuffer, err := concatListViewOffsets(data, offsetType.Bytes(), valueRanges, mem) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// Concatenate the sizes | ||
sizeBuffers := gatherBuffersFixedWidthType(data, 2, offsetType) | ||
sizeBuffer := concatBuffers(sizeBuffers, mem) | ||
|
||
out.childData = []arrow.ArrayData{values} | ||
out.buffers[1] = offsetBuffer | ||
out.buffers[2] = sizeBuffer | ||
|
||
// To make sure the sizes don't reference values that are not in the new | ||
// concatenated values array, we zero the sizes of null list-view values. | ||
if offsetType.ID() == arrow.INT32 { | ||
zeroNullListViewSizes[int32](out) | ||
} else { | ||
zeroNullListViewSizes[int64](out) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// concat is the implementation for actually performing the concatenation of the arrow.ArrayData | ||
// objects that we can call internally for nested types. | ||
func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, err error) { | ||
|
@@ -483,6 +642,18 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, | |
if err != nil { | ||
return nil, err | ||
} | ||
case *arrow.ListViewType: | ||
offsetType := arrow.PrimitiveTypes.Int32.(arrow.FixedWidthDataType) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you shouldn't have to explicitly cast to the interface type here, since There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. offsetType := arrow.PrimitiveTypes.Int32
err := concatListView(data, offsetType, out, mem) I get an error when I do this. Maybe the cast is not even valid (?) in the first place?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah! darn me only declaring it as a I'd consider using the more specific interface type in the declared vars but I don't want to potentially break anyone who's already doing this type assertion.... |
||
err := concatListView(data, offsetType, out, mem) | ||
if err != nil { | ||
return nil, err | ||
} | ||
case *arrow.LargeListViewType: | ||
offsetType := arrow.PrimitiveTypes.Int64.(arrow.FixedWidthDataType) | ||
err := concatListView(data, offsetType, out, mem) | ||
if err != nil { | ||
return nil, err | ||
} | ||
case *arrow.FixedSizeListType: | ||
childData := gatherChildrenMultiplier(data, 0, int(dt.Len())) | ||
for _, c := range childData { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could this be an issue if the first array has an offset?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tt's subtle, but the answer is no.
Because this offset (returned by
rangeOfValuesUsed
) is the smallest offset covered by view ranges, so if it's non-zero and displacement becomes negative it will never lead to a negative offset when offsets are transformed by the negativedisplacement
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please make sure that there's at least one test case which hits this edge case, just for my own sanity 😛
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The concatenation tests exercise this a lot since they concatenate slices of list-views which always start from offsets > 0. I added asserts now which should make the invariant more trustworthy.