Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-35344: [Go][Format] Implementation of the LIST_VIEW and LARGE_LIST_VIEW array formats #37468

Merged
merged 38 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
1c8f181
datatype.go: Add LIST_VIEW and LARGE_LIST_VIEW
felipecrv Aug 23, 2023
c575b61
list.go: Add ListView and LargeListView array structs
felipecrv Aug 22, 2023
a86f3e6
datatype_nested.go: Add ListViewType
felipecrv Aug 25, 2023
b3591a3
datatype_nested.go: Add LargeListViewType
felipecrv Aug 25, 2023
45386b4
datatype_nested.go: Introduce VarLenListLikeType
felipecrv Sep 13, 2023
178067d
builder.go: Add ListViewBuilder and LargeListViewBuilder
felipecrv Aug 28, 2023
71a4a94
list.go: Reconcile differences between list and list-view builders
felipecrv Aug 29, 2023
7c58a72
compare.go: Add list-view and large-list-view cases
felipecrv Aug 29, 2023
69d657e
list_test.go: Expand test to include list-views as well
felipecrv Aug 25, 2023
b2f25c5
arrjson.go: Implement string conversion for list-views
felipecrv Aug 29, 2023
9b53720
list.go: Fix style changes
felipecrv Aug 31, 2023
69fe50a
map.go: Fix style issues
felipecrv Aug 31, 2023
22ce90d
list.go: Add debug.Assert checks to list-like setData()
felipecrv Sep 5, 2023
a651320
list.go: Use IsNull() instead of !isValid() in String()
felipecrv Sep 15, 2023
2058b5c
list.go: Remove bogus assert from baseListViewBuilder::newData()
felipecrv Sep 19, 2023
d337d61
list_test.go: Add tests of list-views with out-of-order offsets
felipecrv Sep 5, 2023
7f25473
list_test.go: Unify all string roundtrip tests
felipecrv Sep 6, 2023
2387981
list_test.go: Add a string roundtrip test for list-views with out-of-…
felipecrv Sep 6, 2023
300f792
list.go: Add validation for ListView and LargeListView
felipecrv Sep 13, 2023
eac4b32
random_array_gen.go: Add ability to generate random ListView arrays
felipecrv Sep 12, 2023
4c62c20
list.go: Add rangeOfValuesUsed() to support concatenation of list-views
felipecrv Sep 6, 2023
eac653e
concat.go: Implement concatenation of list-views
felipecrv Sep 12, 2023
5b92d58
random_array_gen.go: Extend the code to generate large list-views
felipecrv Sep 20, 2023
1e3ae75
concat.go: Enable concatenation tests of large list-views
felipecrv Sep 20, 2023
9daeb81
arrdata.go: Add list-views to arrdata.json
felipecrv Sep 20, 2023
9824669
arrow/ipc/write.go: Remove unnecessary error from getZeroBasedValueOf…
felipecrv Sep 21, 2023
6852a2e
arrow/ipc: Add IPC support for list-views
felipecrv Sep 21, 2023
650f359
arrjson.go: The fixes sent by Matt Topol
felipecrv Sep 27, 2023
af2813a
random_array_gen.go: Simplify random generation and follow new spec r…
felipecrv Oct 9, 2023
9897360
list.go: Don't skip nulls when validating (according to spec changes)
felipecrv Oct 9, 2023
9dbc99a
concat.go: Preserve invariants when concatenating
felipecrv Oct 9, 2023
bc47993
fixup! concat.go: Preserve invariants when concatenating
felipecrv Oct 10, 2023
72c563a
concat.go: Extract a generic function to zero the sizes
felipecrv Oct 10, 2023
4aaa8bf
concat.go: Use getListViewOffsets in more places
felipecrv Oct 10, 2023
f0fa0d7
fixup! concat.go: Preserve invariants when concatenating
felipecrv Oct 10, 2023
83bbd6a
concat.go: Rename getListView{Offsets->BufferValues}
felipecrv Oct 10, 2023
a5ccd73
datatype.go: Add STRING_VIEW/BINARY_VIEW and re-generate type_string.go
felipecrv Oct 10, 2023
b83beff
fixup! list.go: Add validation for ListView and LargeListView
felipecrv Oct 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go/arrow/array/array.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ func init() {
arrow.LARGE_LIST: func(data arrow.ArrayData) arrow.Array { return NewLargeListData(data) },
arrow.INTERVAL_MONTH_DAY_NANO: func(data arrow.ArrayData) arrow.Array { return NewMonthDayNanoIntervalData(data) },
arrow.RUN_END_ENCODED: func(data arrow.ArrayData) arrow.Array { return NewRunEndEncodedData(data) },
arrow.LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewListViewData(data) },
arrow.LARGE_LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewLargeListViewData(data) },

// invalid data types to fill out array to size 2^6 - 1
63: invalidDataType,
Expand Down
6 changes: 6 additions & 0 deletions go/arrow/array/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,12 @@ func NewBuilder(mem memory.Allocator, dtype arrow.DataType) Builder {
case arrow.MAP:
typ := dtype.(*arrow.MapType)
return NewMapBuilderWithType(mem, typ)
case arrow.LIST_VIEW:
typ := dtype.(*arrow.ListViewType)
return NewListViewBuilderWithField(mem, typ.ElemField())
case arrow.LARGE_LIST_VIEW:
typ := dtype.(*arrow.LargeListViewType)
return NewLargeListViewBuilderWithField(mem, typ.ElemField())
case arrow.EXTENSION:
typ := dtype.(arrow.ExtensionType)
bldr := NewExtensionBuilder(mem, typ)
Expand Down
50 changes: 50 additions & 0 deletions go/arrow/array/compare.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,12 @@ func Equal(left, right arrow.Array) bool {
case *LargeList:
r := right.(*LargeList)
return arrayEqualLargeList(l, r)
case *ListView:
r := right.(*ListView)
return arrayEqualListView(l, r)
case *LargeListView:
r := right.(*LargeListView)
return arrayEqualLargeListView(l, r)
case *FixedSizeList:
r := right.(*FixedSizeList)
return arrayEqualFixedSizeList(l, r)
Expand Down Expand Up @@ -536,6 +542,12 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool {
case *LargeList:
r := right.(*LargeList)
return arrayApproxEqualLargeList(l, r, opt)
case *ListView:
r := right.(*ListView)
return arrayApproxEqualListView(l, r, opt)
case *LargeListView:
r := right.(*LargeListView)
return arrayApproxEqualLargeListView(l, r, opt)
case *FixedSizeList:
r := right.(*FixedSizeList)
return arrayApproxEqualFixedSizeList(l, r, opt)
Expand Down Expand Up @@ -682,6 +694,44 @@ func arrayApproxEqualLargeList(left, right *LargeList, opt equalOption) bool {
return true
}

func arrayApproxEqualListView(left, right *ListView, opt equalOption) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
o := func() bool {
l := left.newListValue(i)
defer l.Release()
r := right.newListValue(i)
defer r.Release()
return arrayApproxEqual(l, r, opt)
}()
if !o {
return false
}
}
return true
}

func arrayApproxEqualLargeListView(left, right *LargeListView, opt equalOption) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
o := func() bool {
l := left.newListValue(i)
defer l.Release()
r := right.newListValue(i)
defer r.Release()
return arrayApproxEqual(l, r, opt)
}()
if !o {
return false
}
}
return true
}

func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
Expand Down
171 changes: 171 additions & 0 deletions go/arrow/array/concat.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"math"
"math/bits"
"unsafe"

"github.com/apache/arrow/go/v14/arrow"
"github.com/apache/arrow/go/v14/arrow/bitutil"
Expand Down Expand Up @@ -355,6 +356,164 @@ func concatOffsets(buffers []*memory.Buffer, byteWidth int, mem memory.Allocator
}
}

func sumArraySizes(data []arrow.ArrayData) int {
outSize := 0
for _, arr := range data {
outSize += arr.Len()
}
return outSize
}

func getListViewBufferValues[T int32 | int64](data arrow.ArrayData, i int) []T {
bytes := data.Buffers()[i].Bytes()
base := (*T)(unsafe.Pointer(&bytes[0]))
ret := unsafe.Slice(base, data.Offset()+data.Len())
return ret[data.Offset():]
}

func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Buffer, outOff int) {
debug.Assert(in.DataType().ID() == arrow.LIST_VIEW, "putListViewOffsets32: expected LIST_VIEW data")
inOff, inLen := in.Offset(), in.Len()
if inLen == 0 {
return
}
bitmap := in.Buffers()[0]
srcOffsets := getListViewBufferValues[int32](in, 1)
srcSizes := getListViewBufferValues[int32](in, 2)
isValidAndNonEmpty := func(i int) bool {
return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0
}

dstOffsets := arrow.Int32Traits.CastFromBytes(out.Bytes())
for i, offset := range srcOffsets {
if isValidAndNonEmpty(i) {
// This is guaranteed by RangeOfValuesUsed returning the smallest offset
// of valid and non-empty list-views.
debug.Assert(offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays")
dstOffsets[outOff+i] = offset + displacement
} else {
dstOffsets[outOff+i] = 0
}
}
}

func putListViewOffsets64(in arrow.ArrayData, displacement int64, out *memory.Buffer, outOff int) {
debug.Assert(in.DataType().ID() == arrow.LARGE_LIST_VIEW, "putListViewOffsets64: expected LARGE_LIST_VIEW data")
inOff, inLen := in.Offset(), in.Len()
if inLen == 0 {
return
}
bitmap := in.Buffers()[0]
srcOffsets := getListViewBufferValues[int64](in, 1)
srcSizes := getListViewBufferValues[int64](in, 2)
isValidAndNonEmpty := func(i int) bool {
return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0
}

dstOffsets := arrow.Int64Traits.CastFromBytes(out.Bytes())
for i, offset := range srcOffsets {
if isValidAndNonEmpty(i) {
// This is guaranteed by RangeOfValuesUsed returning the smallest offset
// of valid and non-empty list-views.
debug.Assert(offset+displacement >= 0, "putListViewOffsets64: offset underflow while concatenating arrays")
dstOffsets[outOff+i] = offset + displacement
} else {
dstOffsets[outOff+i] = 0
}
}
}

// Concatenate buffers holding list-view offsets into a single buffer of offsets
//
// valueRanges contains the relevant ranges of values in the child array actually
// referenced to by the views. Most commonly, these ranges will start from 0,
// but when that is not the case, we need to adjust the displacement of offsets.
// The concatenated child array does not contain values from the beginning
// if they are not referenced to by any view.
func concatListViewOffsets(data []arrow.ArrayData, byteWidth int, valueRanges []rng, mem memory.Allocator) (*memory.Buffer, error) {
outSize := sumArraySizes(data)
if byteWidth == 4 && outSize > math.MaxInt32 {
return nil, fmt.Errorf("%w: offset overflow while concatenating arrays", arrow.ErrInvalid)
}
out := memory.NewResizableBuffer(mem)
out.Resize(byteWidth * outSize)

numChildValues, elementsLength := 0, 0
for i, arr := range data {
displacement := numChildValues - valueRanges[i].offset
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could this be an issue if the first array has an offset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tt's subtle, but the answer is no.

Because this offset (returned by rangeOfValuesUsed) is the smallest offset covered by view ranges, so if it's non-zero and displacement becomes negative it will never lead to a negative offset when offsets are transformed by the negative displacement.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please make sure that there's at least one test case which hits this edge case, just for my own sanity 😛

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The concatenation tests exercise this a lot since they concatenate slices of list-views which always start from offsets > 0. I added asserts now which should make the invariant more trustworthy.

if byteWidth == 4 {
putListViewOffsets32(arr, int32(displacement), out, elementsLength)
} else {
putListViewOffsets64(arr, int64(displacement), out, elementsLength)
}
elementsLength += arr.Len()
numChildValues += valueRanges[i].len
}
debug.Assert(elementsLength == outSize, "implementation error")

return out, nil
}

func zeroNullListViewSizes[T int32 | int64](data arrow.ArrayData) {
if data.Len() == 0 || data.Buffers()[0] == nil {
return
}
validity := data.Buffers()[0].Bytes()
sizes := getListViewBufferValues[T](data, 2)

for i := 0; i < data.Len(); i++ {
if !bitutil.BitIsSet(validity, data.Offset()+i) {
sizes[i] = 0
}
}
}

func concatListView(data []arrow.ArrayData, offsetType arrow.FixedWidthDataType, out *Data, mem memory.Allocator) (err error) {
// Calculate the ranges of values that each list-view array uses
valueRanges := make([]rng, len(data))
for i, input := range data {
offset, len := rangeOfValuesUsed(input)
valueRanges[i].offset = offset
valueRanges[i].len = len
}

// Gather the children ranges of each input array
childData := gatherChildrenRanges(data, 0, valueRanges)
for _, c := range childData {
defer c.Release()
}

// Concatenate the values
values, err := concat(childData, mem)
if err != nil {
return err
}

// Concatenate the offsets
offsetBuffer, err := concatListViewOffsets(data, offsetType.Bytes(), valueRanges, mem)
if err != nil {
return err
}

// Concatenate the sizes
sizeBuffers := gatherBuffersFixedWidthType(data, 2, offsetType)
sizeBuffer := concatBuffers(sizeBuffers, mem)

out.childData = []arrow.ArrayData{values}
out.buffers[1] = offsetBuffer
out.buffers[2] = sizeBuffer

// To make sure the sizes don't reference values that are not in the new
// concatenated values array, we zero the sizes of null list-view values.
if offsetType.ID() == arrow.INT32 {
zeroNullListViewSizes[int32](out)
} else {
zeroNullListViewSizes[int64](out)
}

return nil
}

// concat is the implementation for actually performing the concatenation of the arrow.ArrayData
// objects that we can call internally for nested types.
func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, err error) {
Expand Down Expand Up @@ -483,6 +642,18 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData,
if err != nil {
return nil, err
}
case *arrow.ListViewType:
offsetType := arrow.PrimitiveTypes.Int32.(arrow.FixedWidthDataType)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you shouldn't have to explicitly cast to the interface type here, since arrow.PrimitiveTypes.Int32 implements the FixedWidthDataType interface you should be able to just pass it as is.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

		offsetType := arrow.PrimitiveTypes.Int32
		err := concatListView(data, offsetType, out, mem)

I get an error when I do this. Maybe the cast is not even valid (?) in the first place?

cannot use offsetType (variable of type arrow.DataType) as arrow.FixedWidthDataType value in argument to concatListView: arrow.DataType does not implement arrow.FixedWidthDataType (missing method BitWidth)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah! darn me only declaring it as a DataType instead of actually as a FixedWidthDataType despite it implementing it. sigh Essentially you could do &arrow.Int32Type{} but that would be creating a new instance rather than using the existing one.

I'd consider using the more specific interface type in the declared vars but I don't want to potentially break anyone who's already doing this type assertion....

err := concatListView(data, offsetType, out, mem)
if err != nil {
return nil, err
}
case *arrow.LargeListViewType:
offsetType := arrow.PrimitiveTypes.Int64.(arrow.FixedWidthDataType)
err := concatListView(data, offsetType, out, mem)
if err != nil {
return nil, err
}
case *arrow.FixedSizeListType:
childData := gatherChildrenMultiplier(data, 0, int(dt.Len()))
for _, c := range childData {
Expand Down
21 changes: 21 additions & 0 deletions go/arrow/array/concat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ func TestConcatenate(t *testing.T) {
{arrow.BinaryTypes.LargeString},
{arrow.ListOf(arrow.PrimitiveTypes.Int8)},
{arrow.LargeListOf(arrow.PrimitiveTypes.Int8)},
{arrow.ListViewOf(arrow.PrimitiveTypes.Int8)},
{arrow.LargeListViewOf(arrow.PrimitiveTypes.Int8)},
{arrow.FixedSizeListOf(3, arrow.PrimitiveTypes.Int8)},
{arrow.StructOf()},
{arrow.MapOf(arrow.PrimitiveTypes.Uint16, arrow.PrimitiveTypes.Int8)},
Expand Down Expand Up @@ -200,6 +202,16 @@ func (cts *ConcatTestSuite) generateArr(size int64, nullprob float64) arrow.Arra
}
}
return bldr.NewArray()
case arrow.LIST_VIEW:
arr := cts.rng.ListView(cts.dt.(arrow.VarLenListLikeType), size, 0, 20, nullprob)
err := arr.ValidateFull()
cts.NoError(err)
return arr
case arrow.LARGE_LIST_VIEW:
arr := cts.rng.LargeListView(cts.dt.(arrow.VarLenListLikeType), size, 0, 20, nullprob)
err := arr.ValidateFull()
cts.NoError(err)
return arr
case arrow.FIXED_SIZE_LIST:
const listsize = 3
valuesSize := size * listsize
Expand Down Expand Up @@ -317,11 +329,20 @@ func (cts *ConcatTestSuite) TestCheckConcat() {

slices := cts.slices(arr, offsets)
for _, s := range slices {
if s.DataType().ID() == arrow.LIST_VIEW {
err := s.(*array.ListView).ValidateFull()
cts.NoError(err)
}
defer s.Release()
}

actual, err := array.Concatenate(slices, cts.mem)
cts.NoError(err)
if arr.DataType().ID() == arrow.LIST_VIEW {
lv := actual.(*array.ListView)
err := lv.ValidateFull()
cts.NoError(err)
}
defer actual.Release()

cts.Truef(array.Equal(expected, actual), "expected: %s\ngot: %s\n", expected, actual)
Expand Down
Loading