From 1c8f1810a5b11e212c2c2e20c640c9f0c5265521 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 23 Aug 2023 18:38:05 +0200 Subject: [PATCH 01/38] datatype.go: Add LIST_VIEW and LARGE_LIST_VIEW --- go/arrow/datatype.go | 8 +++++++- go/arrow/type_string.go | 6 ++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go index d784c2bfe0767..2588bf430fcb2 100644 --- a/go/arrow/datatype.go +++ b/go/arrow/datatype.go @@ -152,6 +152,12 @@ const ( RUN_END_ENCODED + // LIST_VIEW is a list of some logical data type represented with offsets and sizes + LIST_VIEW + + // like LIST but with 64-bit offsets + LARGE_LIST_VIEW + // Alias to ensure we do not break any consumers DECIMAL = DECIMAL128 ) @@ -384,7 +390,7 @@ func IsListLike(t Type) bool { // IsNested returns true for List, LargeList, FixedSizeList, Map, Struct, and Unions func IsNested(t Type) bool { switch t { - case LIST, LARGE_LIST, FIXED_SIZE_LIST, MAP, STRUCT, SPARSE_UNION, DENSE_UNION: + case LIST, LARGE_LIST, FIXED_SIZE_LIST, MAP, LIST_VIEW, LARGE_LIST_VIEW, STRUCT, SPARSE_UNION, DENSE_UNION: return true } return false diff --git a/go/arrow/type_string.go b/go/arrow/type_string.go index 41a407386357a..a79ea7919908a 100644 --- a/go/arrow/type_string.go +++ b/go/arrow/type_string.go @@ -47,11 +47,13 @@ func _() { _ = x[LARGE_LIST-36] _ = x[INTERVAL_MONTH_DAY_NANO-37] _ = x[RUN_END_ENCODED-38] + _ = x[LIST_VIEW-39] + _ = x[LARGE_LIST_VIEW-40] } -const _Type_name = "NULLBOOLUINT8INT8UINT16INT16UINT32INT32UINT64INT64FLOAT16FLOAT32FLOAT64STRINGBINARYFIXED_SIZE_BINARYDATE32DATE64TIMESTAMPTIME32TIME64INTERVAL_MONTHSINTERVAL_DAY_TIMEDECIMAL128DECIMAL256LISTSTRUCTSPARSE_UNIONDENSE_UNIONDICTIONARYMAPEXTENSIONFIXED_SIZE_LISTDURATIONLARGE_STRINGLARGE_BINARYLARGE_LISTINTERVAL_MONTH_DAY_NANORUN_END_ENCODED" +const _Type_name = "NULLBOOLUINT8INT8UINT16INT16UINT32INT32UINT64INT64FLOAT16FLOAT32FLOAT64STRINGBINARYFIXED_SIZE_BINARYDATE32DATE64TIMESTAMPTIME32TIME64INTERVAL_MONTHSINTERVAL_DAY_TIMEDECIMAL128DECIMAL256LISTSTRUCTSPARSE_UNIONDENSE_UNIONDICTIONARYMAPEXTENSIONFIXED_SIZE_LISTDURATIONLARGE_STRINGLARGE_BINARYLARGE_LISTINTERVAL_MONTH_DAY_NANORUN_END_ENCODEDLIST_VIEWLARGE_LIST_VIEW" -var _Type_index = [...]uint16{0, 4, 8, 13, 17, 23, 28, 34, 39, 45, 50, 57, 64, 71, 77, 83, 100, 106, 112, 121, 127, 133, 148, 165, 175, 185, 189, 195, 207, 218, 228, 231, 240, 255, 263, 275, 287, 297, 320, 335} +var _Type_index = [...]uint16{0, 4, 8, 13, 17, 23, 28, 34, 39, 45, 50, 57, 64, 71, 77, 83, 100, 106, 112, 121, 127, 133, 148, 165, 175, 185, 189, 195, 207, 218, 228, 231, 240, 255, 263, 275, 287, 297, 320, 335, 344, 359} func (i Type) String() string { if i < 0 || i >= Type(len(_Type_index)-1) { From c575b61746bbbac101e1bfadcdb54a7d96b708f3 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 23 Aug 2023 00:38:15 +0200 Subject: [PATCH 02/38] list.go: Add ListView and LargeListView array structs --- go/arrow/array/array.go | 2 + go/arrow/array/list.go | 299 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 299 insertions(+), 2 deletions(-) diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go index 418f67034583d..1ee04c7aa2bcc 100644 --- a/go/arrow/array/array.go +++ b/go/arrow/array/array.go @@ -176,6 +176,8 @@ func init() { arrow.LARGE_LIST: func(data arrow.ArrayData) arrow.Array { return NewLargeListData(data) }, arrow.INTERVAL_MONTH_DAY_NANO: func(data arrow.ArrayData) arrow.Array { return NewMonthDayNanoIntervalData(data) }, arrow.RUN_END_ENCODED: func(data arrow.ArrayData) arrow.Array { return NewRunEndEncodedData(data) }, + arrow.LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewListViewData(data) }, + arrow.LARGE_LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewLargeListViewData(data) }, // invalid data types to fill out array to size 2^6 - 1 63: invalidDataType, diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 36035dd2f01a8..9dee53782cca5 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -618,16 +618,311 @@ func (b *baseListBuilder) UnmarshalJSON(data []byte) error { return b.Unmarshal(dec) } +// ListView represents an immutable sequence of array values defined by an +// offset into a child array and a length. +type ListView struct { + array + values arrow.Array + offsets []int32 + sizes []int32 +} + +var _ ListLike = (*ListView)(nil) + +func NewListViewData(data arrow.ArrayData) *ListView { + a := &ListView{} + a.refCount = 1 + a.setData(data.(*Data)) + return a +} + +func (a *ListView) ListValues() arrow.Array { return a.values } + +func (a *ListView) ValueStr(i int) string { + if !a.IsValid(i) { + return NullValueStr + } + return string(a.GetOneForMarshal(i).(json.RawMessage)) +} + +func (a *ListView) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i := 0; i < a.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + if !a.IsValid(i) { + o.WriteString(NullValueStr) + continue + } + sub := a.newListValue(i) + fmt.Fprintf(o, "%v", sub) + sub.Release() + } + o.WriteString("]") + return o.String() +} + +func (a *ListView) newListValue(i int) arrow.Array { + beg, end := a.ValueOffsets(i) + return NewSlice(a.values, beg, end) +} + +func (a *ListView) setData(data *Data) { + a.array.setData(data) + offsets := data.buffers[1] + if offsets != nil { + a.offsets = arrow.Int32Traits.CastFromBytes(offsets.Bytes()) + } + sizes := data.buffers[2] + if sizes != nil { + a.sizes = arrow.Int32Traits.CastFromBytes(sizes.Bytes()) + } + a.values = MakeFromData(data.childData[0]) +} + +func (a *ListView) GetOneForMarshal(i int) interface{} { + if a.IsNull(i) { + return nil + } + + slice := a.newListValue(i) + defer slice.Release() + v, err := json.Marshal(slice) + if err != nil { + panic(err) + } + return json.RawMessage(v) +} + +func (a *ListView) MarshalJSON() ([]byte, error) { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + + buf.WriteByte('[') + for i := 0; i < a.Len(); i++ { + if i != 0 { + buf.WriteByte(',') + } + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + return nil, err + } + } + buf.WriteByte(']') + return buf.Bytes(), nil +} + +func arrayEqualListView(left, right *ListView) bool { + for i := 0; i < left.Len(); i++ { + if left.IsNull(i) { + continue + } + o := func() bool { + l := left.newListValue(i) + defer l.Release() + r := right.newListValue(i) + defer r.Release() + return Equal(l, r) + }() + if !o { + return false + } + } + return true +} + +// Len returns the number of elements in the array. +func (a *ListView) Len() int { return a.array.Len() } + +func (a *ListView) Offsets() []int32 { return a.offsets } + +func (a *ListView) Sizes() []int32 { return a.sizes } + +func (a *ListView) Retain() { + a.array.Retain() + a.values.Retain() +} + +func (a *ListView) Release() { + a.array.Release() + a.values.Release() +} + +func (a *ListView) ValueOffsets(i int) (start, end int64) { + debug.Assert(i >= 0 && i < a.array.data.length, "index out of range") + j := i + a.array.data.offset + size := int64(a.sizes[j]) + // If size is 0, skip accessing offsets. + if size == 0 { + start, end = 0, 0 + return + } + start = int64(a.offsets[j]) + end = start + size + return +} + +// LargeListView represents an immutable sequence of array values defined by an +// offset into a child array and a length. +type LargeListView struct { + array + values arrow.Array + offsets []int64 + sizes []int64 +} + +var _ ListLike = (*LargeListView)(nil) + +// NewLargeListViewData returns a new LargeListView array value, from data. +func NewLargeListViewData(data arrow.ArrayData) *LargeListView { + a := new(LargeListView) + a.refCount = 1 + a.setData(data.(*Data)) + return a +} + +func (a *LargeListView) ListValues() arrow.Array { return a.values } + +func (a *LargeListView) ValueStr(i int) string { + if !a.IsValid(i) { + return NullValueStr + } + return string(a.GetOneForMarshal(i).(json.RawMessage)) +} + +func (a *LargeListView) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i := 0; i < a.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + if !a.IsValid(i) { + o.WriteString(NullValueStr) + continue + } + sub := a.newListValue(i) + fmt.Fprintf(o, "%v", sub) + sub.Release() + } + o.WriteString("]") + return o.String() +} + +func (a *LargeListView) newListValue(i int) arrow.Array { + beg, end := a.ValueOffsets(i) + return NewSlice(a.values, beg, end) +} + +func (a *LargeListView) setData(data *Data) { + a.array.setData(data) + offsets := data.buffers[1] + if offsets != nil { + a.offsets = arrow.Int64Traits.CastFromBytes(offsets.Bytes()) + } + sizes := data.buffers[2] + if sizes != nil { + a.sizes = arrow.Int64Traits.CastFromBytes(sizes.Bytes()) + } + a.values = MakeFromData(data.childData[0]) +} + +func (a *LargeListView) GetOneForMarshal(i int) interface{} { + if a.IsNull(i) { + return nil + } + + slice := a.newListValue(i) + defer slice.Release() + v, err := json.Marshal(slice) + if err != nil { + panic(err) + } + return json.RawMessage(v) +} + +func (a *LargeListView) MarshalJSON() ([]byte, error) { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + + buf.WriteByte('[') + for i := 0; i < a.Len(); i++ { + if i != 0 { + buf.WriteByte(',') + } + if err := enc.Encode(a.GetOneForMarshal(i)); err != nil { + return nil, err + } + } + buf.WriteByte(']') + return buf.Bytes(), nil +} + +func arrayEqualLargeListView(left, right *LargeListView) bool { + for i := 0; i < left.Len(); i++ { + if left.IsNull(i) { + continue + } + o := func() bool { + l := left.newListValue(i) + defer l.Release() + r := right.newListValue(i) + defer r.Release() + return Equal(l, r) + }() + if !o { + return false + } + } + return true +} + +// Len returns the number of elements in the array. +func (a *LargeListView) Len() int { return a.array.Len() } + +func (a *LargeListView) Offsets() []int64 { return a.offsets } + +func (a *LargeListView) Sizes() []int64 { return a.sizes } + +func (a *LargeListView) ValueOffsets(i int) (start, end int64) { + debug.Assert(i >= 0 && i < a.array.data.length, "index out of range") + j := i + a.array.data.offset + size := a.sizes[j] + // If size is 0, skip accessing offsets. + if size == 0 { + return 0, 0 + } + start = a.offsets[j] + end = start + size + return +} + +func (a *LargeListView) Retain() { + a.array.Retain() + a.values.Retain() +} + +func (a *LargeListView) Release() { + a.array.Release() + a.values.Release() +} + var ( _ arrow.Array = (*List)(nil) _ arrow.Array = (*LargeList)(nil) - _ Builder = (*ListBuilder)(nil) - _ Builder = (*LargeListBuilder)(nil) + _ arrow.Array = (*ListView)(nil) + _ arrow.Array = (*LargeListView)(nil) + + _ Builder = (*ListBuilder)(nil) + _ Builder = (*LargeListBuilder)(nil) _ ListLike = (*List)(nil) _ ListLike = (*LargeList)(nil) _ ListLike = (*FixedSizeList)(nil) _ ListLike = (*Map)(nil) + _ ListLike = (*ListView)(nil) + _ ListLike = (*LargeListView)(nil) _ ListLikeBuilder = (*ListBuilder)(nil) _ ListLikeBuilder = (*LargeListBuilder)(nil) From a86f3e6d070f0961d7db68fb7f3e75212274046b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 25 Aug 2023 13:51:58 +0200 Subject: [PATCH 03/38] datatype_nested.go: Add ListViewType --- go/arrow/datatype_nested.go | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/go/arrow/datatype_nested.go b/go/arrow/datatype_nested.go index 50777929c00a6..ea93858d320ed 100644 --- a/go/arrow/datatype_nested.go +++ b/go/arrow/datatype_nested.go @@ -242,6 +242,74 @@ func (*FixedSizeListType) Layout() DataTypeLayout { return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap()}} } +type ListViewType struct { + elem Field +} + +func ListViewOfField(f Field) *ListViewType { + if f.Type == nil { + panic("arrow: nil DataType") + } + return &ListViewType{elem: f} +} + +// ListViewOf returns the list-view type with element type t. +// For example, if t represents int32, ListViewOf(t) represents []int32. +// +// ListViewOf panics if t is nil or invalid. NullableElem defaults to true +func ListViewOf(t DataType) *ListViewType { + if t == nil { + panic("arrow: nil DataType") + } + return &ListViewType{elem: Field{Name: "item", Type: t, Nullable: true}} +} + +// ListViewOfNonNullable is like ListViewOf but NullableElem defaults to false, indicating +// that the child type should be marked as non-nullable. +func ListViewOfNonNullable(t DataType) *ListViewType { + if t == nil { + panic("arrow: nil DataType") + } + return &ListViewType{elem: Field{Name: "item", Type: t, Nullable: false}} +} + +func (*ListViewType) ID() Type { return LIST_VIEW } +func (*ListViewType) Name() string { return "list_view" } + +func (t *ListViewType) String() string { + if t.elem.Nullable { + return fmt.Sprintf("list_view<%s: %s, nullable>", t.elem.Name, t.elem.Type) + } + return fmt.Sprintf("list_view<%s: %s>", t.elem.Name, t.elem.Type) +} + +func (t *ListViewType) Fingerprint() string { + child := t.elem.Type.Fingerprint() + if len(child) > 0 { + return typeFingerprint(t) + "{" + child + "}" + } + return "" +} + +func (t *ListViewType) SetElemMetadata(md Metadata) { t.elem.Metadata = md } + +func (t *ListViewType) SetElemNullable(n bool) { t.elem.Nullable = n } + +// Elem returns the ListViewType's element type. +func (t *ListViewType) Elem() DataType { return t.elem.Type } + +func (t *ListViewType) ElemField() Field { + return t.elem +} + +func (t *ListViewType) Fields() []Field { return []Field{t.ElemField()} } + +func (*ListViewType) Layout() DataTypeLayout { + return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int32SizeBytes), SpecFixedWidth(Int32SizeBytes)}} +} + +func (*ListViewType) OffsetTypeTraits() OffsetTraits { return Int32Traits } + // StructType describes a nested type parameterized by an ordered sequence // of relative types, called its fields. type StructType struct { From b3591a31be1a9222f9eaba3cd97bdb3737142e18 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 25 Aug 2023 13:58:07 +0200 Subject: [PATCH 04/38] datatype_nested.go: Add LargeListViewType --- go/arrow/datatype_nested.go | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/go/arrow/datatype_nested.go b/go/arrow/datatype_nested.go index ea93858d320ed..333857fa7938c 100644 --- a/go/arrow/datatype_nested.go +++ b/go/arrow/datatype_nested.go @@ -310,6 +310,74 @@ func (*ListViewType) Layout() DataTypeLayout { func (*ListViewType) OffsetTypeTraits() OffsetTraits { return Int32Traits } +type LargeListViewType struct { + elem Field +} + +func LargeListViewOfField(f Field) *LargeListViewType { + if f.Type == nil { + panic("arrow: nil DataType") + } + return &LargeListViewType{elem: f} +} + +// LargeListViewOf returns the list-view type with element type t. +// For example, if t represents int32, LargeListViewOf(t) represents []int32. +// +// LargeListViewOf panics if t is nil or invalid. NullableElem defaults to true +func LargeListViewOf(t DataType) *LargeListViewType { + if t == nil { + panic("arrow: nil DataType") + } + return &LargeListViewType{elem: Field{Name: "item", Type: t, Nullable: true}} +} + +// LargeListViewOfNonNullable is like LargeListViewOf but NullableElem defaults +// to false, indicating that the child type should be marked as non-nullable. +func LargeListViewOfNonNullable(t DataType) *LargeListViewType { + if t == nil { + panic("arrow: nil DataType") + } + return &LargeListViewType{elem: Field{Name: "item", Type: t, Nullable: false}} +} + +func (*LargeListViewType) ID() Type { return LARGE_LIST_VIEW } +func (*LargeListViewType) Name() string { return "large_list_view" } + +func (t *LargeListViewType) String() string { + if t.elem.Nullable { + return fmt.Sprintf("large_list_view<%s: %s, nullable>", t.elem.Name, t.elem.Type) + } + return fmt.Sprintf("large_list_view<%s: %s>", t.elem.Name, t.elem.Type) +} + +func (t *LargeListViewType) Fingerprint() string { + child := t.elem.Type.Fingerprint() + if len(child) > 0 { + return typeFingerprint(t) + "{" + child + "}" + } + return "" +} + +func (t *LargeListViewType) SetElemMetadata(md Metadata) { t.elem.Metadata = md } + +func (t *LargeListViewType) SetElemNullable(n bool) { t.elem.Nullable = n } + +// Elem returns the LargeListViewType's element type. +func (t *LargeListViewType) Elem() DataType { return t.elem.Type } + +func (t *LargeListViewType) ElemField() Field { + return t.elem +} + +func (t *LargeListViewType) Fields() []Field { return []Field{t.ElemField()} } + +func (*LargeListViewType) Layout() DataTypeLayout { + return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(Int64SizeBytes), SpecFixedWidth(Int64SizeBytes)}} +} + +func (*LargeListViewType) OffsetTypeTraits() OffsetTraits { return Int64Traits } + // StructType describes a nested type parameterized by an ordered sequence // of relative types, called its fields. type StructType struct { From 45386b4ae972d14c0a80b377a7591d91ab2a4a11 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 12 Sep 2023 22:01:49 -0300 Subject: [PATCH 05/38] datatype_nested.go: Introduce VarLenListLikeType To match the type hierarchy of builders and the backwards-compatible hierarchy introduced in the C++ implementation. --- go/arrow/datatype_nested.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/go/arrow/datatype_nested.go b/go/arrow/datatype_nested.go index 333857fa7938c..4ae4880334620 100644 --- a/go/arrow/datatype_nested.go +++ b/go/arrow/datatype_nested.go @@ -39,6 +39,10 @@ type ( Elem() DataType ElemField() Field } + + VarLenListLikeType interface { + ListLikeType + } ) // ListType describes a nested type in which each array slot contains @@ -963,4 +967,11 @@ var ( _ ListLikeType = (*LargeListType)(nil) _ ListLikeType = (*FixedSizeListType)(nil) _ ListLikeType = (*MapType)(nil) + + _ VarLenListLikeType = (*ListType)(nil) + _ VarLenListLikeType = (*LargeListType)(nil) + _ VarLenListLikeType = (*ListViewType)(nil) + _ VarLenListLikeType = (*LargeListViewType)(nil) + _ VarLenListLikeType = (*FixedSizeListType)(nil) + _ VarLenListLikeType = (*MapType)(nil) ) From 178067d1039c8b738fdd9ec38a263f4e3a807185 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 28 Aug 2023 14:37:52 +0200 Subject: [PATCH 06/38] builder.go: Add ListViewBuilder and LargeListViewBuilder --- go/arrow/array/builder.go | 6 + go/arrow/array/list.go | 346 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 352 insertions(+) diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go index 58d4a0f4b8895..2f15ac965e07c 100644 --- a/go/arrow/array/builder.go +++ b/go/arrow/array/builder.go @@ -342,6 +342,12 @@ func NewBuilder(mem memory.Allocator, dtype arrow.DataType) Builder { case arrow.MAP: typ := dtype.(*arrow.MapType) return NewMapBuilderWithType(mem, typ) + case arrow.LIST_VIEW: + typ := dtype.(*arrow.ListViewType) + return NewListViewBuilderWithField(mem, typ.ElemField()) + case arrow.LARGE_LIST_VIEW: + typ := dtype.(*arrow.LargeListViewType) + return NewLargeListViewBuilderWithField(mem, typ.ElemField()) case arrow.EXTENSION: typ := dtype.(arrow.ExtensionType) bldr := NewExtensionBuilder(mem, typ) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 9dee53782cca5..b41487d436ab6 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -908,6 +908,350 @@ func (a *LargeListView) Release() { a.values.Release() } +type baseListViewBuilder struct { + builder + + values Builder // value builder for the list-view's elements. + offsets Builder + sizes Builder + + // actual list-view type + dt arrow.DataType + appendOffsetVal func(int) + appendSizeVal func(int) +} + +type ListViewBuilder struct { + baseListViewBuilder +} + +type LargeListViewBuilder struct { + baseListViewBuilder +} + +// NewListViewBuilder returns a builder, using the provided memory allocator. +// The created list-view builder will create a list whose elements will be +// of type etype. +func NewListViewBuilder(mem memory.Allocator, etype arrow.DataType) *ListViewBuilder { + offsetBldr := NewInt32Builder(mem) + sizeBldr := NewInt32Builder(mem) + return &ListViewBuilder{ + baseListViewBuilder{ + builder: builder{refCount: 1, mem: mem}, + values: NewBuilder(mem, etype), + offsets: offsetBldr, + sizes: sizeBldr, + dt: arrow.ListViewOf(etype), + appendOffsetVal: func(o int) { offsetBldr.Append(int32(o)) }, + appendSizeVal: func(s int) { sizeBldr.Append(int32(s)) }, + }, + } +} + +// NewListViewBuilderWithField takes a field to use for the child rather than just +// a datatype to allow for more customization. +func NewListViewBuilderWithField(mem memory.Allocator, field arrow.Field) *ListViewBuilder { + offsetBldr := NewInt32Builder(mem) + sizeBldr := NewInt32Builder(mem) + return &ListViewBuilder{ + baseListViewBuilder{ + builder: builder{refCount: 1, mem: mem}, + values: NewBuilder(mem, field.Type), + offsets: offsetBldr, + sizes: sizeBldr, + dt: arrow.ListViewOfField(field), + appendOffsetVal: func(o int) { offsetBldr.Append(int32(o)) }, + appendSizeVal: func(s int) { sizeBldr.Append(int32(s)) }, + }, + } +} + +func (b *baseListViewBuilder) Type() arrow.DataType { + switch dt := b.dt.(type) { + case *arrow.ListViewType: + f := dt.ElemField() + f.Type = b.values.Type() + return arrow.ListViewOfField(f) + case *arrow.LargeListViewType: + f := dt.ElemField() + f.Type = b.values.Type() + return arrow.LargeListViewOfField(f) + } + return nil +} + +// NewLargeListViewBuilder returns a builder, using the provided memory allocator. +// The created list-view builder will create a list whose elements will be of type etype. +func NewLargeListViewBuilder(mem memory.Allocator, etype arrow.DataType) *LargeListViewBuilder { + offsetBldr := NewInt64Builder(mem) + sizeBldr := NewInt64Builder(mem) + return &LargeListViewBuilder{ + baseListViewBuilder{ + builder: builder{refCount: 1, mem: mem}, + values: NewBuilder(mem, etype), + offsets: offsetBldr, + sizes: sizeBldr, + dt: arrow.LargeListViewOf(etype), + appendOffsetVal: func(o int) { offsetBldr.Append(int64(o)) }, + appendSizeVal: func(s int) { sizeBldr.Append(int64(s)) }, + }, + } +} + +// NewLargeListViewBuilderWithField takes a field rather than just an element type +// to allow for more customization of the final type of the LargeListView Array +func NewLargeListViewBuilderWithField(mem memory.Allocator, field arrow.Field) *LargeListViewBuilder { + offsetBldr := NewInt64Builder(mem) + sizeBldr := NewInt64Builder(mem) + return &LargeListViewBuilder{ + baseListViewBuilder{ + builder: builder{refCount: 1, mem: mem}, + values: NewBuilder(mem, field.Type), + offsets: offsetBldr, + sizes: sizeBldr, + dt: arrow.LargeListViewOfField(field), + appendOffsetVal: func(o int) { offsetBldr.Append(int64(o)) }, + appendSizeVal: func(o int) { sizeBldr.Append(int64(o)) }, + }, + } +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +func (b *baseListViewBuilder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + b.values.Release() + b.offsets.Release() + b.sizes.Release() + } +} + +// XXX: review the need for this and calls to this +func (b *baseListViewBuilder) appendNextOffset() { + b.appendOffsetVal(b.values.Len()) +} + +func (b *baseListViewBuilder) Append(v bool) { + b.Reserve(1) + b.unsafeAppendBoolToBitmap(v) + b.appendNextOffset() // XXX +} + +func (b *baseListViewBuilder) AppendNull() { + b.Reserve(1) + b.unsafeAppendBoolToBitmap(false) + b.appendNextOffset() +} + +func (b *baseListViewBuilder) AppendNulls(n int) { + for i := 0; i < n; i++ { + b.AppendNull() + } +} + +func (b *baseListViewBuilder) AppendEmptyValue() { + b.Append(true) +} + +func (b *baseListViewBuilder) AppendEmptyValues(n int) { + for i := 0; i < n; i++ { + b.AppendEmptyValue() + } +} + +func (b *ListViewBuilder) AppendValues(offsets []int32, sizes []int32, valid []bool) { + b.Reserve(len(valid)) + b.offsets.(*Int32Builder).AppendValues(offsets, nil) + b.sizes.(*Int32Builder).AppendValues(sizes, nil) + b.builder.unsafeAppendBoolsToBitmap(valid, len(valid)) +} + +func (b *LargeListViewBuilder) AppendValues(offsets []int64, sizes []int64, valid []bool) { + b.Reserve(len(valid)) + b.offsets.(*Int64Builder).AppendValues(offsets, nil) + b.sizes.(*Int64Builder).AppendValues(sizes, nil) + b.builder.unsafeAppendBoolsToBitmap(valid, len(valid)) +} + +func (b *baseListViewBuilder) unsafeAppendBoolToBitmap(isValid bool) { + if isValid { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + } else { + b.nulls++ + } + b.length++ +} + +func (b *baseListViewBuilder) init(capacity int) { + b.builder.init(capacity) + b.offsets.init(capacity + 1) + b.sizes.init(capacity + 1) +} + +// Reserve ensures there is enough space for appending n elements +// by checking the capacity and calling Resize if necessary. +func (b *baseListViewBuilder) Reserve(n int) { + b.builder.reserve(n, b.resizeHelper) + b.offsets.Reserve(n) + b.sizes.Reserve(n) +} + +// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(), +// additional memory will be allocated. If n is smaller, the allocated memory may reduced. +func (b *baseListViewBuilder) Resize(n int) { + b.resizeHelper(n) + b.offsets.Resize(n) + b.sizes.Resize(n) +} + +func (b *baseListViewBuilder) resizeHelper(n int) { + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + } else { + b.builder.resize(n, b.builder.init) + } +} + +func (b *baseListViewBuilder) ValueBuilder() Builder { + return b.values +} + +// NewArray creates a ListView array from the memory buffers used by the builder and +// resets the ListViewBuilder so it can be used to build a new array. +func (b *ListViewBuilder) NewArray() arrow.Array { + return b.NewListViewArray() +} + +// NewArray creates a LargeListView array from the memory buffers used by the builder +// and resets the LargeListViewBuilder so it can be used to build a new array. +func (b *LargeListViewBuilder) NewArray() arrow.Array { + return b.NewLargeListViewArray() +} + +// NewListViewArray creates a ListView array from the memory buffers used by the builder +// and resets the ListViewBuilder so it can be used to build a new array. +func (b *ListViewBuilder) NewListViewArray() (a *ListView) { + data := b.newData() + a = NewListViewData(data) + data.Release() + return +} + +// NewLargeListViewArray creates a ListView array from the memory buffers used by the +// builder and resets the LargeListViewBuilder so it can be used to build a new array. +func (b *LargeListViewBuilder) NewLargeListViewArray() (a *LargeListView) { + data := b.newData() + a = NewLargeListViewData(data) + data.Release() + return +} + +func (b *baseListViewBuilder) newData() (data *Data) { + if b.offsets.Len() != b.length+1 { + b.appendNextOffset() + } + values := b.values.NewArray() + defer values.Release() + + var offsets *memory.Buffer + if b.offsets != nil { + arr := b.offsets.NewArray() + defer arr.Release() + offsets = arr.Data().Buffers()[1] + } + + var sizes *memory.Buffer + if b.sizes != nil { + arr := b.sizes.NewArray() + defer arr.Release() + sizes = arr.Data().Buffers()[1] + } + + data = NewData( + b.Type(), b.length, + []*memory.Buffer{ + b.nullBitmap, + offsets, + sizes, + }, + []arrow.ArrayData{values.Data()}, + b.nulls, + 0, + ) + b.reset() + + return +} + +func (b *baseListViewBuilder) AppendValueFromString(s string) error { + if s == NullValueStr { + b.AppendNull() + return nil + } + + return b.UnmarshalOne(json.NewDecoder(strings.NewReader(s))) +} + +func (b *baseListViewBuilder) UnmarshalOne(dec *json.Decoder) error { + t, err := dec.Token() + if err != nil { + return err + } + + switch t { + case json.Delim('['): + b.Append(true) + if err := b.values.Unmarshal(dec); err != nil { + return err + } + // consume ']' + _, err := dec.Token() + return err + case nil: + b.AppendNull() + default: + return &json.UnmarshalTypeError{ + Value: fmt.Sprint(t), + Struct: b.dt.String(), + } + } + + return nil +} + +func (b *baseListViewBuilder) Unmarshal(dec *json.Decoder) error { + for dec.More() { + if err := b.UnmarshalOne(dec); err != nil { + return err + } + } + return nil +} + +func (b *baseListViewBuilder) UnmarshalJSON(data []byte) error { + dec := json.NewDecoder(bytes.NewReader(data)) + t, err := dec.Token() + if err != nil { + return err + } + + if delim, ok := t.(json.Delim); !ok || delim != '[' { + return fmt.Errorf("list-view builder must unpack from json array, found %s", delim) + } + + return b.Unmarshal(dec) +} + var ( _ arrow.Array = (*List)(nil) _ arrow.Array = (*LargeList)(nil) @@ -916,6 +1260,8 @@ var ( _ Builder = (*ListBuilder)(nil) _ Builder = (*LargeListBuilder)(nil) + _ Builder = (*ListViewBuilder)(nil) + _ Builder = (*LargeListViewBuilder)(nil) _ ListLike = (*List)(nil) _ ListLike = (*LargeList)(nil) From 71a4a9420a4aab06be3f2dbf7a56397a2ca4591c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 29 Aug 2023 13:36:32 +0200 Subject: [PATCH 07/38] list.go: Reconcile differences between list and list-view builders --- go/arrow/array/list.go | 83 +++++++++++++++++++++++++++--------------- go/arrow/array/map.go | 4 ++ 2 files changed, 58 insertions(+), 29 deletions(-) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index b41487d436ab6..0c109af0f58a4 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -35,6 +35,10 @@ type ListLike interface { ValueOffsets(i int) (start, end int64) } +type VarLenListLike interface { + ListLike +} + // List represents an immutable sequence of array values. type List struct { array @@ -314,6 +318,11 @@ type ListLikeBuilder interface { Append(bool) } +type VarLenListLikeBuilder interface { + ListLikeBuilder + AppendWithSize(bool, int) +} + type ListBuilder struct { baseListBuilder } @@ -422,6 +431,10 @@ func (b *baseListBuilder) Append(v bool) { b.appendNextOffset() } +func (b *baseListBuilder) AppendWithSize(v bool, list_size int) { + b.Append(v) +} + func (b *baseListBuilder) AppendNull() { b.Reserve(1) b.unsafeAppendBoolToBitmap(false) @@ -627,7 +640,7 @@ type ListView struct { sizes []int32 } -var _ ListLike = (*ListView)(nil) +var _ VarLenListLike = (*ListView)(nil) func NewListViewData(data arrow.ArrayData) *ListView { a := &ListView{} @@ -772,7 +785,7 @@ type LargeListView struct { sizes []int64 } -var _ ListLike = (*LargeListView)(nil) +var _ VarLenListLike = (*LargeListView)(nil) // NewLargeListViewData returns a new LargeListView array value, from data. func NewLargeListViewData(data arrow.ArrayData) *LargeListView { @@ -1032,21 +1045,24 @@ func (b *baseListViewBuilder) Release() { } } -// XXX: review the need for this and calls to this -func (b *baseListViewBuilder) appendNextOffset() { - b.appendOffsetVal(b.values.Len()) +func (b *baseListViewBuilder) appendDimensions(offset int, list_size int) { + b.appendOffsetVal(offset) + b.appendSizeVal(list_size) } func (b *baseListViewBuilder) Append(v bool) { + debug.Assert(false, "baseListViewBuilder.Append should never be called -- use AppendWithSize instead") +} + +func (b *baseListViewBuilder) AppendWithSize(v bool, list_size int) { + debug.Assert(v || list_size == 0, "invalid list-view should have size 0") b.Reserve(1) b.unsafeAppendBoolToBitmap(v) - b.appendNextOffset() // XXX + b.appendDimensions(b.values.Len(), list_size) } func (b *baseListViewBuilder) AppendNull() { - b.Reserve(1) - b.unsafeAppendBoolToBitmap(false) - b.appendNextOffset() + b.AppendWithSize(false, 0) } func (b *baseListViewBuilder) AppendNulls(n int) { @@ -1056,7 +1072,7 @@ func (b *baseListViewBuilder) AppendNulls(n int) { } func (b *baseListViewBuilder) AppendEmptyValue() { - b.Append(true) + b.AppendWithSize(true, 0) } func (b *baseListViewBuilder) AppendEmptyValues(n int) { @@ -1065,14 +1081,14 @@ func (b *baseListViewBuilder) AppendEmptyValues(n int) { } } -func (b *ListViewBuilder) AppendValues(offsets []int32, sizes []int32, valid []bool) { +func (b *ListViewBuilder) AppendValuesWithSizes(offsets []int32, sizes []int32, valid []bool) { b.Reserve(len(valid)) b.offsets.(*Int32Builder).AppendValues(offsets, nil) b.sizes.(*Int32Builder).AppendValues(sizes, nil) b.builder.unsafeAppendBoolsToBitmap(valid, len(valid)) } -func (b *LargeListViewBuilder) AppendValues(offsets []int64, sizes []int64, valid []bool) { +func (b *LargeListViewBuilder) AppendValuesWithSizes(offsets []int64, sizes []int64, valid []bool) { b.Reserve(len(valid)) b.offsets.(*Int64Builder).AppendValues(offsets, nil) b.sizes.(*Int64Builder).AppendValues(sizes, nil) @@ -1090,8 +1106,8 @@ func (b *baseListViewBuilder) unsafeAppendBoolToBitmap(isValid bool) { func (b *baseListViewBuilder) init(capacity int) { b.builder.init(capacity) - b.offsets.init(capacity + 1) - b.sizes.init(capacity + 1) + b.offsets.init(capacity) + b.sizes.init(capacity) } // Reserve ensures there is enough space for appending n elements @@ -1157,9 +1173,7 @@ func (b *LargeListViewBuilder) NewLargeListViewArray() (a *LargeListView) { } func (b *baseListViewBuilder) newData() (data *Data) { - if b.offsets.Len() != b.length+1 { - b.appendNextOffset() - } + debug.Assert(b.offsets.Len() == b.sizes.Len(), "offsets and sizes should have the same length") values := b.values.NewArray() defer values.Release() @@ -1210,12 +1224,21 @@ func (b *baseListViewBuilder) UnmarshalOne(dec *json.Decoder) error { switch t { case json.Delim('['): - b.Append(true) + offset := b.values.Len() + // 0 is a placeholder size as we don't know the actual size yet + b.AppendWithSize(true, 0) if err := b.values.Unmarshal(dec); err != nil { return err } // consume ']' _, err := dec.Token() + // replace the last size with the actual size + switch b.sizes.(type) { + case *Int32Builder: + b.sizes.(*Int32Builder).rawData[b.sizes.Len()-1] = int32(b.values.Len() - offset) + case *Int64Builder: + b.sizes.(*Int64Builder).rawData[b.sizes.Len()-1] = int64(b.values.Len() - offset) + } return err case nil: b.AppendNull() @@ -1263,15 +1286,17 @@ var ( _ Builder = (*ListViewBuilder)(nil) _ Builder = (*LargeListViewBuilder)(nil) - _ ListLike = (*List)(nil) - _ ListLike = (*LargeList)(nil) - _ ListLike = (*FixedSizeList)(nil) - _ ListLike = (*Map)(nil) - _ ListLike = (*ListView)(nil) - _ ListLike = (*LargeListView)(nil) - - _ ListLikeBuilder = (*ListBuilder)(nil) - _ ListLikeBuilder = (*LargeListBuilder)(nil) - _ ListLikeBuilder = (*FixedSizeListBuilder)(nil) - _ ListLikeBuilder = (*MapBuilder)(nil) + _ VarLenListLike = (*List)(nil) + _ VarLenListLike = (*LargeList)(nil) + _ VarLenListLike = (*Map)(nil) + _ VarLenListLike = (*ListView)(nil) + _ VarLenListLike = (*LargeListView)(nil) + _ ListLike = (*FixedSizeList)(nil) + + _ VarLenListLikeBuilder = (*ListBuilder)(nil) + _ VarLenListLikeBuilder = (*LargeListBuilder)(nil) + _ VarLenListLikeBuilder = (*ListBuilder)(nil) + _ VarLenListLikeBuilder = (*LargeListBuilder)(nil) + _ VarLenListLikeBuilder = (*MapBuilder)(nil) + _ ListLikeBuilder = (*FixedSizeListBuilder)(nil) ) diff --git a/go/arrow/array/map.go b/go/arrow/array/map.go index 4fe860f26ef61..d7d847ec44ff0 100644 --- a/go/arrow/array/map.go +++ b/go/arrow/array/map.go @@ -222,6 +222,10 @@ func (b *MapBuilder) Append(v bool) { b.listBuilder.Append(v) } +func (b *MapBuilder) AppendWithSize(v bool, list_size int) { + b.Append(v) +} + // AppendNull adds a null map entry to the array. func (b *MapBuilder) AppendNull() { b.Append(false) From 7c58a72457b5ed9a0ee1b6e3c33b37305a606a98 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 29 Aug 2023 14:00:16 +0200 Subject: [PATCH 08/38] compare.go: Add list-view and large-list-view cases --- go/arrow/array/compare.go | 50 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go index 7dca60688d490..e70716bee91a7 100644 --- a/go/arrow/array/compare.go +++ b/go/arrow/array/compare.go @@ -292,6 +292,12 @@ func Equal(left, right arrow.Array) bool { case *LargeList: r := right.(*LargeList) return arrayEqualLargeList(l, r) + case *ListView: + r := right.(*ListView) + return arrayEqualListView(l, r) + case *LargeListView: + r := right.(*LargeListView) + return arrayEqualLargeListView(l, r) case *FixedSizeList: r := right.(*FixedSizeList) return arrayEqualFixedSizeList(l, r) @@ -536,6 +542,12 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { case *LargeList: r := right.(*LargeList) return arrayApproxEqualLargeList(l, r, opt) + case *ListView: + r := right.(*ListView) + return arrayApproxEqualListView(l, r, opt) + case *LargeListView: + r := right.(*LargeListView) + return arrayApproxEqualLargeListView(l, r, opt) case *FixedSizeList: r := right.(*FixedSizeList) return arrayApproxEqualFixedSizeList(l, r, opt) @@ -682,6 +694,44 @@ func arrayApproxEqualLargeList(left, right *LargeList, opt equalOption) bool { return true } +func arrayApproxEqualListView(left, right *ListView, opt equalOption) bool { + for i := 0; i < left.Len(); i++ { + if left.IsNull(i) { + continue + } + o := func() bool { + l := left.newListValue(i) + defer l.Release() + r := right.newListValue(i) + defer r.Release() + return arrayApproxEqual(l, r, opt) + }() + if !o { + return false + } + } + return true +} + +func arrayApproxEqualLargeListView(left, right *LargeListView, opt equalOption) bool { + for i := 0; i < left.Len(); i++ { + if left.IsNull(i) { + continue + } + o := func() bool { + l := left.newListValue(i) + defer l.Release() + r := right.newListValue(i) + defer r.Release() + return arrayApproxEqual(l, r, opt) + }() + if !o { + return false + } + } + return true +} + func arrayApproxEqualFixedSizeList(left, right *FixedSizeList, opt equalOption) bool { for i := 0; i < left.Len(); i++ { if left.IsNull(i) { From 69d657e6ca618789f0c5c7f434de3b5cc3a3b71b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 25 Aug 2023 11:55:26 +0200 Subject: [PATCH 09/38] list_test.go: Expand test to include list-views as well --- go/arrow/array/list_test.go | 118 +++++++++++++++++++++++++++--------- 1 file changed, 89 insertions(+), 29 deletions(-) diff --git a/go/arrow/array/list_test.go b/go/arrow/array/list_test.go index 9f193fe19aabd..f37a7c5f393df 100644 --- a/go/arrow/array/list_test.go +++ b/go/arrow/array/list_test.go @@ -30,12 +30,15 @@ func TestListArray(t *testing.T) { tests := []struct { typeID arrow.Type offsets interface{} + sizes interface{} dt arrow.DataType }{ - {arrow.LIST, []int32{0, 3, 3, 3, 7}, arrow.ListOf(arrow.PrimitiveTypes.Int32)}, - {arrow.LARGE_LIST, []int64{0, 3, 3, 3, 7}, arrow.LargeListOf(arrow.PrimitiveTypes.Int32)}, - {arrow.LIST, []int32{0, 3, 3, 3, 7}, arrow.ListOfField(arrow.Field{Name: "item", Type: arrow.PrimitiveTypes.Int32, Nullable: true})}, - {arrow.LARGE_LIST, []int64{0, 3, 3, 3, 7}, arrow.LargeListOfField(arrow.Field{Name: "item", Type: arrow.PrimitiveTypes.Int32, Nullable: true})}, + {arrow.LIST, []int32{0, 3, 3, 3, 7}, nil, arrow.ListOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST, []int64{0, 3, 3, 3, 7}, nil, arrow.LargeListOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LIST, []int32{0, 3, 3, 3, 7}, nil, arrow.ListOfField(arrow.Field{Name: "item", Type: arrow.PrimitiveTypes.Int32, Nullable: true})}, + {arrow.LARGE_LIST, []int64{0, 3, 3, 3, 7}, nil, arrow.LargeListOfField(arrow.Field{Name: "item", Type: arrow.PrimitiveTypes.Int32, Nullable: true})}, + {arrow.LIST_VIEW, []int32{0, 3, 3, 3}, []int32{3, 0, 0, 4}, arrow.ListViewOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST_VIEW, []int64{0, 3, 3, 3}, []int64{3, 0, 0, 4}, arrow.LargeListViewOf(arrow.PrimitiveTypes.Int32)}, } for _, tt := range tests { @@ -49,7 +52,7 @@ func TestListArray(t *testing.T) { isValid = []bool{true, false, true, true} ) - lb := array.NewBuilder(pool, tt.dt).(array.ListLikeBuilder) + lb := array.NewBuilder(pool, tt.dt).(array.VarLenListLikeBuilder) defer lb.Release() for i := 0; i < 10; i++ { @@ -58,7 +61,7 @@ func TestListArray(t *testing.T) { pos := 0 for i, length := range lengths { - lb.Append(isValid[i]) + lb.AppendWithSize(isValid[i], length) for j := 0; j < length; j++ { vb.Append(vs[pos]) pos++ @@ -88,18 +91,32 @@ func TestListArray(t *testing.T) { } } - var got interface{} + var gotOffsets, gotSizes interface{} switch tt.typeID { case arrow.LIST: arr := arr.(*array.List) - got = arr.Offsets() + gotOffsets = arr.Offsets() case arrow.LARGE_LIST: arr := arr.(*array.LargeList) - got = arr.Offsets() + gotOffsets = arr.Offsets() + case arrow.LIST_VIEW: + arr := arr.(*array.ListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + case arrow.LARGE_LIST_VIEW: + arr := arr.(*array.LargeListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() } - if !reflect.DeepEqual(got, tt.offsets) { - t.Fatalf("got=%v, want=%v", got, tt.offsets) + if !reflect.DeepEqual(gotOffsets, tt.offsets) { + t.Fatalf("got=%v, want=%v", gotOffsets, tt.offsets) + } + + if tt.typeID == arrow.LIST_VIEW || tt.typeID == arrow.LARGE_LIST_VIEW { + if !reflect.DeepEqual(gotSizes, tt.sizes) { + t.Fatalf("got=%v, want=%v", gotSizes, tt.sizes) + } } varr := arr.ListValues().(*array.Int32) @@ -116,6 +133,8 @@ func TestListArrayEmpty(t *testing.T) { typ := []arrow.DataType{ arrow.ListOf(arrow.PrimitiveTypes.Int32), arrow.LargeListOf(arrow.PrimitiveTypes.Int32), + arrow.ListViewOf(arrow.PrimitiveTypes.Int32), + arrow.LargeListViewOf(arrow.PrimitiveTypes.Int32), } for _, dt := range typ { @@ -138,10 +157,13 @@ func TestListArrayBulkAppend(t *testing.T) { tests := []struct { typeID arrow.Type offsets interface{} + sizes interface{} dt arrow.DataType }{ - {arrow.LIST, []int32{0, 3, 3, 3, 7}, arrow.ListOf(arrow.PrimitiveTypes.Int32)}, - {arrow.LARGE_LIST, []int64{0, 3, 3, 3, 7}, arrow.LargeListOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LIST, []int32{0, 3, 3, 3, 7}, nil, arrow.ListOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST, []int64{0, 3, 3, 3, 7}, nil, arrow.LargeListOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LIST_VIEW, []int32{0, 3, 3, 3}, []int32{3, 0, 0, 4}, arrow.ListViewOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST_VIEW, []int64{0, 3, 3, 3}, []int64{3, 0, 0, 4}, arrow.LargeListViewOf(arrow.PrimitiveTypes.Int32)}, } for _, tt := range tests { @@ -155,7 +177,7 @@ func TestListArrayBulkAppend(t *testing.T) { isValid = []bool{true, false, true, true} ) - lb := array.NewBuilder(pool, tt.dt).(array.ListLikeBuilder) + lb := array.NewBuilder(pool, tt.dt).(array.VarLenListLikeBuilder) defer lb.Release() vb := lb.ValueBuilder().(*array.Int32Builder) vb.Reserve(len(vs)) @@ -165,12 +187,16 @@ func TestListArrayBulkAppend(t *testing.T) { lb.(*array.ListBuilder).AppendValues(tt.offsets.([]int32), isValid) case arrow.LARGE_LIST: lb.(*array.LargeListBuilder).AppendValues(tt.offsets.([]int64), isValid) + case arrow.LIST_VIEW: + lb.(*array.ListViewBuilder).AppendValuesWithSizes(tt.offsets.([]int32), tt.sizes.([]int32), isValid) + case arrow.LARGE_LIST_VIEW: + lb.(*array.LargeListViewBuilder).AppendValuesWithSizes(tt.offsets.([]int64), tt.sizes.([]int64), isValid) } for _, v := range vs { vb.Append(v) } - arr := lb.NewArray().(array.ListLike) + arr := lb.NewArray().(array.VarLenListLike) defer arr.Release() if got, want := arr.DataType().ID(), tt.typeID; got != want { @@ -190,18 +216,31 @@ func TestListArrayBulkAppend(t *testing.T) { } } - var got interface{} + var gotOffsets, gotSizes interface{} switch tt.typeID { case arrow.LIST: arr := arr.(*array.List) - got = arr.Offsets() + gotOffsets = arr.Offsets() case arrow.LARGE_LIST: arr := arr.(*array.LargeList) - got = arr.Offsets() + gotOffsets = arr.Offsets() + case arrow.LIST_VIEW: + arr := arr.(*array.ListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + case arrow.LARGE_LIST_VIEW: + arr := arr.(*array.LargeListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() } - if !reflect.DeepEqual(got, tt.offsets) { - t.Fatalf("got=%v, want=%v", got, tt.offsets) + if !reflect.DeepEqual(gotOffsets, tt.offsets) { + t.Fatalf("got=%v, want=%v", gotOffsets, tt.offsets) + } + if tt.typeID == arrow.LIST_VIEW || tt.typeID == arrow.LARGE_LIST_VIEW { + if !reflect.DeepEqual(gotSizes, tt.sizes) { + t.Fatalf("got=%v, want=%v", gotSizes, tt.sizes) + } } varr := arr.ListValues().(*array.Int32) @@ -216,10 +255,13 @@ func TestListArraySlice(t *testing.T) { tests := []struct { typeID arrow.Type offsets interface{} + sizes interface{} dt arrow.DataType }{ - {arrow.LIST, []int32{0, 3, 3, 3, 7}, arrow.ListOf(arrow.PrimitiveTypes.Int32)}, - {arrow.LARGE_LIST, []int64{0, 3, 3, 3, 7}, arrow.LargeListOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LIST, []int32{0, 3, 3, 3, 7}, nil, arrow.ListOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST, []int64{0, 3, 3, 3, 7}, nil, arrow.LargeListOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LIST_VIEW, []int32{0, 3, 3, 3, 7}, []int32{3, 0, 0, 4}, arrow.ListViewOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST_VIEW, []int64{0, 3, 3, 3, 7}, []int64{3, 0, 0, 4}, arrow.LargeListViewOf(arrow.PrimitiveTypes.Int32)}, } for _, tt := range tests { @@ -233,7 +275,7 @@ func TestListArraySlice(t *testing.T) { isValid = []bool{true, false, true, true} ) - lb := array.NewBuilder(pool, tt.dt).(array.ListLikeBuilder) + lb := array.NewBuilder(pool, tt.dt).(array.VarLenListLikeBuilder) defer lb.Release() vb := lb.ValueBuilder().(*array.Int32Builder) vb.Reserve(len(vs)) @@ -243,12 +285,16 @@ func TestListArraySlice(t *testing.T) { lb.(*array.ListBuilder).AppendValues(tt.offsets.([]int32), isValid) case arrow.LARGE_LIST: lb.(*array.LargeListBuilder).AppendValues(tt.offsets.([]int64), isValid) + case arrow.LIST_VIEW: + lb.(*array.ListViewBuilder).AppendValuesWithSizes(tt.offsets.([]int32), tt.sizes.([]int32), isValid) + case arrow.LARGE_LIST_VIEW: + lb.(*array.LargeListViewBuilder).AppendValuesWithSizes(tt.offsets.([]int64), tt.sizes.([]int64), isValid) } for _, v := range vs { vb.Append(v) } - arr := lb.NewArray().(array.ListLike) + arr := lb.NewArray().(array.VarLenListLike) defer arr.Release() if got, want := arr.DataType().ID(), tt.typeID; got != want { @@ -268,18 +314,32 @@ func TestListArraySlice(t *testing.T) { } } - var got interface{} + var gotOffsets, gotSizes interface{} switch tt.typeID { case arrow.LIST: arr := arr.(*array.List) - got = arr.Offsets() + gotOffsets = arr.Offsets() case arrow.LARGE_LIST: arr := arr.(*array.LargeList) - got = arr.Offsets() + gotOffsets = arr.Offsets() + case arrow.LIST_VIEW: + arr := arr.(*array.ListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + case arrow.LARGE_LIST_VIEW: + arr := arr.(*array.LargeListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + } + + if !reflect.DeepEqual(gotOffsets, tt.offsets) { + t.Fatalf("got=%v, want=%v", gotOffsets, tt.offsets) } - if !reflect.DeepEqual(got, tt.offsets) { - t.Fatalf("got=%v, want=%v", got, tt.offsets) + if tt.typeID == arrow.LIST_VIEW || tt.typeID == arrow.LARGE_LIST_VIEW { + if !reflect.DeepEqual(gotSizes, tt.sizes) { + t.Fatalf("got=%v, want=%v", gotSizes, tt.sizes) + } } varr := arr.ListValues().(*array.Int32) From b2f25c50b714c5d56e3c5ad5a3dc30e66a4bfc26 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 29 Aug 2023 14:01:36 +0200 Subject: [PATCH 10/38] arrjson.go: Implement string conversion for list-views --- go/arrow/array/list_test.go | 84 ++++++++ go/arrow/internal/arrjson/arrjson.go | 129 +++++++++++- go/arrow/internal/arrjson/arrjson_test.go | 242 ++++++++++++++++++++++ 3 files changed, 454 insertions(+), 1 deletion(-) diff --git a/go/arrow/array/list_test.go b/go/arrow/array/list_test.go index f37a7c5f393df..e387cfed8dd70 100644 --- a/go/arrow/array/list_test.go +++ b/go/arrow/array/list_test.go @@ -445,3 +445,87 @@ func TestLargeListStringRoundTrip(t *testing.T) { assert.True(t, array.Equal(arr, arr1)) } + +func TestListViewStringRoundTrip(t *testing.T) { + // 1. create array + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + b := array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32) + defer b.Release() + vb := b.ValueBuilder().(*array.Int32Builder) + + var values = [][]int32{ + {0, 1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6, 7}, + {2, 3, 4, 5, 6, 7, 8}, + {3, 4, 5, 6, 7, 8, 9}, + } + for _, value := range values { + b.AppendNull() + b.AppendWithSize(true, 2*len(value)) + for _, el := range value { + vb.Append(el) + vb.AppendNull() + } + b.Append(false) + } + + arr := b.NewArray().(*array.ListView) + defer arr.Release() + + // 2. create array via AppendValueFromString + b1 := array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32) + defer b1.Release() + + for i := 0; i < arr.Len(); i++ { + assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) + } + + arr1 := b1.NewArray().(*array.ListView) + defer arr1.Release() + + assert.True(t, array.Equal(arr, arr1)) +} + +func TestLargeListViewStringRoundTrip(t *testing.T) { + // 1. create array + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + b := array.NewLargeListViewBuilder(mem, arrow.PrimitiveTypes.Int32) + defer b.Release() + vb := b.ValueBuilder().(*array.Int32Builder) + + var values = [][]int32{ + {0, 1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6, 7}, + {2, 3, 4, 5, 6, 7, 8}, + {3, 4, 5, 6, 7, 8, 9}, + } + for _, value := range values { + b.AppendNull() + b.AppendWithSize(true, 2*len(value)) + for _, el := range value { + vb.Append(el) + vb.AppendNull() + } + b.Append(false) + } + + arr := b.NewArray().(*array.LargeListView) + defer arr.Release() + + // 2. create array via AppendValueFromString + b1 := array.NewLargeListViewBuilder(mem, arrow.PrimitiveTypes.Int32) + defer b1.Release() + + for i := 0; i < arr.Len(); i++ { + assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) + } + + arr1 := b1.NewArray().(*array.LargeListView) + defer arr1.Release() + + assert.True(t, array.Equal(arr, arr1)) +} diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index fa4438276f186..580436189f99b 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -208,6 +208,10 @@ func typeToJSON(arrowType arrow.DataType) (json.RawMessage, error) { typ = nameJSON{"list"} case *arrow.LargeListType: typ = nameJSON{"largelist"} + case *arrow.ListViewType: + typ = nameJSON{"listview"} + case *arrow.LargeListViewType: + typ = nameJSON{"largelistview"} case *arrow.MapType: typ = mapJSON{Name: "map", KeysSorted: dt.KeysSorted} case *arrow.StructType: @@ -400,6 +404,20 @@ func typeFromJSON(typ json.RawMessage, children []FieldWrapper) (arrowType arrow Metadata: children[0].arrowMeta, Nullable: children[0].Nullable, }) + case "listview": + arrowType = arrow.ListViewOfField(arrow.Field{ + Name: children[0].Name, + Type: children[0].arrowType, + Metadata: children[0].arrowMeta, + Nullable: children[0].Nullable, + }) + case "largelistview": + arrowType = arrow.LargeListViewOfField(arrow.Field{ + Name: children[0].Name, + Type: children[0].arrowType, + Metadata: children[0].arrowMeta, + Nullable: children[0].Nullable, + }) case "map": t := mapJSON{} if err = json.Unmarshal(typ, &t); err != nil { @@ -798,6 +816,7 @@ type Array struct { Data []interface{} `json:"DATA,omitempty"` TypeID []arrow.UnionTypeCode `json:"TYPE_ID,omitempty"` Offset interface{} `json:"OFFSET,omitempty"` + Size interface{} `json:"SIZE,omitempty"` Children []Array `json:"children,omitempty"` } @@ -806,7 +825,8 @@ func (a *Array) MarshalJSON() ([]byte, error) { aux := struct { *Alias OutOffset interface{} `json:"OFFSET,omitempty"` - }{Alias: (*Alias)(a), OutOffset: a.Offset} + OutSize interface{} `json:"SIZE,omitempty"` + }{Alias: (*Alias)(a), OutOffset: a.Offset, OutSize: a.Size} return json.Marshal(aux) } @@ -815,6 +835,7 @@ func (a *Array) UnmarshalJSON(b []byte) (err error) { aux := &struct { *Alias RawOffset json.RawMessage `json:"OFFSET,omitempty"` + RawSize json.RawMessage `json:"SIZE,omitempty"` }{Alias: (*Alias)(a)} dec := json.NewDecoder(bytes.NewReader(b)) @@ -824,6 +845,7 @@ func (a *Array) UnmarshalJSON(b []byte) (err error) { return } + // Offsets if len(aux.RawOffset) == 0 { return } @@ -855,6 +877,38 @@ func (a *Array) UnmarshalJSON(b []byte) (err error) { a.Offset = out } + if len(aux.RawSize) == 0 { + return + } + + // Sizes + var rawSizes []interface{} + if err = json.Unmarshal(aux.RawSize, &rawSizes); err != nil { + return + } + + if len(rawSizes) == 0 { + return + } + + switch rawSizes[0].(type) { + case string: + out := make([]int64, len(rawSizes)) + for i, o := range rawSizes { + out[i], err = strconv.ParseInt(o.(string), 10, 64) + if err != nil { + return + } + } + a.Size = out + case float64: + out := make([]int32, len(rawSizes)) + for i, o := range rawSizes { + out[i] = int32(o.(float64)) + } + a.Size = out + } + return nil } @@ -1050,6 +1104,44 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr memory.NewBufferBytes(arrow.Int64Traits.CastToBytes(arr.Offset.([]int64)))}, []arrow.ArrayData{elems}, nulls, 0) + case *arrow.ListViewType: + valids := validsFromJSON(arr.Valids) + elems := arrayFromJSON(mem, dt.Elem(), arr.Children[0]) + defer elems.Release() + + bitmap := validsToBitmap(valids, mem) + defer bitmap.Release() + + nulls := arr.Count - bitutil.CountSetBits(bitmap.Bytes(), 0, arr.Count) + var offsets, sizes *memory.Buffer + if arr.Count == 0 { + emptyBuffer := memory.NewBufferBytes(nil) + offsets, sizes = emptyBuffer, emptyBuffer + } else { + offsets = memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(arr.Offset.([]int32))) + sizes = memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(arr.Size.([]int32))) + } + return array.NewData(dt, arr.Count, []*memory.Buffer{bitmap, offsets, sizes}, []arrow.ArrayData{elems}, nulls, 0) + + case *arrow.LargeListViewType: + valids := validsFromJSON(arr.Valids) + elems := arrayFromJSON(mem, dt.Elem(), arr.Children[0]) + defer elems.Release() + + bitmap := validsToBitmap(valids, mem) + defer bitmap.Release() + + nulls := arr.Count - bitutil.CountSetBits(bitmap.Bytes(), 0, arr.Count) + var offsets, sizes *memory.Buffer + if arr.Count == 0 { + emptyBuffer := memory.NewBufferBytes(nil) + offsets, sizes = emptyBuffer, emptyBuffer + } else { + offsets = memory.NewBufferBytes(arrow.Int64Traits.CastToBytes(arr.Offset.([]int64))) + sizes = memory.NewBufferBytes(arrow.Int64Traits.CastToBytes(arr.Size.([]int64))) + } + return array.NewData(dt, arr.Count, []*memory.Buffer{bitmap, offsets, sizes}, []arrow.ArrayData{elems}, nulls, 0) + case *arrow.FixedSizeListType: valids := validsFromJSON(arr.Valids) elems := arrayFromJSON(mem, dt.Elem(), arr.Children[0]) @@ -1422,6 +1514,41 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array { }, } + case *array.ListView: + o := Array{ + Name: field.Name, + Count: arr.Len(), + Valids: validsToJSON(arr), + Offset: arr.Offsets(), + Size: arr.Sizes(), + Children: []Array{ + arrayToJSON(arrow.Field{Name: "item", Type: arr.DataType().(*arrow.ListViewType).Elem()}, arr.ListValues()), + }, + } + return o + + case *array.LargeListView: + offsets := arr.Offsets() + strOffsets := make([]string, len(offsets)) + for i, o := range offsets { + strOffsets[i] = strconv.FormatInt(o, 10) + } + sizes := arr.Sizes() + strSizes := make([]string, len(sizes)) + for i, s := range sizes { + strSizes[i] = strconv.FormatInt(s, 10) + } + return Array{ + Name: field.Name, + Count: arr.Len(), + Valids: validsToJSON(arr), + Offset: strOffsets, + Size: strSizes, + Children: []Array{ + arrayToJSON(arrow.Field{Name: "item", Type: arr.DataType().(*arrow.LargeListViewType).Elem()}, arr.ListValues()), + }, + } + case *array.Map: o := Array{ Name: field.Name, diff --git a/go/arrow/internal/arrjson/arrjson_test.go b/go/arrow/internal/arrjson/arrjson_test.go index 882dc9a0d860a..a469195a36ede 100644 --- a/go/arrow/internal/arrjson/arrjson_test.go +++ b/go/arrow/internal/arrjson/arrjson_test.go @@ -34,6 +34,7 @@ func TestReadWrite(t *testing.T) { wantJSONs["primitives"] = makePrimitiveWantJSONs() wantJSONs["structs"] = makeStructsWantJSONs() wantJSONs["lists"] = makeListsWantJSONs() + wantJSONs["listviews"] = makeListViewsWantJSONs() wantJSONs["strings"] = makeStringsWantJSONs() wantJSONs["fixed_size_lists"] = makeFixedSizeListsWantJSONs() wantJSONs["fixed_width_types"] = makeFixedWidthTypesWantJSONs() @@ -1558,6 +1559,247 @@ func makeListsWantJSONs() string { }` } +func makeListViewsWantJSONs() string { + return `{ + "schema": { + "fields": [ + { + "name": "listview_nullable", + "type": { + "name": "listview" + }, + "nullable": true, + "children": [ + { + "name": "item", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + } + ] + } + ] + }, + "batches": [ + { + "count": 3, + "columns": [ + { + "name": "listview_nullable", + "count": 3, + "VALIDITY": [ + 1, + 1, + 1 + ], + "children": [ + { + "name": "item", + "count": 15, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + 1, + 0, + 0, + 4, + 5, + 11, + 0, + 0, + 14, + 15, + 21, + 0, + 0, + 24, + 25 + ] + } + ], + "OFFSET": [ + 0, + 5, + 10 + ], + "SIZE": [ + 5, + 5, + 5 + ] + } + ] + }, + { + "count": 3, + "columns": [ + { + "name": "listview_nullable", + "count": 3, + "VALIDITY": [ + 1, + 1, + 1 + ], + "children": [ + { + "name": "item", + "count": 15, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + -1, + 0, + 0, + -4, + -5, + -11, + 0, + 0, + -14, + -15, + -21, + 0, + 0, + -24, + -25 + ] + } + ], + "OFFSET": [ + 0, + 5, + 10 + ], + "SIZE": [ + 5, + 5, + 5 + ] + } + ] + }, + { + "count": 3, + "columns": [ + { + "name": "listview_nullable", + "count": 3, + "VALIDITY": [ + 1, + 0, + 1 + ], + "children": [ + { + "name": "item", + "count": 15, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + -1, + 0, + 0, + -4, + -5, + -11, + 0, + 0, + -14, + -15, + -21, + 0, + 0, + -24, + -25 + ] + } + ], + "OFFSET": [ + 0, + 5, + 10 + ], + "SIZE": [ + 5, + 5, + 5 + ] + } + ] + }, + { + "count": 0, + "columns": [ + { + "name": "listview_nullable", + "count": 0, + "children": [ + { + "name": "item", + "count": 0 + } + ], + "OFFSET": [ + ], + "SIZE": [ + ] + } + ] + } + ] +}` +} + func makeFixedSizeListsWantJSONs() string { return `{ "schema": { From 9b5372011a42d7972716ea0703db20aaa28ad356 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 31 Aug 2023 12:07:55 +0200 Subject: [PATCH 11/38] list.go: Fix style changes --- go/arrow/array/list.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 0c109af0f58a4..40ac12f2f6ae2 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -431,7 +431,7 @@ func (b *baseListBuilder) Append(v bool) { b.appendNextOffset() } -func (b *baseListBuilder) AppendWithSize(v bool, list_size int) { +func (b *baseListBuilder) AppendWithSize(v bool, _ int) { b.Append(v) } @@ -1045,20 +1045,20 @@ func (b *baseListViewBuilder) Release() { } } -func (b *baseListViewBuilder) appendDimensions(offset int, list_size int) { +func (b *baseListViewBuilder) appendDimensions(offset int, listSize int) { b.appendOffsetVal(offset) - b.appendSizeVal(list_size) + b.appendSizeVal(listSize) } func (b *baseListViewBuilder) Append(v bool) { debug.Assert(false, "baseListViewBuilder.Append should never be called -- use AppendWithSize instead") } -func (b *baseListViewBuilder) AppendWithSize(v bool, list_size int) { - debug.Assert(v || list_size == 0, "invalid list-view should have size 0") +func (b *baseListViewBuilder) AppendWithSize(v bool, listSize int) { + debug.Assert(v || listSize == 0, "invalid list-view should have size 0") b.Reserve(1) b.unsafeAppendBoolToBitmap(v) - b.appendDimensions(b.values.Len(), list_size) + b.appendDimensions(b.values.Len(), listSize) } func (b *baseListViewBuilder) AppendNull() { From 69fe50a2a93d6d24bece196be1a1fc37d27cf3b8 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 31 Aug 2023 12:57:40 +0200 Subject: [PATCH 12/38] map.go: Fix style issues --- go/arrow/array/map.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/arrow/array/map.go b/go/arrow/array/map.go index d7d847ec44ff0..9945a90ce495e 100644 --- a/go/arrow/array/map.go +++ b/go/arrow/array/map.go @@ -222,7 +222,7 @@ func (b *MapBuilder) Append(v bool) { b.listBuilder.Append(v) } -func (b *MapBuilder) AppendWithSize(v bool, list_size int) { +func (b *MapBuilder) AppendWithSize(v bool, _ int) { b.Append(v) } From 22ce90d7f5fcc5a26a324d7c9d99e1aa5cfb4c85 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 5 Sep 2023 16:20:30 -0300 Subject: [PATCH 13/38] list.go: Add debug.Assert checks to list-like setData() --- go/arrow/array/list.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 40ac12f2f6ae2..ae4829cb0c2e4 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -90,6 +90,7 @@ func (a *List) newListValue(i int) arrow.Array { } func (a *List) setData(data *Data) { + debug.Assert(len(data.buffers) >= 2, "list data should have 2 buffers") a.array.setData(data) vals := data.buffers[1] if vals != nil { @@ -221,6 +222,7 @@ func (a *LargeList) newListValue(i int) arrow.Array { } func (a *LargeList) setData(data *Data) { + debug.Assert(len(data.buffers) >= 2, "list data should have 2 buffers") a.array.setData(data) vals := data.buffers[1] if vals != nil { @@ -683,6 +685,7 @@ func (a *ListView) newListValue(i int) arrow.Array { } func (a *ListView) setData(data *Data) { + debug.Assert(len(data.buffers) >= 3, "list-view data should have 3 buffers") a.array.setData(data) offsets := data.buffers[1] if offsets != nil { @@ -829,6 +832,7 @@ func (a *LargeListView) newListValue(i int) arrow.Array { } func (a *LargeListView) setData(data *Data) { + debug.Assert(len(data.buffers) >= 3, "list-view data should have 3 buffers") a.array.setData(data) offsets := data.buffers[1] if offsets != nil { From a651320cc173dafaa3a9f76713c3cbe43b910533 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 15 Sep 2023 12:32:21 -0300 Subject: [PATCH 14/38] list.go: Use IsNull() instead of !isValid() in String() --- go/arrow/array/list.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index ae4829cb0c2e4..5573265f78939 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -72,7 +72,7 @@ func (a *List) String() string { if i > 0 { o.WriteString(" ") } - if !a.IsValid(i) { + if a.IsNull(i) { o.WriteString(NullValueStr) continue } @@ -204,7 +204,7 @@ func (a *LargeList) String() string { if i > 0 { o.WriteString(" ") } - if !a.IsValid(i) { + if a.IsNull(i) { o.WriteString(NullValueStr) continue } @@ -667,7 +667,7 @@ func (a *ListView) String() string { if i > 0 { o.WriteString(" ") } - if !a.IsValid(i) { + if a.IsNull(i) { o.WriteString(NullValueStr) continue } @@ -814,7 +814,7 @@ func (a *LargeListView) String() string { if i > 0 { o.WriteString(" ") } - if !a.IsValid(i) { + if a.IsNull(i) { o.WriteString(NullValueStr) continue } From 2058b5c71f210a6098e32e1e871b0d241b32f4b6 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 19 Sep 2023 01:06:56 -0300 Subject: [PATCH 15/38] list.go: Remove bogus assert from baseListViewBuilder::newData() --- go/arrow/array/list.go | 1 - 1 file changed, 1 deletion(-) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 5573265f78939..0ddb673a3206c 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -1177,7 +1177,6 @@ func (b *LargeListViewBuilder) NewLargeListViewArray() (a *LargeListView) { } func (b *baseListViewBuilder) newData() (data *Data) { - debug.Assert(b.offsets.Len() == b.sizes.Len(), "offsets and sizes should have the same length") values := b.values.NewArray() defer values.Release() From d337d61c13fd6b8e41287d542acbf6d58405542c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 5 Sep 2023 17:09:01 -0300 Subject: [PATCH 16/38] list_test.go: Add tests of list-views with out-of-order offsets --- go/arrow/array/list.go | 7 +- go/arrow/array/list_test.go | 276 ++++++++++++++++++++++++++++++++++++ 2 files changed, 281 insertions(+), 2 deletions(-) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 0ddb673a3206c..4d63ed43ab17e 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -1049,7 +1049,9 @@ func (b *baseListViewBuilder) Release() { } } -func (b *baseListViewBuilder) appendDimensions(offset int, listSize int) { +func (b *baseListViewBuilder) AppendDimensions(offset int, listSize int) { + b.Reserve(1) + b.unsafeAppendBoolToBitmap(true) b.appendOffsetVal(offset) b.appendSizeVal(listSize) } @@ -1062,7 +1064,8 @@ func (b *baseListViewBuilder) AppendWithSize(v bool, listSize int) { debug.Assert(v || listSize == 0, "invalid list-view should have size 0") b.Reserve(1) b.unsafeAppendBoolToBitmap(v) - b.appendDimensions(b.values.Len(), listSize) + b.appendOffsetVal(b.values.Len()) + b.appendSizeVal(listSize) } func (b *baseListViewBuilder) AppendNull() { diff --git a/go/arrow/array/list_test.go b/go/arrow/array/list_test.go index e387cfed8dd70..3a7f9ad0222c4 100644 --- a/go/arrow/array/list_test.go +++ b/go/arrow/array/list_test.go @@ -126,7 +126,102 @@ func TestListArray(t *testing.T) { } }) } +} + +// Like the list-view tests in TestListArray, but with out-of-order offsets. +func TestListViewArray(t *testing.T) { + tests := []struct { + typeID arrow.Type + offsets interface{} + sizes interface{} + dt arrow.DataType + }{ + {arrow.LIST_VIEW, []int32{5, 0, 0, 1}, []int32{3, 0, 0, 4}, arrow.ListViewOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST_VIEW, []int64{5, 0, 0, 1}, []int64{3, 0, 0, 4}, arrow.LargeListViewOf(arrow.PrimitiveTypes.Int32)}, + } + + for _, tt := range tests { + t.Run(tt.typeID.String(), func(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + var ( + vs = []int32{-1, 3, 4, 5, 6, 0, 1, 2} + lengths = []int{3, 0, 0, 4} + isValid = []bool{true, false, true, true} + ) + + lb := array.NewBuilder(pool, tt.dt).(array.VarLenListLikeBuilder) + defer lb.Release() + + for i := 0; i < 10; i++ { + switch lvb := lb.(type) { + case *array.ListViewBuilder: + lvb.AppendDimensions(5, 3) + lb.AppendNull() + lvb.AppendDimensions(0, 0) + lvb.AppendDimensions(1, 4) + case *array.LargeListViewBuilder: + lvb.AppendDimensions(5, 3) + lb.AppendNull() + lvb.AppendDimensions(0, 0) + lvb.AppendDimensions(1, 4) + } + + vb := lb.ValueBuilder().(*array.Int32Builder) + vb.Reserve(len(vs)) + vb.AppendValues(vs, []bool{false, true, true, true, true, true, true, true}) + + arr := lb.NewArray().(array.ListLike) + defer arr.Release() + + arr.Retain() + arr.Release() + + if got, want := arr.DataType().ID(), tt.typeID; got != want { + t.Fatalf("got=%v, want=%v", got, want) + } + + if got, want := arr.Len(), len(isValid); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + for i := range lengths { + if got, want := arr.IsValid(i), isValid[i]; got != want { + t.Fatalf("got[%d]=%v, want[%d]=%v", i, got, i, want) + } + if got, want := arr.IsNull(i), !isValid[i]; got != want { + t.Fatalf("got[%d]=%v, want[%d]=%v", i, got, i, want) + } + } + + var gotOffsets, gotSizes interface{} + switch tt.typeID { + case arrow.LIST_VIEW: + arr := arr.(*array.ListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + case arrow.LARGE_LIST_VIEW: + arr := arr.(*array.LargeListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + } + + if !reflect.DeepEqual(gotOffsets, tt.offsets) { + t.Fatalf("got=%v, want=%v", gotOffsets, tt.offsets) + } + + if !reflect.DeepEqual(gotSizes, tt.sizes) { + t.Fatalf("got=%v, want=%v", gotSizes, tt.sizes) + } + varr := arr.ListValues().(*array.Int32) + if got, want := varr.Int32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + } + }) + } } func TestListArrayEmpty(t *testing.T) { @@ -251,6 +346,90 @@ func TestListArrayBulkAppend(t *testing.T) { } } +func TestListViewArrayBulkAppend(t *testing.T) { + tests := []struct { + typeID arrow.Type + offsets interface{} + sizes interface{} + dt arrow.DataType + }{ + {arrow.LIST_VIEW, []int32{5, 0, 0, 1}, []int32{3, 0, 0, 4}, arrow.ListViewOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST_VIEW, []int64{5, 0, 0, 1}, []int64{3, 0, 0, 4}, arrow.LargeListViewOf(arrow.PrimitiveTypes.Int32)}, + } + + for _, tt := range tests { + t.Run(tt.typeID.String(), func(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + var ( + vs = []int32{-1, 3, 4, 5, 6, 0, 1, 2} + lengths = []int{3, 0, 0, 4} + isValid = []bool{true, false, true, true} + ) + + lb := array.NewBuilder(pool, tt.dt).(array.VarLenListLikeBuilder) + defer lb.Release() + vb := lb.ValueBuilder().(*array.Int32Builder) + vb.Reserve(len(vs)) + + switch tt.typeID { + case arrow.LIST_VIEW: + lb.(*array.ListViewBuilder).AppendValuesWithSizes(tt.offsets.([]int32), tt.sizes.([]int32), isValid) + case arrow.LARGE_LIST_VIEW: + lb.(*array.LargeListViewBuilder).AppendValuesWithSizes(tt.offsets.([]int64), tt.sizes.([]int64), isValid) + } + for _, v := range vs { + vb.Append(v) + } + + arr := lb.NewArray().(array.VarLenListLike) + defer arr.Release() + + if got, want := arr.DataType().ID(), tt.typeID; got != want { + t.Fatalf("got=%v, want=%v", got, want) + } + + if got, want := arr.Len(), len(isValid); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + for i := range lengths { + if got, want := arr.IsValid(i), isValid[i]; got != want { + t.Fatalf("got[%d]=%v, want[%d]=%v", i, got, i, want) + } + if got, want := arr.IsNull(i), !isValid[i]; got != want { + t.Fatalf("got[%d]=%v, want[%d]=%v", i, got, i, want) + } + } + + var gotOffsets, gotSizes interface{} + switch tt.typeID { + case arrow.LIST_VIEW: + arr := arr.(*array.ListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + case arrow.LARGE_LIST_VIEW: + arr := arr.(*array.LargeListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + } + + if !reflect.DeepEqual(gotOffsets, tt.offsets) { + t.Fatalf("got=%v, want=%v", gotOffsets, tt.offsets) + } + if !reflect.DeepEqual(gotSizes, tt.sizes) { + t.Fatalf("got=%v, want=%v", gotSizes, tt.sizes) + } + + varr := arr.ListValues().(*array.Int32) + if got, want := varr.Int32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + }) + } +} + func TestListArraySlice(t *testing.T) { tests := []struct { typeID arrow.Type @@ -362,6 +541,103 @@ func TestListArraySlice(t *testing.T) { } } +func TestLisViewtArraySlice(t *testing.T) { + tests := []struct { + typeID arrow.Type + offsets interface{} + sizes interface{} + dt arrow.DataType + }{ + {arrow.LIST_VIEW, []int32{5, 0, 0, 1}, []int32{3, 0, 0, 4}, arrow.ListViewOf(arrow.PrimitiveTypes.Int32)}, + {arrow.LARGE_LIST_VIEW, []int64{5, 0, 0, 1}, []int64{3, 0, 0, 4}, arrow.LargeListViewOf(arrow.PrimitiveTypes.Int32)}, + } + + for _, tt := range tests { + t.Run(tt.typeID.String(), func(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + var ( + vs = []int32{-1, 3, 4, 5, 6, 0, 1, 2} + lengths = []int{3, 0, 0, 4} + isValid = []bool{true, false, true, true} + ) + + lb := array.NewBuilder(pool, tt.dt).(array.VarLenListLikeBuilder) + defer lb.Release() + vb := lb.ValueBuilder().(*array.Int32Builder) + vb.Reserve(len(vs)) + + switch tt.typeID { + case arrow.LIST_VIEW: + lb.(*array.ListViewBuilder).AppendValuesWithSizes(tt.offsets.([]int32), tt.sizes.([]int32), isValid) + case arrow.LARGE_LIST_VIEW: + lb.(*array.LargeListViewBuilder).AppendValuesWithSizes(tt.offsets.([]int64), tt.sizes.([]int64), isValid) + } + for _, v := range vs { + vb.Append(v) + } + + arr := lb.NewArray().(array.VarLenListLike) + defer arr.Release() + + if got, want := arr.DataType().ID(), tt.typeID; got != want { + t.Fatalf("got=%v, want=%v", got, want) + } + + if got, want := arr.Len(), len(isValid); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + for i := range lengths { + if got, want := arr.IsValid(i), isValid[i]; got != want { + t.Fatalf("got[%d]=%v, want[%d]=%v", i, got, i, want) + } + if got, want := arr.IsNull(i), !isValid[i]; got != want { + t.Fatalf("got[%d]=%v, want[%d]=%v", i, got, i, want) + } + } + + var gotOffsets, gotSizes interface{} + switch tt.typeID { + case arrow.LIST_VIEW: + arr := arr.(*array.ListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + case arrow.LARGE_LIST_VIEW: + arr := arr.(*array.LargeListView) + gotOffsets = arr.Offsets() + gotSizes = arr.Sizes() + } + + if !reflect.DeepEqual(gotOffsets, tt.offsets) { + t.Fatalf("got=%v, want=%v", gotOffsets, tt.offsets) + } + + if !reflect.DeepEqual(gotSizes, tt.sizes) { + t.Fatalf("got=%v, want=%v", gotSizes, tt.sizes) + } + + varr := arr.ListValues().(*array.Int32) + if got, want := varr.Int32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + if got, want := arr.String(), `[[0 1 2] (null) [] [3 4 5 6]]`; got != want { + t.Fatalf("got=%q, want=%q", got, want) + } + assert.Equal(t, "[0,1,2]", arr.ValueStr(0)) + + sub := array.NewSlice(arr, 1, 4).(array.ListLike) + defer sub.Release() + + if got, want := sub.String(), `[(null) [] [3 4 5 6]]`; got != want { + t.Fatalf("got=%q, want=%q", got, want) + } + }) + } +} + func TestListStringRoundTrip(t *testing.T) { // 1. create array mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) From 7f254735990c2dff755848aaf925d545ee94f041 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 5 Sep 2023 22:51:17 -0300 Subject: [PATCH 17/38] list_test.go: Unify all string roundtrip tests --- go/arrow/array/list_test.go | 184 ++++++++---------------------------- 1 file changed, 37 insertions(+), 147 deletions(-) diff --git a/go/arrow/array/list_test.go b/go/arrow/array/list_test.go index 3a7f9ad0222c4..129736ffea36c 100644 --- a/go/arrow/array/list_test.go +++ b/go/arrow/array/list_test.go @@ -638,170 +638,60 @@ func TestLisViewtArraySlice(t *testing.T) { } } -func TestListStringRoundTrip(t *testing.T) { +func TestVarLenListLikeStringRoundTrip(t *testing.T) { // 1. create array mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) - b := array.NewListBuilder(mem, arrow.PrimitiveTypes.Int32) - defer b.Release() - vb := b.ValueBuilder().(*array.Int32Builder) - - var values = [][]int32{ - {0, 1, 2, 3, 4, 5, 6}, - {1, 2, 3, 4, 5, 6, 7}, - {2, 3, 4, 5, 6, 7, 8}, - {3, 4, 5, 6, 7, 8, 9}, - } - for _, value := range values { - b.AppendNull() - b.Append(true) - for _, el := range value { - vb.Append(el) - vb.AppendNull() - } - b.Append(false) + builders := []array.VarLenListLikeBuilder{ + array.NewListBuilder(mem, arrow.PrimitiveTypes.Int32), + array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32), + array.NewLargeListBuilder(mem, arrow.PrimitiveTypes.Int32), + array.NewLargeListViewBuilder(mem, arrow.PrimitiveTypes.Int32), } - arr := b.NewArray().(*array.List) - defer arr.Release() - - // 2. create array via AppendValueFromString - b1 := array.NewListBuilder(mem, arrow.PrimitiveTypes.Int32) - defer b1.Release() - - for i := 0; i < arr.Len(); i++ { - assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) + builders1 := []array.VarLenListLikeBuilder{ + array.NewListBuilder(mem, arrow.PrimitiveTypes.Int32), + array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32), + array.NewLargeListBuilder(mem, arrow.PrimitiveTypes.Int32), + array.NewLargeListViewBuilder(mem, arrow.PrimitiveTypes.Int32), } - arr1 := b1.NewArray().(*array.List) - defer arr1.Release() - - assert.True(t, array.Equal(arr, arr1)) -} - -func TestLargeListStringRoundTrip(t *testing.T) { - // 1. create array - mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) - defer mem.AssertSize(t, 0) + for i, b := range builders { + defer b.Release() - b := array.NewLargeListBuilder(mem, arrow.PrimitiveTypes.Int32) - defer b.Release() - vb := b.ValueBuilder().(*array.Int32Builder) + vb := b.ValueBuilder().(*array.Int32Builder) - var values = [][]int32{ - {0, 1, 2, 3, 4, 5, 6}, - {1, 2, 3, 4, 5, 6, 7}, - {2, 3, 4, 5, 6, 7, 8}, - {3, 4, 5, 6, 7, 8, 9}, - } - for _, value := range values { - b.AppendNull() - b.Append(true) - for _, el := range value { - vb.Append(el) - vb.AppendNull() + var values = [][]int32{ + {0, 1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6, 7}, + {2, 3, 4, 5, 6, 7, 8}, + {3, 4, 5, 6, 7, 8, 9}, } - b.Append(false) - } - - arr := b.NewArray().(*array.LargeList) - defer arr.Release() - - // 2. create array via AppendValueFromString - b1 := array.NewLargeListBuilder(mem, arrow.PrimitiveTypes.Int32) - defer b1.Release() - - for i := 0; i < arr.Len(); i++ { - assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) - } - - arr1 := b1.NewArray().(*array.LargeList) - defer arr1.Release() - - assert.True(t, array.Equal(arr, arr1)) -} - -func TestListViewStringRoundTrip(t *testing.T) { - // 1. create array - mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) - defer mem.AssertSize(t, 0) - - b := array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32) - defer b.Release() - vb := b.ValueBuilder().(*array.Int32Builder) - - var values = [][]int32{ - {0, 1, 2, 3, 4, 5, 6}, - {1, 2, 3, 4, 5, 6, 7}, - {2, 3, 4, 5, 6, 7, 8}, - {3, 4, 5, 6, 7, 8, 9}, - } - for _, value := range values { - b.AppendNull() - b.AppendWithSize(true, 2*len(value)) - for _, el := range value { - vb.Append(el) - vb.AppendNull() + for _, value := range values { + b.AppendNull() + b.AppendWithSize(true, 2*len(value)) + for _, el := range value { + vb.Append(el) + vb.AppendNull() + } + b.AppendWithSize(false, 0) } - b.Append(false) - } - arr := b.NewArray().(*array.ListView) - defer arr.Release() + arr := b.NewArray() + defer arr.Release() - // 2. create array via AppendValueFromString - b1 := array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32) - defer b1.Release() - - for i := 0; i < arr.Len(); i++ { - assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) - } - - arr1 := b1.NewArray().(*array.ListView) - defer arr1.Release() - - assert.True(t, array.Equal(arr, arr1)) -} + // 2. create array via AppendValueFromString + b1 := builders1[i] + defer b1.Release() -func TestLargeListViewStringRoundTrip(t *testing.T) { - // 1. create array - mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) - defer mem.AssertSize(t, 0) - - b := array.NewLargeListViewBuilder(mem, arrow.PrimitiveTypes.Int32) - defer b.Release() - vb := b.ValueBuilder().(*array.Int32Builder) - - var values = [][]int32{ - {0, 1, 2, 3, 4, 5, 6}, - {1, 2, 3, 4, 5, 6, 7}, - {2, 3, 4, 5, 6, 7, 8}, - {3, 4, 5, 6, 7, 8, 9}, - } - for _, value := range values { - b.AppendNull() - b.AppendWithSize(true, 2*len(value)) - for _, el := range value { - vb.Append(el) - vb.AppendNull() + for i := 0; i < arr.Len(); i++ { + assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) } - b.Append(false) - } - - arr := b.NewArray().(*array.LargeListView) - defer arr.Release() - // 2. create array via AppendValueFromString - b1 := array.NewLargeListViewBuilder(mem, arrow.PrimitiveTypes.Int32) - defer b1.Release() + arr1 := b1.NewArray() + defer arr1.Release() - for i := 0; i < arr.Len(); i++ { - assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) + assert.True(t, array.Equal(arr, arr1)) } - - arr1 := b1.NewArray().(*array.LargeListView) - defer arr1.Release() - - assert.True(t, array.Equal(arr, arr1)) } From 2387981a5701b835972a21d3a69005db6e57082b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 6 Sep 2023 11:31:40 -0300 Subject: [PATCH 18/38] list_test.go: Add a string roundtrip test for list-views with out-of-order offsets --- go/arrow/array/list_test.go | 57 +++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/go/arrow/array/list_test.go b/go/arrow/array/list_test.go index 129736ffea36c..89b98d009369e 100644 --- a/go/arrow/array/list_test.go +++ b/go/arrow/array/list_test.go @@ -695,3 +695,60 @@ func TestVarLenListLikeStringRoundTrip(t *testing.T) { assert.True(t, array.Equal(arr, arr1)) } } + +// Test the string roun-trip for a list-view containing out-of-order offsets. +func TestListViewStringRoundTrip(t *testing.T) { + // 1. create array + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + builders := []array.VarLenListLikeBuilder{ + array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32), + array.NewLargeListViewBuilder(mem, arrow.PrimitiveTypes.Int32), + } + + builders1 := []array.VarLenListLikeBuilder{ + array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32), + array.NewLargeListViewBuilder(mem, arrow.PrimitiveTypes.Int32), + } + + for i, b := range builders { + defer b.Release() + + switch lvb := b.(type) { + case *array.ListViewBuilder: + lvb.AppendDimensions(5, 3) + b.AppendNull() + lvb.AppendDimensions(0, 0) + lvb.AppendDimensions(1, 4) + case *array.LargeListViewBuilder: + lvb.AppendDimensions(5, 3) + b.AppendNull() + lvb.AppendDimensions(0, 0) + lvb.AppendDimensions(1, 4) + } + + vb := b.ValueBuilder().(*array.Int32Builder) + + vs := []int32{-1, 3, 4, 5, 6, 0, 1, 2} + isValid := []bool{false, true, true, true, true, true, true, true} + vb.Reserve(len(vs)) + vb.AppendValues(vs, isValid) + + arr := b.NewArray() + defer arr.Release() + + // 2. create array via AppendValueFromString + b1 := builders1[i] + defer b1.Release() + + for i := 0; i < arr.Len(); i++ { + assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) + } + + arr1 := b1.NewArray() + defer arr1.Release() + + assert.True(t, array.Equal(arr, arr1)) + } +} From 300f79297245c112f513f13a323a621e17e7f7d4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 13 Sep 2023 01:03:33 -0300 Subject: [PATCH 19/38] list.go: Add validation for ListView and LargeListView --- go/arrow/array/list.go | 169 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 4d63ed43ab17e..547893d0537bc 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -26,6 +26,7 @@ import ( "github.com/apache/arrow/go/v14/arrow/bitutil" "github.com/apache/arrow/go/v14/arrow/internal/debug" "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v14/internal/bitutils" "github.com/apache/arrow/go/v14/internal/json" ) @@ -925,6 +926,174 @@ func (a *LargeListView) Release() { a.values.Release() } +// Acessors for offsets and sizes to make ListView and LargeListView validation generic. +type offsetsAndSizes interface { + offsetAt(slot int64) int64 + sizeAt(slot int64) int64 +} + +var _ offsetsAndSizes = (*ListView)(nil) +var _ offsetsAndSizes = (*LargeListView)(nil) + +func (a *ListView) offsetAt(slot int64) int64 { return int64(a.offsets[int64(a.data.offset)+slot]) } + +func (a *ListView) sizeAt(slot int64) int64 { return int64(a.sizes[int64(a.data.offset)+slot]) } + +func (a *LargeListView) offsetAt(slot int64) int64 { return a.offsets[int64(a.data.offset)+slot] } + +func (a *LargeListView) sizeAt(slot int64) int64 { return a.sizes[int64(a.data.offset)+slot] } + +func outOfBoundsListViewOffset(l offsetsAndSizes, slot int64, offsetLimit int64) error { + offset := l.offsetAt(slot) + return fmt.Errorf("%w: Offset invariant failure: offset for slot %d out of bounds. Expected %d to be at least 0 and less than %d", arrow.ErrInvalid, slot, offset, offsetLimit) +} + +func outOfBoundsListViewSize(l offsetsAndSizes, slot int64, offsetLimit int64) error { + size := l.sizeAt(slot) + if size < 0 { + return fmt.Errorf("%w: Offset invariant failure: size for slot %d out of bounds: %d < 0", arrow.ErrInvalid, slot, size) + } else { + offset := l.offsetAt(slot) + return fmt.Errorf("%w: Offset invariant failure: size for slot %d out of bounds: %d + %d > %d", arrow.ErrInvalid, slot, offset, size, offsetLimit) + } +} + +// Pre-condition: Basic validation has already been performed +func (a *array) fullyValidateOffsetsAndSizes(l offsetsAndSizes, offsetLimit int64) error { + validity := a.NullBitmapBytes() + + slot := int64(0) + if validity != nil { + counter := bitutils.NewBitBlockCounter(validity, int64(a.Offset()), int64(a.Len())) + var block bitutils.BitBlockCount + for i := 0; i < a.Len(); i += int(block.Len) { + block = counter.NextWord() + if block.NoneSet() { + continue + } + allSet := block.AllSet() + for j := 0; j < int(block.Len); j += 1 { + slot = int64(i + j) + valid := allSet || bitutil.BitIsSet(validity, a.Offset()+int(slot)) + if valid { + size := l.sizeAt(slot) + if size > 0 { + offset := l.offsetAt(slot) + if offset < 0 || offset > offsetLimit { + return outOfBoundsListViewOffset(l, slot, offsetLimit) + } + if size > offsetLimit-offset { + return outOfBoundsListViewSize(l, slot, offsetLimit) + } + } else if size < 0 { + return outOfBoundsListViewSize(l, slot, offsetLimit) + } + } + } + } + } else { + for ; slot < int64(a.Len()); slot += 1 { + size := l.sizeAt(slot) + if size > 0 { + offset := l.offsetAt(slot) + if offset < 0 || offset > offsetLimit { + return outOfBoundsListViewOffset(l, slot, offsetLimit) + } + if size > offsetLimit-int64(offset) { + return outOfBoundsListViewSize(l, slot, offsetLimit) + } + } else if size < 0 { + return outOfBoundsListViewSize(l, slot, offsetLimit) + } + } + } + + return nil +} + +func (a *array) validateOffsetsAndMaybeSizes(l offsetsAndSizes, offsetByteWidth int, isListView bool, offsetLimit int64, fullValidation bool) error { + nonEmpty := a.Len() > 0 + if a.data.buffers[1] == nil { + // For length 0, an empty offsets buffer is accepted (ARROW-544). + if nonEmpty { + return fmt.Errorf("non-empty array but offsets are null") + } else { + return nil + } + } + if isListView { + if a.data.buffers[2] == nil { + if nonEmpty { + return fmt.Errorf("non-empty array but sizes are null") + } else { + return nil + } + } + } + + var requiredOffsets int + if nonEmpty { + requiredOffsets = a.Len() + a.Offset() + if !isListView { + requiredOffsets += 1 + } + } else { + requiredOffsets = 0 + } + offsetsByteSize := a.data.buffers[1].Len() + if offsetsByteSize/offsetByteWidth < requiredOffsets { + return fmt.Errorf("offsets buffer size (bytes): %d isn't large enough for length: %d and offset: %d", + offsetsByteSize, a.Len(), a.Offset()) + } + if isListView { + requiredSizes := a.Len() + a.Offset() + sizesBytesSize := a.data.buffers[2].Len() + if sizesBytesSize/offsetByteWidth < requiredSizes { + return fmt.Errorf("sizes buffer size (bytes): %d isn't large enough for length: %d and offset: %d", + sizesBytesSize, a.Len(), a.Offset()) + } + } + + if fullValidation && requiredOffsets > 0 { + if isListView { + return a.fullyValidateOffsetsAndSizes(l, offsetLimit) + } else { + // TODO: implement validation of List and LargeList + // return fullyValidateOffsets(offset_limit) + return nil + } + } + return nil +} + +func (a *ListView) validate(fullValidation bool) error { + values := a.array.data.childData[0] + offsetLimit := values.Len() + return a.array.validateOffsetsAndMaybeSizes(a, 4, true, int64(offsetLimit), fullValidation) +} + +func (a *ListView) Validate() error { + return a.validate(false) +} + +func (a *ListView) ValidateFull() error { + return a.validate(true) +} + +func (a *LargeListView) validate(fullValidation bool) error { + values := a.array.data.childData[0] + offsetLimit := values.Len() + return a.array.validateOffsetsAndMaybeSizes(a, 8, true, int64(offsetLimit), fullValidation) +} + +func (a *LargeListView) Validate() error { + return a.validate(false) +} + +func (a *LargeListView) ValidateFull() error { + return a.validate(true) +} + type baseListViewBuilder struct { builder From eac4b32edd4912d769e5a862a3540fdc94bd5a70 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 12 Sep 2023 20:33:42 -0300 Subject: [PATCH 20/38] random_array_gen.go: Add ability to generate random ListView arrays --- .../internal/testing/gen/random_array_gen.go | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/go/arrow/internal/testing/gen/random_array_gen.go b/go/arrow/internal/testing/gen/random_array_gen.go index ab9e015163730..4b62128d5e2aa 100644 --- a/go/arrow/internal/testing/gen/random_array_gen.go +++ b/go/arrow/internal/testing/gen/random_array_gen.go @@ -22,6 +22,7 @@ import ( "github.com/apache/arrow/go/v14/arrow" "github.com/apache/arrow/go/v14/arrow/array" "github.com/apache/arrow/go/v14/arrow/bitutil" + "github.com/apache/arrow/go/v14/arrow/internal/debug" "github.com/apache/arrow/go/v14/arrow/memory" "golang.org/x/exp/rand" "gonum.org/v1/gonum/stat/distuv" @@ -376,6 +377,103 @@ func (r *RandomArrayGenerator) Numeric(dt arrow.Type, size int64, min, max int64 panic("invalid type for random numeric array") } +// Generate an array of random offsets based on a given sizes array for +// list-view arrays. +// +// Pre-condition: every non-null sizes[i] <= valuesLength. +func viewOffsetsFromLengthsArray( + seed uint64, avgLength int32, valuesLength int32, + sizesArray *array.Int32, forceEmptyNulls bool, + zeroUndefinedOffsets bool) *memory.Buffer { + sizes := sizesArray.Int32Values() + offsets := make([]int32, sizesArray.Len()) + + offsetDeltaRand := rand.New(rand.NewSource(seed)) + sampleOffsetDelta := func() int32 { + return int32(offsetDeltaRand.Int63n(2*int64(avgLength)) - int64(avgLength)) + } + offsetBase := int32(0) + for i := 0; i < sizesArray.Len(); i += 1 { + // We want to always sample the offsetDeltaRand to make sure different + // options regarding nulls and empty views don't affect the other offsets. + offset := offsetBase + sampleOffsetDelta() + if sizesArray.IsNull(i) { + if forceEmptyNulls { + sizes[i] = 0 + } + if zeroUndefinedOffsets { + offsets[i] = 0 + } else { + offsets[i] = offset + } + continue + } + + size := sizes[i] + if size == 0 { + if zeroUndefinedOffsets { + offsets[i] = 0 + } else { + offsets[i] = offset + } + } else { + // Ensure that the size is not too large. + if size > valuesLength { + size = valuesLength + sizes[i] = size // Fix the size. + } + // Ensure the offset is not negative or too large. + if offset < 0 { + offset = 0 + } else if offset > valuesLength-size { + offset = valuesLength - size + } + offsets[i] = offset + } + offsetBase += avgLength + } + + return memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(offsets)) +} + +func (r *RandomArrayGenerator) listView(dt arrow.VarLenListLikeType, length int64, + minLength, maxLength int32, nullprob float64, + forceEmptyNulls bool, zeroUndefinedOffsets bool) *array.ListView { + lengths := r.Int32(length, minLength, maxLength, nullprob).(*array.Int32) + defer lengths.Release() + + // List-views don't have to be disjoint, so let's make the valuesLength a + // multiple of the average list-view size. To make sure every list view + // into the values array can fit, it should be at least maxLength. + avgLength := minLength + (maxLength-minLength)/2 + valuesLength := int64(avgLength) * (length - int64(lengths.NullN())) + if valuesLength < int64(maxLength) { + valuesLength = int64(maxLength) + } + debug.Assert(valuesLength < math.MaxInt32, "valuesLength must be less than math.MaxInt32") + + values := r.ArrayOf(dt.Elem().ID(), int64(valuesLength), 0.0) + defer values.Release() + offsets := viewOffsetsFromLengthsArray(r.seed, avgLength, int32(valuesLength), lengths, + forceEmptyNulls, zeroUndefinedOffsets) + + buffers := []*memory.Buffer{ + memory.NewBufferBytes(lengths.NullBitmapBytes()), + offsets, + memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(lengths.Int32Values())), + } + childData := []arrow.ArrayData{values.Data()} + data := array.NewData(dt, int(length), buffers, childData, int(lengths.NullN()), 0) + defer data.Release() + return array.NewListViewData(data) +} + +func (r *RandomArrayGenerator) ListView(dt arrow.VarLenListLikeType, length int64, minLength, maxLength int32, nullprob float64) *array.ListView { + forceEmptyNulls := false + zeroUndefineOffsets := false + return r.listView(dt, length, minLength, maxLength, nullprob, forceEmptyNulls, zeroUndefineOffsets) +} + func (r *RandomArrayGenerator) ArrayOf(dt arrow.Type, size int64, nullprob float64) arrow.Array { switch dt { case arrow.BOOL: From 4c62c2094b5d95bd84fddadc9a4f63ad41641e51 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 6 Sep 2023 18:00:26 -0300 Subject: [PATCH 21/38] list.go: Add rangeOfValuesUsed() to support concatenation of list-views --- go/arrow/array/list.go | 251 ++++++++++++++++++++++++++++++++++++ go/arrow/array/list_test.go | 110 ++++++++++++++++ 2 files changed, 361 insertions(+) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 547893d0537bc..879ef012f1347 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -19,6 +19,7 @@ package array import ( "bytes" "fmt" + "math" "strings" "sync/atomic" @@ -1450,6 +1451,256 @@ func (b *baseListViewBuilder) UnmarshalJSON(data []byte) error { return b.Unmarshal(dec) } +// Pre-conditions: +// +// input.DataType() is ListViewType +// input.Len() > 0 && input.NullN() != input.Len() +func minListViewOffset32(input arrow.ArrayData) int32 { + var bitmap []byte + if input.Buffers()[0] != nil { + bitmap = input.Buffers()[0].Bytes() + } + offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():] + sizes := arrow.Int32Traits.CastFromBytes(input.Buffers()[2].Bytes())[input.Offset():] + + isNull := func(i int) bool { + return bitmap != nil && bitutil.BitIsNotSet(bitmap, input.Offset()+i) + } + + // It's very likely that the first non-null non-empty list-view starts at + // offset 0 of the child array. + i := 0 + for i < input.Len() && (isNull(i) || sizes[i] == 0) { + i += 1 + } + if i >= input.Len() { + return 0 + } + minOffset := offsets[i] + if minOffset == 0 { + // early exit: offset 0 found already + return 0 + } + + // Slow path: scan the buffers entirely. + i += 1 + for ; i < input.Len(); i += 1 { + if isNull(i) { + continue + } + offset := offsets[i] + if offset < minOffset && sizes[i] > 0 { + minOffset = offset + } + } + return minOffset +} + +// Find the maximum offset+size in a LIST_VIEW array. +// +// Pre-conditions: +// +// input.DataType() is ListViewType +// input.Len() > 0 && input.NullN() != input.Len() +func maxListViewOffset32(input arrow.ArrayData) int { + inputOffset := input.Offset() + var bitmap []byte + if input.Buffers()[0] != nil { + bitmap = input.Buffers()[0].Bytes() + } + offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[inputOffset:] + sizes := arrow.Int32Traits.CastFromBytes(input.Buffers()[2].Bytes())[inputOffset:] + + isNull := func(i int) bool { + return bitmap != nil && bitutil.BitIsNotSet(bitmap, inputOffset+i) + } + + i := input.Len() - 1 // safe because input.Len() > 0 + for i != 0 && (isNull(i) || sizes[i] == 0) { + i -= 1 + } + offset := offsets[i] + size := sizes[i] + if i == 0 { + if isNull(i) || sizes[i] == 0 { + return 0 + } else { + return int(offset + size) + } + } + + values := input.Children()[0] + maxEnd := int(offsets[i] + sizes[i]) + if maxEnd == values.Len() { + // Early-exit: maximum possible view-end found already. + return maxEnd + } + + // Slow path: scan the buffers entirely. + for ; i >= 0; i -= 1 { + offset := offsets[i] + size := sizes[i] + if size > 0 && !isNull(i) { + if int(offset+size) > maxEnd { + maxEnd = int(offset + size) + if maxEnd == values.Len() { + return maxEnd + } + } + } + } + return maxEnd +} + +// Pre-conditions: +// +// input.DataType() is LargeListViewType +// input.Len() > 0 && input.NullN() != input.Len() +func minLargeListViewOffset64(input arrow.ArrayData) int64 { + var bitmap []byte + if input.Buffers()[0] != nil { + bitmap = input.Buffers()[0].Bytes() + } + offsets := arrow.Int64Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():] + sizes := arrow.Int64Traits.CastFromBytes(input.Buffers()[2].Bytes())[input.Offset():] + + isNull := func(i int) bool { + return bitmap != nil && bitutil.BitIsNotSet(bitmap, input.Offset()+i) + } + + // It's very likely that the first non-null non-empty list-view starts at + // offset 0 of the child array. + i := 0 + for i < input.Len() && (isNull(i) || sizes[i] == 0) { + i += 1 + } + if i >= input.Len() { + return 0 + } + minOffset := offsets[i] + if minOffset == 0 { + // early exit: offset 0 found already + return 0 + } + + // Slow path: scan the buffers entirely. + i += 1 + for ; i < input.Len(); i += 1 { + if isNull(i) { + continue + } + offset := offsets[i] + if offset < minOffset && sizes[i] > 0 { + minOffset = offset + } + } + return minOffset +} + +// Find the maximum offset+size in a LARGE_LIST_VIEW array. +// +// Pre-conditions: +// +// input.DataType() is LargeListViewType +// input.Len() > 0 && input.NullN() != input.Len() +func maxLargeListViewOffset64(input arrow.ArrayData) int64 { + inputOffset := input.Offset() + var bitmap []byte + if input.Buffers()[0] != nil { + bitmap = input.Buffers()[0].Bytes() + } + offsets := arrow.Int64Traits.CastFromBytes(input.Buffers()[1].Bytes())[inputOffset:] + sizes := arrow.Int64Traits.CastFromBytes(input.Buffers()[2].Bytes())[inputOffset:] + + isNull := func(i int) bool { + return bitmap != nil && bitutil.BitIsNotSet(bitmap, inputOffset+i) + } + + // It's very likely that the first non-null non-empty list-view starts at + // offset zero, so we check that first and potentially early-return a 0. + i := input.Len() - 1 // safe because input.Len() > 0 + for i != 0 && (isNull(i) || sizes[i] == 0) { + i -= 1 + } + offset := offsets[i] + size := sizes[i] + if i == 0 { + if isNull(i) || sizes[i] == 0 { + return 0 + } else { + return offset + size + } + } + + if offset > math.MaxInt64-size { + // Early-exit: 64-bit overflow detected. This is not possible on a + // valid list-view, but we return the maximum possible value to + // avoid undefined behavior. + return math.MaxInt64 + } + values := input.Children()[0] + maxEnd := offsets[i] + sizes[i] + if maxEnd == int64(values.Len()) { + // Early-exit: maximum possible view-end found already. + return maxEnd + } + + // Slow path: scan the buffers entirely. + for ; i >= 0; i -= 1 { + offset := offsets[i] + size := sizes[i] + if size > 0 && !isNull(i) { + if offset+size > maxEnd { + if offset > math.MaxInt64-size { + // 64-bit overflow detected. This is not possible on a valid list-view, + // but we saturate maxEnd to the maximum possible value to avoid + // undefined behavior. + return math.MaxInt64 + } + maxEnd = offset + size + if maxEnd == int64(values.Len()) { + return maxEnd + } + } + } + } + return maxEnd +} + +func rangeOfValuesUsed(input arrow.ArrayData) (int, int) { + if input.Len() == 0 || input.NullN() == input.Len() { + return 0, 0 + } + var minOffset, maxEnd int + switch input.DataType().(type) { + case *arrow.ListViewType: + minOffset = int(minListViewOffset32(input)) + maxEnd = maxListViewOffset32(input) + case *arrow.LargeListViewType: + minOffset = int(minLargeListViewOffset64(input)) + maxEnd = int(maxLargeListViewOffset64(input)) + case *arrow.ListType: + offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():] + minOffset = int(offsets[0]) + maxEnd = int(offsets[len(offsets)-1]) + case *arrow.LargeListType: + offsets := arrow.Int64Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():] + minOffset = int(offsets[0]) + maxEnd = int(offsets[len(offsets)-1]) + case *arrow.MapType: + offsets := arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():] + minOffset = int(offsets[0]) + maxEnd = int(offsets[len(offsets)-1]) + } + return minOffset, maxEnd - minOffset +} + +// Returns the smallest contiguous range of values of the child array that are +// referenced by all the list values in the input array. +func RangeOfValuesUsed(input VarLenListLike) (int, int) { + return rangeOfValuesUsed(input.Data()) +} + var ( _ arrow.Array = (*List)(nil) _ arrow.Array = (*LargeList)(nil) diff --git a/go/arrow/array/list_test.go b/go/arrow/array/list_test.go index 89b98d009369e..bf3555b3f6603 100644 --- a/go/arrow/array/list_test.go +++ b/go/arrow/array/list_test.go @@ -752,3 +752,113 @@ func TestListViewStringRoundTrip(t *testing.T) { assert.True(t, array.Equal(arr, arr1)) } } + +func TestRangeOfValuesUsed(t *testing.T) { + tests := []struct { + typeID arrow.Type + dt arrow.DataType + }{ + {arrow.LIST, arrow.ListOf(arrow.PrimitiveTypes.Int16)}, + {arrow.LARGE_LIST, arrow.LargeListOf(arrow.PrimitiveTypes.Int16)}, + {arrow.LIST_VIEW, arrow.ListViewOf(arrow.PrimitiveTypes.Int16)}, + {arrow.LARGE_LIST_VIEW, arrow.LargeListViewOf(arrow.PrimitiveTypes.Int16)}, + } + for _, tt := range tests { + t.Run(tt.typeID.String(), func(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + isListView := tt.typeID == arrow.LIST_VIEW || tt.typeID == arrow.LARGE_LIST_VIEW + + bldr := array.NewBuilder(pool, tt.dt).(array.VarLenListLikeBuilder) + defer bldr.Release() + + var arr array.VarLenListLike + + // Empty array + arr = bldr.NewArray().(array.VarLenListLike) + defer arr.Release() + offset, len := array.RangeOfValuesUsed(arr) + assert.Equal(t, 0, offset) + assert.Equal(t, 0, len) + + // List-like array with only nulls + bldr.AppendNulls(3) + arr = bldr.NewArray().(array.VarLenListLike) + defer arr.Release() + offset, len = array.RangeOfValuesUsed(arr) + assert.Equal(t, 0, offset) + assert.Equal(t, 0, len) + + // Array with nulls and non-nulls (starting at a non-zero offset) + vb := bldr.ValueBuilder().(*array.Int16Builder) + vb.Append(-2) + vb.Append(-1) + bldr.AppendWithSize(false, 0) + bldr.AppendWithSize(true, 2) + vb.Append(0) + vb.Append(1) + bldr.AppendWithSize(true, 3) + vb.Append(2) + vb.Append(3) + vb.Append(4) + if isListView { + vb.Append(10) + vb.Append(11) + } + arr = bldr.NewArray().(array.VarLenListLike) + defer arr.Release() + offset, len = array.RangeOfValuesUsed(arr) + assert.Equal(t, 2, offset) + assert.Equal(t, 5, len) + + // Overlapping list-views + // [null, [0, 1, 2, 3, 4, 5], [1, 2], null, [4], null, null] + vb = bldr.ValueBuilder().(*array.Int16Builder) + vb.Append(-2) + vb.Append(-1) + bldr.AppendWithSize(false, 0) + if isListView { + bldr.AppendWithSize(true, 6) + vb.Append(0) + bldr.AppendWithSize(true, 2) + vb.Append(1) + vb.Append(2) + vb.Append(3) + bldr.AppendWithSize(false, 0) + bldr.AppendWithSize(true, 1) + vb.Append(4) + vb.Append(5) + // -- used range ends here -- + vb.Append(10) + vb.Append(11) + } else { + bldr.AppendWithSize(true, 6) + vb.Append(0) + vb.Append(1) + vb.Append(2) + vb.Append(3) + vb.Append(4) + vb.Append(5) + bldr.AppendWithSize(true, 2) + vb.Append(1) + vb.Append(2) + bldr.AppendWithSize(false, 0) + bldr.AppendWithSize(true, 1) + vb.Append(4) + } + bldr.AppendNulls(2) + arr = bldr.NewArray().(array.VarLenListLike) + defer arr.Release() + + // Check the range + offset, len = array.RangeOfValuesUsed(arr) + assert.Equal(t, 2, offset) + if isListView { + assert.Equal(t, 6, len) + } else { + assert.Equal(t, 9, len) + } + }) + } +} From eac653e009ba57345b93eb133d24348141bd8be1 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 11 Sep 2023 21:08:35 -0300 Subject: [PATCH 22/38] concat.go: Implement concatenation of list-views --- go/arrow/array/concat.go | 130 ++++++++++++++++++++++++++++++++++ go/arrow/array/concat_test.go | 19 +++++ 2 files changed, 149 insertions(+) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index f00a36fec1171..b748ec2872386 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -355,6 +355,124 @@ func concatOffsets(buffers []*memory.Buffer, byteWidth int, mem memory.Allocator } } +func sumArraySizes(data []arrow.ArrayData) int { + outSize := 0 + for _, arr := range data { + outSize += arr.Len() + } + return outSize +} + +func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Buffer, outOff int) { + debug.Assert(in.DataType().ID() == arrow.LIST_VIEW, "putListViewOffsets32: expected LIST_VIEW data") + inOff, inLen := in.Offset(), in.Len() + if inLen == 0 { + return + } + bitmap := in.Buffers()[0] + srcOffsets := arrow.Int32Traits.CastFromBytes(in.Buffers()[1].Bytes())[inOff : inOff+inLen] + srcSizes := arrow.Int32Traits.CastFromBytes(in.Buffers()[2].Bytes())[inOff : inOff+inLen] + isValidAndNonEmpty := func(i int) bool { + return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0 + } + + dstOffsets := arrow.Int32Traits.CastFromBytes(out.Bytes()) + for i, offset := range srcOffsets { + debug.Assert(!isValidAndNonEmpty(i) || offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays") + dstOffsets[outOff+i] = offset + displacement + } +} + +func putListViewOffsets64(in arrow.ArrayData, displacement int64, out *memory.Buffer, outOff int) { + debug.Assert(in.DataType().ID() == arrow.LARGE_LIST_VIEW, "putListViewOffsets64: expected LARGE_LIST_VIEW data") + inOff, inLen := in.Offset(), in.Len() + if in.Len() == 0 { + return + } + bitmap := in.Buffers()[0] + srcOffsets := arrow.Int64Traits.CastFromBytes(in.Buffers()[1].Bytes())[in.Offset():(in.Offset() + in.Len())] + srcSizes := arrow.Int64Traits.CastFromBytes(in.Buffers()[2].Bytes())[inOff : inOff+inLen] + isValidAndNonEmpty := func(i int) bool { + return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0 + } + + dstOffsets := arrow.Int64Traits.CastFromBytes(out.Bytes()) + for i, offset := range srcOffsets { + debug.Assert(!isValidAndNonEmpty(i) || offset+displacement >= 0, + "putListViewOffsets64: offset underflow while concatenating arrays") + dstOffsets[outOff+i] = offset + displacement + } +} + +// Concatenate buffers holding list-view offsets into a single buffer of offsets +// +// valueRanges contains the relevant ranges of values in the child array actually +// referenced to by the views. Most commonly, these ranges will start from 0, +// but when that is not the case, we need to adjust the displacement of offsets. +// The concatenated child array does not contain values from the beginning +// if they are not referenced to by any view. +func concatListViewOffsets(data []arrow.ArrayData, byteWidth int, valueRanges []rng, mem memory.Allocator) (*memory.Buffer, error) { + outSize := sumArraySizes(data) + if byteWidth == 4 && outSize > math.MaxInt32 { + return nil, fmt.Errorf("%w: offset overflow while concatenating arrays", arrow.ErrInvalid) + } + out := memory.NewResizableBuffer(mem) + out.Resize(byteWidth * outSize) + + numChildValues, elementsLength := 0, 0 + for i, arr := range data { + displacement := numChildValues - valueRanges[i].offset + if byteWidth == 4 { + putListViewOffsets32(arr, int32(displacement), out, elementsLength) + } else { + putListViewOffsets64(arr, int64(displacement), out, elementsLength) + } + elementsLength += arr.Len() + numChildValues += valueRanges[i].len + } + debug.Assert(elementsLength == outSize, "implementation error") + + return out, nil +} + +func concatListView(data []arrow.ArrayData, offsetType arrow.FixedWidthDataType, out *Data, mem memory.Allocator) (err error) { + // Calculate the ranges of values that each list-view array uses + valueRanges := make([]rng, len(data)) + for i, input := range data { + offset, len := rangeOfValuesUsed(input) + valueRanges[i].offset = offset + valueRanges[i].len = len + } + + // Gather the children ranges of each input array + childData := gatherChildrenRanges(data, 0, valueRanges) + for _, c := range childData { + defer c.Release() + } + + // Concatenate the values + values, err := concat(childData, mem) + if err != nil { + return err + } + + // Concatenate the offsets + offsetBuffer, err := concatListViewOffsets(data, offsetType.Bytes(), valueRanges, mem) + if err != nil { + return err + } + + // Concatenate the sizes + sizeBuffers := gatherBuffersFixedWidthType(data, 2, offsetType) + sizeBuffer := concatBuffers(sizeBuffers, mem) + + out.childData = []arrow.ArrayData{values} + out.buffers[1] = offsetBuffer + out.buffers[2] = sizeBuffer + + return nil +} + // concat is the implementation for actually performing the concatenation of the arrow.ArrayData // objects that we can call internally for nested types. func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, err error) { @@ -483,6 +601,18 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, if err != nil { return nil, err } + case *arrow.ListViewType: + offsetType := arrow.PrimitiveTypes.Int32.(arrow.FixedWidthDataType) + err := concatListView(data, offsetType, out, mem) + if err != nil { + return nil, err + } + case *arrow.LargeListViewType: + offsetType := arrow.PrimitiveTypes.Int64.(arrow.FixedWidthDataType) + err := concatListView(data, offsetType, out, mem) + if err != nil { + return nil, err + } case *arrow.FixedSizeListType: childData := gatherChildrenMultiplier(data, 0, int(dt.Len())) for _, c := range childData { diff --git a/go/arrow/array/concat_test.go b/go/arrow/array/concat_test.go index cc4d29cf42460..22331250e1e8e 100644 --- a/go/arrow/array/concat_test.go +++ b/go/arrow/array/concat_test.go @@ -78,6 +78,8 @@ func TestConcatenate(t *testing.T) { {arrow.BinaryTypes.LargeString}, {arrow.ListOf(arrow.PrimitiveTypes.Int8)}, {arrow.LargeListOf(arrow.PrimitiveTypes.Int8)}, + {arrow.ListViewOf(arrow.PrimitiveTypes.Int8)}, + // {arrow.LargeListViewOf(arrow.PrimitiveTypes.Int8)}, {arrow.FixedSizeListOf(3, arrow.PrimitiveTypes.Int8)}, {arrow.StructOf()}, {arrow.MapOf(arrow.PrimitiveTypes.Uint16, arrow.PrimitiveTypes.Int8)}, @@ -200,6 +202,14 @@ func (cts *ConcatTestSuite) generateArr(size int64, nullprob float64) arrow.Arra } } return bldr.NewArray() + case arrow.LIST_VIEW: + arr := cts.rng.ListView(cts.dt.(arrow.VarLenListLikeType), size, 0, 20, nullprob) + err := arr.ValidateFull() + cts.NoError(err) + return arr + case arrow.LARGE_LIST_VIEW: + // XXX + return nil case arrow.FIXED_SIZE_LIST: const listsize = 3 valuesSize := size * listsize @@ -317,11 +327,20 @@ func (cts *ConcatTestSuite) TestCheckConcat() { slices := cts.slices(arr, offsets) for _, s := range slices { + if s.DataType().ID() == arrow.LIST_VIEW { + err := s.(*array.ListView).ValidateFull() + cts.NoError(err) + } defer s.Release() } actual, err := array.Concatenate(slices, cts.mem) cts.NoError(err) + if arr.DataType().ID() == arrow.LIST_VIEW { + lv := actual.(*array.ListView) + err := lv.ValidateFull() + cts.NoError(err) + } defer actual.Release() cts.Truef(array.Equal(expected, actual), "expected: %s\ngot: %s\n", expected, actual) From 5b92d589b3fb2822a44f26457f0891ec583e47fb Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Sep 2023 00:28:46 -0300 Subject: [PATCH 23/38] random_array_gen.go: Extend the code to generate large list-views --- .../internal/testing/gen/random_array_gen.go | 113 ++++++++++++++++-- 1 file changed, 101 insertions(+), 12 deletions(-) diff --git a/go/arrow/internal/testing/gen/random_array_gen.go b/go/arrow/internal/testing/gen/random_array_gen.go index 4b62128d5e2aa..f80fc494dce19 100644 --- a/go/arrow/internal/testing/gen/random_array_gen.go +++ b/go/arrow/internal/testing/gen/random_array_gen.go @@ -381,7 +381,7 @@ func (r *RandomArrayGenerator) Numeric(dt arrow.Type, size int64, min, max int64 // list-view arrays. // // Pre-condition: every non-null sizes[i] <= valuesLength. -func viewOffsetsFromLengthsArray( +func viewOffsetsFromLengthsArray32( seed uint64, avgLength int32, valuesLength int32, sizesArray *array.Int32, forceEmptyNulls bool, zeroUndefinedOffsets bool) *memory.Buffer { @@ -436,10 +436,76 @@ func viewOffsetsFromLengthsArray( return memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(offsets)) } -func (r *RandomArrayGenerator) listView(dt arrow.VarLenListLikeType, length int64, - minLength, maxLength int32, nullprob float64, - forceEmptyNulls bool, zeroUndefinedOffsets bool) *array.ListView { - lengths := r.Int32(length, minLength, maxLength, nullprob).(*array.Int32) +// Generate an array of random offsets based on a given sizes array for +// large list-view arrays. +// +// Pre-condition: every non-null sizes[i] <= valuesLength. +func viewOffsetsFromLengthsArray64( + seed uint64, avgLength int64, valuesLength int64, + sizesArray *array.Int64, forceEmptyNulls bool, + zeroUndefinedOffsets bool) *memory.Buffer { + sizes := sizesArray.Int64Values() + offsets := make([]int64, sizesArray.Len()) + + offsetDeltaRand := rand.New(rand.NewSource(seed)) + sampleOffsetDelta := func() int64 { + return int64(offsetDeltaRand.Int63n(2*avgLength) - avgLength) + } + offsetBase := int64(0) + for i := 0; i < sizesArray.Len(); i += 1 { + // We want to always sample the offsetDeltaRand to make sure different + // options regarding nulls and empty views don't affect the other offsets. + offset := offsetBase + sampleOffsetDelta() + if sizesArray.IsNull(i) { + if forceEmptyNulls { + sizes[i] = 0 + } + if zeroUndefinedOffsets { + offsets[i] = 0 + } else { + offsets[i] = offset + } + continue + } + + size := sizes[i] + if size == 0 { + if zeroUndefinedOffsets { + offsets[i] = 0 + } else { + offsets[i] = offset + } + } else { + // Ensure that the size is not too large. + if size > valuesLength { + size = valuesLength + sizes[i] = size // Fix the size. + } + // Ensure the offset is not negative or too large. + if offset < 0 { + offset = 0 + } else if offset > valuesLength-size { + offset = valuesLength - size + } + offsets[i] = offset + } + offsetBase += avgLength + } + + return memory.NewBufferBytes(arrow.Int64Traits.CastToBytes(offsets)) +} + +// Generate a random data for ListView or LargeListView arrays. +func (r *RandomArrayGenerator) genListViewData(dt arrow.VarLenListLikeType, length int64, + minLength, maxLength int, nullprob float64, + forceEmptyNulls bool, zeroUndefinedOffsets bool) arrow.ArrayData { + offsetByteWidth := dt.Layout().Buffers[1].ByteWidth + var lengths arrow.Array + if offsetByteWidth == 4 { + lengths = r.Int32(length, int32(minLength), int32(maxLength), nullprob) + } else { + lengths = r.Int64(length, int64(minLength), int64(maxLength), nullprob) + } defer lengths.Release() // List-views don't have to be disjoint, so let's make the valuesLength a @@ -450,28 +516,51 @@ func (r *RandomArrayGenerator) listView(dt arrow.VarLenListLikeType, length int6 if valuesLength < int64(maxLength) { valuesLength = int64(maxLength) } - debug.Assert(valuesLength < math.MaxInt32, "valuesLength must be less than math.MaxInt32") + debug.Assert(offsetByteWidth == 8 || valuesLength < math.MaxInt32, + "valuesLength must be less than math.MaxInt32") values := r.ArrayOf(dt.Elem().ID(), int64(valuesLength), 0.0) defer values.Release() - offsets := viewOffsetsFromLengthsArray(r.seed, avgLength, int32(valuesLength), lengths, - forceEmptyNulls, zeroUndefinedOffsets) + + var offsets *memory.Buffer + if offsetByteWidth == 4 { + lengths32 := lengths.(*array.Int32) + offsets = viewOffsetsFromLengthsArray32(r.seed, int32(avgLength), int32(valuesLength), lengths32, + forceEmptyNulls, zeroUndefinedOffsets) + } else { + lengths64 := lengths.(*array.Int64) + offsets = viewOffsetsFromLengthsArray64(r.seed, int64(avgLength), int64(valuesLength), lengths64, + forceEmptyNulls, zeroUndefinedOffsets) + } + defer offsets.Release() buffers := []*memory.Buffer{ memory.NewBufferBytes(lengths.NullBitmapBytes()), offsets, - memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(lengths.Int32Values())), + memory.NewBufferBytes(lengths.Data().Buffers()[1].Bytes()), } childData := []arrow.ArrayData{values.Data()} - data := array.NewData(dt, int(length), buffers, childData, int(lengths.NullN()), 0) + return array.NewData(dt, int(length), buffers, childData, int(lengths.NullN()), 0) +} + +func (r *RandomArrayGenerator) ListView(dt arrow.VarLenListLikeType, length int64, + minLength, maxLength int32, nullprob float64) *array.ListView { + forceEmptyNulls := false + zeroUndefineOffsets := false + data := r.genListViewData(dt, length, int(minLength), int(maxLength), nullprob, + forceEmptyNulls, zeroUndefineOffsets) defer data.Release() return array.NewListViewData(data) } -func (r *RandomArrayGenerator) ListView(dt arrow.VarLenListLikeType, length int64, minLength, maxLength int32, nullprob float64) *array.ListView { +func (r *RandomArrayGenerator) LargeListView(dt arrow.VarLenListLikeType, length int64, + minLength, maxLength int64, nullprob float64) *array.LargeListView { forceEmptyNulls := false zeroUndefineOffsets := false - return r.listView(dt, length, minLength, maxLength, nullprob, forceEmptyNulls, zeroUndefineOffsets) + data := r.genListViewData(dt, length, int(minLength), int(maxLength), nullprob, + forceEmptyNulls, zeroUndefineOffsets) + defer data.Release() + return array.NewLargeListViewData(data) } func (r *RandomArrayGenerator) ArrayOf(dt arrow.Type, size int64, nullprob float64) arrow.Array { From 1e3ae75113be972e34c57aa4607ccf95ef53ff66 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Sep 2023 00:28:09 -0300 Subject: [PATCH 24/38] concat.go: Enable concatenation tests of large list-views --- go/arrow/array/concat_test.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/go/arrow/array/concat_test.go b/go/arrow/array/concat_test.go index 22331250e1e8e..c80844f05bacd 100644 --- a/go/arrow/array/concat_test.go +++ b/go/arrow/array/concat_test.go @@ -79,7 +79,7 @@ func TestConcatenate(t *testing.T) { {arrow.ListOf(arrow.PrimitiveTypes.Int8)}, {arrow.LargeListOf(arrow.PrimitiveTypes.Int8)}, {arrow.ListViewOf(arrow.PrimitiveTypes.Int8)}, - // {arrow.LargeListViewOf(arrow.PrimitiveTypes.Int8)}, + {arrow.LargeListViewOf(arrow.PrimitiveTypes.Int8)}, {arrow.FixedSizeListOf(3, arrow.PrimitiveTypes.Int8)}, {arrow.StructOf()}, {arrow.MapOf(arrow.PrimitiveTypes.Uint16, arrow.PrimitiveTypes.Int8)}, @@ -208,8 +208,10 @@ func (cts *ConcatTestSuite) generateArr(size int64, nullprob float64) arrow.Arra cts.NoError(err) return arr case arrow.LARGE_LIST_VIEW: - // XXX - return nil + arr := cts.rng.LargeListView(cts.dt.(arrow.VarLenListLikeType), size, 0, 20, nullprob) + err := arr.ValidateFull() + cts.NoError(err) + return arr case arrow.FIXED_SIZE_LIST: const listsize = 3 valuesSize := size * listsize From 9daeb81acf2437fbaea23737f45dec2fe5d253b3 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Sep 2023 15:04:03 -0300 Subject: [PATCH 25/38] arrdata.go: Add list-views to arrdata.json --- go/arrow/internal/arrdata/arrdata.go | 82 ++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/go/arrow/internal/arrdata/arrdata.go b/go/arrow/internal/arrdata/arrdata.go index 3b592cf3992aa..0851bff0fe0da 100644 --- a/go/arrow/internal/arrdata/arrdata.go +++ b/go/arrow/internal/arrdata/arrdata.go @@ -41,6 +41,7 @@ func init() { Records["primitives"] = makePrimitiveRecords() Records["structs"] = makeStructsRecords() Records["lists"] = makeListsRecords() + Records["list_views"] = makeListViewsRecords() Records["strings"] = makeStringsRecords() Records["fixed_size_lists"] = makeFixedSizeListsRecords() Records["fixed_width_types"] = makeFixedWidthTypesRecords() @@ -321,6 +322,63 @@ func makeListsRecords() []arrow.Record { return recs } +func makeListViewsRecords() []arrow.Record { + mem := memory.NewGoAllocator() + dtype := arrow.ListViewOf(arrow.PrimitiveTypes.Int32) + schema := arrow.NewSchema([]arrow.Field{ + {Name: "list_view_nullable", Type: dtype, Nullable: true}, + }, nil) + + mask := []bool{true, false, false, true, true} + + chunks := [][]arrow.Array{ + { + listViewOf(mem, []arrow.Array{ + arrayOf(mem, []int32{1, 2, 3, 4, 5}, mask), + arrayOf(mem, []int32{11, 12, 13, 14, 15}, mask), + arrayOf(mem, []int32{21, 22, 23, 24, 25}, mask), + }, nil), + }, + { + listViewOf(mem, []arrow.Array{ + arrayOf(mem, []int32{-1, -2, -3, -4, -5}, mask), + arrayOf(mem, []int32{-11, -12, -13, -14, -15}, mask), + arrayOf(mem, []int32{-21, -22, -23, -24, -25}, mask), + }, nil), + }, + { + listViewOf(mem, []arrow.Array{ + arrayOf(mem, []int32{-1, -2, -3, -4, -5}, mask), + arrayOf(mem, []int32{}, []bool{}), + arrayOf(mem, []int32{-21, -22, -23, -24, -25}, mask), + }, []bool{true, false, true}), + }, + { + func() arrow.Array { + bldr := array.NewListViewBuilder(mem, arrow.PrimitiveTypes.Int32) + defer bldr.Release() + + return bldr.NewListViewArray() + }(), + }, + } + + defer func() { + for _, chunk := range chunks { + for _, col := range chunk { + col.Release() + } + } + }() + + recs := make([]arrow.Record, len(chunks)) + for i, chunk := range chunks { + recs[i] = array.NewRecord(schema, chunk, -1) + } + + return recs +} + func makeFixedSizeListsRecords() []arrow.Record { mem := memory.NewGoAllocator() const N = 3 @@ -1439,6 +1497,30 @@ func listOf(mem memory.Allocator, values []arrow.Array, valids []bool) *array.Li return bldr.NewListArray() } +func listViewOf(mem memory.Allocator, values []arrow.Array, valids []bool) *array.ListView { + if mem == nil { + mem = memory.NewGoAllocator() + } + + bldr := array.NewListViewBuilder(mem, values[0].DataType()) + defer bldr.Release() + + valid := func(i int) bool { + return valids[i] + } + + if valids == nil { + valid = func(i int) bool { return true } + } + + for i, value := range values { + bldr.AppendWithSize(valid(i), value.Len()) + buildArray(bldr.ValueBuilder(), value) + } + + return bldr.NewListViewArray() +} + func fixedSizeListOf(mem memory.Allocator, n int32, values []arrow.Array, valids []bool) *array.FixedSizeList { if mem == nil { mem = memory.NewGoAllocator() From 98246695364c916f11903a20eac1a7692fd8f68f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Sep 2023 00:04:14 -0300 Subject: [PATCH 26/38] arrow/ipc/write.go: Remove unnecessary error from getZeroBasedValueOffsets() --- go/arrow/ipc/writer.go | 18 ++++++------------ go/arrow/ipc/writer_test.go | 6 ++---- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index 7866ec2b41011..5876bab3334dd 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -577,10 +577,7 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.StringType, *arrow.LargeStringType: arr := arr.(array.BinaryLike) - voffsets, err := w.getZeroBasedValueOffsets(arr) - if err != nil { - return fmt.Errorf("could not retrieve zero-based value offsets from %T: %w", arr, err) - } + voffsets := w.getZeroBasedValueOffsets(arr) data := arr.Data() values := data.Buffers()[2] @@ -687,10 +684,7 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { w.depth++ case *arrow.MapType, *arrow.ListType, *arrow.LargeListType: arr := arr.(array.ListLike) - voffsets, err := w.getZeroBasedValueOffsets(arr) - if err != nil { - return fmt.Errorf("could not retrieve zero-based value offsets for array %T: %w", arr, err) - } + voffsets := w.getZeroBasedValueOffsets(arr) p.body = append(p.body, voffsets) w.depth-- @@ -716,7 +710,7 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { values = array.NewSlice(values, values_offset, values_end) mustRelease = true } - err = w.visit(p, values) + err := w.visit(p, values) if err != nil { return fmt.Errorf("could not visit list element for array %T: %w", arr, err) @@ -764,7 +758,7 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { return nil } -func (w *recordEncoder) getZeroBasedValueOffsets(arr arrow.Array) (*memory.Buffer, error) { +func (w *recordEncoder) getZeroBasedValueOffsets(arr arrow.Array) *memory.Buffer { data := arr.Data() voffsets := data.Buffers()[1] offsetTraits := arr.DataType().(arrow.OffsetsDataType).OffsetTypeTraits() @@ -806,10 +800,10 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr arrow.Array) (*memory.Buffe voffsets.Retain() } if voffsets == nil || voffsets.Len() == 0 { - return nil, nil + return nil } - return voffsets, nil + return voffsets } func (w *recordEncoder) rebaseDenseUnionValueOffsets(arr *array.DenseUnion, offsets, lengths []int32) *memory.Buffer { diff --git a/go/arrow/ipc/writer_test.go b/go/arrow/ipc/writer_test.go index 47aa29db91082..da461c3d52272 100644 --- a/go/arrow/ipc/writer_test.go +++ b/go/arrow/ipc/writer_test.go @@ -112,16 +112,14 @@ func TestGetZeroBasedValueOffsets(t *testing.T) { env := &recordEncoder{mem: alloc} - offsets, err := env.getZeroBasedValueOffsets(arr) - require.NoError(t, err) + offsets := env.getZeroBasedValueOffsets(arr) defer offsets.Release() assert.Equal(t, 44, offsets.Len(), "include all offsets if array is not sliced") sl := array.NewSlice(arr, 0, 4) defer sl.Release() - offsets, err = env.getZeroBasedValueOffsets(sl) - require.NoError(t, err) + offsets = env.getZeroBasedValueOffsets(sl) defer offsets.Release() assert.Equal(t, 20, offsets.Len(), "trim trailing offsets after slice") } From 6852a2e81c2615471250bdaf3fb969a69e82a2aa Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Sep 2023 01:25:24 -0300 Subject: [PATCH 27/38] arrow/ipc: Add IPC support for list-views --- go/arrow/ipc/file_reader.go | 17 ++++++ go/arrow/ipc/metadata.go | 26 ++++++++ go/arrow/ipc/writer.go | 119 +++++++++++++++++++++++++++++++++--- 3 files changed, 155 insertions(+), 7 deletions(-) diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go index 7d799149c2a69..10cb2cae764e6 100644 --- a/go/arrow/ipc/file_reader.go +++ b/go/arrow/ipc/file_reader.go @@ -485,6 +485,12 @@ func (ctx *arrayLoaderContext) loadArray(dt arrow.DataType) arrow.ArrayData { case *arrow.LargeListType: return ctx.loadList(dt) + case *arrow.ListViewType: + return ctx.loadListView(dt) + + case *arrow.LargeListViewType: + return ctx.loadListView(dt) + case *arrow.FixedSizeListType: return ctx.loadFixedSizeList(dt) @@ -606,6 +612,17 @@ func (ctx *arrayLoaderContext) loadList(dt arrow.ListLikeType) arrow.ArrayData { return array.NewData(dt, int(field.Length()), buffers, []arrow.ArrayData{sub}, int(field.NullCount()), 0) } +func (ctx *arrayLoaderContext) loadListView(dt arrow.VarLenListLikeType) arrow.ArrayData { + field, buffers := ctx.loadCommon(dt.ID(), 3) + buffers = append(buffers, ctx.buffer(), ctx.buffer()) + defer releaseBuffers(buffers) + + sub := ctx.loadChild(dt.Elem()) + defer sub.Release() + + return array.NewData(dt, int(field.Length()), buffers, []arrow.ArrayData{sub}, int(field.NullCount()), 0) +} + func (ctx *arrayLoaderContext) loadFixedSizeList(dt *arrow.FixedSizeListType) arrow.ArrayData { field, buffers := ctx.loadCommon(dt.ID(), 1) defer releaseBuffers(buffers) diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go index 5c5e41833aea1..9bab47d6fa0cd 100644 --- a/go/arrow/ipc/metadata.go +++ b/go/arrow/ipc/metadata.go @@ -386,6 +386,18 @@ func (fv *fieldVisitor) visit(field arrow.Field) { flatbuf.LargeListStart(fv.b) fv.offset = flatbuf.LargeListEnd(fv.b) + case *arrow.ListViewType: + fv.dtype = flatbuf.TypeListView + fv.kids = append(fv.kids, fieldToFB(fv.b, fv.pos.Child(0), dt.ElemField(), fv.memo)) + flatbuf.ListViewStart(fv.b) + fv.offset = flatbuf.ListViewEnd(fv.b) + + case *arrow.LargeListViewType: + fv.dtype = flatbuf.TypeLargeListView + fv.kids = append(fv.kids, fieldToFB(fv.b, fv.pos.Child(0), dt.ElemField(), fv.memo)) + flatbuf.LargeListViewStart(fv.b) + fv.offset = flatbuf.LargeListViewEnd(fv.b) + case *arrow.FixedSizeListType: fv.dtype = flatbuf.TypeFixedSizeList fv.kids = append(fv.kids, fieldToFB(fv.b, fv.pos.Child(0), dt.ElemField(), fv.memo)) @@ -718,6 +730,20 @@ func concreteTypeFromFB(typ flatbuf.Type, data flatbuffers.Table, children []arr dt := arrow.LargeListOfField(children[0]) return dt, nil + case flatbuf.TypeListView: + if len(children) != 1 { + return nil, fmt.Errorf("arrow/ipc: ListView must have exactly 1 child field (got=%d)", len(children)) + } + dt := arrow.ListViewOfField(children[0]) + return dt, nil + + case flatbuf.TypeLargeListView: + if len(children) != 1 { + return nil, fmt.Errorf("arrow/ipc: LargeListView must have exactly 1 child field (got=%d)", len(children)) + } + dt := arrow.LargeListViewOfField(children[0]) + return dt, nil + case flatbuf.TypeFixedSizeList: var dt flatbuf.FixedSizeList dt.Init(data.Bytes, data.Pos) diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index 5876bab3334dd..a97f47ef4aa43 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -717,6 +717,51 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { } w.depth++ + case *arrow.ListViewType, *arrow.LargeListViewType: + data := arr.Data() + arr := arr.(array.VarLenListLike) + offsetTraits := arr.DataType().(arrow.OffsetsDataType).OffsetTypeTraits() + rngOff, rngLen := array.RangeOfValuesUsed(arr) + voffsets := w.getValueOffsetsAtBaseValue(arr, rngOff) + p.body = append(p.body, voffsets) + + vsizes := data.Buffers()[2] + if vsizes != nil { + if data.Offset() != 0 || vsizes.Len() > offsetTraits.BytesRequired(arr.Len()) { + beg := offsetTraits.BytesRequired(data.Offset()) + end := beg + offsetTraits.BytesRequired(data.Len()) + vsizes = memory.NewBufferBytes(vsizes.Bytes()[beg:end]) + } else { + vsizes.Retain() + } + } + p.body = append(p.body, vsizes) + + w.depth-- + var ( + values = arr.ListValues() + mustRelease = false + values_offset = int64(rngOff) + values_end = int64(rngOff + rngLen) + ) + defer func() { + if mustRelease { + values.Release() + } + }() + + if arr.Len() > 0 && values_end < int64(values.Len()) { + // must also slice the values + values = array.NewSlice(values, values_offset, values_end) + mustRelease = true + } + err := w.visit(p, values) + + if err != nil { + return fmt.Errorf("could not visit list element for array %T: %w", arr, err) + } + w.depth++ + case *arrow.FixedSizeListType: arr := arr.(*array.FixedSizeList) @@ -764,13 +809,19 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr arrow.Array) *memory.Buffer offsetTraits := arr.DataType().(arrow.OffsetsDataType).OffsetTypeTraits() offsetBytesNeeded := offsetTraits.BytesRequired(data.Len() + 1) - if data.Offset() != 0 || offsetBytesNeeded < voffsets.Len() { - // if we have a non-zero offset, then the value offsets do not start at - // zero. we must a) create a new offsets array with shifted offsets and - // b) slice the values array accordingly - // - // or if there are more value offsets than values (the array has been sliced) - // we need to trim off the trailing offsets + if voffsets == nil || voffsets.Len() == 0 { + return nil + } + + // if we have a non-zero offset, then the value offsets do not start at + // zero. we must a) create a new offsets array with shifted offsets and + // b) slice the values array accordingly + // + // or if there are more value offsets than values (the array has been sliced) + // we need to trim off the trailing offsets + needsTruncateAndShift := data.Offset() != 0 || offsetBytesNeeded < voffsets.Len() + + if needsTruncateAndShift { shiftedOffsets := memory.NewResizableBuffer(w.mem) shiftedOffsets.Resize(offsetBytesNeeded) @@ -799,10 +850,64 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr arrow.Array) *memory.Buffer } else { voffsets.Retain() } + + return voffsets +} + +// Truncates the offsets if needed and shifts the values if minOffset > 0. +// The offsets returned are corrected assuming the child values are truncated +// and now start at minOffset. +// +// This function only works on offset buffers of ListViews and LargeListViews. +// TODO(felipecrv): Unify this with getZeroBasedValueOffsets. +func (w *recordEncoder) getValueOffsetsAtBaseValue(arr arrow.Array, minOffset int) *memory.Buffer { + data := arr.Data() + voffsets := data.Buffers()[1] + offsetTraits := arr.DataType().(arrow.OffsetsDataType).OffsetTypeTraits() + offsetBytesNeeded := offsetTraits.BytesRequired(data.Len()) + if voffsets == nil || voffsets.Len() == 0 { return nil } + needsTruncate := data.Offset() != 0 || offsetBytesNeeded < voffsets.Len() + needsShift := minOffset > 0 + + if needsTruncate || needsShift { + shiftedOffsets := memory.NewResizableBuffer(w.mem) + shiftedOffsets.Resize(offsetBytesNeeded) + + switch arr.DataType().Layout().Buffers[1].ByteWidth { + case 8: + dest := arrow.Int64Traits.CastFromBytes(shiftedOffsets.Bytes()) + offsets := arrow.Int64Traits.CastFromBytes(voffsets.Bytes())[data.Offset() : data.Offset()+data.Len()] + + if minOffset > 0 { + for i, o := range offsets { + dest[i] = o - int64(minOffset) + } + } else { + copy(dest, offsets) + } + default: + debug.Assert(arr.DataType().Layout().Buffers[1].ByteWidth == 4, "invalid offset bytewidth") + dest := arrow.Int32Traits.CastFromBytes(shiftedOffsets.Bytes()) + offsets := arrow.Int32Traits.CastFromBytes(voffsets.Bytes())[data.Offset() : data.Offset()+data.Len()] + + if minOffset > 0 { + for i, o := range offsets { + dest[i] = o - int32(minOffset) + } + } else { + copy(dest, offsets) + } + } + + voffsets = shiftedOffsets + } else { + voffsets.Retain() + } + return voffsets } From 650f35918225febe42d5a4dc13253e17bb99efa0 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 27 Sep 2023 17:25:49 -0300 Subject: [PATCH 28/38] arrjson.go: The fixes sent by Matt Topol --- go/arrow/internal/arrjson/arrjson.go | 7 +++-- go/arrow/internal/arrjson/arrjson_test.go | 32 ++++++++--------------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 580436189f99b..ad87b73fc4ddb 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -1116,7 +1116,7 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr var offsets, sizes *memory.Buffer if arr.Count == 0 { emptyBuffer := memory.NewBufferBytes(nil) - offsets, sizes = emptyBuffer, emptyBuffer + offsets, sizes = emptyBuffer, emptyBuffer } else { offsets = memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(arr.Offset.([]int32))) sizes = memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(arr.Size.([]int32))) @@ -1135,7 +1135,7 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr var offsets, sizes *memory.Buffer if arr.Count == 0 { emptyBuffer := memory.NewBufferBytes(nil) - offsets, sizes = emptyBuffer, emptyBuffer + offsets, sizes = emptyBuffer, emptyBuffer } else { offsets = memory.NewBufferBytes(arrow.Int64Traits.CastToBytes(arr.Offset.([]int64))) sizes = memory.NewBufferBytes(arrow.Int64Traits.CastToBytes(arr.Size.([]int64))) @@ -1525,6 +1525,9 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array { arrayToJSON(arrow.Field{Name: "item", Type: arr.DataType().(*arrow.ListViewType).Elem()}, arr.ListValues()), }, } + if arr.Len() == 0 { + o.Offset, o.Size = []int32{}, []int32{} + } return o case *array.LargeListView: diff --git a/go/arrow/internal/arrjson/arrjson_test.go b/go/arrow/internal/arrjson/arrjson_test.go index a469195a36ede..ee85d431805ab 100644 --- a/go/arrow/internal/arrjson/arrjson_test.go +++ b/go/arrow/internal/arrjson/arrjson_test.go @@ -34,7 +34,7 @@ func TestReadWrite(t *testing.T) { wantJSONs["primitives"] = makePrimitiveWantJSONs() wantJSONs["structs"] = makeStructsWantJSONs() wantJSONs["lists"] = makeListsWantJSONs() - wantJSONs["listviews"] = makeListViewsWantJSONs() + wantJSONs["list_views"] = makeListViewsWantJSONs() wantJSONs["strings"] = makeStringsWantJSONs() wantJSONs["fixed_size_lists"] = makeFixedSizeListsWantJSONs() wantJSONs["fixed_width_types"] = makeFixedWidthTypesWantJSONs() @@ -1367,7 +1367,7 @@ func makeListsWantJSONs() string { 1, 1, 1 - ], + ], "children": [ { "name": "item", @@ -1564,7 +1564,7 @@ func makeListViewsWantJSONs() string { "schema": { "fields": [ { - "name": "listview_nullable", + "name": "list_view_nullable", "type": { "name": "listview" }, @@ -1589,7 +1589,7 @@ func makeListViewsWantJSONs() string { "count": 3, "columns": [ { - "name": "listview_nullable", + "name": "list_view_nullable", "count": 3, "VALIDITY": [ 1, @@ -1653,7 +1653,7 @@ func makeListViewsWantJSONs() string { "count": 3, "columns": [ { - "name": "listview_nullable", + "name": "list_view_nullable", "count": 3, "VALIDITY": [ 1, @@ -1717,7 +1717,7 @@ func makeListViewsWantJSONs() string { "count": 3, "columns": [ { - "name": "listview_nullable", + "name": "list_view_nullable", "count": 3, "VALIDITY": [ 1, @@ -1727,7 +1727,7 @@ func makeListViewsWantJSONs() string { "children": [ { "name": "item", - "count": 15, + "count": 10, "VALIDITY": [ 1, 0, @@ -1738,11 +1738,6 @@ func makeListViewsWantJSONs() string { 0, 0, 1, - 1, - 1, - 0, - 0, - 1, 1 ], "DATA": [ @@ -1751,11 +1746,6 @@ func makeListViewsWantJSONs() string { 0, -4, -5, - -11, - 0, - 0, - -14, - -15, -21, 0, 0, @@ -1767,11 +1757,11 @@ func makeListViewsWantJSONs() string { "OFFSET": [ 0, 5, - 10 + 5 ], "SIZE": [ 5, - 5, + 0, 5 ] } @@ -1781,7 +1771,7 @@ func makeListViewsWantJSONs() string { "count": 0, "columns": [ { - "name": "listview_nullable", + "name": "list_view_nullable", "count": 0, "children": [ { @@ -3817,7 +3807,7 @@ func makeMapsWantJSONs() string { "VALIDITY": [ 1, 0 - ], + ], "children": [ { "name": "entries", From af2813a8373b94378927c21073c6a6f87c0744ba Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 9 Oct 2023 12:11:20 -0300 Subject: [PATCH 29/38] random_array_gen.go: Simplify random generation and follow new spec requirements --- .../internal/testing/gen/random_array_gen.go | 96 ++++++------------- 1 file changed, 30 insertions(+), 66 deletions(-) diff --git a/go/arrow/internal/testing/gen/random_array_gen.go b/go/arrow/internal/testing/gen/random_array_gen.go index f80fc494dce19..41f2578209a7f 100644 --- a/go/arrow/internal/testing/gen/random_array_gen.go +++ b/go/arrow/internal/testing/gen/random_array_gen.go @@ -389,44 +389,26 @@ func viewOffsetsFromLengthsArray32( offsets := make([]int32, sizesArray.Len()) offsetDeltaRand := rand.New(rand.NewSource(seed)) - sampleOffsetDelta := func() int32 { - return int32(offsetDeltaRand.Int63n(2*int64(avgLength)) - int64(avgLength)) + sampleOffset := func(offsetBase int32) int32 { + delta := int32(offsetDeltaRand.Int63n(2*int64(avgLength)) - int64(avgLength)) + offset := offsetBase + delta + if offset < 0 { + return 0 + } + return offset } offsetBase := int32(0) for i := 0; i < sizesArray.Len(); i += 1 { - // We want to always sample the offsetDeltaRand to make sure different - // options regarding nulls and empty views don't affect the other offsets. - offset := offsetBase + sampleOffsetDelta() - if sizesArray.IsNull(i) { - if forceEmptyNulls { - sizes[i] = 0 - } - if zeroUndefinedOffsets { - offsets[i] = 0 - } else { - offsets[i] = offset - } - continue + isNull := sizesArray.IsNull(i) + if forceEmptyNulls && isNull { + sizes[i] = 0 } - - size := sizes[i] - if size == 0 { - if zeroUndefinedOffsets { - offsets[i] = 0 - } else { - offsets[i] = offset - } + if zeroUndefinedOffsets && (isNull || sizes[i] == 0) { + offsets[i] = 0 } else { - // Ensure that the size is not too large. - if size > valuesLength { - size = valuesLength - sizes[i] = size // Fix the size. - } - // Ensure the offset is not negative or too large. - if offset < 0 { - offset = 0 - } else if offset > valuesLength-size { - offset = valuesLength - size + offset := sampleOffset(offsetBase) + if offset > valuesLength-sizes[i] { + offset = valuesLength - sizes[i] } offsets[i] = offset } @@ -448,44 +430,26 @@ func viewOffsetsFromLengthsArray64( offsets := make([]int64, sizesArray.Len()) offsetDeltaRand := rand.New(rand.NewSource(seed)) - sampleOffsetDelta := func() int64 { - return int64(offsetDeltaRand.Int63n(2*avgLength) - avgLength) + sampleOffset := func(offsetBase int64) int64 { + delta := int64(offsetDeltaRand.Int63n(2*avgLength) - avgLength) + offset := offsetBase + delta + if offset < 0 { + return 0 + } + return offset } offsetBase := int64(0) for i := 0; i < sizesArray.Len(); i += 1 { - // We want to always sample the offsetDeltaRand to make sure different - // options regarding nulls and empty views don't affect the other offsets. - offset := offsetBase + sampleOffsetDelta() - if sizesArray.IsNull(i) { - if forceEmptyNulls { - sizes[i] = 0 - } - if zeroUndefinedOffsets { - offsets[i] = 0 - } else { - offsets[i] = offset - } - continue + isNull := sizesArray.IsNull(i) + if forceEmptyNulls && isNull { + sizes[i] = 0 } - - size := sizes[i] - if size == 0 { - if zeroUndefinedOffsets { - offsets[i] = 0 - } else { - offsets[i] = offset - } + if zeroUndefinedOffsets && (isNull || sizes[i] == 0) { + offsets[i] = 0 } else { - // Ensure that the size is not too large. - if size > valuesLength { - size = valuesLength - sizes[i] = size // Fix the size. - } - // Ensure the offset is not negative or too large. - if offset < 0 { - offset = 0 - } else if offset > valuesLength-size { - offset = valuesLength - size + offset := sampleOffset(offsetBase) + if offset > valuesLength-sizes[i] { + offset = valuesLength - sizes[i] } offsets[i] = offset } From 9897360e61fce6222f096241a5e317552f37700a Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 9 Oct 2023 12:14:42 -0300 Subject: [PATCH 30/38] list.go: Don't skip nulls when validating (according to spec changes) --- go/arrow/array/list.go | 52 ++++++++---------------------------------- 1 file changed, 9 insertions(+), 43 deletions(-) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 879ef012f1347..7ad3c22745325 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -27,7 +27,6 @@ import ( "github.com/apache/arrow/go/v14/arrow/bitutil" "github.com/apache/arrow/go/v14/arrow/internal/debug" "github.com/apache/arrow/go/v14/arrow/memory" - "github.com/apache/arrow/go/v14/internal/bitutils" "github.com/apache/arrow/go/v14/internal/json" ) @@ -961,51 +960,18 @@ func outOfBoundsListViewSize(l offsetsAndSizes, slot int64, offsetLimit int64) e // Pre-condition: Basic validation has already been performed func (a *array) fullyValidateOffsetsAndSizes(l offsetsAndSizes, offsetLimit int64) error { - validity := a.NullBitmapBytes() - - slot := int64(0) - if validity != nil { - counter := bitutils.NewBitBlockCounter(validity, int64(a.Offset()), int64(a.Len())) - var block bitutils.BitBlockCount - for i := 0; i < a.Len(); i += int(block.Len) { - block = counter.NextWord() - if block.NoneSet() { - continue + for slot := int64(0); slot < int64(a.Len()); slot += 1 { + size := l.sizeAt(slot) + if size > 0 { + offset := l.offsetAt(slot) + if offset < 0 || offset > offsetLimit { + return outOfBoundsListViewOffset(l, slot, offsetLimit) } - allSet := block.AllSet() - for j := 0; j < int(block.Len); j += 1 { - slot = int64(i + j) - valid := allSet || bitutil.BitIsSet(validity, a.Offset()+int(slot)) - if valid { - size := l.sizeAt(slot) - if size > 0 { - offset := l.offsetAt(slot) - if offset < 0 || offset > offsetLimit { - return outOfBoundsListViewOffset(l, slot, offsetLimit) - } - if size > offsetLimit-offset { - return outOfBoundsListViewSize(l, slot, offsetLimit) - } - } else if size < 0 { - return outOfBoundsListViewSize(l, slot, offsetLimit) - } - } - } - } - } else { - for ; slot < int64(a.Len()); slot += 1 { - size := l.sizeAt(slot) - if size > 0 { - offset := l.offsetAt(slot) - if offset < 0 || offset > offsetLimit { - return outOfBoundsListViewOffset(l, slot, offsetLimit) - } - if size > offsetLimit-int64(offset) { - return outOfBoundsListViewSize(l, slot, offsetLimit) - } - } else if size < 0 { + if size > offsetLimit-int64(offset) { return outOfBoundsListViewSize(l, slot, offsetLimit) } + } else if size < 0 { + return outOfBoundsListViewSize(l, slot, offsetLimit) } } From 9dbc99aea29b7f8c165c2b2a805bd9ac32b7181c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 9 Oct 2023 15:27:58 -0300 Subject: [PATCH 31/38] concat.go: Preserve invariants when concatenating --- go/arrow/array/concat.go | 41 +++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index b748ec2872386..ff921316eb278 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -378,8 +378,14 @@ func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Bu dstOffsets := arrow.Int32Traits.CastFromBytes(out.Bytes()) for i, offset := range srcOffsets { - debug.Assert(!isValidAndNonEmpty(i) || offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays") - dstOffsets[outOff+i] = offset + displacement + if isValidAndNonEmpty(i) { + // This is guaranteed by RangeOfValuesUsed returning the smallest offset + // of valid and non-empty list-views. + debug.Assert(offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays") + dstOffsets[outOff+i] = offset + displacement + } else { + dstOffsets[outOff+i] = 0 + } } } @@ -398,9 +404,14 @@ func putListViewOffsets64(in arrow.ArrayData, displacement int64, out *memory.Bu dstOffsets := arrow.Int64Traits.CastFromBytes(out.Bytes()) for i, offset := range srcOffsets { - debug.Assert(!isValidAndNonEmpty(i) || offset+displacement >= 0, - "putListViewOffsets64: offset underflow while concatenating arrays") - dstOffsets[outOff+i] = offset + displacement + if isValidAndNonEmpty(i) { + // This is guaranteed by RangeOfValuesUsed returning the smallest offset + // of valid and non-empty list-views. + debug.Assert(offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays") + dstOffsets[outOff+i] = offset + displacement + } else { + dstOffsets[outOff+i] = 0 + } } } @@ -465,6 +476,26 @@ func concatListView(data []arrow.ArrayData, offsetType arrow.FixedWidthDataType, // Concatenate the sizes sizeBuffers := gatherBuffersFixedWidthType(data, 2, offsetType) sizeBuffer := concatBuffers(sizeBuffers, mem) + if out.Buffers()[0] != nil { + // To make sure the sizes don't reference values that are not in the new + // concatenated values array, we zero the sizes of null list-view values. + validity := out.Buffers()[0].Bytes() + if offsetType.ID() == arrow.INT32 { + sizes := arrow.Int32Traits.CastFromBytes(sizeBuffer.Bytes()) + for i := 0; i < out.Len(); i++ { + if !bitutil.BitIsSet(validity, out.offset+i) { + sizes[i] = 0 + } + } + } else { + sizes := arrow.Int64Traits.CastFromBytes(sizeBuffer.Bytes()) + for i := 0; i < out.Len(); i++ { + if !bitutil.BitIsSet(validity, out.offset+i) { + sizes[i] = 0 + } + } + } + } out.childData = []arrow.ArrayData{values} out.buffers[1] = offsetBuffer From bc47993ae6e7b2f3441add695414894568a10397 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 10 Oct 2023 11:02:02 -0300 Subject: [PATCH 32/38] fixup! concat.go: Preserve invariants when concatenating --- go/arrow/array/concat.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index ff921316eb278..cae4ef00c3619 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -381,7 +381,7 @@ func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Bu if isValidAndNonEmpty(i) { // This is guaranteed by RangeOfValuesUsed returning the smallest offset // of valid and non-empty list-views. - debug.Assert(offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays") + debug.Assert(offset+displacement >= 0, "putListViewOffsets64: offset underflow while concatenating arrays") dstOffsets[outOff+i] = offset + displacement } else { dstOffsets[outOff+i] = 0 From 72c563a9cd2b2b37fd03f5bc8850d2ca77756268 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 10 Oct 2023 12:10:34 -0300 Subject: [PATCH 33/38] concat.go: Extract a generic function to zero the sizes --- go/arrow/array/concat.go | 50 ++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index cae4ef00c3619..26e30cabc21d7 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -21,6 +21,7 @@ import ( "fmt" "math" "math/bits" + "unsafe" "github.com/apache/arrow/go/v14/arrow" "github.com/apache/arrow/go/v14/arrow/bitutil" @@ -363,6 +364,13 @@ func sumArraySizes(data []arrow.ArrayData) int { return outSize } +func getListViewOffsets[T int32 | int64](data arrow.ArrayData, i int) []T { + bytes := data.Buffers()[i].Bytes() + base := (*T)(unsafe.Pointer(&bytes[0])) + ret := unsafe.Slice(base, data.Offset()+data.Len()) + return ret[data.Offset():] +} + func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Buffer, outOff int) { debug.Assert(in.DataType().ID() == arrow.LIST_VIEW, "putListViewOffsets32: expected LIST_VIEW data") inOff, inLen := in.Offset(), in.Len() @@ -446,6 +454,20 @@ func concatListViewOffsets(data []arrow.ArrayData, byteWidth int, valueRanges [] return out, nil } +func zeroNullListViewSizes[T int32 | int64](data arrow.ArrayData) { + if data.Len() == 0 || data.Buffers()[0] == nil { + return + } + validity := data.Buffers()[0].Bytes() + sizes := getListViewOffsets[T](data, 2) + + for i := 0; i < data.Len(); i++ { + if !bitutil.BitIsSet(validity, data.Offset()+i) { + sizes[i] = 0 + } + } +} + func concatListView(data []arrow.ArrayData, offsetType arrow.FixedWidthDataType, out *Data, mem memory.Allocator) (err error) { // Calculate the ranges of values that each list-view array uses valueRanges := make([]rng, len(data)) @@ -476,31 +498,19 @@ func concatListView(data []arrow.ArrayData, offsetType arrow.FixedWidthDataType, // Concatenate the sizes sizeBuffers := gatherBuffersFixedWidthType(data, 2, offsetType) sizeBuffer := concatBuffers(sizeBuffers, mem) - if out.Buffers()[0] != nil { - // To make sure the sizes don't reference values that are not in the new - // concatenated values array, we zero the sizes of null list-view values. - validity := out.Buffers()[0].Bytes() - if offsetType.ID() == arrow.INT32 { - sizes := arrow.Int32Traits.CastFromBytes(sizeBuffer.Bytes()) - for i := 0; i < out.Len(); i++ { - if !bitutil.BitIsSet(validity, out.offset+i) { - sizes[i] = 0 - } - } - } else { - sizes := arrow.Int64Traits.CastFromBytes(sizeBuffer.Bytes()) - for i := 0; i < out.Len(); i++ { - if !bitutil.BitIsSet(validity, out.offset+i) { - sizes[i] = 0 - } - } - } - } out.childData = []arrow.ArrayData{values} out.buffers[1] = offsetBuffer out.buffers[2] = sizeBuffer + // To make sure the sizes don't reference values that are not in the new + // concatenated values array, we zero the sizes of null list-view values. + if offsetType.ID() == arrow.INT32 { + zeroNullListViewSizes[int32](out) + } else { + zeroNullListViewSizes[int64](out) + } + return nil } From 4aaa8bf73f6eb767c19d175b4267e3a6e86001a7 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 10 Oct 2023 12:19:23 -0300 Subject: [PATCH 34/38] concat.go: Use getListViewOffsets in more places --- go/arrow/array/concat.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index 26e30cabc21d7..364003630595a 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -378,8 +378,8 @@ func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Bu return } bitmap := in.Buffers()[0] - srcOffsets := arrow.Int32Traits.CastFromBytes(in.Buffers()[1].Bytes())[inOff : inOff+inLen] - srcSizes := arrow.Int32Traits.CastFromBytes(in.Buffers()[2].Bytes())[inOff : inOff+inLen] + srcOffsets := getListViewOffsets[int32](in, 1) + srcSizes := getListViewOffsets[int32](in, 2) isValidAndNonEmpty := func(i int) bool { return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0 } @@ -400,12 +400,12 @@ func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Bu func putListViewOffsets64(in arrow.ArrayData, displacement int64, out *memory.Buffer, outOff int) { debug.Assert(in.DataType().ID() == arrow.LARGE_LIST_VIEW, "putListViewOffsets64: expected LARGE_LIST_VIEW data") inOff, inLen := in.Offset(), in.Len() - if in.Len() == 0 { + if inLen == 0 { return } bitmap := in.Buffers()[0] - srcOffsets := arrow.Int64Traits.CastFromBytes(in.Buffers()[1].Bytes())[in.Offset():(in.Offset() + in.Len())] - srcSizes := arrow.Int64Traits.CastFromBytes(in.Buffers()[2].Bytes())[inOff : inOff+inLen] + srcOffsets := getListViewOffsets[int64](in, 1) + srcSizes := getListViewOffsets[int64](in, 2) isValidAndNonEmpty := func(i int) bool { return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0 } From f0fa0d7471b88b44b4cbfbe82c25bf63f2fca2dc Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 10 Oct 2023 12:58:56 -0300 Subject: [PATCH 35/38] fixup! concat.go: Preserve invariants when concatenating --- go/arrow/array/concat.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index 364003630595a..aaad32c31a65d 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -389,7 +389,7 @@ func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Bu if isValidAndNonEmpty(i) { // This is guaranteed by RangeOfValuesUsed returning the smallest offset // of valid and non-empty list-views. - debug.Assert(offset+displacement >= 0, "putListViewOffsets64: offset underflow while concatenating arrays") + debug.Assert(offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays") dstOffsets[outOff+i] = offset + displacement } else { dstOffsets[outOff+i] = 0 @@ -415,7 +415,7 @@ func putListViewOffsets64(in arrow.ArrayData, displacement int64, out *memory.Bu if isValidAndNonEmpty(i) { // This is guaranteed by RangeOfValuesUsed returning the smallest offset // of valid and non-empty list-views. - debug.Assert(offset+displacement >= 0, "putListViewOffsets32: offset underflow while concatenating arrays") + debug.Assert(offset+displacement >= 0, "putListViewOffsets64: offset underflow while concatenating arrays") dstOffsets[outOff+i] = offset + displacement } else { dstOffsets[outOff+i] = 0 From 83bbd6a4a5f531979b775e9952bb6e23af7957a5 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 10 Oct 2023 13:03:36 -0300 Subject: [PATCH 36/38] concat.go: Rename getListView{Offsets->BufferValues} --- go/arrow/array/concat.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index aaad32c31a65d..9d815023c4b76 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -364,7 +364,7 @@ func sumArraySizes(data []arrow.ArrayData) int { return outSize } -func getListViewOffsets[T int32 | int64](data arrow.ArrayData, i int) []T { +func getListViewBufferValues[T int32 | int64](data arrow.ArrayData, i int) []T { bytes := data.Buffers()[i].Bytes() base := (*T)(unsafe.Pointer(&bytes[0])) ret := unsafe.Slice(base, data.Offset()+data.Len()) @@ -378,8 +378,8 @@ func putListViewOffsets32(in arrow.ArrayData, displacement int32, out *memory.Bu return } bitmap := in.Buffers()[0] - srcOffsets := getListViewOffsets[int32](in, 1) - srcSizes := getListViewOffsets[int32](in, 2) + srcOffsets := getListViewBufferValues[int32](in, 1) + srcSizes := getListViewBufferValues[int32](in, 2) isValidAndNonEmpty := func(i int) bool { return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0 } @@ -404,8 +404,8 @@ func putListViewOffsets64(in arrow.ArrayData, displacement int64, out *memory.Bu return } bitmap := in.Buffers()[0] - srcOffsets := getListViewOffsets[int64](in, 1) - srcSizes := getListViewOffsets[int64](in, 2) + srcOffsets := getListViewBufferValues[int64](in, 1) + srcSizes := getListViewBufferValues[int64](in, 2) isValidAndNonEmpty := func(i int) bool { return (bitmap == nil || bitutil.BitIsSet(bitmap.Bytes(), inOff+i)) && srcSizes[i] > 0 } @@ -459,7 +459,7 @@ func zeroNullListViewSizes[T int32 | int64](data arrow.ArrayData) { return } validity := data.Buffers()[0].Bytes() - sizes := getListViewOffsets[T](data, 2) + sizes := getListViewBufferValues[T](data, 2) for i := 0; i < data.Len(); i++ { if !bitutil.BitIsSet(validity, data.Offset()+i) { From a5ccd73f838155470826e9cc84ffd0b2ec399414 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 10 Oct 2023 16:59:44 -0300 Subject: [PATCH 37/38] datatype.go: Add STRING_VIEW/BINARY_VIEW and re-generate type_string.go --- go/arrow/datatype.go | 7 +++++++ go/arrow/type_string.go | 10 ++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go index 2588bf430fcb2..f0fb24ec873c5 100644 --- a/go/arrow/datatype.go +++ b/go/arrow/datatype.go @@ -152,6 +152,13 @@ const ( RUN_END_ENCODED + // String (UTF8) view type with 4-byte prefix and inline + // small string optimizations + STRING_VIEW + + // Bytes view with 4-byte prefix and inline small byte arrays optimization + BINARY_VIEW + // LIST_VIEW is a list of some logical data type represented with offsets and sizes LIST_VIEW diff --git a/go/arrow/type_string.go b/go/arrow/type_string.go index a79ea7919908a..ee3ccb7ef9f0a 100644 --- a/go/arrow/type_string.go +++ b/go/arrow/type_string.go @@ -47,13 +47,15 @@ func _() { _ = x[LARGE_LIST-36] _ = x[INTERVAL_MONTH_DAY_NANO-37] _ = x[RUN_END_ENCODED-38] - _ = x[LIST_VIEW-39] - _ = x[LARGE_LIST_VIEW-40] + _ = x[STRING_VIEW-39] + _ = x[BINARY_VIEW-40] + _ = x[LIST_VIEW-41] + _ = x[LARGE_LIST_VIEW-42] } -const _Type_name = "NULLBOOLUINT8INT8UINT16INT16UINT32INT32UINT64INT64FLOAT16FLOAT32FLOAT64STRINGBINARYFIXED_SIZE_BINARYDATE32DATE64TIMESTAMPTIME32TIME64INTERVAL_MONTHSINTERVAL_DAY_TIMEDECIMAL128DECIMAL256LISTSTRUCTSPARSE_UNIONDENSE_UNIONDICTIONARYMAPEXTENSIONFIXED_SIZE_LISTDURATIONLARGE_STRINGLARGE_BINARYLARGE_LISTINTERVAL_MONTH_DAY_NANORUN_END_ENCODEDLIST_VIEWLARGE_LIST_VIEW" +const _Type_name = "NULLBOOLUINT8INT8UINT16INT16UINT32INT32UINT64INT64FLOAT16FLOAT32FLOAT64STRINGBINARYFIXED_SIZE_BINARYDATE32DATE64TIMESTAMPTIME32TIME64INTERVAL_MONTHSINTERVAL_DAY_TIMEDECIMAL128DECIMAL256LISTSTRUCTSPARSE_UNIONDENSE_UNIONDICTIONARYMAPEXTENSIONFIXED_SIZE_LISTDURATIONLARGE_STRINGLARGE_BINARYLARGE_LISTINTERVAL_MONTH_DAY_NANORUN_END_ENCODEDSTRING_VIEWBINARY_VIEWLIST_VIEWLARGE_LIST_VIEW" -var _Type_index = [...]uint16{0, 4, 8, 13, 17, 23, 28, 34, 39, 45, 50, 57, 64, 71, 77, 83, 100, 106, 112, 121, 127, 133, 148, 165, 175, 185, 189, 195, 207, 218, 228, 231, 240, 255, 263, 275, 287, 297, 320, 335, 344, 359} +var _Type_index = [...]uint16{0, 4, 8, 13, 17, 23, 28, 34, 39, 45, 50, 57, 64, 71, 77, 83, 100, 106, 112, 121, 127, 133, 148, 165, 175, 185, 189, 195, 207, 218, 228, 231, 240, 255, 263, 275, 287, 297, 320, 335, 346, 357, 366, 381} func (i Type) String() string { if i < 0 || i >= Type(len(_Type_index)-1) { From b83beffc86e3c12ff35e64c188188187015086b7 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 10 Oct 2023 17:37:35 -0300 Subject: [PATCH 38/38] fixup! list.go: Add validation for ListView and LargeListView --- go/arrow/array/list.go | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/go/arrow/array/list.go b/go/arrow/array/list.go index 7ad3c22745325..d8d8b8c76165a 100644 --- a/go/arrow/array/list.go +++ b/go/arrow/array/list.go @@ -952,10 +952,9 @@ func outOfBoundsListViewSize(l offsetsAndSizes, slot int64, offsetLimit int64) e size := l.sizeAt(slot) if size < 0 { return fmt.Errorf("%w: Offset invariant failure: size for slot %d out of bounds: %d < 0", arrow.ErrInvalid, slot, size) - } else { - offset := l.offsetAt(slot) - return fmt.Errorf("%w: Offset invariant failure: size for slot %d out of bounds: %d + %d > %d", arrow.ErrInvalid, slot, offset, size, offsetLimit) } + offset := l.offsetAt(slot) + return fmt.Errorf("%w: Offset invariant failure: size for slot %d out of bounds: %d + %d > %d", arrow.ErrInvalid, slot, offset, size, offsetLimit) } // Pre-condition: Basic validation has already been performed @@ -984,18 +983,14 @@ func (a *array) validateOffsetsAndMaybeSizes(l offsetsAndSizes, offsetByteWidth // For length 0, an empty offsets buffer is accepted (ARROW-544). if nonEmpty { return fmt.Errorf("non-empty array but offsets are null") - } else { - return nil } + return nil } - if isListView { - if a.data.buffers[2] == nil { - if nonEmpty { - return fmt.Errorf("non-empty array but sizes are null") - } else { - return nil - } + if isListView && a.data.buffers[2] == nil { + if nonEmpty { + return fmt.Errorf("non-empty array but sizes are null") } + return nil } var requiredOffsets int @@ -1024,11 +1019,10 @@ func (a *array) validateOffsetsAndMaybeSizes(l offsetsAndSizes, offsetByteWidth if fullValidation && requiredOffsets > 0 { if isListView { return a.fullyValidateOffsetsAndSizes(l, offsetLimit) - } else { - // TODO: implement validation of List and LargeList - // return fullyValidateOffsets(offset_limit) - return nil } + // TODO: implement validation of List and LargeList + // return fullyValidateOffsets(offset_limit) + return nil } return nil }