Skip to content

Commit

Permalink
apacheGH-38718: [Go][Format][Integration] Add StringView/BinaryView t…
Browse files Browse the repository at this point in the history
…o Go implementation (apache#35769)

### Rationale for this change
See apache#35628 for the rationale and description of the StringView/BinaryView array types.

This change is adding Go as a second implementation of it.

### What changes are included in this PR?

Add Array Types for `StringView` and `BinaryView` along with `StringViewType` and `BinaryViewType` and necessary enums and builders. These arrays can be round tripped through JSON and IPC.

### Are these changes tested?
Yes, unit tests have been added and integration tests run

* Closes: [apache#38718](apache#38718)
* Closes: apache#38718

Lead-authored-by: Matt Topol <zotthewizard@gmail.com>
Co-authored-by: Alex Shcherbakov <candiduslynx@users.noreply.github.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
  • Loading branch information
2 people authored and dgreiss committed Feb 17, 2024
1 parent e1abc6f commit a3a83ce
Show file tree
Hide file tree
Showing 33 changed files with 2,011 additions and 33 deletions.
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ cpp/src/generated/*.cpp linguist-generated=true
cpp/src/generated/*.h linguist-generated=true
go/**/*.s linguist-generated=true
go/arrow/unionmode_string.go linguist-generated=true
go/arrow/internal/flatbuf/*.go linguist-generated=true
go/**/*.pb.go linguist-generated=true
go/parquet/internal/gen-go/parquet/*.go linguist-generated=true
r/R/RcppExports.R linguist-generated=true
r/R/arrowExports.R linguist-generated=true
r/src/RcppExports.cpp linguist-generated=true
Expand Down
4 changes: 4 additions & 0 deletions docs/source/status.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ Data Types
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Large Utf8 |||| | ||| |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Binary View || || | | | | |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| String View || || | | | | |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+

+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift |
Expand Down
2 changes: 1 addition & 1 deletion format/Schema.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ enum MetadataVersion:short {
/// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
V4,

/// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4
/// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
/// metadata and IPC messages). Implementations are recommended to provide a
/// V4 compatibility mode with V5 format changes disabled.
///
Expand Down
3 changes: 2 additions & 1 deletion go/arrow/array/array.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ func init() {
arrow.RUN_END_ENCODED: func(data arrow.ArrayData) arrow.Array { return NewRunEndEncodedData(data) },
arrow.LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewListViewData(data) },
arrow.LARGE_LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewLargeListViewData(data) },

arrow.BINARY_VIEW: func(data arrow.ArrayData) arrow.Array { return NewBinaryViewData(data) },
arrow.STRING_VIEW: func(data arrow.ArrayData) arrow.Array { return NewStringViewData(data) },
// invalid data types to fill out array to size 2^6 - 1
63: invalidDataType,
}
Expand Down
121 changes: 121 additions & 0 deletions go/arrow/array/binary.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"unsafe"

"github.com/apache/arrow/go/v15/arrow"
"github.com/apache/arrow/go/v15/arrow/memory"
"github.com/apache/arrow/go/v15/internal/json"
)

Expand Down Expand Up @@ -318,6 +319,126 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool {
return true
}

type ViewLike interface {
arrow.Array
ValueHeader(int) *arrow.ViewHeader
}

type BinaryView struct {
array
values []arrow.ViewHeader
dataBuffers []*memory.Buffer
}

func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
a := &BinaryView{}
a.refCount = 1
a.setData(data.(*Data))
return a
}

func (a *BinaryView) setData(data *Data) {
if len(data.buffers) < 2 {
panic("len(data.buffers) < 2")
}
a.array.setData(data)

if valueData := data.buffers[1]; valueData != nil {
a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes())
}

a.dataBuffers = data.buffers[2:]
}

func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader {
if i < 0 || i >= a.array.data.length {
panic("arrow/array: index out of range")
}
return &a.values[a.array.data.offset+i]
}

func (a *BinaryView) Value(i int) []byte {
s := a.ValueHeader(i)
if s.IsInline() {
return s.InlineBytes()
}
start := s.BufferOffset()
buf := a.dataBuffers[s.BufferIndex()]
return buf.Bytes()[start : start+int32(s.Len())]
}

// ValueString returns the value at index i as a string instead of
// a byte slice, without copying the underlying data.
func (a *BinaryView) ValueString(i int) string {
b := a.Value(i)
return *(*string)(unsafe.Pointer(&b))
}

func (a *BinaryView) String() string {
var o strings.Builder
o.WriteString("[")
for i := 0; i < a.Len(); i++ {
if i > 0 {
o.WriteString(" ")
}
switch {
case a.IsNull(i):
o.WriteString(NullValueStr)
default:
fmt.Fprintf(&o, "%q", a.ValueString(i))
}
}
o.WriteString("]")
return o.String()
}

// ValueStr is paired with AppendValueFromString in that it returns
// the value at index i as a string: Semantically this means that for
// a null value it will return the string "(null)", otherwise it will
// return the value as a base64 encoded string suitable for CSV/JSON.
//
// This is always going to be less performant than just using ValueString
// and exists to fulfill the Array interface to provide a method which
// can produce a human readable string for a given index.
func (a *BinaryView) ValueStr(i int) string {
if a.IsNull(i) {
return NullValueStr
}
return base64.StdEncoding.EncodeToString(a.Value(i))
}

func (a *BinaryView) GetOneForMarshal(i int) interface{} {
if a.IsNull(i) {
return nil
}
return a.Value(i)
}

func (a *BinaryView) MarshalJSON() ([]byte, error) {
vals := make([]interface{}, a.Len())
for i := 0; i < a.Len(); i++ {
vals[i] = a.GetOneForMarshal(i)
}
// golang marshal standard says that []byte will be marshalled
// as a base64-encoded string
return json.Marshal(vals)
}

func arrayEqualBinaryView(left, right *BinaryView) bool {
leftBufs, rightBufs := left.dataBuffers, right.dataBuffers
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) {
return false
}
}
return true
}

var (
_ arrow.Array = (*Binary)(nil)
_ arrow.Array = (*LargeBinary)(nil)
_ arrow.Array = (*BinaryView)(nil)
)
24 changes: 24 additions & 0 deletions go/arrow/array/binary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -700,3 +700,27 @@ func TestBinaryStringRoundTrip(t *testing.T) {

assert.True(t, Equal(arr, arr1))
}

func TestBinaryViewStringRoundTrip(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "supercalifragilistic", "", "expeallodocious"}
valid := []bool{true, true, false, false, true, true, true}

b := NewBinaryViewBuilder(mem)
defer b.Release()

b.AppendStringValues(values, valid)
arr := b.NewArray().(*BinaryView)
defer arr.Release()

for i := 0; i < arr.Len(); i++ {
assert.NoError(t, b.AppendValueFromString(arr.ValueStr(i)))
}

arr1 := b.NewArray().(*BinaryView)
defer arr1.Release()

assert.True(t, Equal(arr, arr1))
}
Loading

0 comments on commit a3a83ce

Please sign in to comment.