Skip to content

Commit

Permalink
GH-38836:[Go] Add Size() for ArrayData (#38839)
Browse files Browse the repository at this point in the history
### Rationale for this change

Address #38836

### What changes are included in this PR?

Add a new function SizeInBytes() to calculate the size of ArrayData.

### Are these changes tested?

### Are there any user-facing changes?

No

* Closes: #38836

Lead-authored-by: Yifeng Wu <yifeng@sigmacomputing.com>
Co-authored-by: Matt Topol <zotthewizard@gmail.com>
Co-authored-by: Yifeng-Sigma <yifeng@sigmacomputing.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
Yifeng-Sigma and zeroshade authored Nov 28, 2023
1 parent 83b2c5f commit 143b475
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 1 deletion.
2 changes: 2 additions & 0 deletions go/arrow/array.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ type ArrayData interface {
// Dictionary returns the ArrayData object for the dictionary if this is a
// dictionary array, otherwise it will be nil.
Dictionary() ArrayData
// SizeInBytes returns the size of the ArrayData buffers and any children and/or dictionary in bytes.
SizeInBytes() uint64
}

// Array represents an immutable sequence of values using the Arrow in-memory format.
Expand Down
27 changes: 26 additions & 1 deletion go/arrow/array/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,34 @@ func (d *Data) SetDictionary(dict arrow.ArrayData) {
}
}

// SizeInBytes returns the size of the Data and any children and/or dictionary in bytes by
// recursively examining the nested structures of children and/or dictionary.
// The value returned is an upper-bound since offset is not taken into account.
func (d *Data) SizeInBytes() uint64 {
var size uint64

if d == nil {
return 0
}

for _, b := range d.Buffers() {
size += uint64(b.Len())
}
for _, c := range d.Children() {
size += c.SizeInBytes()
}
if dict := d.Dictionary(); dict != nil {
size += dict.SizeInBytes()
}

return size
}

// NewSliceData returns a new slice that shares backing data with the input.
// The returned Data slice starts at i and extends j-i elements, such as:
// slice := data[i:j]
//
// slice := data[i:j]
//
// The returned value must be Release'd after use.
//
// NewSliceData panics if the slice is outside the valid range of the input Data.
Expand Down
75 changes: 75 additions & 0 deletions go/arrow/array/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,78 @@ func TestDataReset(t *testing.T) {
data.Reset(&arrow.Int64Type{}, 5, data.Buffers(), nil, 1, 2)
}
}

func TestSizeInBytes(t *testing.T) {
var buffers1 = make([]*memory.Buffer, 0, 3)

for i := 0; i < cap(buffers1); i++ {
buffers1 = append(buffers1, memory.NewBufferBytes([]byte("15-bytes-buffer")))
}
data := NewData(&arrow.StringType{}, 10, buffers1, nil, 0, 0)
var arrayData arrow.ArrayData = data
dataWithChild := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{arrayData}, 0, 0)

t.Run("buffers only", func(t *testing.T) {
expectedSize := uint64(45)
if actualSize := data.SizeInBytes(); actualSize != expectedSize {
t.Errorf("expected size %d, got %d", expectedSize, actualSize)
}
})

t.Run("buffers and child data", func(t *testing.T) {
// 45 bytes in buffers, 45 bytes in child data
expectedSize := uint64(90)
if actualSize := dataWithChild.SizeInBytes(); actualSize != expectedSize {
t.Errorf("expected size %d, got %d", expectedSize, actualSize)
}
})

t.Run("buffers and nested child data", func(t *testing.T) {
var dataWithChildArrayData arrow.ArrayData = dataWithChild
var dataWithNestedChild arrow.ArrayData = NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{dataWithChildArrayData}, 0, 0)
// 45 bytes in buffers, 90 bytes in nested child data
expectedSize := uint64(135)
if actualSize := dataWithNestedChild.SizeInBytes(); actualSize != expectedSize {
t.Errorf("expected size %d, got %d", expectedSize, actualSize)
}
})

t.Run("buffers and dictionary", func(t *testing.T) {
dictData := data
dataWithDict := NewDataWithDictionary(&arrow.StringType{}, 10, buffers1, 0, 0, dictData)
// 45 bytes in buffers, 45 bytes in dictionary
expectedSize := uint64(90)
if actualSize := dataWithDict.SizeInBytes(); actualSize != expectedSize {
t.Errorf("expected size %d, got %d", expectedSize, actualSize)
}
})

t.Run("sliced data", func(t *testing.T) {
sliceData := NewSliceData(arrayData, 3, 5)
// offset is not taken into account in SizeInBytes()
expectedSize := uint64(45)
if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize {
t.Errorf("expected size %d, got %d", expectedSize, actualSize)
}
})

t.Run("sliced data with children", func(t *testing.T) {
var dataWithChildArrayData arrow.ArrayData = dataWithChild
sliceData := NewSliceData(dataWithChildArrayData, 3, 5)
// offset is not taken into account in SizeInBytes()
expectedSize := uint64(90)
if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize {
t.Errorf("expected size %d, got %d", expectedSize, actualSize)
}
})

t.Run("buffers with children which are sliced data", func(t *testing.T) {
sliceData := NewSliceData(arrayData, 3, 5)
dataWithSlicedChildren := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{sliceData}, 0, 0)
// offset is not taken into account in SizeInBytes()
expectedSize := uint64(90)
if actualSize := dataWithSlicedChildren.SizeInBytes(); actualSize != expectedSize {
t.Errorf("expected size %d, got %d", expectedSize, actualSize)
}
})
}

0 comments on commit 143b475

Please sign in to comment.