From 4e0d2263d27caef58447a49b139f76be08994fb6 Mon Sep 17 00:00:00 2001 From: Yifeng-Sigma Date: Tue, 28 Nov 2023 11:09:24 -0600 Subject: [PATCH] GH-38836:[Go] Add Size() for ArrayData (#38839) ### Rationale for this change Address https://github.com/apache/arrow/issues/38836 ### What changes are included in this PR? Add a new function SizeInBytes() to calculate the size of ArrayData. ### Are these changes tested? ### Are there any user-facing changes? No * Closes: #38836 Lead-authored-by: Yifeng Wu Co-authored-by: Matt Topol Co-authored-by: Yifeng-Sigma Signed-off-by: Matt Topol --- arrow/array.go | 2 ++ arrow/array/data.go | 27 ++++++++++++++- arrow/array/data_test.go | 75 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) diff --git a/arrow/array.go b/arrow/array.go index e07fa478..eed859cf 100644 --- a/arrow/array.go +++ b/arrow/array.go @@ -81,6 +81,8 @@ type ArrayData interface { // Dictionary returns the ArrayData object for the dictionary if this is a // dictionary array, otherwise it will be nil. Dictionary() ArrayData + // SizeInBytes returns the size of the ArrayData buffers and any children and/or dictionary in bytes. + SizeInBytes() uint64 } // Array represents an immutable sequence of values using the Arrow in-memory format. diff --git a/arrow/array/data.go b/arrow/array/data.go index 8cce4918..3c859ec3 100644 --- a/arrow/array/data.go +++ b/arrow/array/data.go @@ -190,9 +190,34 @@ func (d *Data) SetDictionary(dict arrow.ArrayData) { } } +// SizeInBytes returns the size of the Data and any children and/or dictionary in bytes by +// recursively examining the nested structures of children and/or dictionary. +// The value returned is an upper-bound since offset is not taken into account. +func (d *Data) SizeInBytes() uint64 { + var size uint64 + + if d == nil { + return 0 + } + + for _, b := range d.Buffers() { + size += uint64(b.Len()) + } + for _, c := range d.Children() { + size += c.SizeInBytes() + } + if dict := d.Dictionary(); dict != nil { + size += dict.SizeInBytes() + } + + return size +} + // NewSliceData returns a new slice that shares backing data with the input. // The returned Data slice starts at i and extends j-i elements, such as: -// slice := data[i:j] +// +// slice := data[i:j] +// // The returned value must be Release'd after use. // // NewSliceData panics if the slice is outside the valid range of the input Data. diff --git a/arrow/array/data_test.go b/arrow/array/data_test.go index b7b0f396..dd4793a7 100644 --- a/arrow/array/data_test.go +++ b/arrow/array/data_test.go @@ -49,3 +49,78 @@ func TestDataReset(t *testing.T) { data.Reset(&arrow.Int64Type{}, 5, data.Buffers(), nil, 1, 2) } } + +func TestSizeInBytes(t *testing.T) { + var buffers1 = make([]*memory.Buffer, 0, 3) + + for i := 0; i < cap(buffers1); i++ { + buffers1 = append(buffers1, memory.NewBufferBytes([]byte("15-bytes-buffer"))) + } + data := NewData(&arrow.StringType{}, 10, buffers1, nil, 0, 0) + var arrayData arrow.ArrayData = data + dataWithChild := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{arrayData}, 0, 0) + + t.Run("buffers only", func(t *testing.T) { + expectedSize := uint64(45) + if actualSize := data.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and child data", func(t *testing.T) { + // 45 bytes in buffers, 45 bytes in child data + expectedSize := uint64(90) + if actualSize := dataWithChild.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and nested child data", func(t *testing.T) { + var dataWithChildArrayData arrow.ArrayData = dataWithChild + var dataWithNestedChild arrow.ArrayData = NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{dataWithChildArrayData}, 0, 0) + // 45 bytes in buffers, 90 bytes in nested child data + expectedSize := uint64(135) + if actualSize := dataWithNestedChild.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and dictionary", func(t *testing.T) { + dictData := data + dataWithDict := NewDataWithDictionary(&arrow.StringType{}, 10, buffers1, 0, 0, dictData) + // 45 bytes in buffers, 45 bytes in dictionary + expectedSize := uint64(90) + if actualSize := dataWithDict.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("sliced data", func(t *testing.T) { + sliceData := NewSliceData(arrayData, 3, 5) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(45) + if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("sliced data with children", func(t *testing.T) { + var dataWithChildArrayData arrow.ArrayData = dataWithChild + sliceData := NewSliceData(dataWithChildArrayData, 3, 5) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(90) + if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers with children which are sliced data", func(t *testing.T) { + sliceData := NewSliceData(arrayData, 3, 5) + dataWithSlicedChildren := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{sliceData}, 0, 0) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(90) + if actualSize := dataWithSlicedChildren.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) +}