Merge branch 'apache:main' into GAC-odbc-driver

Bit-Quill · Nov 28, 2023 · ef032c0 · ef032c0
2 parents 39f8308 + 143b475
commit ef032c0
Show file tree

Hide file tree

Showing 25 changed files with 180 additions and 35 deletions.
diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
@@ -18,7 +18,7 @@
 _realname=arrow
 pkgbase=mingw-w64-${_realname}
 pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}"
-pkgver=14.0.0.9000
+pkgver=14.0.1.9000
 pkgrel=8000
 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)"
 arch=("any")

diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
@@ -438,9 +438,9 @@ class TypedComparatorImpl
     return Helper::Compare(type_length_, a, b);
   }
 
-  bool Compare(const T& a, const T& b) override { return CompareInline(a, b); }
+  bool Compare(const T& a, const T& b) const override { return CompareInline(a, b); }
 
-  std::pair<T, T> GetMinMax(const T* values, int64_t length) override {
+  std::pair<T, T> GetMinMax(const T* values, int64_t length) const override {
     DCHECK_GT(length, 0);
 
     T min = Helper::DefaultMin();
@@ -457,7 +457,7 @@ class TypedComparatorImpl
 
   std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
                                   const uint8_t* valid_bits,
-                                  int64_t valid_bits_offset) override {
+                                  int64_t valid_bits_offset) const override {
     DCHECK_GT(length, 0);
 
     T min = Helper::DefaultMin();
@@ -477,7 +477,7 @@ class TypedComparatorImpl
     return {min, max};
   }
 
-  std::pair<T, T> GetMinMax(const ::arrow::Array& values) override {
+  std::pair<T, T> GetMinMax(const ::arrow::Array& values) const override {
     ParquetException::NYI(values.type()->ToString());
   }
 
@@ -491,7 +491,7 @@ class TypedComparatorImpl
 template <>
 std::pair<int32_t, int32_t>
 TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* values,
-                                                               int64_t length) {
+                                                               int64_t length) const {
   DCHECK_GT(length, 0);
 
   const uint32_t* unsigned_values = reinterpret_cast<const uint32_t*>(values);
@@ -537,13 +537,13 @@ std::pair<ByteArray, ByteArray> GetMinMaxBinaryHelper(
 
 template <>
 std::pair<ByteArray, ByteArray> TypedComparatorImpl<true, ByteArrayType>::GetMinMax(
-    const ::arrow::Array& values) {
+    const ::arrow::Array& values) const {
   return GetMinMaxBinaryHelper<true>(*this, values);
 }
 
 template <>
 std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMinMax(
-    const ::arrow::Array& values) {
+    const ::arrow::Array& values) const {
   return GetMinMaxBinaryHelper<false>(*this, values);
 }
 

diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
@@ -73,16 +73,16 @@ class TypedComparator : public Comparator {
 
   /// \brief Scalar comparison of two elements, return true if first
   /// is strictly less than the second
-  virtual bool Compare(const T& a, const T& b) = 0;
+  virtual bool Compare(const T& a, const T& b) const = 0;
 
   /// \brief Compute maximum and minimum elements in a batch of
   /// elements without any nulls
-  virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
+  virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) const = 0;
 
   /// \brief Compute minimum and maximum elements from an Arrow array. Only
   /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
   /// / arrow::BinaryArray
-  virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
+  virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) const = 0;
 
   /// \brief Compute maximum and minimum elements in a batch of
   /// elements with accompanying bitmap indicating which elements are
@@ -96,7 +96,7 @@ class TypedComparator : public Comparator {
   /// the first element in the sequence
   virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
                                           const uint8_t* valid_bits,
-                                          int64_t valid_bits_offset) = 0;
+                                          int64_t valid_bits_offset) const = 0;
 };
 
 /// \brief Typed version of Comparator::Make

diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog
@@ -1,3 +1,9 @@
+apache-arrow-apt-source (14.0.1-1) unstable; urgency=low
+
+  * New upstream release.
+
+ -- Raúl Cumplido <raulcumplido@gmail.com>  Mon, 06 Nov 2023 22:23:27 -0000
+
 apache-arrow-apt-source (14.0.0-1) unstable; urgency=low
 
   * New upstream release.

diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in
@@ -102,6 +102,9 @@ else
 fi
 
 %changelog
+* Mon Nov 06 2023 Raúl Cumplido <raulcumplido@gmail.com> - 14.0.1-1
+- New upstream release.
+
 * Thu Oct 19 2023 Raúl Cumplido <raulcumplido@gmail.com> - 14.0.0-1
 - New upstream release.
 

diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog
@@ -1,3 +1,9 @@
+apache-arrow (14.0.1-1) unstable; urgency=low
+
+  * New upstream release.
+
+ -- Raúl Cumplido <raulcumplido@gmail.com>  Mon, 06 Nov 2023 22:23:27 -0000
+
 apache-arrow (14.0.0-1) unstable; urgency=low
 
   * New upstream release.

diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
@@ -864,6 +864,9 @@ Documentation for Apache Parquet GLib.
 %{_datadir}/gtk-doc/html/parquet-glib/
 
 %changelog
+* Mon Nov 06 2023 Raúl Cumplido <raulcumplido@gmail.com> - 14.0.1-1
+- New upstream release.
+
 * Thu Oct 19 2023 Raúl Cumplido <raulcumplido@gmail.com> - 14.0.0-1
 - New upstream release.
 

diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json
@@ -10,6 +10,11 @@
         "url": "https://arrow.apache.org/docs/",
         "preferred": true
     },
+    {
+        "name": "14.0",
+        "version": "14.0/",
+        "url": "https://arrow.apache.org/docs/14.0/"
+    },
     {
         "name": "13.0",
         "version": "13.0/",

diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst
@@ -567,6 +567,9 @@ Be sure to go through on the following checklist:
 
    .. code-block:: Bash
 
+      # You can run the script with BUMP_TAG=0 and BUMP_PUSH=0
+      # this will avoid default pushing to main and pushing the tag
+      # but you will require to push manually after reviewing the commits.
       # dev/release/post-11-bump-versions.sh 10.0.0 11.0.0
       dev/release/post-11-bump-versions.sh X.Y.Z NEXT_X.NEXT_Y.NEXT_Z
 

diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst
@@ -207,9 +207,9 @@ names and types of child fields are read from the child arrays.
 +------------------------+---------------------------------------------------+------------+
 | ``+L``                 | large list                                        |            |
 +------------------------+---------------------------------------------------+------------+
-| ``+lv``                | list-view                                         |            |
+| ``+vl``                | list-view                                         |            |
 +------------------------+---------------------------------------------------+------------+
-| ``+Lv``                | large list-view                                   |            |
+| ``+vL``                | large list-view                                   |            |
 +------------------------+---------------------------------------------------+------------+
 | ``+w:123``             | fixed-sized list [123 items]                      |            |
 +------------------------+---------------------------------------------------+------------+

diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
@@ -715,7 +715,7 @@ A struct array has its own validity bitmap that is independent of its
 child arrays' validity bitmaps. The validity bitmap for the struct
 array might indicate a null when one or more of its child arrays has
 a non-null value in its corresponding slot; or conversely, a child
-array might have a null in its validity bitmap while the struct array's
+array might indicate a null in its validity bitmap while the struct array's
 validity bitmap shows a non-null value.
 
 Therefore, to know whether a particular child entry is valid, one must

diff --git a/go/arrow/array.go b/go/arrow/array.go
@@ -81,6 +81,8 @@ type ArrayData interface {
 	// Dictionary returns the ArrayData object for the dictionary if this is a
 	// dictionary array, otherwise it will be nil.
 	Dictionary() ArrayData
+	// SizeInBytes returns the size of the ArrayData buffers and any children and/or dictionary in bytes.
+	SizeInBytes() uint64
 }
 
 // Array represents an immutable sequence of values using the Arrow in-memory format.

diff --git a/go/arrow/array/data.go b/go/arrow/array/data.go
@@ -190,9 +190,34 @@ func (d *Data) SetDictionary(dict arrow.ArrayData) {
 	}
 }
 
+// SizeInBytes returns the size of the Data and any children and/or dictionary in bytes by 
+// recursively examining the nested structures of children and/or dictionary.
+// The value returned is an upper-bound since offset is not taken into account.
+func (d *Data) SizeInBytes() uint64 {
+	var size uint64
+
+	if d == nil {
+		return 0
+	}
+
+	for _, b := range d.Buffers() {
+		size += uint64(b.Len())
+	}
+	for _, c := range d.Children() {
+		size += c.SizeInBytes()
+	}
+	if dict := d.Dictionary(); dict != nil {
+		size += dict.SizeInBytes()
+	}
+
+	return size
+}
+
 // NewSliceData returns a new slice that shares backing data with the input.
 // The returned Data slice starts at i and extends j-i elements, such as:
-//    slice := data[i:j]
+//
+//	slice := data[i:j]
+//
 // The returned value must be Release'd after use.
 //
 // NewSliceData panics if the slice is outside the valid range of the input Data.

diff --git a/go/arrow/array/data_test.go b/go/arrow/array/data_test.go
@@ -49,3 +49,78 @@ func TestDataReset(t *testing.T) {
 		data.Reset(&arrow.Int64Type{}, 5, data.Buffers(), nil, 1, 2)
 	}
 }
+
+func TestSizeInBytes(t *testing.T) {
+	var buffers1 = make([]*memory.Buffer, 0, 3)
+
+	for i := 0; i < cap(buffers1); i++ {
+		buffers1 = append(buffers1, memory.NewBufferBytes([]byte("15-bytes-buffer")))
+	}
+	data := NewData(&arrow.StringType{}, 10, buffers1, nil, 0, 0)
+	var arrayData arrow.ArrayData = data
+	dataWithChild := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{arrayData}, 0, 0)
+
+	t.Run("buffers only", func(t *testing.T) {
+		expectedSize := uint64(45)
+		if actualSize := data.SizeInBytes(); actualSize != expectedSize {
+			t.Errorf("expected size %d, got %d", expectedSize, actualSize)
+		}
+	})
+
+	t.Run("buffers and child data", func(t *testing.T) {
+		// 45 bytes in buffers, 45 bytes in child data
+		expectedSize := uint64(90)
+		if actualSize := dataWithChild.SizeInBytes(); actualSize != expectedSize {
+			t.Errorf("expected size %d, got %d", expectedSize, actualSize)
+		}
+	})
+
+	t.Run("buffers and nested child data", func(t *testing.T) {
+		var dataWithChildArrayData arrow.ArrayData = dataWithChild
+		var dataWithNestedChild arrow.ArrayData = NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{dataWithChildArrayData}, 0, 0)
+		// 45 bytes in buffers, 90 bytes in nested child data
+		expectedSize := uint64(135)
+		if actualSize := dataWithNestedChild.SizeInBytes(); actualSize != expectedSize {
+			t.Errorf("expected size %d, got %d", expectedSize, actualSize)
+		}
+	})
+
+	t.Run("buffers and dictionary", func(t *testing.T) {
+		dictData := data
+		dataWithDict := NewDataWithDictionary(&arrow.StringType{}, 10, buffers1, 0, 0, dictData)
+		// 45 bytes in buffers, 45 bytes in dictionary
+		expectedSize := uint64(90)
+		if actualSize := dataWithDict.SizeInBytes(); actualSize != expectedSize {
+			t.Errorf("expected size %d, got %d", expectedSize, actualSize)
+		}
+	})
+
+	t.Run("sliced data", func(t *testing.T) {
+		sliceData := NewSliceData(arrayData, 3, 5)
+		// offset is not taken into account in SizeInBytes()
+		expectedSize := uint64(45)
+		if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize {
+			t.Errorf("expected size %d, got %d", expectedSize, actualSize)
+		}
+	})
+
+	t.Run("sliced data with children", func(t *testing.T) {
+		var dataWithChildArrayData arrow.ArrayData = dataWithChild
+		sliceData := NewSliceData(dataWithChildArrayData, 3, 5)
+		// offset is not taken into account in SizeInBytes()
+		expectedSize := uint64(90)
+		if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize {
+			t.Errorf("expected size %d, got %d", expectedSize, actualSize)
+		}
+	})
+
+	t.Run("buffers with children which are sliced data", func(t *testing.T) {
+		sliceData := NewSliceData(arrayData, 3, 5)
+		dataWithSlicedChildren := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{sliceData}, 0, 0)
+		// offset is not taken into account in SizeInBytes()
+		expectedSize := uint64(90)
+		if actualSize := dataWithSlicedChildren.SizeInBytes(); actualSize != expectedSize {
+			t.Errorf("expected size %d, got %d", expectedSize, actualSize)
+		}
+	})
+}
diff --git a/js/src/enum.ts b/js/src/enum.ts
@@ -21,7 +21,7 @@
 // v4 doesn't seem to be able to tree-shake the rest of those exports.
 //
 // We will have to keep these enums in sync when we re-generate the flatbuffers
-// code from the shchemas. See js/DEVELOP.md for info on how to run flatbuffers
+// code from the schemas. See js/DEVELOP.md for info on how to run flatbuffers
 // code generation.
 //
 ////
@@ -174,7 +174,7 @@ export enum Type {
     FixedSizeBinary = 15, /** Fixed-size binary. Each value occupies the same number of bytes */
     FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */
     Map = 17, /** Map of named logical types */
-    Duration = 18, /** Measure of elapsed time in either seconds, miliseconds, microseconds or nanoseconds. */
+    Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */
 
     Dictionary = -1, /** Dictionary aka Category type */
     Int8 = -2,
@@ -215,7 +215,7 @@ export enum BufferType {
     OFFSET = 0,
 
     /**
-     * actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector
+     * actual data, either fixed width primitive types in slots or variable width delimited by an OFFSET vector
      */
     DATA = 1,
 

diff --git a/js/src/fb/timestamp.ts b/js/src/fb/timestamp.ts
@@ -105,7 +105,7 @@ import { TimeUnit } from './time-unit.js';
  * no indication of how to map this information to a physical point in time.
  * Naive date-times must be handled with care because of this missing
  * information, and also because daylight saving time (DST) may make
- * some values ambiguous or non-existent. A naive date-time may be
+ * some values ambiguous or nonexistent. A naive date-time may be
  * stored as a struct with Date and Time fields. However, it may also be
  * encoded into a Timestamp column with an empty timezone. The timestamp
  * values should be computed "as if" the timezone of the date-time values

diff --git a/js/src/ipc/reader.ts b/js/src/ipc/reader.ts
@@ -185,7 +185,7 @@ export class RecordBatchReader<T extends TypeMap = any> extends ReadableInterop<
 
 //
 // Since TS is a structural type system, we define the following subclass stubs
-// so that concrete types exist to associate with with the interfaces below.
+// so that concrete types exist to associate with the interfaces below.
 //
 // The implementation for each RecordBatchReader is hidden away in the set of
 // `RecordBatchReaderImpl` classes in the second half of this file. This allows

diff --git a/js/src/vector.ts b/js/src/vector.ts
@@ -302,8 +302,8 @@ export class Vector<T extends DataType = any> {
      * values.
      *
      * Memoization is very useful when decoding a value is expensive such as
-     * Uft8. The memoization creates a cache of the size of the Vector and
-     * therfore increases memory usage.
+     * Utf8. The memoization creates a cache of the size of the Vector and
+     * therefore increases memory usage.
      *
      * @returns A new vector that memoizes calls to {@link get}.
      */

diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts
@@ -96,7 +96,7 @@ export class GetBuilderCtor extends Visitor {
     public visitDurationSecond() { return DurationSecondBuilder; }
     public visitDurationMillisecond() { return DurationMillisecondBuilder; }
     public visitDurationMicrosecond() { return DurationMicrosecondBuilder; }
-    public visistDurationNanosecond() { return DurationNanosecondBuilder; }
+    public visitDurationNanosecond() { return DurationNanosecondBuilder; }
     public visitFixedSizeList() { return FixedSizeListBuilder; }
     public visitMap() { return MapBuilder; }
 }

diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts
@@ -144,7 +144,7 @@ function indexOfValue<T extends DataType>(data: Data<T>, searchElement?: T['TVal
 function indexOfUnion<T extends DataType>(data: Data<T>, searchElement?: T['TValue'] | null, fromIndex?: number): number {
     // Unions are special -- they do have a nullBitmap, but so can their children.
     // If the searchElement is null, we don't know whether it came from the Union's
-    // bitmap or one of its childrens'. So we don't interrogate the Union's bitmap,
+    // bitmap or one of its children's. So we don't interrogate the Union's bitmap,
     // since that will report the wrong index if a child has a null before the Union.
     const get = getVisitor.getVisitFn(data);
     const compare = createElementComparator(searchElement);

diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts
@@ -62,9 +62,9 @@ export class JSONVectorAssembler extends Visitor {
 
     /** @nocollapse */
     public static assemble<T extends RecordBatch>(...batches: T[]) {
-        const assemlber = new JSONVectorAssembler();
+        const assembler = new JSONVectorAssembler();
         return batches.map(({ schema, data }) => {
-            return assemlber.visitMany(schema.fields, data.children);
+            return assembler.visitMany(schema.fields, data.children);
         });
     }
 

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: arrow
 Title: Integration to 'Apache' 'Arrow'
-Version: 14.0.0.9000
+Version: 14.0.1.9000
 Authors@R: c(
     person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")),
     person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")),