From 525881987d0b9b4f464c3e3593a9a7b4e3c767d0 Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Tue, 20 Aug 2024 20:25:19 -0400
Subject: [PATCH 001/186] GH-17682: [C++][Python] Bool8 Extension Type
 Implementation (#43488)

### Rationale for this change

C++ and Python implementations of #43234

### What changes are included in this PR?

- Implement C++ `Bool8Type`, `Bool8Array`, `Bool8Scalar`, and tests
- Implement Python bindings to C++, as well as zero-copy numpy conversion methods
- TODO: docs waiting for rebase on #43458

### Are these changes tested?

Yes

### Are there any user-facing changes?

Bool8 extension type will be available in C++ and Python libraries

* GitHub Issue: #17682

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
---
 cpp/src/arrow/CMakeLists.txt                |   1 +
 cpp/src/arrow/extension/CMakeLists.txt      |   6 +
 cpp/src/arrow/extension/bool8.cc            |  61 ++++++++
 cpp/src/arrow/extension/bool8.h             |  58 ++++++++
 cpp/src/arrow/extension/bool8_test.cc       |  91 ++++++++++++
 cpp/src/arrow/extension_type.cc             |   7 +-
 python/pyarrow/__init__.py                  |   7 +-
 python/pyarrow/array.pxi                    | 114 ++++++++++++++-
 python/pyarrow/includes/libarrow.pxd        |   9 ++
 python/pyarrow/lib.pxd                      |   3 +
 python/pyarrow/public-api.pxi               |   2 +
 python/pyarrow/scalar.pxi                   |  23 ++-
 python/pyarrow/tests/test_extension_type.py | 152 ++++++++++++++++++++
 python/pyarrow/tests/test_misc.py           |   3 +
 python/pyarrow/types.pxi                    |  74 ++++++++++
 15 files changed, 604 insertions(+), 7 deletions(-)
 create mode 100644 cpp/src/arrow/extension/bool8.cc
 create mode 100644 cpp/src/arrow/extension/bool8.h
 create mode 100644 cpp/src/arrow/extension/bool8_test.cc
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index fb785e1e9571b..fb7253b6fd69d 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -906,6 +906,7 @@ endif()
 
 if(ARROW_JSON)
   arrow_add_object_library(ARROW_JSON
+                           extension/bool8.cc
                            extension/fixed_shape_tensor.cc
                            extension/opaque.cc
                            json/options.cc
diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt
index 6741ab602f50b..fcd5fa529ab56 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,6 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
+add_arrow_test(test
+               SOURCES
+               bool8_test.cc
+               PREFIX
+               "arrow-extension-bool8")
+
 add_arrow_test(test
                SOURCES
                fixed_shape_tensor_test.cc
diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc
new file mode 100644
index 0000000000000..c081f0c2b2866
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8.cc
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include "arrow/extension/bool8.h"
+#include "arrow/util/logging.h"
+
+namespace arrow::extension {
+
+bool Bool8Type::ExtensionEquals(const ExtensionType& other) const {
+  return extension_name() == other.extension_name();
+}
+
+std::string Bool8Type::ToString(bool show_metadata) const {
+  std::stringstream ss;
+  ss << "extension<" << this->extension_name() << ">";
+  return ss.str();
+}
+
+std::string Bool8Type::Serialize() const { return ""; }
+
+Result<std::shared_ptr<DataType>> Bool8Type::Deserialize(
+    std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
+  if (storage_type->id() != Type::INT8) {
+    return Status::Invalid("Expected INT8 storage type, got ", storage_type->ToString());
+  }
+  if (serialized_data != "") {
+    return Status::Invalid("Serialize data must be empty, got ", serialized_data);
+  }
+  return bool8();
+}
+
+std::shared_ptr<Array> Bool8Type::MakeArray(std::shared_ptr<ArrayData> data) const {
+  DCHECK_EQ(data->type->id(), Type::EXTENSION);
+  DCHECK_EQ("arrow.bool8",
+            internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
+  return std::make_shared<Bool8Array>(data);
+}
+
+Result<std::shared_ptr<DataType>> Bool8Type::Make() {
+  return std::make_shared<Bool8Type>();
+}
+
+std::shared_ptr<DataType> bool8() { return std::make_shared<Bool8Type>(); }
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h
new file mode 100644
index 0000000000000..02e629b28a867
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Array : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Type : public ExtensionType {
+ public:
+  /// \brief Construct a Bool8Type.
+  Bool8Type() : ExtensionType(int8()) {}
+
+  std::string extension_name() const override { return "arrow.bool8"; }
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::string Serialize() const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+
+  /// Create a Bool8Array from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  static Result<std::shared_ptr<DataType>> Make();
+};
+
+/// \brief Return a Bool8Type instance.
+ARROW_EXPORT std::shared_ptr<DataType> bool8();
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc
new file mode 100644
index 0000000000000..eabcfcf62d32c
--- /dev/null
+++ b/cpp/src/arrow/extension/bool8_test.cc
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/bool8.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/testing/extension_type.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+
+TEST(Bool8Type, Basics) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  auto type2 = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  ASSERT_EQ("arrow.bool8", type->extension_name());
+  ASSERT_EQ(*type, *type);
+  ASSERT_NE(*arrow::null(), *type);
+  ASSERT_EQ(*type, *type2);
+  ASSERT_EQ(*arrow::int8(), *type->storage_type());
+  ASSERT_EQ("", type->Serialize());
+  ASSERT_EQ("extension<arrow.bool8>", type->ToString(false));
+}
+
+TEST(Bool8Type, CreateFromArray) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]");
+  auto array = ExtensionType::WrapArray(type, storage);
+  ASSERT_EQ(5, array->length());
+  ASSERT_EQ(1, array->null_count());
+}
+
+TEST(Bool8Type, Deserialize) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  ASSERT_OK_AND_ASSIGN(auto deserialized, type->Deserialize(type->storage_type(), ""));
+  ASSERT_EQ(*type, *deserialized);
+  ASSERT_NOT_OK(type->Deserialize(type->storage_type(), "must be empty"));
+  ASSERT_EQ(*type, *deserialized);
+  ASSERT_NOT_OK(type->Deserialize(uint8(), ""));
+  ASSERT_EQ(*type, *deserialized);
+}
+
+TEST(Bool8Type, MetadataRoundTrip) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+  std::string serialized = type->Serialize();
+  ASSERT_OK_AND_ASSIGN(auto deserialized,
+                       type->Deserialize(type->storage_type(), serialized));
+  ASSERT_EQ(*type, *deserialized);
+}
+
+TEST(Bool8Type, BatchRoundTrip) {
+  auto type = internal::checked_pointer_cast<extension::Bool8Type>(extension::bool8());
+
+  auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]");
+  auto array = ExtensionType::WrapArray(type, storage);
+  auto batch =
+      RecordBatch::Make(schema({field("field", type)}), array->length(), {array});
+
+  std::shared_ptr<RecordBatch> written;
+  {
+    ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
+    ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
+                                          out_stream.get()));
+
+    ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+
+    io::BufferReader reader(complete_ipc_stream);
+    std::shared_ptr<RecordBatchReader> batch_reader;
+    ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
+    ASSERT_OK(batch_reader->ReadNext(&written));
+  }
+
+  ASSERT_EQ(*batch->schema(), *written->schema());
+  ASSERT_BATCHES_EQUAL(*batch, *written);
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index cf8dda7a85df4..685018f7de7b8 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -28,6 +28,7 @@
 #include "arrow/chunked_array.h"
 #include "arrow/config.h"
 #ifdef ARROW_JSON
+#include "arrow/extension/bool8.h"
 #include "arrow/extension/fixed_shape_tensor.h"
 #endif
 #include "arrow/status.h"
@@ -146,10 +147,12 @@ static void CreateGlobalRegistry() {
 
 #ifdef ARROW_JSON
   // Register canonical extension types
-  auto ext_type =
+  auto fst_ext_type =
       checked_pointer_cast<ExtensionType>(extension::fixed_shape_tensor(int64(), {}));
+  ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type));
 
-  ARROW_CHECK_OK(g_registry->RegisterType(ext_type));
+  auto bool8_ext_type = checked_pointer_cast<ExtensionType>(extension::bool8());
+  ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type));
 #endif
 }
 
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index aa7bab9f97e05..807bcdc315036 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -174,6 +174,7 @@ def print_entry(label, value):
                          run_end_encoded,
                          fixed_shape_tensor,
                          opaque,
+                         bool8,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -184,7 +185,7 @@ def print_entry(label, value):
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
                          RunEndEncodedType, FixedShapeTensorType, OpaqueType,
-                         PyExtensionType, UnknownExtensionType,
+                         Bool8Type, PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
                          KeyValueMetadata,
@@ -218,7 +219,7 @@ def print_entry(label, value):
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
                          RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray,
-                         scalar, NA, _NULL as NULL, Scalar,
+                         Bool8Array, scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
                          UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
@@ -235,7 +236,7 @@ def print_entry(label, value):
                          FixedSizeBinaryScalar, DictionaryScalar,
                          MapScalar, StructScalar, UnionScalar,
                          RunEndEncodedScalar, ExtensionScalar,
-                         FixedShapeTensorScalar, OpaqueScalar)
+                         FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar)
 
 # Buffers, allocation
 from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 6c40a21db96ca..4c3eb93232634 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1581,7 +1581,7 @@ cdef class Array(_PandasConvertible):
 
     def to_numpy(self, zero_copy_only=True, writable=False):
         """
-        Return a NumPy view or copy of this array (experimental).
+        Return a NumPy view or copy of this array.
 
         By default, tries to return a view of this array. This is only
         supported for primitive arrays with the same memory layout as NumPy
@@ -4476,6 +4476,118 @@ cdef class OpaqueArray(ExtensionArray):
     """
 
 
+cdef class Bool8Array(ExtensionArray):
+    """
+    Concrete class for bool8 extension arrays.
+
+    Examples
+    --------
+    Define the extension type for an bool8 array
+
+    >>> import pyarrow as pa
+    >>> bool8_type = pa.bool8()
+
+    Create an extension array
+
+    >>> arr = [-1, 0, 1, 2, None]
+    >>> storage = pa.array(arr, pa.int8())
+    >>> pa.ExtensionArray.from_storage(bool8_type, storage)
+    <pyarrow.lib.Bool8Array object at ...>
+    [
+      -1,
+      0,
+      1,
+      2,
+      null
+    ]
+    """
+
+    def to_numpy(self, zero_copy_only=True, writable=False):
+        """
+        Return a NumPy bool view or copy of this array.
+
+        By default, tries to return a view of this array. This is only
+        supported for arrays without any nulls.
+
+        Parameters
+        ----------
+        zero_copy_only : bool, default True
+            If True, an exception will be raised if the conversion to a numpy
+            array would require copying the underlying data (e.g. in presence
+            of nulls).
+        writable : bool, default False
+            For numpy arrays created with zero copy (view on the Arrow data),
+            the resulting array is not writable (Arrow data is immutable).
+            By setting this to True, a copy of the array is made to ensure
+            it is writable.
+
+        Returns
+        -------
+        array : numpy.ndarray
+        """
+        if not writable:
+            try:
+                return self.storage.to_numpy().view(np.bool_)
+            except ArrowInvalid as e:
+                if zero_copy_only:
+                    raise e
+
+        return _pc().not_equal(self.storage, 0).to_numpy(zero_copy_only=zero_copy_only, writable=writable)
+
+    @staticmethod
+    def from_storage(Int8Array storage):
+        """
+        Construct Bool8Array from Int8Array storage.
+
+        Parameters
+        ----------
+        storage : Int8Array
+            The underlying storage for the result array.
+
+        Returns
+        -------
+        bool8_array : Bool8Array
+        """
+        return ExtensionArray.from_storage(bool8(), storage)
+
+    @staticmethod
+    def from_numpy(obj):
+        """
+        Convert numpy array to a bool8 extension array without making a copy.
+        The input array must be 1-dimensional, with either bool_ or int8 dtype.
+
+        Parameters
+        ----------
+        obj : numpy.ndarray
+
+        Returns
+        -------
+        bool8_array : Bool8Array
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> import numpy as np
+        >>> arr = np.array([True, False, True], dtype=np.bool_)
+        >>> pa.Bool8Array.from_numpy(arr)
+        <pyarrow.lib.Bool8Array object at ...>
+        [
+          1,
+          0,
+          1
+        ]
+        """
+
+        if obj.ndim != 1:
+            raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8 array")
+
+        if obj.dtype not in [np.bool_, np.int8]:
+            raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8 storage")
+
+        storage_arr = array(obj.view(np.int8), type=int8())
+        return Bool8Array.from_storage(storage_arr)
+
+
 cdef dict _array_classes = {
     _Type_NA: NullArray,
     _Type_BOOL: BooleanArray,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 9b008d150f1f1..a54a1db292f70 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2895,6 +2895,15 @@ cdef extern from "arrow/extension/opaque.h" namespace "arrow::extension" nogil:
         pass
 
 
+cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil:
+    cdef cppclass CBool8Type" arrow::extension::Bool8Type"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make()
+
+    cdef cppclass CBool8Array" arrow::extension::Bool8Array"(CExtensionArray):
+        pass
+
 cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
     cdef enum CCompressionType" arrow::Compression::type":
         CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 2cb302d20a8ac..e3625c1815274 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -214,6 +214,9 @@ cdef class FixedShapeTensorType(BaseExtensionType):
     cdef:
         const CFixedShapeTensorType* tensor_ext_type
 
+cdef class Bool8Type(BaseExtensionType):
+    cdef:
+        const CBool8Type* bool8_ext_type
 
 cdef class OpaqueType(BaseExtensionType):
     cdef:
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 2f9fc1c554209..19a26bd6c683d 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -126,6 +126,8 @@ cdef api object pyarrow_wrap_data_type(
             out = FixedShapeTensorType.__new__(FixedShapeTensorType)
         elif ext_type.extension_name() == b"arrow.opaque":
             out = OpaqueType.__new__(OpaqueType)
+        elif ext_type.extension_name() == b"arrow.bool8":
+            out = Bool8Type.__new__(Bool8Type)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 12a99c2aece63..72ae2aee5f8b3 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -1091,6 +1091,18 @@ cdef class OpaqueScalar(ExtensionScalar):
     """
 
 
+cdef class Bool8Scalar(ExtensionScalar):
+    """
+    Concrete class for bool8 extension scalar.
+    """
+
+    def as_py(self):
+        """
+        Return this scalar as a Python object.
+        """
+        py_val = super().as_py()
+        return None if py_val is None else py_val != 0
+
 cdef dict _scalar_classes = {
     _Type_BOOL: BooleanScalar,
     _Type_UINT8: UInt8Scalar,
@@ -1199,6 +1211,11 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
     type = ensure_type(type, allow_none=True)
     pool = maybe_unbox_memory_pool(memory_pool)
 
+    extension_type = None
+    if type is not None and type.id == _Type_EXTENSION:
+        extension_type = type
+        type = type.storage_type
+
     if _is_array_like(value):
         value = get_values(value, &is_pandas_object)
 
@@ -1223,4 +1240,8 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
 
     # retrieve the scalar from the first position
     scalar = GetResultValue(array.get().GetScalar(0))
-    return Scalar.wrap(scalar)
+    result = Scalar.wrap(scalar)
+
+    if extension_type is not None:
+        result = ExtensionScalar.from_storage(extension_type, result)
+    return result
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index 58c54189f223e..b04ee85ec99ad 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1707,3 +1707,155 @@ def test_opaque_type(pickle_module, storage_type, storage):
     # cast extension type -> storage type
     inner = arr.cast(storage_type)
     assert inner == storage
+
+
+def test_bool8_type(pickle_module):
+    bool8_type = pa.bool8()
+    storage_type = pa.int8()
+    assert bool8_type.extension_name == "arrow.bool8"
+    assert bool8_type.storage_type == storage_type
+    assert str(bool8_type) == "extension<arrow.bool8>"
+
+    assert bool8_type == bool8_type
+    assert bool8_type == pa.bool8()
+    assert bool8_type != storage_type
+
+    # Pickle roundtrip
+    result = pickle_module.loads(pickle_module.dumps(bool8_type))
+    assert result == bool8_type
+
+    # IPC roundtrip
+    storage = pa.array([-1, 0, 1, 2, None], storage_type)
+    arr = pa.ExtensionArray.from_storage(bool8_type, storage)
+    assert isinstance(arr, pa.Bool8Array)
+
+    # extension is registered by default
+    buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+    batch = ipc_read_batch(buf)
+
+    assert batch.column(0).type.extension_name == "arrow.bool8"
+    assert isinstance(batch.column(0), pa.Bool8Array)
+
+    # cast storage -> extension type
+    result = storage.cast(bool8_type)
+    assert result == arr
+
+    # cast extension type -> storage type
+    inner = arr.cast(storage_type)
+    assert inner == storage
+
+
+def test_bool8_to_bool_conversion():
+    bool_arr = pa.array([True, False, True, True, None], pa.bool_())
+    bool8_arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+
+    # cast extension type -> arrow boolean type
+    assert bool8_arr.cast(pa.bool_()) == bool_arr
+
+    # cast arrow boolean type -> extension type, expecting canonical values
+    canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8())
+    canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(), canonical_storage)
+    assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
+
+
+def test_bool8_to_numpy_conversion():
+    arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+
+    # cannot zero-copy with nulls
+    with pytest.raises(
+        pa.ArrowInvalid,
+        match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True",
+    ):
+        arr.to_numpy()
+
+    # nullable conversion possible with a copy, but dest dtype is object
+    assert np.array_equal(
+        arr.to_numpy(zero_copy_only=False),
+        np.array([True, False, True, True, None], dtype=np.object_),
+    )
+
+    # zero-copy possible with non-null array
+    np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+    arr_no_nulls = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2], pa.int8()),
+    )
+
+    arr_to_np = arr_no_nulls.to_numpy()
+    assert np.array_equal(arr_to_np, np_arr_no_nulls)
+
+    # same underlying buffer
+    assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address
+
+    # if the user requests a writable array, a copy should be performed
+    arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, writable=True)
+    assert np.array_equal(arr_to_np_writable, np_arr_no_nulls)
+
+    # different underlying buffer
+    assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address
+
+
+def test_bool8_from_numpy_conversion():
+    np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
+    canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([1, 0, 1, 1], pa.int8()),
+    )
+
+    arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls)
+    assert arr_from_np == canonical_bool8_arr_no_nulls
+
+    # same underlying buffer
+    assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data
+
+    # conversion only valid for 1-D arrays
+    with pytest.raises(
+        ValueError,
+        match="Cannot convert 2-D array to bool8 array",
+    ):
+        pa.Bool8Array.from_numpy(
+            np.array([[True, False], [False, True]], dtype=np.bool_),
+        )
+
+    with pytest.raises(
+        ValueError,
+        match="Cannot convert 0-D array to bool8 array",
+    ):
+        pa.Bool8Array.from_numpy(np.bool_())
+
+    # must use compatible storage type
+    with pytest.raises(
+        TypeError,
+        match="Array dtype float64 incompatible with bool8 storage",
+    ):
+        pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64))
+
+
+def test_bool8_scalar():
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() is False
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() is True
+    assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None
+
+    arr = pa.ExtensionArray.from_storage(
+        pa.bool8(),
+        pa.array([-1, 0, 1, 2, None], pa.int8()),
+    )
+    assert arr[0].as_py() is True
+    assert arr[1].as_py() is False
+    assert arr[2].as_py() is True
+    assert arr[3].as_py() is True
+    assert arr[4].as_py() is None
+
+    assert pa.scalar(-1, type=pa.bool8()).as_py() is True
+    assert pa.scalar(0, type=pa.bool8()).as_py() is False
+    assert pa.scalar(1, type=pa.bool8()).as_py() is True
+    assert pa.scalar(2, type=pa.bool8()).as_py() is True
+    assert pa.scalar(None, type=pa.bool8()).as_py() is None
diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py
index 9a55a38177fc8..5d3471c7c35db 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -250,6 +250,9 @@ def test_set_timezone_db_path_non_windows():
     pa.OpaqueArray,
     pa.OpaqueScalar,
     pa.OpaqueType,
+    pa.Bool8Array,
+    pa.Bool8Scalar,
+    pa.Bool8Type,
 ])
 def test_extension_type_constructor_errors(klass):
     # ARROW-2638: prevent calling extension class constructors directly
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index dcd2b61c33411..563782f0c2643 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1837,6 +1837,37 @@ cdef class FixedShapeTensorType(BaseExtensionType):
         return FixedShapeTensorScalar
 
 
+cdef class Bool8Type(BaseExtensionType):
+    """
+    Concrete class for bool8 extension type.
+
+    Bool8 is an alternate representation for boolean
+    arrays using 8 bits instead of 1 bit per value. The underlying
+    storage type is int8.
+
+    Examples
+    --------
+    Create an instance of bool8 extension type:
+
+    >>> import pyarrow as pa
+    >>> pa.bool8()
+    Bool8Type(extension<arrow.bool8>)
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.bool8_ext_type = <const CBool8Type*> type.get()
+
+    def __arrow_ext_class__(self):
+        return Bool8Array
+
+    def __reduce__(self):
+        return bool8, ()
+
+    def __arrow_ext_scalar_class__(self):
+        return Bool8Scalar
+
+
 cdef class OpaqueType(BaseExtensionType):
     """
     Concrete class for opaque extension type.
@@ -5278,6 +5309,49 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N
     return out
 
 
+def bool8():
+    """
+    Create instance of bool8 extension type.
+
+    Examples
+    --------
+    Create an instance of bool8 extension type:
+
+    >>> import pyarrow as pa
+    >>> type = pa.bool8()
+    >>> type
+    Bool8Type(extension<arrow.bool8>)
+
+    Inspect the data type:
+
+    >>> type.storage_type
+    DataType(int8)
+
+    Create a table with a bool8 array:
+
+    >>> arr = [-1, 0, 1, 2, None]
+    >>> storage = pa.array(arr, pa.int8())
+    >>> other = pa.ExtensionArray.from_storage(type, storage)
+    >>> pa.table([other], names=["unknown_col"])
+    pyarrow.Table
+    unknown_col: extension<arrow.bool8>
+    ----
+    unknown_col: [[-1,0,1,2,null]]
+
+    Returns
+    -------
+    type : Bool8Type
+    """
+
+    cdef Bool8Type out = Bool8Type.__new__(Bool8Type)
+
+    c_type = GetResultValue(CBool8Type.Make())
+
+    out.init(c_type)
+
+    return out
+
+
 def opaque(DataType storage_type, str type_name not None, str vendor_name not None):
     """
     Create instance of opaque extension type.

From 27c22389579dd773d9701f5d3c743bbfca3bdb8e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 21 Aug 2024 14:38:12 +0900
Subject: [PATCH 002/186] MINOR: [Java] Bump
 org.codehaus.mojo:exec-maven-plugin from 3.3.0 to 3.4.1 in /java (#43692)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [org.codehaus.mojo:exec-maven-plugin](https://github.com/mojohaus/exec-maven-plugin) from 3.3.0 to 3.4.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/mojohaus/exec-maven-plugin/releases">org.codehaus.mojo:exec-maven-plugin's releases</a>.</em></p>
<blockquote>
<h2>3.4.1</h2>

<h2>🐛 Bug Fixes</h2>
<ul>
<li>Environment variable Path should be used as case-insensitive (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/442">#442</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
<li>fix: NPE because declared MavenSession field hides field of superclass (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/439">#439</a>) <a href="https://github.com/sebthom"><code>@​sebthom</code></a></li>
</ul>
<h2>📦 Dependency updates</h2>
<ul>
<li>Bump org.codehaus.mojo:mojo-parent from 84 to 85 (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/441">#441</a>) <a href="https://github.com/dependabot"><code>@​dependabot</code></a></li>
</ul>
<h2>👻 Maintenance</h2>
<ul>
<li>Remove redundant spotless configuration (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/440">#440</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
</ul>
<h2>🔧 Build</h2>
<ul>
<li>Use Maven4 enabled with GH Action (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/443">#443</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
<li>Use shared release drafter GH Action (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/444">#444</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
</ul>
<h2>3.4.0</h2>

<h2>🚀 New features and improvements</h2>
<ul>
<li>Allow <code>&lt;includePluginDependencies&gt;</code> to be specified for the exec:exec goal (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/432">#432</a>) <a href="https://github.com/sebthom"><code>@​sebthom</code></a></li>
</ul>
<h2>🐛 Bug Fixes</h2>
<ul>
<li>Do not get UPPERCASE env vars (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/427">#427</a>) <a href="https://github.com/wheezil"><code>@​wheezil</code></a></li>
</ul>
<h2>📦 Dependency updates</h2>
<ul>
<li>Bump org.codehaus.mojo:mojo-parent from 82 to 84 (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/434">#434</a>) <a href="https://github.com/dependabot"><code>@​dependabot</code></a></li>
<li>Bump org.codehaus.plexus:plexus-xml from 3.0.0 to 3.0.1 (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/431">#431</a>) <a href="https://github.com/dependabot"><code>@​dependabot</code></a></li>
</ul>
<h2>👻 Maintenance</h2>
<ul>
<li>Remove Log4j 1.2.x from ITs (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/437">#437</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
</ul>
<h2>🔧 Build</h2>
<ul>
<li>Use Maven 3.9.7 and 4.0.0-beta-3 (<a href="https://redirect.github.com/mojohaus/exec-maven-plugin/pull/433">#433</a>) <a href="https://github.com/slawekjaranowski"><code>@​slawekjaranowski</code></a></li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/7b0be2cd7809190d615e02905a7cb7abf5470ac0"><code>7b0be2c</code></a> [maven-release-plugin] prepare release 3.4.1</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/5ac4f80f7c1d6f3b493b44b48afd7a28a454277d"><code>5ac4f80</code></a> Environment variable Path should be used as case-insensitive</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/cfb3a9fdd679689cea5ad782e4a831a3b8636ccc"><code>cfb3a9f</code></a> Use Maven4 enabled with GH Action</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/d0ded48c487b9b61f41974be67e2c5b1236c0768"><code>d0ded48</code></a> Use shared release drafter GH Action</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/4c229549403d2ce17f8c965f3d955b85d8f78979"><code>4c22954</code></a> Bump org.codehaus.mojo:mojo-parent from 84 to 85</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/a8c4f94c668c1f60dfcbb39f4c4304db2f910d99"><code>a8c4f94</code></a> fix: NPE because declared MavenSession field hides field of superclass</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/a2b735ffcf72a0dfd33c14cf5ced91a6e7ee5e85"><code>a2b735f</code></a> Remove redundant spotless configuration</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/8e0e83c5644c495b0b57fce735f168ccb477fe46"><code>8e0e83c</code></a> [maven-release-plugin] prepare for next development iteration</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/6c4996f4584524e3d7332bf648910e07aff6f559"><code>6c4996f</code></a> [maven-release-plugin] prepare release 3.4.0</li>
<li><a href="https://github.com/mojohaus/exec-maven-plugin/commit/c7ad6710729f8d69fc03b96d01707125534e7931"><code>c7ad671</code></a> Remove Log4j 1.2.x from ITs</li>
<li>Additional commits viewable in <a href="https://github.com/mojohaus/exec-maven-plugin/compare/3.3.0...3.4.1">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.codehaus.mojo:exec-maven-plugin&package-manager=maven&previous-version=3.3.0&new-version=3.4.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 1524dc3257997..0f3e5760f2b82 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -504,7 +504,7 @@ under the License.
         <plugin>
           <groupId>org.codehaus.mojo</groupId>
           <artifactId>exec-maven-plugin</artifactId>
-          <version>3.3.0</version>
+          <version>3.4.1</version>
         </plugin>
         <plugin>
           <groupId>org.codehaus.mojo</groupId>

From 4af1e491df7ac22217656668b65c3e8d55f5b5ab Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 21 Aug 2024 14:56:44 +0900
Subject: [PATCH 003/186] MINOR: [Java] Bump io.grpc:grpc-bom from 1.65.0 to
 1.66.0 in /java (#43657)

Bumps [io.grpc:grpc-bom](https://github.com/grpc/grpc-java) from 1.65.0 to 1.66.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/grpc/grpc-java/releases">io.grpc:grpc-bom's releases</a>.</em></p>
<blockquote>
<h2>v1.65.1</h2>
<h2>What's Changed</h2>
<ul>
<li>netty: Restore old behavior of NettyAdaptiveCumulator, but avoid using that class if Netty is on version 4.1.111 or later</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/grpc/grpc-java/commit/cf784069508fc5767a85c915e43bb43ccfc84c76"><code>cf78406</code></a> Bump version to 1.66.0</li>
<li><a href="https://github.com/grpc/grpc-java/commit/33af0a75fda4dcf2ff166ab400fc89b9066a7641"><code>33af0a7</code></a> Update README etc to reference 1.66.0</li>
<li><a href="https://github.com/grpc/grpc-java/commit/19c9b998b1efd8bee3fb40335e1a5f7360859cfb"><code>19c9b99</code></a> xds: XdsClient should unsubscribe on last resource (<a href="https://redirect.github.com/grpc/grpc-java/issues/11264">#11264</a>)</li>
<li><a href="https://github.com/grpc/grpc-java/commit/752a045f10d59286d196c203d9177c6fb191241f"><code>752a045</code></a> Revert &quot;Start 1.67.0 development cycle (<a href="https://redirect.github.com/grpc/grpc-java/issues/11416">#11416</a>)&quot; (<a href="https://redirect.github.com/grpc/grpc-java/issues/11428">#11428</a>)</li>
<li><a href="https://github.com/grpc/grpc-java/commit/ef09d94fe8d51aca13f3490f599ebbfabf3299ab"><code>ef09d94</code></a> Revert &quot;Introduce onResult2 in NameResolver Listener2 that returns Status (<a href="https://redirect.github.com/grpc/grpc-java/issues/1">#1</a>...</li>
<li><a href="https://github.com/grpc/grpc-java/commit/c37fb181a4a803a0d4c4ad2733468c12004be59c"><code>c37fb18</code></a> Start 1.67.0 development cycle</li>
<li><a href="https://github.com/grpc/grpc-java/commit/9ba2f9dec5c71a5d0afbba0f196331a47844bc07"><code>9ba2f9d</code></a> Introduce onResult2 in NameResolver Listener2 that returns Status (<a href="https://redirect.github.com/grpc/grpc-java/issues/11313">#11313</a>)</li>
<li><a href="https://github.com/grpc/grpc-java/commit/786523dca4461597072cc2b86e827d18a34e6440"><code>786523d</code></a> xds: WRR rr_fallback should trigger with one endpoint weight</li>
<li><a href="https://github.com/grpc/grpc-java/commit/b108ed3ddf08d20926cab1ea4ddd75264aff8c18"><code>b108ed3</code></a> api: Give instruments a toString() including their name</li>
<li><a href="https://github.com/grpc/grpc-java/commit/eb4cdf7959795d70c44aa74d572ddc5f8bd2ac5e"><code>eb4cdf7</code></a> Update MAINTAINERS.md (<a href="https://redirect.github.com/grpc/grpc-java/issues/11241">#11241</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/grpc/grpc-java/compare/v1.65.0...v1.66.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.grpc:grpc-bom&package-manager=maven&previous-version=1.65.0&new-version=1.66.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 0f3e5760f2b82..a73453df68fd2 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -97,7 +97,7 @@ under the License.
     <dep.slf4j.version>2.0.13</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>
-    <dep.grpc-bom.version>1.65.0</dep.grpc-bom.version>
+    <dep.grpc-bom.version>1.66.0</dep.grpc-bom.version>
     <dep.protobuf-bom.version>3.25.4</dep.protobuf-bom.version>
     <dep.jackson-bom.version>2.17.2</dep.jackson-bom.version>
     <dep.hadoop.version>3.4.0</dep.hadoop.version>

From 9fc03015463a8f1cb616b088342b104fbc767a0c Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 21 Aug 2024 09:22:53 +0200
Subject: [PATCH 004/186] GH-43069: [Python] Use Py_IsFinalizing from
 pythoncapi_compat.h (#43767)

### Rationale for this change

https://github.com/apache/arrow/pull/43540 already vendored `pythoncapi_compat.h`, so closing https://github.com/apache/arrow/issues/43069 by using this as well for `Py_IsFinalizing` (which was added in https://github.com/apache/arrow/pull/42034, and for which we opened that follow-up issue to use  `pythoncapi_compat.h` instead)

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/src/arrow/python/udf.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/udf.cc b/python/pyarrow/src/arrow/python/udf.cc
index 2c1e97c3ea03d..74f16899c47eb 100644
--- a/python/pyarrow/src/arrow/python/udf.cc
+++ b/python/pyarrow/src/arrow/python/udf.cc
@@ -24,14 +24,11 @@
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/row/grouper.h"
 #include "arrow/python/common.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
 #include "arrow/table.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 
-// Py_IsFinalizing added in Python 3.13.0a4
-#if PY_VERSION_HEX < 0x030D00A4
-#define Py_IsFinalizing() _Py_IsFinalizing()
-#endif
 namespace arrow {
 using compute::ExecSpan;
 using compute::Grouper;

From e1e7c501019ac26c896d61fa0c129eee83da9b55 Mon Sep 17 00:00:00 2001
From: Oliver Layer <o.layer@celonis.de>
Date: Wed, 21 Aug 2024 13:22:57 +0200
Subject: [PATCH 005/186] GH-40036: [C++] Azure file system write buffering &
 async writes (#43096)

### Rationale for this change

See #40036.

### What changes are included in this PR?

Write buffering and async writes (similar to what the S3 file system does) in the `ObjectAppendStream` for the Azure file system.

With write buffering and async writes, the input scenario creation runtime in the tests (which uses the `ObjectAppendStream` against Azurite) decreased from ~25s (see [here](https://github.com/apache/arrow/issues/40036)) to ~800ms:
```
[ RUN      ] TestAzuriteFileSystem.OpenInputFileMixedReadVsReadAt
[       OK ] TestAzuriteFileSystem.OpenInputFileMixedReadVsReadAt (787 ms)
```

### Are these changes tested?
Added some tests with background writes enabled and disabled (some were taken from the S3 tests). Everything changed should be covered.

### Are there any user-facing changes?
`AzureOptions` now allows for `background_writes` to be set (default: true). No breaking changes.

### Notes

- The code in `DoWrite` is very similar to [the code in the S3 FS](https://github.com/apache/arrow/blob/edfa343eeca008513f0300924380e1b187cc976b/cpp/src/arrow/filesystem/s3fs.cc#L1753). Maybe this could be unified? I didn't see this in the scope of the PR though.
* GitHub Issue: #40036

Lead-authored-by: Oliver Layer <o.layer@celonis.de>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/filesystem/azurefs.cc      | 276 ++++++++++++++++++++---
 cpp/src/arrow/filesystem/azurefs.h       |   3 +
 cpp/src/arrow/filesystem/azurefs_test.cc | 264 ++++++++++++++++++----
 3 files changed, 471 insertions(+), 72 deletions(-)

diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc
index 9b3c0c0c1d703..0bad856339729 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -22,6 +22,7 @@
 
 #include "arrow/filesystem/azurefs.h"
 #include "arrow/filesystem/azurefs_internal.h"
+#include "arrow/io/memory.h"
 
 // idenfity.hpp triggers -Wattributes warnings cause -Werror builds to fail,
 // so disable it for this file with pragmas.
@@ -144,6 +145,9 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
         blob_storage_scheme = "http";
         dfs_storage_scheme = "http";
       }
+    } else if (kv.first == "background_writes") {
+      ARROW_ASSIGN_OR_RAISE(background_writes,
+                            ::arrow::internal::ParseBoolean(kv.second));
     } else {
       return Status::Invalid(
           "Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'");
@@ -937,8 +941,8 @@ Status CommitBlockList(std::shared_ptr<Storage::Blobs::BlockBlobClient> block_bl
                        const std::vector<std::string>& block_ids,
                        const Blobs::CommitBlockListOptions& options) {
   try {
-    // CommitBlockList puts all block_ids in the latest element. That means in the case of
-    // overlapping block_ids the newly staged block ids will always replace the
+    // CommitBlockList puts all block_ids in the latest element. That means in the case
+    // of overlapping block_ids the newly staged block ids will always replace the
     // previously committed blocks.
     // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block-list?tabs=microsoft-entra-id#request-body
     block_blob_client->CommitBlockList(block_ids, options);
@@ -950,7 +954,34 @@ Status CommitBlockList(std::shared_ptr<Storage::Blobs::BlockBlobClient> block_bl
   return Status::OK();
 }
 
+Status StageBlock(Blobs::BlockBlobClient* block_blob_client, const std::string& id,
+                  Core::IO::MemoryBodyStream& content) {
+  try {
+    block_blob_client->StageBlock(id, content);
+  } catch (const Storage::StorageException& exception) {
+    return ExceptionToStatus(
+        exception, "StageBlock failed for '", block_blob_client->GetUrl(),
+        "' new_block_id: '", id,
+        "'. Staging new blocks is fundamental to streaming writes to blob storage.");
+  }
+
+  return Status::OK();
+}
+
+/// Writes will be buffered up to this size (in bytes) before actually uploading them.
+static constexpr int64_t kBlockUploadSizeBytes = 10 * 1024 * 1024;
+/// The maximum size of a block in Azure Blob (as per docs).
+static constexpr int64_t kMaxBlockSizeBytes = 4UL * 1024 * 1024 * 1024;
+
+/// This output stream, similar to other arrow OutputStreams, is not thread-safe.
 class ObjectAppendStream final : public io::OutputStream {
+ private:
+  struct UploadState;
+
+  std::shared_ptr<ObjectAppendStream> Self() {
+    return std::dynamic_pointer_cast<ObjectAppendStream>(shared_from_this());
+  }
+
  public:
   ObjectAppendStream(std::shared_ptr<Blobs::BlockBlobClient> block_blob_client,
                      const io::IOContext& io_context, const AzureLocation& location,
@@ -958,7 +989,8 @@ class ObjectAppendStream final : public io::OutputStream {
                      const AzureOptions& options)
       : block_blob_client_(std::move(block_blob_client)),
         io_context_(io_context),
-        location_(location) {
+        location_(location),
+        background_writes_(options.background_writes) {
     if (metadata && metadata->size() != 0) {
       ArrowMetadataToCommitBlockListOptions(metadata, commit_block_list_options_);
     } else if (options.default_metadata && options.default_metadata->size() != 0) {
@@ -1008,10 +1040,13 @@ class ObjectAppendStream final : public io::OutputStream {
         content_length_ = 0;
       }
     }
+
+    upload_state_ = std::make_shared<UploadState>();
+
     if (content_length_ > 0) {
       ARROW_ASSIGN_OR_RAISE(auto block_list, GetBlockList(block_blob_client_));
       for (auto block : block_list.CommittedBlocks) {
-        block_ids_.push_back(block.Name);
+        upload_state_->block_ids.push_back(block.Name);
       }
     }
     initialised_ = true;
@@ -1031,12 +1066,34 @@ class ObjectAppendStream final : public io::OutputStream {
     if (closed_) {
       return Status::OK();
     }
+
+    if (current_block_) {
+      // Upload remaining buffer
+      RETURN_NOT_OK(AppendCurrentBlock());
+    }
+
     RETURN_NOT_OK(Flush());
     block_blob_client_ = nullptr;
     closed_ = true;
     return Status::OK();
   }
 
+  Future<> CloseAsync() override {
+    if (closed_) {
+      return Status::OK();
+    }
+
+    if (current_block_) {
+      // Upload remaining buffer
+      RETURN_NOT_OK(AppendCurrentBlock());
+    }
+
+    return FlushAsync().Then([self = Self()]() {
+      self->block_blob_client_ = nullptr;
+      self->closed_ = true;
+    });
+  }
+
   bool closed() const override { return closed_; }
 
   Status CheckClosed(const char* action) const {
@@ -1052,11 +1109,11 @@ class ObjectAppendStream final : public io::OutputStream {
   }
 
   Status Write(const std::shared_ptr<Buffer>& buffer) override {
-    return DoAppend(buffer->data(), buffer->size(), buffer);
+    return DoWrite(buffer->data(), buffer->size(), buffer);
   }
 
   Status Write(const void* data, int64_t nbytes) override {
-    return DoAppend(data, nbytes);
+    return DoWrite(data, nbytes);
   }
 
   Status Flush() override {
@@ -1066,20 +1123,111 @@ class ObjectAppendStream final : public io::OutputStream {
       // flush. This also avoids some unhandled errors when flushing in the destructor.
       return Status::OK();
     }
-    return CommitBlockList(block_blob_client_, block_ids_, commit_block_list_options_);
+
+    Future<> pending_blocks_completed;
+    {
+      std::unique_lock<std::mutex> lock(upload_state_->mutex);
+      pending_blocks_completed = upload_state_->pending_blocks_completed;
+    }
+
+    RETURN_NOT_OK(pending_blocks_completed.status());
+    std::unique_lock<std::mutex> lock(upload_state_->mutex);
+    return CommitBlockList(block_blob_client_, upload_state_->block_ids,
+                           commit_block_list_options_);
   }
 
- private:
-  Status DoAppend(const void* data, int64_t nbytes,
-                  std::shared_ptr<Buffer> owned_buffer = nullptr) {
-    RETURN_NOT_OK(CheckClosed("append"));
-    auto append_data = reinterpret_cast<const uint8_t*>(data);
-    Core::IO::MemoryBodyStream block_content(append_data, nbytes);
-    if (block_content.Length() == 0) {
+  Future<> FlushAsync() {
+    RETURN_NOT_OK(CheckClosed("flush async"));
+    if (!initialised_) {
+      // If the stream has not been successfully initialized then there is nothing to
+      // flush. This also avoids some unhandled errors when flushing in the destructor.
       return Status::OK();
     }
 
-    const auto n_block_ids = block_ids_.size();
+    Future<> pending_blocks_completed;
+    {
+      std::unique_lock<std::mutex> lock(upload_state_->mutex);
+      pending_blocks_completed = upload_state_->pending_blocks_completed;
+    }
+
+    return pending_blocks_completed.Then([self = Self()] {
+      std::unique_lock<std::mutex> lock(self->upload_state_->mutex);
+      return CommitBlockList(self->block_blob_client_, self->upload_state_->block_ids,
+                             self->commit_block_list_options_);
+    });
+  }
+
+ private:
+  Status AppendCurrentBlock() {
+    ARROW_ASSIGN_OR_RAISE(auto buf, current_block_->Finish());
+    current_block_.reset();
+    current_block_size_ = 0;
+    return AppendBlock(buf);
+  }
+
+  Status DoWrite(const void* data, int64_t nbytes,
+                 std::shared_ptr<Buffer> owned_buffer = nullptr) {
+    if (closed_) {
+      return Status::Invalid("Operation on closed stream");
+    }
+
+    const auto* data_ptr = reinterpret_cast<const int8_t*>(data);
+    auto advance_ptr = [this, &data_ptr, &nbytes](const int64_t offset) {
+      data_ptr += offset;
+      nbytes -= offset;
+      pos_ += offset;
+      content_length_ += offset;
+    };
+
+    // Handle case where we have some bytes buffered from prior calls.
+    if (current_block_size_ > 0) {
+      // Try to fill current buffer
+      const int64_t to_copy =
+          std::min(nbytes, kBlockUploadSizeBytes - current_block_size_);
+      RETURN_NOT_OK(current_block_->Write(data_ptr, to_copy));
+      current_block_size_ += to_copy;
+      advance_ptr(to_copy);
+
+      // If buffer isn't full, break
+      if (current_block_size_ < kBlockUploadSizeBytes) {
+        return Status::OK();
+      }
+
+      // Upload current buffer
+      RETURN_NOT_OK(AppendCurrentBlock());
+    }
+
+    // We can upload chunks without copying them into a buffer
+    while (nbytes >= kBlockUploadSizeBytes) {
+      const auto upload_size = std::min(nbytes, kMaxBlockSizeBytes);
+      RETURN_NOT_OK(AppendBlock(data_ptr, upload_size));
+      advance_ptr(upload_size);
+    }
+
+    // Buffer remaining bytes
+    if (nbytes > 0) {
+      current_block_size_ = nbytes;
+
+      if (current_block_ == nullptr) {
+        ARROW_ASSIGN_OR_RAISE(
+            current_block_,
+            io::BufferOutputStream::Create(kBlockUploadSizeBytes, io_context_.pool()));
+      } else {
+        // Re-use the allocation from before.
+        RETURN_NOT_OK(current_block_->Reset(kBlockUploadSizeBytes, io_context_.pool()));
+      }
+
+      RETURN_NOT_OK(current_block_->Write(data_ptr, current_block_size_));
+      pos_ += current_block_size_;
+      content_length_ += current_block_size_;
+    }
+
+    return Status::OK();
+  }
+
+  std::string CreateBlock() {
+    std::unique_lock<std::mutex> lock(upload_state_->mutex);
+    const auto n_block_ids = upload_state_->block_ids.size();
 
     // New block ID must always be distinct from the existing block IDs. Otherwise we
     // will accidentally replace the content of existing blocks, causing corruption.
@@ -1093,36 +1241,106 @@ class ObjectAppendStream final : public io::OutputStream {
     new_block_id.insert(0, required_padding_digits, '0');
     // There is a small risk when appending to a blob created by another client that
     // `new_block_id` may overlapping with an existing block id. Adding the `-arrow`
-    // suffix significantly reduces the risk, but does not 100% eliminate it. For example
-    // if the blob was previously created with one block, with id `00001-arrow` then the
-    // next block we append will conflict with that, and cause corruption.
+    // suffix significantly reduces the risk, but does not 100% eliminate it. For
+    // example if the blob was previously created with one block, with id `00001-arrow`
+    // then the next block we append will conflict with that, and cause corruption.
     new_block_id += "-arrow";
     new_block_id = Core::Convert::Base64Encode(
         std::vector<uint8_t>(new_block_id.begin(), new_block_id.end()));
 
-    try {
-      block_blob_client_->StageBlock(new_block_id, block_content);
-    } catch (const Storage::StorageException& exception) {
-      return ExceptionToStatus(
-          exception, "StageBlock failed for '", block_blob_client_->GetUrl(),
-          "' new_block_id: '", new_block_id,
-          "'. Staging new blocks is fundamental to streaming writes to blob storage.");
+    upload_state_->block_ids.push_back(new_block_id);
+
+    // We only use the future if we have background writes enabled. Without background
+    // writes the future is initialized as finished and not mutated any more.
+    if (background_writes_ && upload_state_->blocks_in_progress++ == 0) {
+      upload_state_->pending_blocks_completed = Future<>::Make();
     }
-    block_ids_.push_back(new_block_id);
-    pos_ += nbytes;
-    content_length_ += nbytes;
+
+    return new_block_id;
+  }
+
+  Status AppendBlock(const void* data, int64_t nbytes,
+                     std::shared_ptr<Buffer> owned_buffer = nullptr) {
+    RETURN_NOT_OK(CheckClosed("append"));
+
+    if (nbytes == 0) {
+      return Status::OK();
+    }
+
+    const auto block_id = CreateBlock();
+
+    if (background_writes_) {
+      if (owned_buffer == nullptr) {
+        ARROW_ASSIGN_OR_RAISE(owned_buffer, AllocateBuffer(nbytes, io_context_.pool()));
+        memcpy(owned_buffer->mutable_data(), data, nbytes);
+      } else {
+        DCHECK_EQ(data, owned_buffer->data());
+        DCHECK_EQ(nbytes, owned_buffer->size());
+      }
+
+      // The closure keeps the buffer and the upload state alive
+      auto deferred = [owned_buffer, block_id, block_blob_client = block_blob_client_,
+                       state = upload_state_]() mutable -> Status {
+        Core::IO::MemoryBodyStream block_content(owned_buffer->data(),
+                                                 owned_buffer->size());
+
+        auto status = StageBlock(block_blob_client.get(), block_id, block_content);
+        HandleUploadOutcome(state, status);
+        return Status::OK();
+      };
+      RETURN_NOT_OK(io::internal::SubmitIO(io_context_, std::move(deferred)));
+    } else {
+      auto append_data = reinterpret_cast<const uint8_t*>(data);
+      Core::IO::MemoryBodyStream block_content(append_data, nbytes);
+
+      RETURN_NOT_OK(StageBlock(block_blob_client_.get(), block_id, block_content));
+    }
+
     return Status::OK();
   }
 
+  Status AppendBlock(std::shared_ptr<Buffer> buffer) {
+    return AppendBlock(buffer->data(), buffer->size(), buffer);
+  }
+
+  static void HandleUploadOutcome(const std::shared_ptr<UploadState>& state,
+                                  const Status& status) {
+    std::unique_lock<std::mutex> lock(state->mutex);
+    if (!status.ok()) {
+      state->status &= status;
+    }
+    // Notify completion
+    if (--state->blocks_in_progress == 0) {
+      auto fut = state->pending_blocks_completed;
+      lock.unlock();
+      fut.MarkFinished(state->status);
+    }
+  }
+
   std::shared_ptr<Blobs::BlockBlobClient> block_blob_client_;
   const io::IOContext io_context_;
   const AzureLocation location_;
+  const bool background_writes_;
   int64_t content_length_ = kNoSize;
 
+  std::shared_ptr<io::BufferOutputStream> current_block_;
+  int64_t current_block_size_ = 0;
+
   bool closed_ = false;
   bool initialised_ = false;
   int64_t pos_ = 0;
-  std::vector<std::string> block_ids_;
+
+  // This struct is kept alive through background writes to avoid problems
+  // in the completion handler.
+  struct UploadState {
+    std::mutex mutex;
+    std::vector<std::string> block_ids;
+    int64_t blocks_in_progress = 0;
+    Status status;
+    Future<> pending_blocks_completed = Future<>::MakeFinished(Status::OK());
+  };
+  std::shared_ptr<UploadState> upload_state_;
+
   Blobs::CommitBlockListOptions commit_block_list_options_;
 };
 
diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h
index 072b061eeb2a9..ebbe00c4ee784 100644
--- a/cpp/src/arrow/filesystem/azurefs.h
+++ b/cpp/src/arrow/filesystem/azurefs.h
@@ -112,6 +112,9 @@ struct ARROW_EXPORT AzureOptions {
   /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
   std::shared_ptr<const KeyValueMetadata> default_metadata;
 
+  /// Whether OutputStream writes will be issued in the background, without blocking.
+  bool background_writes = true;
+
  private:
   enum class CredentialKind {
     kDefault,
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc
index 5ff241b17ff58..9d437d1f83aac 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -39,6 +39,7 @@
 #include <memory>
 #include <random>
 #include <string>
+#include <vector>
 
 #include <gmock/gmock-matchers.h>
 #include <gmock/gmock-more-matchers.h>
@@ -53,6 +54,7 @@
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/util.h"
+#include "arrow/util/future.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
@@ -566,6 +568,7 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, default_options.dfs_storage_scheme);
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kDefault);
     ASSERT_EQ(path, "container/dir/blob");
+    ASSERT_EQ(options.background_writes, true);
   }
 
   void TestFromUriDfsStorage() {
@@ -582,6 +585,7 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, default_options.dfs_storage_scheme);
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kDefault);
     ASSERT_EQ(path, "file_system/dir/file");
+    ASSERT_EQ(options.background_writes, true);
   }
 
   void TestFromUriAbfs() {
@@ -597,6 +601,7 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, "https");
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kStorageSharedKey);
     ASSERT_EQ(path, "container/dir/blob");
+    ASSERT_EQ(options.background_writes, true);
   }
 
   void TestFromUriAbfss() {
@@ -612,6 +617,7 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, "https");
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kStorageSharedKey);
     ASSERT_EQ(path, "container/dir/blob");
+    ASSERT_EQ(options.background_writes, true);
   }
 
   void TestFromUriEnableTls() {
@@ -628,6 +634,17 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.dfs_storage_scheme, "http");
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kStorageSharedKey);
     ASSERT_EQ(path, "container/dir/blob");
+    ASSERT_EQ(options.background_writes, true);
+  }
+
+  void TestFromUriDisableBackgroundWrites() {
+    std::string path;
+    ASSERT_OK_AND_ASSIGN(auto options,
+                         AzureOptions::FromUri(
+                             "abfs://account:password@127.0.0.1:10000/container/dir/blob?"
+                             "background_writes=false",
+                             &path));
+    ASSERT_EQ(options.background_writes, false);
   }
 
   void TestFromUriCredentialDefault() {
@@ -773,6 +790,9 @@ TEST_F(TestAzureOptions, FromUriDfsStorage) { TestFromUriDfsStorage(); }
 TEST_F(TestAzureOptions, FromUriAbfs) { TestFromUriAbfs(); }
 TEST_F(TestAzureOptions, FromUriAbfss) { TestFromUriAbfss(); }
 TEST_F(TestAzureOptions, FromUriEnableTls) { TestFromUriEnableTls(); }
+TEST_F(TestAzureOptions, FromUriDisableBackgroundWrites) {
+  TestFromUriDisableBackgroundWrites();
+}
 TEST_F(TestAzureOptions, FromUriCredentialDefault) { TestFromUriCredentialDefault(); }
 TEST_F(TestAzureOptions, FromUriCredentialAnonymous) { TestFromUriCredentialAnonymous(); }
 TEST_F(TestAzureOptions, FromUriCredentialStorageSharedKey) {
@@ -929,8 +949,9 @@ class TestAzureFileSystem : public ::testing::Test {
   void UploadLines(const std::vector<std::string>& lines, const std::string& path,
                    int total_size) {
     ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
-    const auto all_lines = std::accumulate(lines.begin(), lines.end(), std::string(""));
-    ASSERT_OK(output->Write(all_lines));
+    for (auto const& line : lines) {
+      ASSERT_OK(output->Write(line.data(), line.size()));
+    }
     ASSERT_OK(output->Close());
   }
 
@@ -1474,6 +1495,162 @@ class TestAzureFileSystem : public ::testing::Test {
     arrow::fs::AssertFileInfo(fs(), data.Path("dir/file0"), FileType::File);
   }
 
+  void AssertObjectContents(AzureFileSystem* fs, std::string_view path,
+                            std::string_view expected) {
+    ASSERT_OK_AND_ASSIGN(auto input, fs->OpenInputStream(std::string{path}));
+    std::string contents;
+    std::shared_ptr<Buffer> buffer;
+    do {
+      ASSERT_OK_AND_ASSIGN(buffer, input->Read(128 * 1024));
+      contents.append(buffer->ToString());
+    } while (buffer->size() != 0);
+
+    EXPECT_EQ(expected, contents);
+  }
+
+  void TestOpenOutputStreamSmall() {
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+
+    auto data = SetUpPreexistingData();
+    const auto path = data.ContainerPath("test-write-object");
+    ASSERT_OK_AND_ASSIGN(auto output, fs->OpenOutputStream(path, {}));
+    const std::string_view expected(PreexistingData::kLoremIpsum);
+    ASSERT_OK(output->Write(expected));
+    ASSERT_OK(output->Close());
+
+    // Verify we can read the object back.
+    AssertObjectContents(fs.get(), path, expected);
+  }
+
+  void TestOpenOutputStreamLarge() {
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+
+    auto data = SetUpPreexistingData();
+    const auto path = data.ContainerPath("test-write-object");
+    ASSERT_OK_AND_ASSIGN(auto output, fs->OpenOutputStream(path, {}));
+
+    // Upload 5 MB, 4 MB und 2 MB and a very small write to test varying sizes
+    std::vector<std::int64_t> sizes{5 * 1024 * 1024, 4 * 1024 * 1024, 2 * 1024 * 1024,
+                                    2000};
+
+    std::vector<std::string> buffers{};
+    char current_char = 'A';
+    for (const auto size : sizes) {
+      buffers.emplace_back(size, current_char++);
+    }
+
+    auto expected_size = std::int64_t{0};
+    for (size_t i = 0; i < buffers.size(); ++i) {
+      ASSERT_OK(output->Write(buffers[i]));
+      expected_size += sizes[i];
+      ASSERT_EQ(expected_size, output->Tell());
+    }
+    ASSERT_OK(output->Close());
+
+    AssertObjectContents(fs.get(), path,
+                         buffers[0] + buffers[1] + buffers[2] + buffers[3]);
+  }
+
+  void TestOpenOutputStreamLargeSingleWrite() {
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+
+    auto data = SetUpPreexistingData();
+    const auto path = data.ContainerPath("test-write-object");
+    ASSERT_OK_AND_ASSIGN(auto output, fs->OpenOutputStream(path, {}));
+
+    constexpr std::int64_t size{12 * 1024 * 1024};
+    const std::string large_string(size, 'X');
+
+    ASSERT_OK(output->Write(large_string));
+    ASSERT_EQ(size, output->Tell());
+    ASSERT_OK(output->Close());
+
+    AssertObjectContents(fs.get(), path, large_string);
+  }
+
+  void TestOpenOutputStreamCloseAsync() {
+#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND)
+    // This false positive leak is similar to the one pinpointed in the
+    // have_false_positive_memory_leak_with_generator() comments above,
+    // though the stack trace is different. It happens when a block list
+    // is committed from a background thread.
+    //
+    // clang-format off
+    // Direct leak of 968 byte(s) in 1 object(s) allocated from:
+    //   #0 calloc
+    //   #1 (/lib/x86_64-linux-gnu/libxml2.so.2+0xe25a4)
+    //   #2 __xmlDefaultBufferSize
+    //   #3 xmlBufferCreate
+    //   #4 Azure::Storage::_internal::XmlWriter::XmlWriter()
+    //   #5 Azure::Storage::Blobs::_detail::BlockBlobClient::CommitBlockList
+    //   #6 Azure::Storage::Blobs::BlockBlobClient::CommitBlockList
+    //   #7 arrow::fs::(anonymous namespace)::CommitBlockList
+    //   #8 arrow::fs::(anonymous namespace)::ObjectAppendStream::FlushAsync()::'lambda'
+    // clang-format on
+    //
+    // TODO perhaps remove this skip once we can rely on
+    // https://github.com/Azure/azure-sdk-for-cpp/pull/5767
+    //
+    // Also note that ClickHouse has a workaround for a similar issue:
+    // https://github.com/ClickHouse/ClickHouse/pull/45796
+    if (options_.background_writes) {
+      GTEST_SKIP() << "False positive memory leak in libxml2 with CloseAsync";
+    }
+#endif
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+    auto data = SetUpPreexistingData();
+    const std::string path = data.ContainerPath("test-write-object");
+    constexpr auto payload = PreexistingData::kLoremIpsum;
+
+    ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenOutputStream(path));
+    ASSERT_OK(stream->Write(payload));
+    auto close_fut = stream->CloseAsync();
+
+    ASSERT_OK(close_fut.MoveResult());
+
+    AssertObjectContents(fs.get(), path, payload);
+  }
+
+  void TestOpenOutputStreamCloseAsyncDestructor() {
+#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND)
+    // See above.
+    if (options_.background_writes) {
+      GTEST_SKIP() << "False positive memory leak in libxml2 with CloseAsync";
+    }
+#endif
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+    auto data = SetUpPreexistingData();
+    const std::string path = data.ContainerPath("test-write-object");
+    constexpr auto payload = PreexistingData::kLoremIpsum;
+
+    ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenOutputStream(path));
+    ASSERT_OK(stream->Write(payload));
+    // Destructor implicitly closes stream and completes the upload.
+    // Testing it doesn't matter whether flush is triggered asynchronously
+    // after CloseAsync or synchronously after stream.reset() since we're just
+    // checking that the future keeps the stream alive until completion
+    // rather than segfaulting on a dangling stream.
+    auto close_fut = stream->CloseAsync();
+    stream.reset();
+    ASSERT_OK(close_fut.MoveResult());
+
+    AssertObjectContents(fs.get(), path, payload);
+  }
+
+  void TestOpenOutputStreamDestructor() {
+    ASSERT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options_));
+    constexpr auto* payload = "new data";
+    auto data = SetUpPreexistingData();
+    const std::string path = data.ContainerPath("test-write-object");
+
+    ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenOutputStream(path));
+    ASSERT_OK(stream->Write(payload));
+    // Destructor implicitly closes stream and completes the multipart upload.
+    stream.reset();
+
+    AssertObjectContents(fs.get(), path, payload);
+  }
+
  private:
   using StringMatcher =
       ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher<std::string>>;
@@ -2704,53 +2881,27 @@ TEST_F(TestAzuriteFileSystem, WriteMetadataHttpHeaders) {
   ASSERT_EQ("text/plain", content_type);
 }
 
-TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) {
-  auto data = SetUpPreexistingData();
-  const auto path = data.ContainerPath("test-write-object");
-  ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
-  const std::string_view expected(PreexistingData::kLoremIpsum);
-  ASSERT_OK(output->Write(expected));
-  ASSERT_OK(output->Close());
-
-  // Verify we can read the object back.
-  ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path));
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmallNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamSmall();
+}
 
-  std::array<char, 1024> inbuf{};
-  ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data()));
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { TestOpenOutputStreamSmall(); }
 
-  EXPECT_EQ(expected, std::string_view(inbuf.data(), size));
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamLargeNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamLarge();
 }
 
-TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) {
-  auto data = SetUpPreexistingData();
-  const auto path = data.ContainerPath("test-write-object");
-  ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
-  std::array<std::int64_t, 3> sizes{257 * 1024, 258 * 1024, 259 * 1024};
-  std::array<std::string, 3> buffers{
-      std::string(sizes[0], 'A'),
-      std::string(sizes[1], 'B'),
-      std::string(sizes[2], 'C'),
-  };
-  auto expected = std::int64_t{0};
-  for (auto i = 0; i != 3; ++i) {
-    ASSERT_OK(output->Write(buffers[i]));
-    expected += sizes[i];
-    ASSERT_EQ(expected, output->Tell());
-  }
-  ASSERT_OK(output->Close());
-
-  // Verify we can read the object back.
-  ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path));
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { TestOpenOutputStreamLarge(); }
 
-  std::string contents;
-  std::shared_ptr<Buffer> buffer;
-  do {
-    ASSERT_OK_AND_ASSIGN(buffer, input->Read(128 * 1024));
-    ASSERT_TRUE(buffer);
-    contents.append(buffer->ToString());
-  } while (buffer->size() != 0);
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamLargeSingleWriteNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamLargeSingleWrite();
+}
 
-  EXPECT_EQ(contents, buffers[0] + buffers[1] + buffers[2]);
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamLargeSingleWrite) {
+  TestOpenOutputStreamLargeSingleWrite();
 }
 
 TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) {
@@ -2820,6 +2971,33 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) {
   ASSERT_RAISES(Invalid, output->Tell());
 }
 
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamCloseAsync) {
+  TestOpenOutputStreamCloseAsync();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamCloseAsyncNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamCloseAsync();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamAsyncDestructor) {
+  TestOpenOutputStreamCloseAsyncDestructor();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamAsyncDestructorNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamCloseAsyncDestructor();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamDestructor) {
+  TestOpenOutputStreamDestructor();
+}
+
+TEST_F(TestAzuriteFileSystem, OpenOutputStreamDestructorNoBackgroundWrites) {
+  options_.background_writes = false;
+  TestOpenOutputStreamDestructor();
+}
+
 TEST_F(TestAzuriteFileSystem, OpenOutputStreamUri) {
   auto data = SetUpPreexistingData();
   const auto path = data.ContainerPath("open-output-stream-uri.txt");

From ffee537d88ab6d26614e2a1e85d4d18152695020 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 21 Aug 2024 14:18:45 +0200
Subject: [PATCH 006/186] GH-42222: [Python] Add bindings for CopyTo on
 RecordBatch and Array classes (#42223)

### Rationale for this change

We have added bindings for the Device and MemoryManager classes (https://github.com/apache/arrow/issues/41126), and as a next step we can expose the functionality to copy a full Array or RecordBatch to a specific memory manager.

### What changes are included in this PR?

This adds a `copy_to` method on pyarrow Array and RecordBatch.

### Are these changes tested?

Yes

* GitHub Issue: #42222

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/array.pxi             | 36 ++++++++++++
 python/pyarrow/device.pxi            |  6 ++
 python/pyarrow/includes/libarrow.pxd |  4 ++
 python/pyarrow/lib.pxd               |  4 ++
 python/pyarrow/table.pxi             | 35 ++++++++++++
 python/pyarrow/tests/test_cuda.py    | 82 +++++++++++-----------------
 python/pyarrow/tests/test_device.py  | 26 +++++++++
 7 files changed, 143 insertions(+), 50 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 4c3eb93232634..77d6c9c06d2de 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1702,6 +1702,42 @@ cdef class Array(_PandasConvertible):
         _append_array_buffers(self.sp_array.get().data().get(), res)
         return res
 
+    def copy_to(self, destination):
+        """
+        Construct a copy of the array with all buffers on destination
+        device.
+
+        This method recursively copies the array's buffers and those of its
+        children onto the destination MemoryManager device and returns the
+        new Array.
+
+        Parameters
+        ----------
+        destination : pyarrow.MemoryManager or pyarrow.Device
+            The destination device to copy the array to.
+
+        Returns
+        -------
+        Array
+        """
+        cdef:
+            shared_ptr[CArray] c_array
+            shared_ptr[CMemoryManager] c_memory_manager
+
+        if isinstance(destination, Device):
+            c_memory_manager = (<Device>destination).unwrap().get().default_memory_manager()
+        elif isinstance(destination, MemoryManager):
+            c_memory_manager = (<MemoryManager>destination).unwrap()
+        else:
+            raise TypeError(
+                "Argument 'destination' has incorrect type (expected a "
+                f"pyarrow Device or MemoryManager, got {type(destination)})"
+            )
+
+        with nogil:
+            c_array = GetResultValue(self.ap.CopyTo(c_memory_manager))
+        return pyarrow_wrap_array(c_array)
+
     def _export_to_c(self, out_ptr, out_schema_ptr=0):
         """
         Export to a C ArrowArray struct, given its pointer.
diff --git a/python/pyarrow/device.pxi b/python/pyarrow/device.pxi
index 6e6034752085a..26256de62093e 100644
--- a/python/pyarrow/device.pxi
+++ b/python/pyarrow/device.pxi
@@ -64,6 +64,9 @@ cdef class Device(_Weakrefable):
         self.init(device)
         return self
 
+    cdef inline shared_ptr[CDevice] unwrap(self) nogil:
+        return self.device
+
     def __eq__(self, other):
         if not isinstance(other, Device):
             return False
@@ -130,6 +133,9 @@ cdef class MemoryManager(_Weakrefable):
         self.init(mm)
         return self
 
+    cdef inline shared_ptr[CMemoryManager] unwrap(self) nogil:
+        return self.memory_manager
+
     def __repr__(self):
         return "<pyarrow.MemoryManager device: {}>".format(
             frombytes(self.memory_manager.get().device().get().ToString())
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index a54a1db292f70..6f510cfc0c06c 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -234,7 +234,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         CStatus Validate() const
         CStatus ValidateFull() const
         CResult[shared_ptr[CArray]] View(const shared_ptr[CDataType]& type)
+
         CDeviceAllocationType device_type()
+        CResult[shared_ptr[CArray]] CopyTo(const shared_ptr[CMemoryManager]& to) const
 
     shared_ptr[CArray] MakeArray(const shared_ptr[CArrayData]& data)
     CResult[shared_ptr[CArray]] MakeArrayOfNull(
@@ -1027,6 +1029,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CRecordBatch] Slice(int64_t offset)
         shared_ptr[CRecordBatch] Slice(int64_t offset, int64_t length)
 
+        CResult[shared_ptr[CRecordBatch]] CopyTo(const shared_ptr[CMemoryManager]& to) const
+
         CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, c_bool row_major,
                                               CMemoryPool* pool) const
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index e3625c1815274..a7c3b496a0045 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -542,6 +542,8 @@ cdef class Device(_Weakrefable):
     @staticmethod
     cdef wrap(const shared_ptr[CDevice]& device)
 
+    cdef inline shared_ptr[CDevice] unwrap(self) nogil
+
 
 cdef class MemoryManager(_Weakrefable):
     cdef:
@@ -552,6 +554,8 @@ cdef class MemoryManager(_Weakrefable):
     @staticmethod
     cdef wrap(const shared_ptr[CMemoryManager]& mm)
 
+    cdef inline shared_ptr[CMemoryManager] unwrap(self) nogil
+
 
 cdef class Buffer(_Weakrefable):
     cdef:
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 8f7c44e55dc8d..6d34c71c9df40 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -3569,6 +3569,41 @@ cdef class RecordBatch(_Tabular):
                                                                              row_major, pool))
         return pyarrow_wrap_tensor(c_tensor)
 
+    def copy_to(self, destination):
+        """
+        Copy the entire RecordBatch to destination device.
+
+        This copies each column of the record batch to create
+        a new record batch where all underlying buffers for the columns have
+        been copied to the destination MemoryManager.
+
+        Parameters
+        ----------
+        destination : pyarrow.MemoryManager or pyarrow.Device
+            The destination device to copy the array to.
+
+        Returns
+        -------
+        RecordBatch
+        """
+        cdef:
+            shared_ptr[CRecordBatch] c_batch
+            shared_ptr[CMemoryManager] c_memory_manager
+
+        if isinstance(destination, Device):
+            c_memory_manager = (<Device>destination).unwrap().get().default_memory_manager()
+        elif isinstance(destination, MemoryManager):
+            c_memory_manager = (<MemoryManager>destination).unwrap()
+        else:
+            raise TypeError(
+                "Argument 'destination' has incorrect type (expected a "
+                f"pyarrow Device or MemoryManager, got {type(destination)})"
+            )
+
+        with nogil:
+            c_batch = GetResultValue(self.batch.CopyTo(c_memory_manager))
+        return pyarrow_wrap_batch(c_batch)
+
     def _export_to_c(self, out_ptr, out_schema_ptr=0):
         """
         Export to a C ArrowArray struct, given its pointer.
diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py
index 36b97a6206463..d55be651b1571 100644
--- a/python/pyarrow/tests/test_cuda.py
+++ b/python/pyarrow/tests/test_cuda.py
@@ -827,21 +827,29 @@ def test_IPC(size):
     assert p.exitcode == 0
 
 
-def _arr_copy_to_host(carr):
-    # TODO replace below with copy to device when exposed in python
-    buffers = []
-    for cbuf in carr.buffers():
-        if cbuf is None:
-            buffers.append(None)
-        else:
-            buf = global_context.foreign_buffer(
-                cbuf.address, cbuf.size, cbuf
-            ).copy_to_host()
-            buffers.append(buf)
-
-    child = pa.Array.from_buffers(carr.type.value_type, 3, buffers[2:])
-    new = pa.Array.from_buffers(carr.type, 2, buffers[:2], children=[child])
-    return new
+def test_copy_to():
+    _, buf = make_random_buffer(size=10, target='device')
+    mm_cuda = buf.memory_manager
+
+    for dest in [mm_cuda, mm_cuda.device]:
+        arr = pa.array([0, 1, 2])
+        arr_cuda = arr.copy_to(dest)
+        assert not arr_cuda.buffers()[1].is_cpu
+        assert arr_cuda.buffers()[1].device_type == pa.DeviceAllocationType.CUDA
+        assert arr_cuda.buffers()[1].device == mm_cuda.device
+
+        arr_roundtrip = arr_cuda.copy_to(pa.default_cpu_memory_manager())
+        assert arr_roundtrip.equals(arr)
+
+        batch = pa.record_batch({"col": arr})
+        batch_cuda = batch.copy_to(dest)
+        buf_cuda = batch_cuda["col"].buffers()[1]
+        assert not buf_cuda.is_cpu
+        assert buf_cuda.device_type == pa.DeviceAllocationType.CUDA
+        assert buf_cuda.device == mm_cuda.device
+
+        batch_roundtrip = batch_cuda.copy_to(pa.default_cpu_memory_manager())
+        assert batch_roundtrip.equals(batch)
 
 
 def test_device_interface_array():
@@ -856,19 +864,10 @@ def test_device_interface_array():
     typ = pa.list_(pa.int32())
     arr = pa.array([[1], [2, 42]], type=typ)
 
-    # TODO replace below with copy to device when exposed in python
-    cbuffers = []
-    for buf in arr.buffers():
-        if buf is None:
-            cbuffers.append(None)
-        else:
-            cbuf = global_context.new_buffer(buf.size)
-            cbuf.copy_from_host(buf, position=0, nbytes=buf.size)
-            cbuffers.append(cbuf)
-
-    carr = pa.Array.from_buffers(typ, 2, cbuffers[:2], children=[
-        pa.Array.from_buffers(typ.value_type, 3, cbuffers[2:])
-    ])
+    # copy to device
+    _, buf = make_random_buffer(size=10, target='device')
+    mm_cuda = buf.memory_manager
+    carr = arr.copy_to(mm_cuda)
 
     # Type is known up front
     carr._export_to_c_device(ptr_array)
@@ -882,7 +881,7 @@ def test_device_interface_array():
     del carr
     carr_new = pa.Array._import_from_c_device(ptr_array, typ)
     assert carr_new.type == pa.list_(pa.int32())
-    arr_new = _arr_copy_to_host(carr_new)
+    arr_new = carr_new.copy_to(pa.default_cpu_memory_manager())
     assert arr_new.equals(arr)
 
     del carr_new
@@ -891,15 +890,13 @@ def test_device_interface_array():
         pa.Array._import_from_c_device(ptr_array, typ)
 
     # Schema is exported and imported at the same time
-    carr = pa.Array.from_buffers(typ, 2, cbuffers[:2], children=[
-        pa.Array.from_buffers(typ.value_type, 3, cbuffers[2:])
-    ])
+    carr = arr.copy_to(mm_cuda)
     carr._export_to_c_device(ptr_array, ptr_schema)
     # Delete and recreate C++ objects from exported pointers
     del carr
     carr_new = pa.Array._import_from_c_device(ptr_array, ptr_schema)
     assert carr_new.type == pa.list_(pa.int32())
-    arr_new = _arr_copy_to_host(carr_new)
+    arr_new = carr_new.copy_to(pa.default_cpu_memory_manager())
     assert arr_new.equals(arr)
 
     del carr_new
@@ -908,21 +905,6 @@ def test_device_interface_array():
         pa.Array._import_from_c_device(ptr_array, ptr_schema)
 
 
-def _batch_copy_to_host(cbatch):
-    # TODO replace below with copy to device when exposed in python
-    arrs = []
-    for col in cbatch.columns:
-        buffers = [
-            global_context.foreign_buffer(buf.address, buf.size, buf).copy_to_host()
-            if buf is not None else None
-            for buf in col.buffers()
-        ]
-        new = pa.Array.from_buffers(col.type, len(col), buffers)
-        arrs.append(new)
-
-    return pa.RecordBatch.from_arrays(arrs, schema=cbatch.schema)
-
-
 def test_device_interface_batch_array():
     cffi = pytest.importorskip("pyarrow.cffi")
     ffi = cffi.ffi
@@ -949,7 +931,7 @@ def test_device_interface_batch_array():
     del cbatch
     cbatch_new = pa.RecordBatch._import_from_c_device(ptr_array, schema)
     assert cbatch_new.schema == schema
-    batch_new = _batch_copy_to_host(cbatch_new)
+    batch_new = cbatch_new.copy_to(pa.default_cpu_memory_manager())
     assert batch_new.equals(batch)
 
     del cbatch_new
@@ -964,7 +946,7 @@ def test_device_interface_batch_array():
     del cbatch
     cbatch_new = pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema)
     assert cbatch_new.schema == schema
-    batch_new = _batch_copy_to_host(cbatch_new)
+    batch_new = cbatch_new.copy_to(pa.default_cpu_memory_manager())
     assert batch_new.equals(batch)
 
     del cbatch_new
diff --git a/python/pyarrow/tests/test_device.py b/python/pyarrow/tests/test_device.py
index 6bdb015be1a95..dc1a51e6d0092 100644
--- a/python/pyarrow/tests/test_device.py
+++ b/python/pyarrow/tests/test_device.py
@@ -17,6 +17,8 @@
 
 import pyarrow as pa
 
+import pytest
+
 
 def test_device_memory_manager():
     mm = pa.default_cpu_memory_manager()
@@ -41,3 +43,27 @@ def test_buffer_device():
     assert buf.device.is_cpu
     assert buf.device == pa.default_cpu_memory_manager().device
     assert buf.memory_manager.is_cpu
+
+
+def test_copy_to():
+    mm = pa.default_cpu_memory_manager()
+
+    arr = pa.array([0, 1, 2])
+    batch = pa.record_batch({"col": arr})
+
+    for dest in [mm, mm.device]:
+        arr_copied = arr.copy_to(dest)
+        assert arr_copied.equals(arr)
+        assert arr_copied.buffers()[1].device == mm.device
+        assert arr_copied.buffers()[1].address != arr.buffers()[1].address
+
+        batch_copied = batch.copy_to(dest)
+        assert batch_copied.equals(batch)
+        assert batch_copied["col"].buffers()[1].device == mm.device
+        assert batch_copied["col"].buffers()[1].address != arr.buffers()[1].address
+
+    with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"):
+        arr.copy_to(mm.device.device_type)
+
+    with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"):
+        batch.copy_to(mm.device.device_type)

From f9911ee2ffc62fa946b2e1198bcdd13a757181fe Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 21 Aug 2024 14:37:47 +0200
Subject: [PATCH 007/186] GH-43776: [C++] Add chunked Take benchmarks with a
 small selection factor (#43772)

This should help exercise the performance of chunked Take implementation on more use cases.

* GitHub Issue: #43776

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../kernels/vector_selection_benchmark.cc     | 91 ++++++++++++++++---
 1 file changed, 80 insertions(+), 11 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
index c2a27dfe43488..75affd32560f0 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
@@ -17,6 +17,7 @@
 
 #include "benchmark/benchmark.h"
 
+#include <cmath>
 #include <cstdint>
 #include <sstream>
 
@@ -42,6 +43,9 @@ struct FilterParams {
   const double filter_null_proportion;
 };
 
+constexpr double kDefaultTakeSelectionFactor = 1.0;
+constexpr double kSmallTakeSelectionFactor = 0.05;
+
 std::vector<int64_t> g_data_sizes = {kL2Size};
 
 // The benchmark state parameter references this vector of cases. Test high and
@@ -104,14 +108,21 @@ struct TakeBenchmark {
   benchmark::State& state;
   RegressionArgs args;
   random::RandomArrayGenerator rand;
+  double selection_factor;
   bool indices_have_nulls;
   bool monotonic_indices = false;
 
   TakeBenchmark(benchmark::State& state, bool indices_have_nulls,
                 bool monotonic_indices = false)
+      : TakeBenchmark(state, /*selection_factor=*/kDefaultTakeSelectionFactor,
+                      indices_have_nulls, monotonic_indices) {}
+
+  TakeBenchmark(benchmark::State& state, double selection_factor, bool indices_have_nulls,
+                bool monotonic_indices = false)
       : state(state),
         args(state, /*size_is_bytes=*/false),
         rand(kSeed),
+        selection_factor(selection_factor),
         indices_have_nulls(indices_have_nulls),
         monotonic_indices(monotonic_indices) {}
 
@@ -185,10 +196,10 @@ struct TakeBenchmark {
   }
 
   void Bench(const std::shared_ptr<Array>& values) {
-    double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
-    auto indices =
-        rand.Int32(values->length(), 0, static_cast<int32_t>(values->length() - 1),
-                   indices_null_proportion);
+    const double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
+    const int64_t num_indices = static_cast<int64_t>(selection_factor * values->length());
+    auto indices = rand.Int32(num_indices, 0, static_cast<int32_t>(values->length() - 1),
+                              indices_null_proportion);
 
     if (monotonic_indices) {
       auto arg_sorter = *SortIndices(*indices);
@@ -198,14 +209,15 @@ struct TakeBenchmark {
     for (auto _ : state) {
       ABORT_NOT_OK(Take(values, indices).status());
     }
-    state.SetItemsProcessed(state.iterations() * values->length());
+    state.SetItemsProcessed(state.iterations() * num_indices);
+    state.counters["selection_factor"] = selection_factor;
   }
 
   void BenchChunked(const std::shared_ptr<ChunkedArray>& values, bool chunk_indices_too) {
     double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
-    auto indices =
-        rand.Int32(values->length(), 0, static_cast<int32_t>(values->length() - 1),
-                   indices_null_proportion);
+    const int64_t num_indices = static_cast<int64_t>(selection_factor * values->length());
+    auto indices = rand.Int32(num_indices, 0, static_cast<int32_t>(values->length() - 1),
+                              indices_null_proportion);
 
     if (monotonic_indices) {
       auto arg_sorter = *SortIndices(*indices);
@@ -213,14 +225,26 @@ struct TakeBenchmark {
     }
     std::shared_ptr<ChunkedArray> chunked_indices;
     if (chunk_indices_too) {
+      // Here we choose for indices chunks to have roughly the same length
+      // as values chunks, but there may be less of them if selection_factor < 1.0.
+      // The alternative is to have the same number of chunks, but with a potentially
+      // much smaller (and irrealistic) length.
       std::vector<std::shared_ptr<Array>> indices_chunks;
+      // Make sure there are at least two chunks of indices
+      const auto max_chunk_length = indices->length() / 2 + 1;
       int64_t offset = 0;
       for (int i = 0; i < values->num_chunks(); ++i) {
-        auto chunk = indices->Slice(offset, values->chunk(i)->length());
+        const auto chunk_length = std::min(max_chunk_length, values->chunk(i)->length());
+        auto chunk = indices->Slice(offset, chunk_length);
         indices_chunks.push_back(std::move(chunk));
-        offset += values->chunk(i)->length();
+        offset += chunk_length;
+        if (offset >= indices->length()) {
+          break;
+        }
       }
       chunked_indices = std::make_shared<ChunkedArray>(std::move(indices_chunks));
+      ARROW_CHECK_EQ(chunked_indices->length(), num_indices);
+      ARROW_CHECK_GT(chunked_indices->num_chunks(), 1);
     }
 
     if (chunk_indices_too) {
@@ -232,7 +256,8 @@ struct TakeBenchmark {
         ABORT_NOT_OK(Take(values, indices).status());
       }
     }
-    state.SetItemsProcessed(state.iterations() * values->length());
+    state.SetItemsProcessed(state.iterations() * num_indices);
+    state.counters["selection_factor"] = selection_factor;
   }
 };
 
@@ -432,12 +457,25 @@ static void TakeChunkedChunkedInt64RandomIndicesWithNulls(benchmark::State& stat
       .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/true);
 }
 
+static void TakeChunkedChunkedInt64FewRandomIndicesWithNulls(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/true)
+      .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/true);
+}
+
 static void TakeChunkedChunkedInt64MonotonicIndices(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true)
       .ChunkedInt64(
           /*num_chunks=*/100, /*chunk_indices_too=*/true);
 }
 
+static void TakeChunkedChunkedInt64FewMonotonicIndices(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/false, /*monotonic=*/true)
+      .ChunkedInt64(
+          /*num_chunks=*/100, /*chunk_indices_too=*/true);
+}
+
 static void TakeChunkedChunkedFSBRandomIndicesNoNulls(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false)
       .ChunkedFSB(/*num_chunks=*/100, /*chunk_indices_too=*/true);
@@ -463,11 +501,23 @@ static void TakeChunkedChunkedStringRandomIndicesWithNulls(benchmark::State& sta
       .ChunkedString(/*num_chunks=*/100, /*chunk_indices_too=*/true);
 }
 
+static void TakeChunkedChunkedStringFewRandomIndicesWithNulls(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/true)
+      .ChunkedString(/*num_chunks=*/100, /*chunk_indices_too=*/true);
+}
+
 static void TakeChunkedChunkedStringMonotonicIndices(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true)
       .ChunkedString(/*num_chunks=*/100, /*chunk_indices_too=*/true);
 }
 
+static void TakeChunkedChunkedStringFewMonotonicIndices(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/false, /*monotonic=*/true)
+      .ChunkedString(/*num_chunks=*/100, /*chunk_indices_too=*/true);
+}
+
 static void TakeChunkedFlatInt64RandomIndicesNoNulls(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false)
       .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/false);
@@ -478,12 +528,25 @@ static void TakeChunkedFlatInt64RandomIndicesWithNulls(benchmark::State& state)
       .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/false);
 }
 
+static void TakeChunkedFlatInt64FewRandomIndicesWithNulls(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/true)
+      .ChunkedInt64(/*num_chunks=*/100, /*chunk_indices_too=*/false);
+}
+
 static void TakeChunkedFlatInt64MonotonicIndices(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true)
       .ChunkedInt64(
           /*num_chunks=*/100, /*chunk_indices_too=*/false);
 }
 
+static void TakeChunkedFlatInt64FewMonotonicIndices(benchmark::State& state) {
+  TakeBenchmark(state, /*selection_factor=*/kSmallTakeSelectionFactor,
+                /*indices_with_nulls=*/false, /*monotonic=*/true)
+      .ChunkedInt64(
+          /*num_chunks=*/100, /*chunk_indices_too=*/false);
+}
+
 void FilterSetArgs(benchmark::internal::Benchmark* bench) {
   for (int64_t size : g_data_sizes) {
     for (int i = 0; i < static_cast<int>(g_filter_params.size()); ++i) {
@@ -560,18 +623,24 @@ BENCHMARK(TakeStringMonotonicIndices)->Apply(TakeSetArgs);
 // Chunked values x Chunked indices
 BENCHMARK(TakeChunkedChunkedInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedChunkedInt64FewRandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedInt64MonotonicIndices)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedChunkedInt64FewMonotonicIndices)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedFSBRandomIndicesNoNulls)->Apply(TakeFSBSetArgs);
 BENCHMARK(TakeChunkedChunkedFSBRandomIndicesWithNulls)->Apply(TakeFSBSetArgs);
 BENCHMARK(TakeChunkedChunkedFSBMonotonicIndices)->Apply(TakeFSBSetArgs);
 BENCHMARK(TakeChunkedChunkedStringRandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedStringRandomIndicesWithNulls)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedChunkedStringFewRandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedChunkedStringMonotonicIndices)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedChunkedStringFewMonotonicIndices)->Apply(TakeSetArgs);
 
 // Chunked values x Flat indices
 BENCHMARK(TakeChunkedFlatInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedFlatInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedFlatInt64FewRandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeChunkedFlatInt64MonotonicIndices)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedFlatInt64FewMonotonicIndices)->Apply(TakeSetArgs);
 
 }  // namespace compute
 }  // namespace arrow

From f078942ce2df68de8f48c3b4233132133601ca53 Mon Sep 17 00:00:00 2001
From: Adam Reeve <adreeve@gmail.com>
Date: Thu, 22 Aug 2024 02:59:04 +1200
Subject: [PATCH 008/186] GH-43141: [C++][Parquet] Replace use of int with
 int32_t in the internal Parquet encryption APIs (#43413)

### Rationale for this change

See #43141

### What changes are included in this PR?

* Changes uses of int to int32_t in the Encryptor and Decryptor APIs, except where interfacing with OpenSSL.
* Also change RandBytes to use size_t instead of int and check for overflow.
* Check the return code from OpenSSL's Rand_bytes in case there is a failure generating random bytes

### Are these changes tested?

Yes, this doesn't change behaviour and is covered by existing tests.

### Are there any user-facing changes?

No
* GitHub Issue: #43141

Authored-by: Adam Reeve <adreeve@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/column_reader.cc              |   4 +-
 cpp/src/parquet/encryption/crypto_factory.cc  |   6 +-
 .../parquet/encryption/encryption_internal.cc | 251 ++++++++++--------
 .../parquet/encryption/encryption_internal.h  |  46 ++--
 .../encryption/encryption_internal_nossl.cc   |  47 ++--
 .../encryption/encryption_internal_test.cc    |  22 +-
 .../parquet/encryption/file_key_wrapper.cc    |   4 +-
 .../encryption/internal_file_decryptor.cc     |  12 +-
 .../encryption/internal_file_decryptor.h      |   8 +-
 .../encryption/internal_file_encryptor.cc     |  10 +-
 .../encryption/internal_file_encryptor.h      |   6 +-
 .../encryption/key_toolkit_internal.cc        |   2 +-
 cpp/src/parquet/metadata.cc                   |   6 +-
 cpp/src/parquet/thrift_internal.h             |   2 +-
 14 files changed, 233 insertions(+), 193 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 05ee6a16c5448..60a8a2176b0a8 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -468,8 +468,8 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
     // Advance the stream offset
     PARQUET_THROW_NOT_OK(stream_->Advance(header_size));
 
-    int compressed_len = current_page_header_.compressed_page_size;
-    int uncompressed_len = current_page_header_.uncompressed_page_size;
+    int32_t compressed_len = current_page_header_.compressed_page_size;
+    int32_t uncompressed_len = current_page_header_.uncompressed_page_size;
     if (compressed_len < 0 || uncompressed_len < 0) {
       throw ParquetException("Invalid page header");
     }
diff --git a/cpp/src/parquet/encryption/crypto_factory.cc b/cpp/src/parquet/encryption/crypto_factory.cc
index 72506bdc014b6..56069d559771c 100644
--- a/cpp/src/parquet/encryption/crypto_factory.cc
+++ b/cpp/src/parquet/encryption/crypto_factory.cc
@@ -72,8 +72,7 @@ std::shared_ptr<FileEncryptionProperties> CryptoFactory::GetFileEncryptionProper
   int dek_length = dek_length_bits / 8;
 
   std::string footer_key(dek_length, '\0');
-  RandBytes(reinterpret_cast<uint8_t*>(&footer_key[0]),
-            static_cast<int>(footer_key.size()));
+  RandBytes(reinterpret_cast<uint8_t*>(footer_key.data()), footer_key.size());
 
   std::string footer_key_metadata =
       key_wrapper.GetEncryptionKeyMetadata(footer_key, footer_key_id, true);
@@ -148,8 +147,7 @@ ColumnPathToEncryptionPropertiesMap CryptoFactory::GetColumnEncryptionProperties
       }
 
       std::string column_key(dek_length, '\0');
-      RandBytes(reinterpret_cast<uint8_t*>(&column_key[0]),
-                static_cast<int>(column_key.size()));
+      RandBytes(reinterpret_cast<uint8_t*>(column_key.data()), column_key.size());
       std::string column_key_key_metadata =
           key_wrapper->GetEncryptionKeyMetadata(column_key, column_key_id, false);
 
diff --git a/cpp/src/parquet/encryption/encryption_internal.cc b/cpp/src/parquet/encryption/encryption_internal.cc
index 99d1707f4a8d4..a0d9367b619c6 100644
--- a/cpp/src/parquet/encryption/encryption_internal.cc
+++ b/cpp/src/parquet/encryption/encryption_internal.cc
@@ -18,6 +18,7 @@
 #include "parquet/encryption/encryption_internal.h"
 
 #include <openssl/aes.h>
+#include <openssl/err.h>
 #include <openssl/evp.h>
 #include <openssl/rand.h>
 
@@ -36,10 +37,10 @@ using parquet::ParquetException;
 
 namespace parquet::encryption {
 
-constexpr int kGcmMode = 0;
-constexpr int kCtrMode = 1;
-constexpr int kCtrIvLength = 16;
-constexpr int kBufferSizeLength = 4;
+constexpr int32_t kGcmMode = 0;
+constexpr int32_t kCtrMode = 1;
+constexpr int32_t kCtrIvLength = 16;
+constexpr int32_t kBufferSizeLength = 4;
 
 #define ENCRYPT_INIT(CTX, ALG)                                        \
   if (1 != EVP_EncryptInit_ex(CTX, ALG, nullptr, nullptr, nullptr)) { \
@@ -53,17 +54,17 @@ constexpr int kBufferSizeLength = 4;
 
 class AesEncryptor::AesEncryptorImpl {
  public:
-  explicit AesEncryptorImpl(ParquetCipher::type alg_id, int key_len, bool metadata,
+  explicit AesEncryptorImpl(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                             bool write_length);
 
   ~AesEncryptorImpl() { WipeOut(); }
 
-  int Encrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
-              span<const uint8_t> aad, span<uint8_t> ciphertext);
+  int32_t Encrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
+                  span<const uint8_t> aad, span<uint8_t> ciphertext);
 
-  int SignedFooterEncrypt(span<const uint8_t> footer, span<const uint8_t> key,
-                          span<const uint8_t> aad, span<const uint8_t> nonce,
-                          span<uint8_t> encrypted_footer);
+  int32_t SignedFooterEncrypt(span<const uint8_t> footer, span<const uint8_t> key,
+                              span<const uint8_t> aad, span<const uint8_t> nonce,
+                              span<uint8_t> encrypted_footer);
   void WipeOut() {
     if (nullptr != ctx_) {
       EVP_CIPHER_CTX_free(ctx_);
@@ -89,21 +90,22 @@ class AesEncryptor::AesEncryptorImpl {
 
  private:
   EVP_CIPHER_CTX* ctx_;
-  int aes_mode_;
-  int key_length_;
-  int ciphertext_size_delta_;
-  int length_buffer_length_;
+  int32_t aes_mode_;
+  int32_t key_length_;
+  int32_t ciphertext_size_delta_;
+  int32_t length_buffer_length_;
 
-  int GcmEncrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
-                 span<const uint8_t> nonce, span<const uint8_t> aad,
-                 span<uint8_t> ciphertext);
+  int32_t GcmEncrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
+                     span<const uint8_t> nonce, span<const uint8_t> aad,
+                     span<uint8_t> ciphertext);
 
-  int CtrEncrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
-                 span<const uint8_t> nonce, span<uint8_t> ciphertext);
+  int32_t CtrEncrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
+                     span<const uint8_t> nonce, span<uint8_t> ciphertext);
 };
 
-AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata, bool write_length) {
+AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata,
+                                                 bool write_length) {
   openssl::EnsureInitialized();
 
   ctx_ = nullptr;
@@ -151,11 +153,9 @@ AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id, int
   }
 }
 
-int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(span<const uint8_t> footer,
-                                                        span<const uint8_t> key,
-                                                        span<const uint8_t> aad,
-                                                        span<const uint8_t> nonce,
-                                                        span<uint8_t> encrypted_footer) {
+int32_t AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(
+    span<const uint8_t> footer, span<const uint8_t> key, span<const uint8_t> aad,
+    span<const uint8_t> nonce, span<uint8_t> encrypted_footer) {
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -176,10 +176,10 @@ int AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(span<const uint8_t> foot
   return GcmEncrypt(footer, key, nonce, aad, encrypted_footer);
 }
 
-int AesEncryptor::AesEncryptorImpl::Encrypt(span<const uint8_t> plaintext,
-                                            span<const uint8_t> key,
-                                            span<const uint8_t> aad,
-                                            span<uint8_t> ciphertext) {
+int32_t AesEncryptor::AesEncryptorImpl::Encrypt(span<const uint8_t> plaintext,
+                                                span<const uint8_t> key,
+                                                span<const uint8_t> aad,
+                                                span<uint8_t> ciphertext) {
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -205,13 +205,13 @@ int AesEncryptor::AesEncryptorImpl::Encrypt(span<const uint8_t> plaintext,
   return CtrEncrypt(plaintext, key, nonce, ciphertext);
 }
 
-int AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
-                                               span<const uint8_t> key,
-                                               span<const uint8_t> nonce,
-                                               span<const uint8_t> aad,
-                                               span<uint8_t> ciphertext) {
+int32_t AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
+                                                   span<const uint8_t> key,
+                                                   span<const uint8_t> nonce,
+                                                   span<const uint8_t> aad,
+                                                   span<uint8_t> ciphertext) {
   int len;
-  int ciphertext_len;
+  int32_t ciphertext_len;
 
   std::array<uint8_t, kGcmTagLength> tag{};
 
@@ -227,12 +227,22 @@ int AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
   }
 
   // Setting additional authenticated data
+  if (aad.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "AAD size " << aad.size() << " overflows int";
+    throw ParquetException(ss.str());
+  }
   if ((!aad.empty()) && (1 != EVP_EncryptUpdate(ctx_, nullptr, &len, aad.data(),
                                                 static_cast<int>(aad.size())))) {
     throw ParquetException("Couldn't set AAD");
   }
 
   // Encryption
+  if (plaintext.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "Plaintext size " << plaintext.size() << " overflows int";
+    throw ParquetException(ss.str());
+  }
   if (1 !=
       EVP_EncryptUpdate(ctx_, ciphertext.data() + length_buffer_length_ + kNonceLength,
                         &len, plaintext.data(), static_cast<int>(plaintext.size()))) {
@@ -256,7 +266,7 @@ int AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
   }
 
   // Copying the buffer size, nonce and tag to ciphertext
-  int buffer_size = kNonceLength + ciphertext_len + kGcmTagLength;
+  int32_t buffer_size = kNonceLength + ciphertext_len + kGcmTagLength;
   if (length_buffer_length_ > 0) {
     ciphertext[3] = static_cast<uint8_t>(0xff & (buffer_size >> 24));
     ciphertext[2] = static_cast<uint8_t>(0xff & (buffer_size >> 16));
@@ -271,12 +281,12 @@ int AesEncryptor::AesEncryptorImpl::GcmEncrypt(span<const uint8_t> plaintext,
   return length_buffer_length_ + buffer_size;
 }
 
-int AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
-                                               span<const uint8_t> key,
-                                               span<const uint8_t> nonce,
-                                               span<uint8_t> ciphertext) {
+int32_t AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
+                                                   span<const uint8_t> key,
+                                                   span<const uint8_t> nonce,
+                                                   span<uint8_t> ciphertext) {
   int len;
-  int ciphertext_len;
+  int32_t ciphertext_len;
 
   if (nonce.size() != static_cast<size_t>(kNonceLength)) {
     std::stringstream ss;
@@ -298,6 +308,11 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
   }
 
   // Encryption
+  if (plaintext.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "Plaintext size " << plaintext.size() << " overflows int";
+    throw ParquetException(ss.str());
+  }
   if (1 !=
       EVP_EncryptUpdate(ctx_, ciphertext.data() + length_buffer_length_ + kNonceLength,
                         &len, plaintext.data(), static_cast<int>(plaintext.size()))) {
@@ -316,7 +331,7 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
   ciphertext_len += len;
 
   // Copying the buffer size and nonce to ciphertext
-  int buffer_size = kNonceLength + ciphertext_len;
+  int32_t buffer_size = kNonceLength + ciphertext_len;
   if (length_buffer_length_ > 0) {
     ciphertext[3] = static_cast<uint8_t>(0xff & (buffer_size >> 24));
     ciphertext[2] = static_cast<uint8_t>(0xff & (buffer_size >> 16));
@@ -331,9 +346,11 @@ int AesEncryptor::AesEncryptorImpl::CtrEncrypt(span<const uint8_t> plaintext,
 
 AesEncryptor::~AesEncryptor() {}
 
-int AesEncryptor::SignedFooterEncrypt(span<const uint8_t> footer, span<const uint8_t> key,
-                                      span<const uint8_t> aad, span<const uint8_t> nonce,
-                                      span<uint8_t> encrypted_footer) {
+int32_t AesEncryptor::SignedFooterEncrypt(span<const uint8_t> footer,
+                                          span<const uint8_t> key,
+                                          span<const uint8_t> aad,
+                                          span<const uint8_t> nonce,
+                                          span<uint8_t> encrypted_footer) {
   return impl_->SignedFooterEncrypt(footer, key, aad, nonce, encrypted_footer);
 }
 
@@ -343,25 +360,25 @@ int32_t AesEncryptor::CiphertextLength(int64_t plaintext_len) const {
   return impl_->CiphertextLength(plaintext_len);
 }
 
-int AesEncryptor::Encrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
-                          span<const uint8_t> aad, span<uint8_t> ciphertext) {
+int32_t AesEncryptor::Encrypt(span<const uint8_t> plaintext, span<const uint8_t> key,
+                              span<const uint8_t> aad, span<uint8_t> ciphertext) {
   return impl_->Encrypt(plaintext, key, aad, ciphertext);
 }
 
-AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                            bool write_length)
     : impl_{std::unique_ptr<AesEncryptorImpl>(
           new AesEncryptorImpl(alg_id, key_len, metadata, write_length))} {}
 
 class AesDecryptor::AesDecryptorImpl {
  public:
-  explicit AesDecryptorImpl(ParquetCipher::type alg_id, int key_len, bool metadata,
+  explicit AesDecryptorImpl(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                             bool contains_length);
 
   ~AesDecryptorImpl() { WipeOut(); }
 
-  int Decrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
-              span<const uint8_t> aad, span<uint8_t> plaintext);
+  int32_t Decrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
+                  span<const uint8_t> aad, span<uint8_t> plaintext);
 
   void WipeOut() {
     if (nullptr != ctx_) {
@@ -370,7 +387,7 @@ class AesDecryptor::AesDecryptorImpl {
     }
   }
 
-  [[nodiscard]] int PlaintextLength(int ciphertext_len) const {
+  [[nodiscard]] int32_t PlaintextLength(int32_t ciphertext_len) const {
     if (ciphertext_len < ciphertext_size_delta_) {
       std::stringstream ss;
       ss << "Ciphertext length " << ciphertext_len << " is invalid, expected at least "
@@ -380,12 +397,13 @@ class AesDecryptor::AesDecryptorImpl {
     return ciphertext_len - ciphertext_size_delta_;
   }
 
-  [[nodiscard]] int CiphertextLength(int plaintext_len) const {
+  [[nodiscard]] int32_t CiphertextLength(int32_t plaintext_len) const {
     if (plaintext_len < 0) {
       std::stringstream ss;
       ss << "Negative plaintext length " << plaintext_len;
       throw ParquetException(ss.str());
-    } else if (plaintext_len > std::numeric_limits<int>::max() - ciphertext_size_delta_) {
+    } else if (plaintext_len >
+               std::numeric_limits<int32_t>::max() - ciphertext_size_delta_) {
       std::stringstream ss;
       ss << "Plaintext length " << plaintext_len << " plus ciphertext size delta "
          << ciphertext_size_delta_ << " overflows int32";
@@ -396,24 +414,24 @@ class AesDecryptor::AesDecryptorImpl {
 
  private:
   EVP_CIPHER_CTX* ctx_;
-  int aes_mode_;
-  int key_length_;
-  int ciphertext_size_delta_;
-  int length_buffer_length_;
+  int32_t aes_mode_;
+  int32_t key_length_;
+  int32_t ciphertext_size_delta_;
+  int32_t length_buffer_length_;
 
   /// Get the actual ciphertext length, inclusive of the length buffer length,
   /// and validate that the provided buffer size is large enough.
-  [[nodiscard]] int GetCiphertextLength(span<const uint8_t> ciphertext) const;
+  [[nodiscard]] int32_t GetCiphertextLength(span<const uint8_t> ciphertext) const;
 
-  int GcmDecrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
-                 span<const uint8_t> aad, span<uint8_t> plaintext);
+  int32_t GcmDecrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
+                     span<const uint8_t> aad, span<uint8_t> plaintext);
 
-  int CtrDecrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
-                 span<uint8_t> plaintext);
+  int32_t CtrDecrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
+                     span<uint8_t> plaintext);
 };
 
-int AesDecryptor::Decrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
-                          span<const uint8_t> aad, span<uint8_t> plaintext) {
+int32_t AesDecryptor::Decrypt(span<const uint8_t> ciphertext, span<const uint8_t> key,
+                              span<const uint8_t> aad, span<uint8_t> plaintext) {
   return impl_->Decrypt(ciphertext, key, aad, plaintext);
 }
 
@@ -421,8 +439,9 @@ void AesDecryptor::WipeOut() { impl_->WipeOut(); }
 
 AesDecryptor::~AesDecryptor() {}
 
-AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata, bool contains_length) {
+AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata,
+                                                 bool contains_length) {
   openssl::EnsureInitialized();
 
   ctx_ = nullptr;
@@ -469,13 +488,14 @@ AesDecryptor::AesDecryptorImpl::AesDecryptorImpl(ParquetCipher::type alg_id, int
   }
 }
 
-std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata) {
+std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata) {
   return Make(alg_id, key_len, metadata, true /*write_length*/);
 }
 
-std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata, bool write_length) {
+std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata,
+                                                 bool write_length) {
   if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) {
     std::stringstream ss;
     ss << "Crypto algorithm " << alg_id << " is not supported";
@@ -485,13 +505,13 @@ std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int
   return std::make_unique<AesEncryptor>(alg_id, key_len, metadata, write_length);
 }
 
-AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                            bool contains_length)
     : impl_{std::unique_ptr<AesDecryptorImpl>(
           new AesDecryptorImpl(alg_id, key_len, metadata, contains_length))} {}
 
 std::shared_ptr<AesDecryptor> AesDecryptor::Make(
-    ParquetCipher::type alg_id, int key_len, bool metadata,
+    ParquetCipher::type alg_id, int32_t key_len, bool metadata,
     std::vector<std::weak_ptr<AesDecryptor>>* all_decryptors) {
   if (ParquetCipher::AES_GCM_V1 != alg_id && ParquetCipher::AES_GCM_CTR_V1 != alg_id) {
     std::stringstream ss;
@@ -506,15 +526,15 @@ std::shared_ptr<AesDecryptor> AesDecryptor::Make(
   return decryptor;
 }
 
-int AesDecryptor::PlaintextLength(int ciphertext_len) const {
+int32_t AesDecryptor::PlaintextLength(int32_t ciphertext_len) const {
   return impl_->PlaintextLength(ciphertext_len);
 }
 
-int AesDecryptor::CiphertextLength(int plaintext_len) const {
+int32_t AesDecryptor::CiphertextLength(int32_t plaintext_len) const {
   return impl_->CiphertextLength(plaintext_len);
 }
 
-int AesDecryptor::AesDecryptorImpl::GetCiphertextLength(
+int32_t AesDecryptor::AesDecryptorImpl::GetCiphertextLength(
     span<const uint8_t> ciphertext) const {
   if (length_buffer_length_ > 0) {
     // Note: length_buffer_length_ must be either 0 or kBufferSizeLength
@@ -533,10 +553,11 @@ int AesDecryptor::AesDecryptorImpl::GetCiphertextLength(
                                       (static_cast<uint32_t>(ciphertext[0]));
 
     if (written_ciphertext_len >
-        static_cast<uint32_t>(std::numeric_limits<int>::max() - length_buffer_length_)) {
+        static_cast<uint32_t>(std::numeric_limits<int32_t>::max() -
+                              length_buffer_length_)) {
       std::stringstream ss;
       ss << "Written ciphertext length " << written_ciphertext_len
-         << " plus length buffer length " << length_buffer_length_ << " overflows int";
+         << " plus length buffer length " << length_buffer_length_ << " overflows int32";
       throw ParquetException(ss.str());
     } else if (ciphertext.size() <
                static_cast<size_t>(written_ciphertext_len) + length_buffer_length_) {
@@ -548,28 +569,28 @@ int AesDecryptor::AesDecryptorImpl::GetCiphertextLength(
       throw ParquetException(ss.str());
     }
 
-    return static_cast<int>(written_ciphertext_len) + length_buffer_length_;
+    return static_cast<int32_t>(written_ciphertext_len) + length_buffer_length_;
   } else {
-    if (ciphertext.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    if (ciphertext.size() > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
       std::stringstream ss;
-      ss << "Ciphertext buffer length " << ciphertext.size() << " overflows int";
+      ss << "Ciphertext buffer length " << ciphertext.size() << " overflows int32";
       throw ParquetException(ss.str());
     }
-    return static_cast<int>(ciphertext.size());
+    return static_cast<int32_t>(ciphertext.size());
   }
 }
 
-int AesDecryptor::AesDecryptorImpl::GcmDecrypt(span<const uint8_t> ciphertext,
-                                               span<const uint8_t> key,
-                                               span<const uint8_t> aad,
-                                               span<uint8_t> plaintext) {
+int32_t AesDecryptor::AesDecryptorImpl::GcmDecrypt(span<const uint8_t> ciphertext,
+                                                   span<const uint8_t> key,
+                                                   span<const uint8_t> aad,
+                                                   span<uint8_t> plaintext) {
   int len;
-  int plaintext_len;
+  int32_t plaintext_len;
 
   std::array<uint8_t, kGcmTagLength> tag{};
   std::array<uint8_t, kNonceLength> nonce{};
 
-  int ciphertext_len = GetCiphertextLength(ciphertext);
+  int32_t ciphertext_len = GetCiphertextLength(ciphertext);
 
   if (plaintext.size() < static_cast<size_t>(ciphertext_len) - ciphertext_size_delta_) {
     std::stringstream ss;
@@ -597,16 +618,22 @@ int AesDecryptor::AesDecryptorImpl::GcmDecrypt(span<const uint8_t> ciphertext,
   }
 
   // Setting additional authenticated data
+  if (aad.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "AAD size " << aad.size() << " overflows int";
+    throw ParquetException(ss.str());
+  }
   if ((!aad.empty()) && (1 != EVP_DecryptUpdate(ctx_, nullptr, &len, aad.data(),
                                                 static_cast<int>(aad.size())))) {
     throw ParquetException("Couldn't set AAD");
   }
 
   // Decryption
-  if (!EVP_DecryptUpdate(
-          ctx_, plaintext.data(), &len,
-          ciphertext.data() + length_buffer_length_ + kNonceLength,
-          ciphertext_len - length_buffer_length_ - kNonceLength - kGcmTagLength)) {
+  int decryption_length =
+      ciphertext_len - length_buffer_length_ - kNonceLength - kGcmTagLength;
+  if (!EVP_DecryptUpdate(ctx_, plaintext.data(), &len,
+                         ciphertext.data() + length_buffer_length_ + kNonceLength,
+                         decryption_length)) {
     throw ParquetException("Failed decryption update");
   }
 
@@ -626,15 +653,15 @@ int AesDecryptor::AesDecryptorImpl::GcmDecrypt(span<const uint8_t> ciphertext,
   return plaintext_len;
 }
 
-int AesDecryptor::AesDecryptorImpl::CtrDecrypt(span<const uint8_t> ciphertext,
-                                               span<const uint8_t> key,
-                                               span<uint8_t> plaintext) {
+int32_t AesDecryptor::AesDecryptorImpl::CtrDecrypt(span<const uint8_t> ciphertext,
+                                                   span<const uint8_t> key,
+                                                   span<uint8_t> plaintext) {
   int len;
-  int plaintext_len;
+  int32_t plaintext_len;
 
   std::array<uint8_t, kCtrIvLength> iv{};
 
-  int ciphertext_len = GetCiphertextLength(ciphertext);
+  int32_t ciphertext_len = GetCiphertextLength(ciphertext);
 
   if (plaintext.size() < static_cast<size_t>(ciphertext_len) - ciphertext_size_delta_) {
     std::stringstream ss;
@@ -665,9 +692,10 @@ int AesDecryptor::AesDecryptorImpl::CtrDecrypt(span<const uint8_t> ciphertext,
   }
 
   // Decryption
+  int decryption_length = ciphertext_len - length_buffer_length_ - kNonceLength;
   if (!EVP_DecryptUpdate(ctx_, plaintext.data(), &len,
                          ciphertext.data() + length_buffer_length_ + kNonceLength,
-                         ciphertext_len - length_buffer_length_ - kNonceLength)) {
+                         decryption_length)) {
     throw ParquetException("Failed decryption update");
   }
 
@@ -682,10 +710,10 @@ int AesDecryptor::AesDecryptorImpl::CtrDecrypt(span<const uint8_t> ciphertext,
   return plaintext_len;
 }
 
-int AesDecryptor::AesDecryptorImpl::Decrypt(span<const uint8_t> ciphertext,
-                                            span<const uint8_t> key,
-                                            span<const uint8_t> aad,
-                                            span<uint8_t> plaintext) {
+int32_t AesDecryptor::AesDecryptorImpl::Decrypt(span<const uint8_t> ciphertext,
+                                                span<const uint8_t> key,
+                                                span<const uint8_t> aad,
+                                                span<uint8_t> plaintext) {
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -758,9 +786,22 @@ void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD) {
   std::memcpy(AAD->data() + AAD->length() - 2, page_ordinal_bytes.data(), 2);
 }
 
-void RandBytes(unsigned char* buf, int num) {
+void RandBytes(unsigned char* buf, size_t num) {
+  if (num > static_cast<size_t>(std::numeric_limits<int>::max())) {
+    std::stringstream ss;
+    ss << "Length " << num << " for RandBytes overflows int";
+    throw ParquetException(ss.str());
+  }
   openssl::EnsureInitialized();
-  RAND_bytes(buf, num);
+  int status = RAND_bytes(buf, static_cast<int>(num));
+  if (status != 1) {
+    const auto error_code = ERR_get_error();
+    char buffer[256];
+    ERR_error_string_n(error_code, buffer, sizeof(buffer));
+    std::stringstream ss;
+    ss << "Failed to generate random bytes: " << buffer;
+    throw ParquetException(ss.str());
+  }
 }
 
 void EnsureBackendInitialized() { openssl::EnsureInitialized(); }
diff --git a/cpp/src/parquet/encryption/encryption_internal.h b/cpp/src/parquet/encryption/encryption_internal.h
index c874b137ad1ad..d79ff56ad49be 100644
--- a/cpp/src/parquet/encryption/encryption_internal.h
+++ b/cpp/src/parquet/encryption/encryption_internal.h
@@ -29,8 +29,8 @@ using parquet::ParquetCipher;
 
 namespace parquet::encryption {
 
-constexpr int kGcmTagLength = 16;
-constexpr int kNonceLength = 12;
+constexpr int32_t kGcmTagLength = 16;
+constexpr int32_t kNonceLength = 12;
 
 // Module types
 constexpr int8_t kFooter = 0;
@@ -49,13 +49,13 @@ class PARQUET_EXPORT AesEncryptor {
  public:
   /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
   /// If write_length is true, prepend ciphertext length to the ciphertext
-  explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+  explicit AesEncryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                         bool write_length = true);
 
-  static std::unique_ptr<AesEncryptor> Make(ParquetCipher::type alg_id, int key_len,
+  static std::unique_ptr<AesEncryptor> Make(ParquetCipher::type alg_id, int32_t key_len,
                                             bool metadata);
 
-  static std::unique_ptr<AesEncryptor> Make(ParquetCipher::type alg_id, int key_len,
+  static std::unique_ptr<AesEncryptor> Make(ParquetCipher::type alg_id, int32_t key_len,
                                             bool metadata, bool write_length);
 
   ~AesEncryptor();
@@ -65,17 +65,17 @@ class PARQUET_EXPORT AesEncryptor {
 
   /// Encrypts plaintext with the key and aad. Key length is passed only for validation.
   /// If different from value in constructor, exception will be thrown.
-  int Encrypt(::arrow::util::span<const uint8_t> plaintext,
-              ::arrow::util::span<const uint8_t> key,
-              ::arrow::util::span<const uint8_t> aad,
-              ::arrow::util::span<uint8_t> ciphertext);
+  int32_t Encrypt(::arrow::util::span<const uint8_t> plaintext,
+                  ::arrow::util::span<const uint8_t> key,
+                  ::arrow::util::span<const uint8_t> aad,
+                  ::arrow::util::span<uint8_t> ciphertext);
 
   /// Encrypts plaintext footer, in order to compute footer signature (tag).
-  int SignedFooterEncrypt(::arrow::util::span<const uint8_t> footer,
-                          ::arrow::util::span<const uint8_t> key,
-                          ::arrow::util::span<const uint8_t> aad,
-                          ::arrow::util::span<const uint8_t> nonce,
-                          ::arrow::util::span<uint8_t> encrypted_footer);
+  int32_t SignedFooterEncrypt(::arrow::util::span<const uint8_t> footer,
+                              ::arrow::util::span<const uint8_t> key,
+                              ::arrow::util::span<const uint8_t> aad,
+                              ::arrow::util::span<const uint8_t> nonce,
+                              ::arrow::util::span<uint8_t> encrypted_footer);
 
   void WipeOut();
 
@@ -90,7 +90,7 @@ class PARQUET_EXPORT AesDecryptor {
  public:
   /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
   /// If contains_length is true, expect ciphertext length prepended to the ciphertext
-  explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+  explicit AesDecryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                         bool contains_length = true);
 
   /// \brief Factory function to create an AesDecryptor
@@ -102,26 +102,26 @@ class PARQUET_EXPORT AesDecryptor {
   /// out when decryption is finished
   /// \return shared pointer to a new AesDecryptor
   static std::shared_ptr<AesDecryptor> Make(
-      ParquetCipher::type alg_id, int key_len, bool metadata,
+      ParquetCipher::type alg_id, int32_t key_len, bool metadata,
       std::vector<std::weak_ptr<AesDecryptor>>* all_decryptors);
 
   ~AesDecryptor();
   void WipeOut();
 
   /// The size of the plaintext, for this cipher and the specified ciphertext length.
-  [[nodiscard]] int PlaintextLength(int ciphertext_len) const;
+  [[nodiscard]] int32_t PlaintextLength(int32_t ciphertext_len) const;
 
   /// The size of the ciphertext, for this cipher and the specified plaintext length.
-  [[nodiscard]] int CiphertextLength(int plaintext_len) const;
+  [[nodiscard]] int32_t CiphertextLength(int32_t plaintext_len) const;
 
   /// Decrypts ciphertext with the key and aad. Key length is passed only for
   /// validation. If different from value in constructor, exception will be thrown.
   /// The caller is responsible for ensuring that the plaintext buffer is at least as
   /// large as PlaintextLength(ciphertext_len).
-  int Decrypt(::arrow::util::span<const uint8_t> ciphertext,
-              ::arrow::util::span<const uint8_t> key,
-              ::arrow::util::span<const uint8_t> aad,
-              ::arrow::util::span<uint8_t> plaintext);
+  int32_t Decrypt(::arrow::util::span<const uint8_t> ciphertext,
+                  ::arrow::util::span<const uint8_t> key,
+                  ::arrow::util::span<const uint8_t> aad,
+                  ::arrow::util::span<uint8_t> plaintext);
 
  private:
   // PIMPL Idiom
@@ -139,7 +139,7 @@ std::string CreateFooterAad(const std::string& aad_prefix_bytes);
 void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD);
 
 // Wraps OpenSSL RAND_bytes function
-void RandBytes(unsigned char* buf, int num);
+void RandBytes(unsigned char* buf, size_t num);
 
 // Ensure OpenSSL is initialized.
 //
diff --git a/cpp/src/parquet/encryption/encryption_internal_nossl.cc b/cpp/src/parquet/encryption/encryption_internal_nossl.cc
index 2cce83915d7e5..2a8162ed3964b 100644
--- a/cpp/src/parquet/encryption/encryption_internal_nossl.cc
+++ b/cpp/src/parquet/encryption/encryption_internal_nossl.cc
@@ -29,11 +29,11 @@ class AesEncryptor::AesEncryptorImpl {};
 
 AesEncryptor::~AesEncryptor() {}
 
-int AesEncryptor::SignedFooterEncrypt(::arrow::util::span<const uint8_t> footer,
-                                      ::arrow::util::span<const uint8_t> key,
-                                      ::arrow::util::span<const uint8_t> aad,
-                                      ::arrow::util::span<const uint8_t> nonce,
-                                      ::arrow::util::span<uint8_t> encrypted_footer) {
+int32_t AesEncryptor::SignedFooterEncrypt(::arrow::util::span<const uint8_t> footer,
+                                          ::arrow::util::span<const uint8_t> key,
+                                          ::arrow::util::span<const uint8_t> aad,
+                                          ::arrow::util::span<const uint8_t> nonce,
+                                          ::arrow::util::span<uint8_t> encrypted_footer) {
   ThrowOpenSSLRequiredException();
   return -1;
 }
@@ -45,25 +45,25 @@ int32_t AesEncryptor::CiphertextLength(int64_t plaintext_len) const {
   return -1;
 }
 
-int AesEncryptor::Encrypt(::arrow::util::span<const uint8_t> plaintext,
-                          ::arrow::util::span<const uint8_t> key,
-                          ::arrow::util::span<const uint8_t> aad,
-                          ::arrow::util::span<uint8_t> ciphertext) {
+int32_t AesEncryptor::Encrypt(::arrow::util::span<const uint8_t> plaintext,
+                              ::arrow::util::span<const uint8_t> key,
+                              ::arrow::util::span<const uint8_t> aad,
+                              ::arrow::util::span<uint8_t> ciphertext) {
   ThrowOpenSSLRequiredException();
   return -1;
 }
 
-AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                            bool write_length) {
   ThrowOpenSSLRequiredException();
 }
 
 class AesDecryptor::AesDecryptorImpl {};
 
-int AesDecryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext,
-                          ::arrow::util::span<const uint8_t> key,
-                          ::arrow::util::span<const uint8_t> aad,
-                          ::arrow::util::span<uint8_t> plaintext) {
+int32_t AesDecryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext,
+                              ::arrow::util::span<const uint8_t> key,
+                              ::arrow::util::span<const uint8_t> aad,
+                              ::arrow::util::span<uint8_t> plaintext) {
   ThrowOpenSSLRequiredException();
   return -1;
 }
@@ -72,36 +72,37 @@ void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
 
 AesDecryptor::~AesDecryptor() {}
 
-std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata) {
+std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata) {
   ThrowOpenSSLRequiredException();
   return NULLPTR;
 }
 
-std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id, int key_len,
-                                                 bool metadata, bool write_length) {
+std::unique_ptr<AesEncryptor> AesEncryptor::Make(ParquetCipher::type alg_id,
+                                                 int32_t key_len, bool metadata,
+                                                 bool write_length) {
   ThrowOpenSSLRequiredException();
   return NULLPTR;
 }
 
-AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata,
+AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int32_t key_len, bool metadata,
                            bool contains_length) {
   ThrowOpenSSLRequiredException();
 }
 
 std::shared_ptr<AesDecryptor> AesDecryptor::Make(
-    ParquetCipher::type alg_id, int key_len, bool metadata,
+    ParquetCipher::type alg_id, int32_t key_len, bool metadata,
     std::vector<std::weak_ptr<AesDecryptor>>* all_decryptors) {
   ThrowOpenSSLRequiredException();
   return NULLPTR;
 }
 
-int AesDecryptor::PlaintextLength(int ciphertext_len) const {
+int32_t AesDecryptor::PlaintextLength(int32_t ciphertext_len) const {
   ThrowOpenSSLRequiredException();
   return -1;
 }
 
-int AesDecryptor::CiphertextLength(int plaintext_len) const {
+int32_t AesDecryptor::CiphertextLength(int32_t plaintext_len) const {
   ThrowOpenSSLRequiredException();
   return -1;
 }
@@ -122,7 +123,7 @@ void QuickUpdatePageAad(int32_t new_page_ordinal, std::string* AAD) {
   ThrowOpenSSLRequiredException();
 }
 
-void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); }
+void RandBytes(unsigned char* buf, size_t num) { ThrowOpenSSLRequiredException(); }
 
 void EnsureBackendInitialized() {}
 
diff --git a/cpp/src/parquet/encryption/encryption_internal_test.cc b/cpp/src/parquet/encryption/encryption_internal_test.cc
index 22e14663ea81f..bf6607e32877d 100644
--- a/cpp/src/parquet/encryption/encryption_internal_test.cc
+++ b/cpp/src/parquet/encryption/encryption_internal_test.cc
@@ -41,22 +41,22 @@ class TestAesEncryption : public ::testing::Test {
         encryptor.CiphertextLength(static_cast<int64_t>(plain_text_.size()));
     std::vector<uint8_t> ciphertext(expected_ciphertext_len, '\0');
 
-    int ciphertext_length = encryptor.Encrypt(str2span(plain_text_), str2span(key_),
-                                              str2span(aad_), ciphertext);
+    int32_t ciphertext_length = encryptor.Encrypt(str2span(plain_text_), str2span(key_),
+                                                  str2span(aad_), ciphertext);
 
     ASSERT_EQ(ciphertext_length, expected_ciphertext_len);
 
     AesDecryptor decryptor(cipher_type, key_length_, metadata, write_length);
 
-    int expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
+    int32_t expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
     std::vector<uint8_t> decrypted_text(expected_plaintext_length, '\0');
 
-    int plaintext_length =
+    int32_t plaintext_length =
         decryptor.Decrypt(ciphertext, str2span(key_), str2span(aad_), decrypted_text);
 
     std::string decrypted_text_str(decrypted_text.begin(), decrypted_text.end());
 
-    ASSERT_EQ(plaintext_length, static_cast<int>(plain_text_.size()));
+    ASSERT_EQ(plaintext_length, static_cast<int32_t>(plain_text_.size()));
     ASSERT_EQ(plaintext_length, expected_plaintext_length);
     ASSERT_EQ(decrypted_text_str, plain_text_);
   }
@@ -68,10 +68,10 @@ class TestAesEncryption : public ::testing::Test {
     AesDecryptor decryptor(cipher_type, key_length_, metadata, write_length);
 
     // Create ciphertext of all zeros, so the ciphertext length will be read as zero
-    const int ciphertext_length = 100;
+    constexpr int32_t ciphertext_length = 100;
     std::vector<uint8_t> ciphertext(ciphertext_length, '\0');
 
-    int expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
+    int32_t expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
     std::vector<uint8_t> decrypted_text(expected_plaintext_length, '\0');
 
     EXPECT_THROW(
@@ -89,12 +89,12 @@ class TestAesEncryption : public ::testing::Test {
         encryptor.CiphertextLength(static_cast<int64_t>(plain_text_.size()));
     std::vector<uint8_t> ciphertext(expected_ciphertext_len, '\0');
 
-    int ciphertext_length = encryptor.Encrypt(str2span(plain_text_), str2span(key_),
-                                              str2span(aad_), ciphertext);
+    int32_t ciphertext_length = encryptor.Encrypt(str2span(plain_text_), str2span(key_),
+                                                  str2span(aad_), ciphertext);
 
     AesDecryptor decryptor(cipher_type, key_length_, metadata, write_length);
 
-    int expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
+    int32_t expected_plaintext_length = decryptor.PlaintextLength(ciphertext_length);
     std::vector<uint8_t> decrypted_text(expected_plaintext_length, '\0');
 
     ::arrow::util::span<uint8_t> truncated_ciphertext(ciphertext.data(),
@@ -105,7 +105,7 @@ class TestAesEncryption : public ::testing::Test {
   }
 
  private:
-  int key_length_ = 0;
+  int32_t key_length_ = 0;
   std::string key_;
   std::string aad_;
   std::string plain_text_;
diff --git a/cpp/src/parquet/encryption/file_key_wrapper.cc b/cpp/src/parquet/encryption/file_key_wrapper.cc
index 032ae45821a68..8ce563e60d752 100644
--- a/cpp/src/parquet/encryption/file_key_wrapper.cc
+++ b/cpp/src/parquet/encryption/file_key_wrapper.cc
@@ -112,10 +112,10 @@ std::string FileKeyWrapper::GetEncryptionKeyMetadata(const std::string& data_key
 KeyEncryptionKey FileKeyWrapper::CreateKeyEncryptionKey(
     const std::string& master_key_id) {
   std::string kek_bytes(kKeyEncryptionKeyLength, '\0');
-  RandBytes(reinterpret_cast<uint8_t*>(&kek_bytes[0]), kKeyEncryptionKeyLength);
+  RandBytes(reinterpret_cast<uint8_t*>(kek_bytes.data()), kKeyEncryptionKeyLength);
 
   std::string kek_id(kKeyEncryptionKeyIdLength, '\0');
-  RandBytes(reinterpret_cast<uint8_t*>(&kek_id[0]), kKeyEncryptionKeyIdLength);
+  RandBytes(reinterpret_cast<uint8_t*>(kek_id.data()), kKeyEncryptionKeyIdLength);
 
   // Encrypt KEK with Master key
   std::string encoded_wrapped_kek = kms_client_->WrapKey(kek_bytes, master_key_id);
diff --git a/cpp/src/parquet/encryption/internal_file_decryptor.cc b/cpp/src/parquet/encryption/internal_file_decryptor.cc
index fae5ce1f7a809..53a2f8c02168b 100644
--- a/cpp/src/parquet/encryption/internal_file_decryptor.cc
+++ b/cpp/src/parquet/encryption/internal_file_decryptor.cc
@@ -33,16 +33,16 @@ Decryptor::Decryptor(std::shared_ptr<encryption::AesDecryptor> aes_decryptor,
       aad_(aad),
       pool_(pool) {}
 
-int Decryptor::PlaintextLength(int ciphertext_len) const {
+int32_t Decryptor::PlaintextLength(int32_t ciphertext_len) const {
   return aes_decryptor_->PlaintextLength(ciphertext_len);
 }
 
-int Decryptor::CiphertextLength(int plaintext_len) const {
+int32_t Decryptor::CiphertextLength(int32_t plaintext_len) const {
   return aes_decryptor_->CiphertextLength(plaintext_len);
 }
 
-int Decryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext,
-                       ::arrow::util::span<uint8_t> plaintext) {
+int32_t Decryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext,
+                           ::arrow::util::span<uint8_t> plaintext) {
   return aes_decryptor_->Decrypt(ciphertext, str2span(key_), str2span(aad_), plaintext);
 }
 
@@ -143,7 +143,7 @@ std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor(
 
   // Create both data and metadata decryptors to avoid redundant retrieval of key
   // from the key_retriever.
-  int key_len = static_cast<int>(footer_key.size());
+  auto key_len = static_cast<int32_t>(footer_key.size());
   std::shared_ptr<encryption::AesDecryptor> aes_metadata_decryptor;
   std::shared_ptr<encryption::AesDecryptor> aes_data_decryptor;
 
@@ -197,7 +197,7 @@ std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDecryptor(
     throw HiddenColumnException("HiddenColumnException, path=" + column_path);
   }
 
-  int key_len = static_cast<int>(column_key.size());
+  auto key_len = static_cast<int32_t>(column_key.size());
   std::lock_guard<std::mutex> lock(mutex_);
   auto aes_decryptor =
       encryption::AesDecryptor::Make(algorithm_, key_len, metadata, &all_decryptors_);
diff --git a/cpp/src/parquet/encryption/internal_file_decryptor.h b/cpp/src/parquet/encryption/internal_file_decryptor.h
index 8af3587acf884..08423de7fe920 100644
--- a/cpp/src/parquet/encryption/internal_file_decryptor.h
+++ b/cpp/src/parquet/encryption/internal_file_decryptor.h
@@ -45,10 +45,10 @@ class PARQUET_EXPORT Decryptor {
   void UpdateAad(const std::string& aad) { aad_ = aad; }
   ::arrow::MemoryPool* pool() { return pool_; }
 
-  [[nodiscard]] int PlaintextLength(int ciphertext_len) const;
-  [[nodiscard]] int CiphertextLength(int plaintext_len) const;
-  int Decrypt(::arrow::util::span<const uint8_t> ciphertext,
-              ::arrow::util::span<uint8_t> plaintext);
+  [[nodiscard]] int32_t PlaintextLength(int32_t ciphertext_len) const;
+  [[nodiscard]] int32_t CiphertextLength(int32_t plaintext_len) const;
+  int32_t Decrypt(::arrow::util::span<const uint8_t> ciphertext,
+                  ::arrow::util::span<uint8_t> plaintext);
 
  private:
   std::shared_ptr<encryption::AesDecryptor> aes_decryptor_;
diff --git a/cpp/src/parquet/encryption/internal_file_encryptor.cc b/cpp/src/parquet/encryption/internal_file_encryptor.cc
index 285c2100be813..94094e6aca228 100644
--- a/cpp/src/parquet/encryption/internal_file_encryptor.cc
+++ b/cpp/src/parquet/encryption/internal_file_encryptor.cc
@@ -35,8 +35,8 @@ int32_t Encryptor::CiphertextLength(int64_t plaintext_len) const {
   return aes_encryptor_->CiphertextLength(plaintext_len);
 }
 
-int Encryptor::Encrypt(::arrow::util::span<const uint8_t> plaintext,
-                       ::arrow::util::span<uint8_t> ciphertext) {
+int32_t Encryptor::Encrypt(::arrow::util::span<const uint8_t> plaintext,
+                           ::arrow::util::span<uint8_t> ciphertext) {
   return aes_encryptor_->Encrypt(plaintext, str2span(key_), str2span(aad_), ciphertext);
 }
 
@@ -143,7 +143,7 @@ InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor(
   return encryptor;
 }
 
-int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) const {
+int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int32_t key_len) const {
   if (key_len == 16)
     return 0;
   else if (key_len == 24)
@@ -155,7 +155,7 @@ int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) const {
 
 encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
     ParquetCipher::type algorithm, size_t key_size) {
-  int key_len = static_cast<int>(key_size);
+  auto key_len = static_cast<int32_t>(key_size);
   int index = MapKeyLenToEncryptorArrayIndex(key_len);
   if (meta_encryptor_[index] == nullptr) {
     meta_encryptor_[index] = encryption::AesEncryptor::Make(algorithm, key_len, true);
@@ -165,7 +165,7 @@ encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
 
 encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor(
     ParquetCipher::type algorithm, size_t key_size) {
-  int key_len = static_cast<int>(key_size);
+  auto key_len = static_cast<int32_t>(key_size);
   int index = MapKeyLenToEncryptorArrayIndex(key_len);
   if (data_encryptor_[index] == nullptr) {
     data_encryptor_[index] = encryption::AesEncryptor::Make(algorithm, key_len, false);
diff --git a/cpp/src/parquet/encryption/internal_file_encryptor.h b/cpp/src/parquet/encryption/internal_file_encryptor.h
index 91b6e9fe5aa2f..5a3d743ce5365 100644
--- a/cpp/src/parquet/encryption/internal_file_encryptor.h
+++ b/cpp/src/parquet/encryption/internal_file_encryptor.h
@@ -45,8 +45,8 @@ class PARQUET_EXPORT Encryptor {
 
   [[nodiscard]] int32_t CiphertextLength(int64_t plaintext_len) const;
 
-  int Encrypt(::arrow::util::span<const uint8_t> plaintext,
-              ::arrow::util::span<uint8_t> ciphertext);
+  int32_t Encrypt(::arrow::util::span<const uint8_t> plaintext,
+                  ::arrow::util::span<uint8_t> ciphertext);
 
   bool EncryptColumnMetaData(
       bool encrypted_footer,
@@ -103,7 +103,7 @@ class InternalFileEncryptor {
   encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm,
                                                 size_t key_len);
 
-  int MapKeyLenToEncryptorArrayIndex(int key_len) const;
+  int MapKeyLenToEncryptorArrayIndex(int32_t key_len) const;
 };
 
 }  // namespace parquet
diff --git a/cpp/src/parquet/encryption/key_toolkit_internal.cc b/cpp/src/parquet/encryption/key_toolkit_internal.cc
index 5d7925aa0318f..89a52a2bcd632 100644
--- a/cpp/src/parquet/encryption/key_toolkit_internal.cc
+++ b/cpp/src/parquet/encryption/key_toolkit_internal.cc
@@ -53,7 +53,7 @@ std::string DecryptKeyLocally(const std::string& encoded_encrypted_key,
                              static_cast<int>(master_key.size()), false,
                              false /*contains_length*/);
 
-  int decrypted_key_len =
+  int32_t decrypted_key_len =
       key_decryptor.PlaintextLength(static_cast<int>(encrypted_key.size()));
   std::string decrypted_key(decrypted_key_len, '\0');
   ::arrow::util::span<uint8_t> decrypted_key_span(
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 4f2aa6e37328c..423154f8641e5 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -751,7 +751,7 @@ class FileMetaData::FileMetaDataImpl {
 
     std::shared_ptr<Buffer> encrypted_buffer = AllocateBuffer(
         file_decryptor_->pool(), aes_encryptor->CiphertextLength(serialized_len));
-    uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
+    int32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
         serialized_data_span, str2span(key), str2span(aad), nonce,
         encrypted_buffer->mutable_span_as<uint8_t>());
     // Delete AES encryptor object. It was created only to verify the footer signature.
@@ -799,7 +799,7 @@ class FileMetaData::FileMetaDataImpl {
 
       // encrypt the footer key
       std::vector<uint8_t> encrypted_data(encryptor->CiphertextLength(serialized_len));
-      int encrypted_len = encryptor->Encrypt(serialized_data_span, encrypted_data);
+      int32_t encrypted_len = encryptor->Encrypt(serialized_data_span, encrypted_data);
 
       // write unencrypted footer
       PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len));
@@ -1672,7 +1672,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
                                                                 serialized_len);
 
         std::vector<uint8_t> encrypted_data(encryptor->CiphertextLength(serialized_len));
-        int encrypted_len = encryptor->Encrypt(serialized_data_span, encrypted_data);
+        int32_t encrypted_len = encryptor->Encrypt(serialized_data_span, encrypted_data);
 
         const char* temp =
             const_cast<const char*>(reinterpret_cast<char*>(encrypted_data.data()));
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index b21b0e07afba2..e7bfd434c81a8 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -530,7 +530,7 @@ class ThriftSerializer {
     auto cipher_buffer =
         AllocateBuffer(encryptor->pool(), encryptor->CiphertextLength(out_length));
     ::arrow::util::span<const uint8_t> out_span(out_buffer, out_length);
-    int cipher_buffer_len =
+    int32_t cipher_buffer_len =
         encryptor->Encrypt(out_span, cipher_buffer->mutable_span_as<uint8_t>());
 
     PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len));

From 6a1d0520974355a749557c993841732d4fcf894c Mon Sep 17 00:00:00 2001
From: Devin Smith <devinsmith@deephaven.io>
Date: Wed, 21 Aug 2024 18:12:45 -0700
Subject: [PATCH 009/186] GH-43717: [Java][FlightSQL] Add all ActionTypes to
 FlightSqlUtils.FLIGHT_SQL_ACTIONS (#43718)

This adds all of the FlightSQL ActionTypes to FlightSqlUtils.FLIGHT_SQL_ACTIONS

* GitHub Issue: #43717

Authored-by: Devin Smith <devinsmith@deephaven.io>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../org/apache/arrow/flight/sql/FlightSqlUtils.java    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlUtils.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlUtils.java
index 9bb95047691ae..9e13e57d66c65 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlUtils.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlUtils.java
@@ -82,7 +82,15 @@ public final class FlightSqlUtils {
               + "Response Message: N/A");
 
   public static final List<ActionType> FLIGHT_SQL_ACTIONS =
-      ImmutableList.of(FLIGHT_SQL_CREATE_PREPARED_STATEMENT, FLIGHT_SQL_CLOSE_PREPARED_STATEMENT);
+      ImmutableList.of(
+          FLIGHT_SQL_BEGIN_SAVEPOINT,
+          FLIGHT_SQL_BEGIN_TRANSACTION,
+          FLIGHT_SQL_CREATE_PREPARED_STATEMENT,
+          FLIGHT_SQL_CLOSE_PREPARED_STATEMENT,
+          FLIGHT_SQL_CREATE_PREPARED_SUBSTRAIT_PLAN,
+          FLIGHT_SQL_CANCEL_QUERY,
+          FLIGHT_SQL_END_SAVEPOINT,
+          FLIGHT_SQL_END_TRANSACTION);
 
   /**
    * Helper to parse {@link com.google.protobuf.Any} objects to the specific protobuf object.

From 2e83aa63d95a6fa380efdd5e5cb720a3154f9c93 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 22 Aug 2024 09:57:02 +0200
Subject: [PATCH 010/186] GH-43690: [Python][CI] Simplify
 python/requirements-wheel-test.txt file (#43691)

### Rationale for this change

The current [requirements-wheel-test.txt](https://github.com/apache/arrow/blob/7c8909a144f2e8d593dc8ad363ac95b2865b04ca/python/requirements-wheel-test.txt) file has quite complex and detailed version pinning, varying per architecture. I think this can be simplified because we just want to test with some older version of numpy and pandas (and the exact version isn't that important).

* GitHub Issue: #43690

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/requirements-wheel-test.txt | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt
index 46bedc13ba1a7..c7ff63e339575 100644
--- a/python/requirements-wheel-test.txt
+++ b/python/requirements-wheel-test.txt
@@ -5,22 +5,12 @@ pytest
 pytz
 tzdata; sys_platform == 'win32'
 
-numpy==1.21.3; platform_system == "Linux"   and platform_machine == "aarch64" and python_version < "3.11"
-numpy==1.23.4;                                                                    python_version == "3.11"
-numpy==1.26.0;                                                                    python_version >= "3.12"
-numpy==1.19.5; platform_system == "Linux"   and platform_machine != "aarch64" and python_version <  "3.9"
-numpy==1.21.3; platform_system == "Linux"   and platform_machine != "aarch64" and python_version >= "3.9" and python_version < "3.11"
-numpy==1.21.3; platform_system == "Darwin"  and platform_machine == "arm64"   and python_version <  "3.11"
-numpy==1.19.5; platform_system == "Darwin"  and platform_machine != "arm64"   and python_version <  "3.9"
-numpy==1.21.3; platform_system == "Darwin"  and platform_machine != "arm64"   and python_version >= "3.9" and python_version < "3.11"
-numpy==1.19.5; platform_system == "Windows"                                   and python_version <  "3.9"
-numpy==1.21.3; platform_system == "Windows"                                   and python_version >= "3.9" and python_version < "3.11"
+# We generally test with the oldest numpy version that supports a given Python
+# version. However, there is no need to make this strictly the oldest version,
+# so it can be broadened to have a single version specification across platforms.
+# (`~=x.y.z` specifies a compatible release as `>=x.y.z, == x.y.*`)
+numpy~=1.21.3; python_version < "3.11"
+numpy~=1.23.2; python_version == "3.11"
+numpy~=1.26.0; python_version == "3.12"
 
-pandas<1.1.0;  platform_system == "Linux"   and platform_machine != "aarch64" and python_version <  "3.8"
-pandas;        platform_system == "Linux"   and platform_machine != "aarch64" and python_version >= "3.8"
-pandas;        platform_system == "Linux"   and platform_machine == "aarch64"
-pandas<1.1.0;  platform_system == "Darwin"  and platform_machine != "arm64"   and python_version <  "3.8"
-pandas;        platform_system == "Darwin"  and platform_machine != "arm64"   and python_version >= "3.8"
-pandas;        platform_system == "Darwin"  and platform_machine == "arm64"
-pandas<1.1.0;  platform_system == "Windows"                                   and python_version <  "3.8"
-pandas;        platform_system == "Windows"                                   and python_version >= "3.8"
+pandas

From fc54eadb72791288fc9681bbcc6c8a9d8d6fff1d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 22 Aug 2024 11:28:01 +0200
Subject: [PATCH 011/186] GH-43785: [Python][CI] Correct PARQUET_TEST_DATA path
 in wheel tests (#43786)

### Rationale for this change

Starting with https://github.com/apache/arrow/pull/41580, the pyarrow tests now also rely on a file in the parquet-testing submodule. And the path to that directory is controlled by `PARQUET_TEST_DATA`, which appears to be set wrongly in the wheel test scripts, causing all wheel builds to fail at the moment.

* GitHub Issue: #43785

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 ci/scripts/python_wheel_unix_test.sh     | 2 +-
 ci/scripts/python_wheel_windows_test.bat | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh
index a25e5c51bddbc..cf87a17056783 100755
--- a/ci/scripts/python_wheel_unix_test.sh
+++ b/ci/scripts/python_wheel_unix_test.sh
@@ -54,7 +54,7 @@ export PYARROW_TEST_S3=${ARROW_S3}
 export PYARROW_TEST_TENSORFLOW=ON
 
 export ARROW_TEST_DATA=${source_dir}/testing/data
-export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data
+export PARQUET_TEST_DATA=${source_dir}/cpp/submodules/parquet-testing/data
 
 if [ "${INSTALL_PYARROW}" == "ON" ]; then
   # Install the built wheels
diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat
index a928c3571d0cb..87c0bb1252024 100755
--- a/ci/scripts/python_wheel_windows_test.bat
+++ b/ci/scripts/python_wheel_windows_test.bat
@@ -35,7 +35,7 @@ set PYARROW_TEST_TENSORFLOW=ON
 @REM set PYARROW_TEST_PANDAS=ON
 
 set ARROW_TEST_DATA=C:\arrow\testing\data
-set PARQUET_TEST_DATA=C:\arrow\submodules\parquet-testing\data
+set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data
 
 @REM Install testing dependencies
 pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1

From b4f7efe5bdc2218bb595b130b4f65237caecfa76 Mon Sep 17 00:00:00 2001
From: Rok Mihevc <rok@mihevc.org>
Date: Thu, 22 Aug 2024 14:45:00 +0200
Subject: [PATCH 012/186] GH-43787: [C++] Register the new Opaque extension
 type by default (#43788)

This is to resolve #43787

> The Opaque extension type implementation for C++ (plus python bindings) was added in https://github.com/apache/arrow/pull/43458, but it was not registered by default, which we should do for canonical extension types (see https://github.com/apache/arrow/pull/43458#issuecomment-2302551404)

Additionally, this adds `bool8` extension type builds with `ARROW_JSON=false` as discussed [here](https://github.com/apache/arrow/commit/525881987d0b9b4f464c3e3593a9a7b4e3c767d0#r145613657)

### Rationale for this change

Canonical types should be registered by default if possible (except e.g. if they can't be compiled due to `ARROW_JSON=false`).

### What changes are included in this PR?

This adds default registration for `opaque`, changes when `bool8` is built and moves all canonical tests under the same test target.

### Are these changes tested?

Changes are tested by previously existing tests.

### Are there any user-facing changes?

`opaue` will now be registered by default and `bool8` will be present in case `ARROW_JSON=false` at build time.
* GitHub Issue: #43787

Authored-by: Rok Mihevc <rok@mihevc.org>
Signed-off-by: Rok Mihevc <rok@mihevc.org>
---
 cpp/src/arrow/CMakeLists.txt                 |  2 +-
 cpp/src/arrow/extension/CMakeLists.txt       | 18 ++++++-----------
 cpp/src/arrow/extension/bool8.h              |  2 ++
 cpp/src/arrow/extension/bool8_test.cc        |  1 -
 cpp/src/arrow/extension/fixed_shape_tensor.h |  2 ++
 cpp/src/arrow/extension/opaque.h             |  2 ++
 cpp/src/arrow/extension/opaque_test.cc       |  2 --
 cpp/src/arrow/extension_type.cc              | 21 ++++++++++++--------
 python/pyarrow/tests/test_extension_type.py  |  5 ++---
 9 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index fb7253b6fd69d..89f28ee416ede 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -374,6 +374,7 @@ set(ARROW_SRCS
     datum.cc
     device.cc
     extension_type.cc
+    extension/bool8.cc
     pretty_print.cc
     record_batch.cc
     result.cc
@@ -906,7 +907,6 @@ endif()
 
 if(ARROW_JSON)
   arrow_add_object_library(ARROW_JSON
-                           extension/bool8.cc
                            extension/fixed_shape_tensor.cc
                            extension/opaque.cc
                            json/options.cc
diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt
index fcd5fa529ab56..5cb4bc77af2a4 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,22 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 
-add_arrow_test(test
-               SOURCES
-               bool8_test.cc
-               PREFIX
-               "arrow-extension-bool8")
+set(CANONICAL_EXTENSION_TESTS bool8_test.cc)
 
-add_arrow_test(test
-               SOURCES
-               fixed_shape_tensor_test.cc
-               PREFIX
-               "arrow-fixed-shape-tensor")
+if(ARROW_JSON)
+  list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc)
+endif()
 
 add_arrow_test(test
                SOURCES
-               opaque_test.cc
+               ${CANONICAL_EXTENSION_TESTS}
                PREFIX
-               "arrow-extension-opaque")
+               "arrow-canonical-extensions")
 
 arrow_install_all_headers("arrow/extension")
diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h
index 02e629b28a867..fbb507639e272 100644
--- a/cpp/src/arrow/extension/bool8.h
+++ b/cpp/src/arrow/extension/bool8.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include "arrow/extension_type.h"
 
 namespace arrow::extension {
diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc
index eabcfcf62d32c..ee77332bc3257 100644
--- a/cpp/src/arrow/extension/bool8_test.cc
+++ b/cpp/src/arrow/extension/bool8_test.cc
@@ -19,7 +19,6 @@
 #include "arrow/io/memory.h"
 #include "arrow/ipc/reader.h"
 #include "arrow/ipc/writer.h"
-#include "arrow/testing/extension_type.h"
 #include "arrow/testing/gtest_util.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h
index 20ec20a64c2d4..80a602021c60b 100644
--- a/cpp/src/arrow/extension/fixed_shape_tensor.h
+++ b/cpp/src/arrow/extension/fixed_shape_tensor.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include "arrow/extension_type.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/extension/opaque.h b/cpp/src/arrow/extension/opaque.h
index 9814b391cbad6..5d3411798f88d 100644
--- a/cpp/src/arrow/extension/opaque.h
+++ b/cpp/src/arrow/extension/opaque.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include "arrow/extension_type.h"
 #include "arrow/type.h"
 
diff --git a/cpp/src/arrow/extension/opaque_test.cc b/cpp/src/arrow/extension/opaque_test.cc
index 1629cdb39651c..16fcba3fa6bb0 100644
--- a/cpp/src/arrow/extension/opaque_test.cc
+++ b/cpp/src/arrow/extension/opaque_test.cc
@@ -25,7 +25,6 @@
 #include "arrow/ipc/reader.h"
 #include "arrow/ipc/writer.h"
 #include "arrow/record_batch.h"
-#include "arrow/testing/extension_type.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type_fwd.h"
 #include "arrow/util/checked_cast.h"
@@ -169,7 +168,6 @@ TEST(OpaqueType, MetadataRoundTrip) {
 TEST(OpaqueType, BatchRoundTrip) {
   auto type = internal::checked_pointer_cast<extension::OpaqueType>(
       extension::opaque(binary(), "geometry", "adbc.postgresql"));
-  ExtensionTypeGuard guard(type);
 
   auto storage = ArrayFromJSON(binary(), R"(["foobar", null])");
   auto array = ExtensionType::WrapArray(type, storage);
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index 685018f7de7b8..83c7ebed4f319 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -27,9 +27,10 @@
 #include "arrow/array/util.h"
 #include "arrow/chunked_array.h"
 #include "arrow/config.h"
-#ifdef ARROW_JSON
 #include "arrow/extension/bool8.h"
+#ifdef ARROW_JSON
 #include "arrow/extension/fixed_shape_tensor.h"
+#include "arrow/extension/opaque.h"
 #endif
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -143,17 +144,21 @@ static std::once_flag registry_initialized;
 namespace internal {
 
 static void CreateGlobalRegistry() {
+  // Register canonical extension types
+
   g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
+  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8()};
 
 #ifdef ARROW_JSON
-  // Register canonical extension types
-  auto fst_ext_type =
-      checked_pointer_cast<ExtensionType>(extension::fixed_shape_tensor(int64(), {}));
-  ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type));
-
-  auto bool8_ext_type = checked_pointer_cast<ExtensionType>(extension::bool8());
-  ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type));
+  ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
+  ext_types.push_back(extension::opaque(null(), "", ""));
 #endif
+
+  // Register canonical extension types
+  for (const auto& ext_type : ext_types) {
+    ARROW_CHECK_OK(
+        g_registry->RegisterType(checked_pointer_cast<ExtensionType>(ext_type)));
+  }
 }
 
 }  // namespace internal
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index b04ee85ec99ad..0d50c467e96bd 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1693,9 +1693,8 @@ def test_opaque_type(pickle_module, storage_type, storage):
     arr = pa.ExtensionArray.from_storage(opaque_type, storage)
     assert isinstance(arr, opaque_arr_class)
 
-    with registered_extension_type(opaque_type):
-        buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
-        batch = ipc_read_batch(buf)
+    buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"]))
+    batch = ipc_read_batch(buf)
 
     assert batch.column(0).type.extension_name == "arrow.opaque"
     assert isinstance(batch.column(0), opaque_arr_class)

From 3e9384bbf4162ea060e867a753bce464b31e5e1c Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Thu, 22 Aug 2024 15:27:40 +0200
Subject: [PATCH 013/186] GH-43519: [Python] Set up wheel building for Python
 3.13 (#43539)

### Rationale for this change

Like #43519 mentionies, now that the first `rc` is out, it's probably time to add CI coverage for Python 3.13 (and also start building wheels).

### What changes are included in this PR?

I'm fairly new to the build/CI processes of the project, but I tried to follow the same template as #37901. I'll follow up afterwards with adding CI coverage for the free-threaded build as well.
* GitHub Issue: #43519

Lead-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 .env                                               |  2 +-
 ci/docker/python-wheel-manylinux-test.dockerfile   |  7 ++++---
 ci/docker/python-wheel-manylinux.dockerfile        |  2 +-
 .../python-wheel-windows-test-vs2019.dockerfile    |  7 ++++---
 ci/docker/python-wheel-windows-vs2019.dockerfile   |  7 ++++---
 ci/scripts/install_gcs_testbench.sh                | 10 +++++++---
 ci/scripts/install_python.sh                       | 14 +++++++++++---
 ci/scripts/python_wheel_macos_build.sh             |  2 --
 dev/release/verify-release-candidate.sh            |  6 +++---
 dev/tasks/python-wheels/github.linux.yml           |  5 +++++
 dev/tasks/python-wheels/github.osx.yml             |  2 +-
 dev/tasks/tasks.yml                                |  3 ++-
 docker-compose.yml                                 |  9 ++++++---
 python/pyproject.toml                              |  1 +
 python/requirements-wheel-build.txt                |  5 +++++
 python/requirements-wheel-test.txt                 |  7 +++++++
 16 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/.env b/.env
index 1358aafe824a6..21f904c3208f6 100644
--- a/.env
+++ b/.env
@@ -95,7 +95,7 @@ VCPKG="943c5ef1c8f6b5e6ced092b242c8299caae2ff01"    # 2024.04.26 Release
 # ci/docker/python-wheel-windows-vs2019.dockerfile.
 # This is a workaround for our CI problem that "archery docker build" doesn't
 # use pulled built images in dev/tasks/python-wheels/github.windows.yml.
-PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-06-18
+PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-08-06
 
 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker-compose run --rm conan".
 # See https://github.com/conan-io/conan-docker-tools#readme and
diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile
index cdd0ae3ced756..443ff9c53cbcb 100644
--- a/ci/docker/python-wheel-manylinux-test.dockerfile
+++ b/ci/docker/python-wheel-manylinux-test.dockerfile
@@ -16,8 +16,8 @@
 # under the License.
 
 ARG arch
-ARG python
-FROM ${arch}/python:${python}
+ARG python_image_tag
+FROM ${arch}/python:${python_image_tag}
 
 # RUN pip install --upgrade pip
 
@@ -27,4 +27,5 @@ COPY python/requirements-wheel-test.txt /arrow/python/
 RUN pip install -r /arrow/python/requirements-wheel-test.txt
 
 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
-RUN PYTHON=python /arrow/ci/scripts/install_gcs_testbench.sh default
+ARG python
+RUN PYTHON_VERSION=${python} /arrow/ci/scripts/install_gcs_testbench.sh default
diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile
index cb39667af1e10..42f088fd8a22a 100644
--- a/ci/docker/python-wheel-manylinux.dockerfile
+++ b/ci/docker/python-wheel-manylinux.dockerfile
@@ -103,7 +103,7 @@ RUN vcpkg install \
 # Configure Python for applications running in the bash shell of this Dockerfile
 ARG python=3.8
 ENV PYTHON_VERSION=${python}
-RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-*) && \
+RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}) && \
     echo "export PATH=$PYTHON_ROOT/bin:\$PATH" >> /etc/profile.d/python.sh
 
 SHELL ["/bin/bash", "-i", "-c"]
diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
index 32bbb55e82689..5f488a4c285ff 100644
--- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
@@ -40,10 +40,11 @@ ARG python=3.8
 RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \
     (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
     (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \
-    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.5" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
-    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts")
+    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
+    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
+    (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts")
 
 # Install archiver to extract xz archives
-RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% & \
+RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% & \
     python -m pip install --no-cache-dir -U pip setuptools & \
     choco install --no-progress -r -y archiver
diff --git a/ci/docker/python-wheel-windows-vs2019.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile
index ff42de939d91f..5a17e3e4c52c2 100644
--- a/ci/docker/python-wheel-windows-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-vs2019.dockerfile
@@ -83,9 +83,10 @@ ARG python=3.8
 RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \
     (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
     (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \
-    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.5" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
-    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts")
-RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION%
+    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
+    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
+    (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts")
+RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION%
 RUN python -m pip install -U pip setuptools
 
 COPY python/requirements-wheel-build.txt arrow/python/
diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh
index 2090290c99322..5471b3cc238ca 100755
--- a/ci/scripts/install_gcs_testbench.sh
+++ b/ci/scripts/install_gcs_testbench.sh
@@ -41,8 +41,12 @@ version=$1
 if [[ "${version}" -eq "default" ]]; then
   version="v0.39.0"
   # Latests versions of Testbench require newer setuptools
-  ${PYTHON:-python3} -m pip install --upgrade setuptools
+  python3 -m pip install --upgrade setuptools
 fi
 
-${PYTHON:-python3} -m pip install \
-  "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+# This script is run with PYTHON undefined in some places,
+# but those only use older pythons.
+if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then
+  python3 -m pip install \
+    "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+fi
diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh
index 5f962f02b911b..42d0e9ca179fb 100755
--- a/ci/scripts/install_python.sh
+++ b/ci/scripts/install_python.sh
@@ -28,8 +28,9 @@ declare -A versions
 versions=([3.8]=3.8.10
           [3.9]=3.9.13
           [3.10]=3.10.11
-          [3.11]=3.11.5
-          [3.12]=3.12.0)
+          [3.11]=3.11.9
+          [3.12]=3.12.4
+          [3.13]=3.13.0)
 
 if [ "$#" -ne 2 ]; then
   echo "Usage: $0 <platform> <version>"
@@ -46,7 +47,14 @@ full_version=${versions[$2]}
 if [ $platform = "macOS" ]; then
     echo "Downloading Python installer..."
 
-    if [ "$(uname -m)" = "arm64" ] || [ "$version" = "3.10" ] || [ "$version" = "3.11" ] || [ "$version" = "3.12" ]; then
+    if [ "$version" = "3.13" ];
+    then
+        fname="python-${full_version}rc1-macos11.pkg"
+    elif [ "$(uname -m)" = "arm64" ] || \
+         [ "$version" = "3.10" ] || \
+         [ "$version" = "3.11" ] || \
+         [ "$version" = "3.12" ];
+    then
         fname="python-${full_version}-macos11.pkg"
     else
         fname="python-${full_version}-macosx10.9.pkg"
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
index 3ed9d5d8dd12f..d5430f26748eb 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -48,13 +48,11 @@ fi
 
 echo "=== (${PYTHON_VERSION}) Install Python build dependencies ==="
 export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
-export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}"
 
 pip install \
   --upgrade \
   --only-binary=:all: \
   --target $PIP_SITE_PACKAGES \
-  --platform $PIP_TARGET_PLATFORM \
   -r ${source_dir}/python/requirements-wheel-build.txt
 pip install "delocate>=0.10.3"
 
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 6a36109dc2fc1..07e765a759ea0 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -1146,7 +1146,7 @@ test_linux_wheels() {
     local arch="x86_64"
   fi
 
-  local python_versions="${TEST_PYTHON_VERSIONS:-3.8 3.9 3.10 3.11 3.12}"
+  local python_versions="${TEST_PYTHON_VERSIONS:-3.8 3.9 3.10 3.11 3.12 3.13}"
   local platform_tags="${TEST_WHEEL_PLATFORM_TAGS:-manylinux_2_17_${arch}.manylinux2014_${arch} manylinux_2_28_${arch}}"
 
   for python in ${python_versions}; do
@@ -1170,11 +1170,11 @@ test_macos_wheels() {
 
   # apple silicon processor
   if [ "$(uname -m)" = "arm64" ]; then
-    local python_versions="3.8 3.9 3.10 3.11 3.12"
+    local python_versions="3.8 3.9 3.10 3.11 3.12 3.13"
     local platform_tags="macosx_11_0_arm64"
     local check_flight=OFF
   else
-    local python_versions="3.8 3.9 3.10 3.11 3.12"
+    local python_versions="3.8 3.9 3.10 3.11 3.12 3.13"
     local platform_tags="macosx_10_15_x86_64"
   fi
 
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index 968c5da21897b..2854d4349fb7c 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -36,6 +36,11 @@ jobs:
       ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
       {% endif %}
       PYTHON: "{{ python_version }}"
+      {% if python_version == "3.13" %}
+      PYTHON_IMAGE_TAG: "3.13-rc"
+      {% else %}
+      PYTHON_IMAGE_TAG: "{{ python_version }}"
+      {% endif %}
 
     steps:
       {{ macros.github_checkout_arrow()|indent }}
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index 8ceb468af89dd..b26aeba32b79b 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -121,7 +121,7 @@ jobs:
           source test-env/bin/activate
           pip install --upgrade pip wheel
           arch -{{ arch }} pip install -r arrow/python/requirements-wheel-test.txt
-          PYTHON=python arch -{{ arch }} arrow/ci/scripts/install_gcs_testbench.sh default
+          PYTHON_VERSION={{ python_version }} arch -{{ arch }} arrow/ci/scripts/install_gcs_testbench.sh default
           arch -{{ arch }} arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow
 
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index fe02fe9ce68b2..60114d6930878 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -389,7 +389,8 @@ tasks:
                                                ("3.9", "cp39", "cp39"),
                                                ("3.10", "cp310", "cp310"),
                                                ("3.11", "cp311", "cp311"),
-                                               ("3.12", "cp312", "cp312")] %}
+                                               ("3.12", "cp312", "cp312"),
+                                               ("3.13", "cp313", "cp313")] %}
 
 {############################## Wheel Linux ##################################}
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 14eeeeee6e5ef..3045cf015bc26 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1096,9 +1096,10 @@ services:
       args:
         arch: ${ARCH}
         arch_short: ${ARCH_SHORT}
-        base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2024-02-04-ea37246
+        base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2024-08-03-32dfa47
         vcpkg: ${VCPKG}
         python: ${PYTHON}
+        python_image_tag: ${PYTHON_IMAGE_TAG}
         manylinux: 2014
       context: .
       dockerfile: ci/docker/python-wheel-manylinux.dockerfile
@@ -1119,9 +1120,10 @@ services:
       args:
         arch: ${ARCH}
         arch_short: ${ARCH_SHORT}
-        base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2024-02-04-ea37246
+        base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2024-08-03-32dfa47
         vcpkg: ${VCPKG}
         python: ${PYTHON}
+        python_image_tag: ${PYTHON_IMAGE_TAG}
         manylinux: 2_28
       context: .
       dockerfile: ci/docker/python-wheel-manylinux.dockerfile
@@ -1135,7 +1137,7 @@ services:
     command: /arrow/ci/scripts/python_wheel_manylinux_build.sh
 
   python-wheel-manylinux-test-imports:
-    image: ${ARCH}/python:${PYTHON}
+    image: ${ARCH}/python:${PYTHON_IMAGE_TAG}
     shm_size: 2G
     volumes:
       - .:/arrow:delegated
@@ -1151,6 +1153,7 @@ services:
       args:
         arch: ${ARCH}
         python: ${PYTHON}
+        python_image_tag: ${PYTHON_IMAGE_TAG}
       context: .
       dockerfile: ci/docker/python-wheel-manylinux-test.dockerfile
       cache_from:
diff --git a/python/pyproject.toml b/python/pyproject.toml
index d863bb3e5f0ac..8ece65dd467bb 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -48,6 +48,7 @@ classifiers  = [
     'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
     'Programming Language :: Python :: 3.12',
+    'Programming Language :: Python :: 3.13',
 ]
 maintainers = [
     {name = "Apache Arrow Developers", email = "dev@arrow.apache.org"}
diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt
index faa078d3d7fe7..2d448004768ce 100644
--- a/python/requirements-wheel-build.txt
+++ b/python/requirements-wheel-build.txt
@@ -1,3 +1,8 @@
+# Remove pre and extra index url once there's NumPy and Cython wheels for 3.13
+# on PyPI
+--pre
+--extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
+
 cython>=0.29.31
 oldest-supported-numpy>=0.14; python_version<'3.9'
 numpy>=2.0.0; python_version>='3.9'
diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt
index c7ff63e339575..98ec2bd4fd4e4 100644
--- a/python/requirements-wheel-test.txt
+++ b/python/requirements-wheel-test.txt
@@ -1,3 +1,9 @@
+# Remove pre and extra index url once there's NumPy and Cython wheels for 3.13
+# on PyPI
+--pre
+--prefer-binary
+--extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
+
 cffi
 cython
 hypothesis
@@ -12,5 +18,6 @@ tzdata; sys_platform == 'win32'
 numpy~=1.21.3; python_version < "3.11"
 numpy~=1.23.2; python_version == "3.11"
 numpy~=1.26.0; python_version == "3.12"
+numpy~=2.1.0; python_version >= "3.13"
 
 pandas

From 88d57cf41fde20adf14adca02e02d2cb92c83443 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Thu, 22 Aug 2024 08:45:19 -0500
Subject: [PATCH 014/186] MINOR: [CI][R] Undo #43636 now that the action is
 approved (#43730)

Undo the pinning in #43636 now that INFRA has approved the quarto-dev action

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/r.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index bf7eb99e7e990..2820d42470bca 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -86,19 +86,18 @@ jobs:
       run: |
         sudo apt-get install devscripts
 
-    # replace the SHA with v2 once INFRA-26031 is resolved
-    - uses: r-lib/actions/setup-r@732fb28088814627972f1ccbacc02561178cf391
+    - uses: r-lib/actions/setup-r@v2
       with:
         use-public-rspm: true
         install-r: false
 
-    - uses: r-lib/actions/setup-r-dependencies@732fb28088814627972f1ccbacc02561178cf391
+    - uses: r-lib/actions/setup-r-dependencies@v2
       with:
         extra-packages: any::rcmdcheck
         needs: check
         working-directory: src/r
 
-    - uses: r-lib/actions/check-r-package@732fb28088814627972f1ccbacc02561178cf391
+    - uses: r-lib/actions/check-r-package@v2
       with:
         working-directory: src/r
       env:
@@ -341,11 +340,11 @@ jobs:
           cd r/windows
           ls *.zip | xargs -n 1 unzip -uo
           rm -rf *.zip
-      - uses: r-lib/actions/setup-r@732fb28088814627972f1ccbacc02561178cf391
+      - uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.rversion }}
           Ncpus: 2
-      - uses: r-lib/actions/setup-r-dependencies@732fb28088814627972f1ccbacc02561178cf391
+      - uses: r-lib/actions/setup-r-dependencies@v2
         env:
           GITHUB_PAT: "${{ github.token }}"
         with:

From 2e33e98f583035cd686455870e9cbf5fb6dc9966 Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Thu, 22 Aug 2024 08:26:37 -0800
Subject: [PATCH 015/186] MINOR: [GO] fixup test case name in cast_test.go
 (#43780)

---
 go/arrow/compute/cast_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go
index 2e748a2fee9c2..fa08467dd3946 100644
--- a/go/arrow/compute/cast_test.go
+++ b/go/arrow/compute/cast_test.go
@@ -2636,7 +2636,7 @@ func (c *CastSuite) TestStructToDifferentNullabilityStruct() {
 		defer dest3Nullable.Release()
 		checkCast(c.T(), srcNonNull, dest3Nullable, *compute.DefaultCastOptions(true))
 	})
-	c.Run("non-nullable to nullable", func() {
+	c.Run("nullable to non-nullable", func() {
 		fieldsSrcNullable := []arrow.Field{
 			{Name: "a", Type: arrow.PrimitiveTypes.Int8, Nullable: true},
 			{Name: "b", Type: arrow.PrimitiveTypes.Int8, Nullable: true},

From 76e0f6254b75509d83e44fe8997bd14007907c4f Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 22 Aug 2024 15:37:09 -0400
Subject: [PATCH 016/186] GH-43764: [Go][FlightSQL] Add NewPreparedStatement
 function (#43781)

### Rationale for this change
Allowing creation of the prepared statement object outside of the client allows for logging, proxying, and handing off prepared statements if necessary.

### Are these changes tested?
Yes

* GitHub Issue: #43764

Authored-by: Matt Topol <zotthewizard@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
---
 go/arrow/flight/flightsql/client.go      |  9 +++++++++
 go/arrow/flight/flightsql/client_test.go | 21 +++++++++++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go
index 4a600e5253e9b..4c9dc50135108 100644
--- a/go/arrow/flight/flightsql/client.go
+++ b/go/arrow/flight/flightsql/client.go
@@ -1102,6 +1102,15 @@ type PreparedStatement struct {
 	closed        bool
 }
 
+// NewPreparedStatement creates a prepared statement object bound to the provided
+// client using the given handle. In general, it should be sufficient to use the
+// Prepare function a client and this wouldn't be needed. But this can be used
+// to propagate a prepared statement from one client to another if needed or if
+// proxying requests.
+func NewPreparedStatement(client *Client, handle []byte) *PreparedStatement {
+	return &PreparedStatement{client: client, handle: handle}
+}
+
 // Execute executes the prepared statement on the server and returns a FlightInfo
 // indicating where to retrieve the response. If SetParameters has been called
 // then the parameter bindings will be sent before execution.
diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go
index 7604b554cbc6c..d060161f94f0f 100644
--- a/go/arrow/flight/flightsql/client_test.go
+++ b/go/arrow/flight/flightsql/client_test.go
@@ -378,8 +378,10 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecute() {
 	createRsp := &mockDoActionClient{}
 	defer createRsp.AssertExpectations(s.T())
 	createRsp.On("Recv").Return(&pb.Result{Body: data}, nil).Once()
-	createRsp.On("Recv").Return(&pb.Result{}, io.EOF)
-	createRsp.On("CloseSend").Return(nil)
+	createRsp.On("Recv").Return(&pb.Result{}, io.EOF).Once()
+	createRsp.On("Recv").Return(&pb.Result{Body: data}, nil).Once()
+	createRsp.On("Recv").Return(&pb.Result{}, io.EOF).Once()
+	createRsp.On("CloseSend").Return(nil).Twice()
 
 	closeRsp := &mockDoActionClient{}
 	defer closeRsp.AssertExpectations(s.T())
@@ -387,13 +389,13 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecute() {
 	closeRsp.On("CloseSend").Return(nil)
 
 	s.mockClient.On("DoAction", flightsql.CreatePreparedStatementActionType, action.Body, s.callOpts).
-		Return(createRsp, nil)
+		Return(createRsp, nil).Twice()
 	s.mockClient.On("DoAction", flightsql.ClosePreparedStatementActionType, closeAct.Body, s.callOpts).
 		Return(closeRsp, nil)
 
 	infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)}
 	desc := getDesc(infoCmd)
-	s.mockClient.On("GetFlightInfo", desc.Type, desc.Cmd, s.callOpts).Return(&emptyFlightInfo, nil)
+	s.mockClient.On("GetFlightInfo", desc.Type, desc.Cmd, s.callOpts).Return(&emptyFlightInfo, nil).Twice()
 
 	prepared, err := s.sqlClient.Prepare(context.TODO(), query, s.callOpts...)
 	s.NoError(err)
@@ -404,6 +406,17 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecute() {
 	info, err := prepared.Execute(context.TODO(), s.callOpts...)
 	s.NoError(err)
 	s.Equal(&emptyFlightInfo, info)
+
+	prepared, err = s.sqlClient.Prepare(context.TODO(), query, s.callOpts...)
+	s.NoError(err)
+
+	secondPrepare := flightsql.NewPreparedStatement(&s.sqlClient, prepared.Handle())
+	s.Equal(string(secondPrepare.Handle()), "query")
+	defer secondPrepare.Close(context.TODO(), s.callOpts...)
+
+	info, err = secondPrepare.Execute(context.TODO(), s.callOpts...)
+	s.NoError(err)
+	s.Equal(&emptyFlightInfo, info)
 }
 
 func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() {

From d47b305bbce037af18ce65dc968074fe1681b4d4 Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Thu, 22 Aug 2024 16:04:59 -0400
Subject: [PATCH 017/186] GH-43624: [Go] Add JSON/UUID extension types, extend
 arrow -> parquet logical type mapping (#43679)

### Rationale for this change

- Missing `JSON` extension type implementation.
- Current precedent in C++ (and thereby PyArrow) is that canonical extension types do not require manual registration.
- Issues like #43640 and #43624 suggest that we need to expose ways of configuring parquet types written from arrow records, but casting the underlying data presents challenges for a generalized approach.

### What changes are included in this PR?

- Move `UUIDType` from `internal` to `arrow/extensions`
- Implement `JSON` canonical extension type
- Automatically register all canonical extension types at initialization
  - remove register/unregister from various locations these extension types are used
- Add new `CustomParquetType` interface so extension types can specify their target `LogicalType` in Parquet
- Refactor parquet `fieldToNode` to split up `PrimitiveNode` type mapping for leaves from `GroupNode` composition
- Simplify parquet `LogicalType` to use only value receivers

### Are these changes tested?

Yes

### Are there any user-facing changes?

- `UUID` and `JSON` extension types are available to end users.
- Canonical extension types will automatically be recognized in IPC without registration.
- Users with their own extension type implementations may use the `CustomParquetType` interface to control Parquet conversion without needing to fork or upstream the change.

* GitHub Issue: #43624

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 docs/source/status.rst                        |   6 +
 go/arrow/array/array_test.go                  |   4 +-
 go/arrow/array/diff_test.go                   |   4 +-
 go/arrow/array/extension_test.go              |  10 -
 go/arrow/avro/reader_types.go                 |   4 +-
 go/arrow/avro/schema.go                       |   4 +-
 go/arrow/compute/exec/span_test.go            |   6 +-
 go/arrow/csv/reader_test.go                   |   4 +-
 go/arrow/csv/writer_test.go                   |   6 +-
 go/arrow/datatype_extension_test.go           |  18 +-
 go/arrow/extensions/bool8_test.go             |   3 -
 go/arrow/extensions/extensions.go             |  36 +++
 go/arrow/extensions/json.go                   | 148 ++++++++++
 go/arrow/extensions/json_test.go              | 268 ++++++++++++++++++
 go/arrow/extensions/opaque_test.go            |   3 -
 go/arrow/extensions/uuid.go                   | 265 +++++++++++++++++
 go/arrow/extensions/uuid_test.go              | 257 +++++++++++++++++
 .../internal/flight_integration/scenario.go   |   4 -
 .../cmd/arrow-json-integration-test/main.go   |   4 -
 go/arrow/ipc/metadata_test.go                 |  11 +-
 go/internal/types/extension_types.go          | 227 +--------------
 go/internal/types/extension_types_test.go     |  95 -------
 go/parquet/cmd/parquet_reader/main.go         |   2 +-
 go/parquet/metadata/app_version.go            |   2 +-
 go/parquet/pqarrow/encode_arrow_test.go       |  82 ++++--
 go/parquet/pqarrow/path_builder_test.go       |   6 +-
 go/parquet/pqarrow/schema.go                  | 228 +++++++--------
 go/parquet/pqarrow/schema_test.go             |  15 +-
 go/parquet/schema/converted_types.go          |   8 +-
 go/parquet/schema/logical_types.go            |  30 +-
 go/parquet/schema/logical_types_test.go       |  40 +--
 go/parquet/schema/schema_element_test.go      |   4 +-
 32 files changed, 1221 insertions(+), 583 deletions(-)
 create mode 100644 go/arrow/extensions/extensions.go
 create mode 100644 go/arrow/extensions/json.go
 create mode 100644 go/arrow/extensions/json_test.go
 create mode 100644 go/arrow/extensions/uuid.go
 create mode 100644 go/arrow/extensions/uuid_test.go
 delete mode 100644 go/internal/types/extension_types_test.go

diff --git a/docs/source/status.rst b/docs/source/status.rst
index c232aa280befb..5e2c2cc19c890 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -119,6 +119,12 @@ Data Types
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 | Variable shape tensor |       |       |       |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
+| JSON                  |       |       | ✓     |            |       |       |       |       |
++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
+| UUID                  |       |       | ✓     |            |       |       |       |       |
++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
+| 8-bit Boolean         | ✓     |       | ✓     |            |       |       |       |       |
++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 
 Notes:
 
diff --git a/go/arrow/array/array_test.go b/go/arrow/array/array_test.go
index 4d83766b4fa3e..4f0627c600078 100644
--- a/go/arrow/array/array_test.go
+++ b/go/arrow/array/array_test.go
@@ -21,9 +21,9 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/internal/testing/tools"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -122,7 +122,7 @@ func TestMakeFromData(t *testing.T) {
 		{name: "dictionary", d: &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Uint64, ValueType: &testDataType{arrow.TIMESTAMP}}, dict: array.NewData(&testDataType{arrow.TIMESTAMP}, 0 /* length */, make([]*memory.Buffer, 2 /*null bitmap, values*/), nil /* childData */, 0 /* nulls */, 0 /* offset */)},
 
 		{name: "extension", d: &testDataType{arrow.EXTENSION}, expPanic: true, expError: "arrow/array: DataType for ExtensionArray must implement arrow.ExtensionType"},
-		{name: "extension", d: types.NewUUIDType()},
+		{name: "extension", d: extensions.NewUUIDType()},
 
 		{name: "run end encoded", d: arrow.RunEndEncodedOf(arrow.PrimitiveTypes.Int64, arrow.PrimitiveTypes.Int64), child: []arrow.ArrayData{
 			array.NewData(&testDataType{arrow.INT64}, 0 /* length */, make([]*memory.Buffer, 2 /*null bitmap, values*/), nil /* childData */, 0 /* nulls */, 0 /* offset */),
diff --git a/go/arrow/array/diff_test.go b/go/arrow/array/diff_test.go
index 65d212be11838..9c9ce6a53aed0 100644
--- a/go/arrow/array/diff_test.go
+++ b/go/arrow/array/diff_test.go
@@ -25,9 +25,9 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/internal/json"
-	"github.com/apache/arrow/go/v18/internal/types"
 )
 
 type diffTestCase struct {
@@ -861,7 +861,7 @@ func TestEdits_UnifiedDiff(t *testing.T) {
 		},
 		{
 			name:       "extensions",
-			dataType:   types.NewUUIDType(),
+			dataType:   extensions.NewUUIDType(),
 			baseJSON:   `["00000000-0000-0000-0000-000000000000", "00000000-0000-0000-0000-000000000001"]`,
 			targetJSON: `["00000000-0000-0000-0000-000000000001", "00000000-0000-0000-0000-000000000002"]`,
 			want: `@@ -0, +0 @@
diff --git a/go/arrow/array/extension_test.go b/go/arrow/array/extension_test.go
index 71ea9f105af7c..26245cf015dec 100644
--- a/go/arrow/array/extension_test.go
+++ b/go/arrow/array/extension_test.go
@@ -30,16 +30,6 @@ type ExtensionTypeTestSuite struct {
 	suite.Suite
 }
 
-func (e *ExtensionTypeTestSuite) SetupTest() {
-	e.NoError(arrow.RegisterExtensionType(types.NewUUIDType()))
-}
-
-func (e *ExtensionTypeTestSuite) TearDownTest() {
-	if arrow.GetExtensionType("uuid") != nil {
-		e.NoError(arrow.UnregisterExtensionType("uuid"))
-	}
-}
-
 func (e *ExtensionTypeTestSuite) TestParametricEquals() {
 	p1Type := types.NewParametric1Type(6)
 	p2Type := types.NewParametric1Type(6)
diff --git a/go/arrow/avro/reader_types.go b/go/arrow/avro/reader_types.go
index e07cd380d511f..dab2b33dce601 100644
--- a/go/arrow/avro/reader_types.go
+++ b/go/arrow/avro/reader_types.go
@@ -27,8 +27,8 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/decimal256"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 )
 
 type dataLoader struct {
@@ -436,7 +436,7 @@ func mapFieldBuilders(b array.Builder, field arrow.Field, parent *fieldPos) {
 			}
 			return nil
 		}
-	case *types.UUIDBuilder:
+	case *extensions.UUIDBuilder:
 		f.appendFunc = func(data interface{}) error {
 			switch dt := data.(type) {
 			case nil:
diff --git a/go/arrow/avro/schema.go b/go/arrow/avro/schema.go
index 007dad06c19cd..a6de3718d3ccf 100644
--- a/go/arrow/avro/schema.go
+++ b/go/arrow/avro/schema.go
@@ -24,7 +24,7 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
-	"github.com/apache/arrow/go/v18/internal/types"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/internal/utils"
 	avro "github.com/hamba/avro/v2"
 )
@@ -349,7 +349,7 @@ func avroLogicalToArrowField(n *schemaNode) {
 		// The uuid logical type represents a random generated universally unique identifier (UUID).
 		// A uuid logical type annotates an Avro string. The string has to conform with RFC-4122
 	case "uuid":
-		dt = types.NewUUIDType()
+		dt = extensions.NewUUIDType()
 
 	// The date logical type represents a date within the calendar, with no reference to a particular
 	// time zone or time of day.
diff --git a/go/arrow/compute/exec/span_test.go b/go/arrow/compute/exec/span_test.go
index f5beb45ee1494..018fbb7d623d9 100644
--- a/go/arrow/compute/exec/span_test.go
+++ b/go/arrow/compute/exec/span_test.go
@@ -29,6 +29,7 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/compute/exec"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/endian"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/arrow/scalar"
 	"github.com/apache/arrow/go/v18/internal/types"
@@ -192,9 +193,6 @@ func TestArraySpan_NumBuffers(t *testing.T) {
 		Children []exec.ArraySpan
 	}
 
-	arrow.RegisterExtensionType(types.NewUUIDType())
-	defer arrow.UnregisterExtensionType("uuid")
-
 	tests := []struct {
 		name   string
 		fields fields
@@ -207,7 +205,7 @@ func TestArraySpan_NumBuffers(t *testing.T) {
 		{"large binary", fields{Type: arrow.BinaryTypes.LargeBinary}, 3},
 		{"string", fields{Type: arrow.BinaryTypes.String}, 3},
 		{"large string", fields{Type: arrow.BinaryTypes.LargeString}, 3},
-		{"extension", fields{Type: types.NewUUIDType()}, 2},
+		{"extension", fields{Type: extensions.NewUUIDType()}, 2},
 		{"int32", fields{Type: arrow.PrimitiveTypes.Int32}, 2},
 	}
 	for _, tt := range tests {
diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go
index b0775b9b11a96..6a89d49704298 100644
--- a/go/arrow/csv/reader_test.go
+++ b/go/arrow/csv/reader_test.go
@@ -30,8 +30,8 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/csv"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/decimal256"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -356,7 +356,7 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool, stringsCanBeN
 			{Name: "binary", Type: arrow.BinaryTypes.Binary},
 			{Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary},
 			{Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}},
-			{Name: "uuid", Type: types.NewUUIDType()},
+			{Name: "uuid", Type: extensions.NewUUIDType()},
 			{Name: "date32", Type: arrow.PrimitiveTypes.Date32},
 			{Name: "date64", Type: arrow.PrimitiveTypes.Date64},
 		},
diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go
index be9ab961c3ef7..2ae01a6d49071 100644
--- a/go/arrow/csv/writer_test.go
+++ b/go/arrow/csv/writer_test.go
@@ -31,9 +31,9 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/csv"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/decimal256"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/float16"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/google/uuid"
 )
 
@@ -230,7 +230,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo
 			{Name: "binary", Type: arrow.BinaryTypes.Binary},
 			{Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary},
 			{Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}},
-			{Name: "uuid", Type: types.NewUUIDType()},
+			{Name: "uuid", Type: extensions.NewUUIDType()},
 			{Name: "null", Type: arrow.Null},
 		},
 		nil,
@@ -285,7 +285,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo
 	b.Field(22).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil)
 	b.Field(23).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil)
 	b.Field(24).(*array.FixedSizeBinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil)
-	b.Field(25).(*types.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"), uuid.MustParse("00000000-0000-0000-0000-000000000002"), uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil)
+	b.Field(25).(*extensions.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"), uuid.MustParse("00000000-0000-0000-0000-000000000002"), uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil)
 	b.Field(26).(*array.NullBuilder).AppendEmptyValues(3)
 
 	for _, field := range b.Fields() {
diff --git a/go/arrow/datatype_extension_test.go b/go/arrow/datatype_extension_test.go
index c3e595f523e57..7244d377bd285 100644
--- a/go/arrow/datatype_extension_test.go
+++ b/go/arrow/datatype_extension_test.go
@@ -21,7 +21,7 @@ import (
 	"testing"
 
 	"github.com/apache/arrow/go/v18/arrow"
-	"github.com/apache/arrow/go/v18/internal/types"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 )
@@ -50,24 +50,14 @@ type ExtensionTypeTestSuite struct {
 	suite.Suite
 }
 
-func (e *ExtensionTypeTestSuite) SetupTest() {
-	e.NoError(arrow.RegisterExtensionType(types.NewUUIDType()))
-}
-
-func (e *ExtensionTypeTestSuite) TearDownTest() {
-	if arrow.GetExtensionType("uuid") != nil {
-		e.NoError(arrow.UnregisterExtensionType("uuid"))
-	}
-}
-
 func (e *ExtensionTypeTestSuite) TestExtensionType() {
 	e.Nil(arrow.GetExtensionType("uuid-unknown"))
-	e.NotNil(arrow.GetExtensionType("uuid"))
+	e.NotNil(arrow.GetExtensionType("arrow.uuid"))
 
-	e.Error(arrow.RegisterExtensionType(types.NewUUIDType()))
+	e.Error(arrow.RegisterExtensionType(extensions.NewUUIDType()))
 	e.Error(arrow.UnregisterExtensionType("uuid-unknown"))
 
-	typ := types.NewUUIDType()
+	typ := extensions.NewUUIDType()
 	e.Implements((*arrow.ExtensionType)(nil), typ)
 	e.Equal(arrow.EXTENSION, typ.ID())
 	e.Equal("extension", typ.Name())
diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go
index 9f7365d1555fb..ff129e24bc8f0 100644
--- a/go/arrow/extensions/bool8_test.go
+++ b/go/arrow/extensions/bool8_test.go
@@ -178,9 +178,6 @@ func TestReinterpretStorageEqualToValues(t *testing.T) {
 
 func TestBool8TypeBatchIPCRoundTrip(t *testing.T) {
 	typ := extensions.NewBool8Type()
-	arrow.RegisterExtensionType(typ)
-	defer arrow.UnregisterExtensionType(typ.ExtensionName())
-
 	storage, _, err := array.FromJSON(memory.DefaultAllocator, arrow.PrimitiveTypes.Int8,
 		strings.NewReader(`[-1, 0, 1, 2, null]`))
 	require.NoError(t, err)
diff --git a/go/arrow/extensions/extensions.go b/go/arrow/extensions/extensions.go
new file mode 100644
index 0000000000000..03c6923e95f4f
--- /dev/null
+++ b/go/arrow/extensions/extensions.go
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions
+
+import (
+	"github.com/apache/arrow/go/v18/arrow"
+)
+
+var canonicalExtensionTypes = []arrow.ExtensionType{
+	&Bool8Type{},
+	&UUIDType{},
+	&OpaqueType{},
+	&JSONType{},
+}
+
+func init() {
+	for _, extType := range canonicalExtensionTypes {
+		if err := arrow.RegisterExtensionType(extType); err != nil {
+			panic(err)
+		}
+	}
+}
diff --git a/go/arrow/extensions/json.go b/go/arrow/extensions/json.go
new file mode 100644
index 0000000000000..12c49f9c0a76d
--- /dev/null
+++ b/go/arrow/extensions/json.go
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions
+
+import (
+	"fmt"
+	"reflect"
+	"slices"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/internal/json"
+	"github.com/apache/arrow/go/v18/parquet/schema"
+)
+
+var jsonSupportedStorageTypes = []arrow.DataType{
+	arrow.BinaryTypes.String,
+	arrow.BinaryTypes.LargeString,
+	arrow.BinaryTypes.StringView,
+}
+
+// JSONType represents a UTF-8 encoded JSON string as specified in RFC8259.
+type JSONType struct {
+	arrow.ExtensionBase
+}
+
+// ParquetLogicalType implements pqarrow.ExtensionCustomParquetType.
+func (b *JSONType) ParquetLogicalType() schema.LogicalType {
+	return schema.JSONLogicalType{}
+}
+
+// NewJSONType creates a new JSONType with the specified storage type.
+// storageType must be one of String, LargeString, StringView.
+func NewJSONType(storageType arrow.DataType) (*JSONType, error) {
+	if !slices.Contains(jsonSupportedStorageTypes, storageType) {
+		return nil, fmt.Errorf("unsupported storage type for JSON extension type: %s", storageType)
+	}
+	return &JSONType{ExtensionBase: arrow.ExtensionBase{Storage: storageType}}, nil
+}
+
+func (b *JSONType) ArrayType() reflect.Type { return reflect.TypeOf(JSONArray{}) }
+
+func (b *JSONType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
+	if !(data == "" || data == "{}") {
+		return nil, fmt.Errorf("serialized metadata for JSON extension type must be '' or '{}', found: %s", data)
+	}
+	return NewJSONType(storageType)
+}
+
+func (b *JSONType) ExtensionEquals(other arrow.ExtensionType) bool {
+	return b.ExtensionName() == other.ExtensionName() && arrow.TypeEqual(b.Storage, other.StorageType())
+}
+
+func (b *JSONType) ExtensionName() string { return "arrow.json" }
+
+func (b *JSONType) Serialize() string { return "" }
+
+func (b *JSONType) String() string {
+	return fmt.Sprintf("extension<%s[storage_type=%s]>", b.ExtensionName(), b.Storage)
+}
+
+// JSONArray is logically an array of UTF-8 encoded JSON strings.
+// Its values are unmarshaled to native Go values.
+type JSONArray struct {
+	array.ExtensionArrayBase
+}
+
+func (a *JSONArray) String() string {
+	b, err := a.MarshalJSON()
+	if err != nil {
+		panic(fmt.Sprintf("failed marshal JSONArray: %s", err))
+	}
+
+	return string(b)
+}
+
+func (a *JSONArray) Value(i int) any {
+	val := a.ValueBytes(i)
+
+	var res any
+	if err := json.Unmarshal(val, &res); err != nil {
+		panic(err)
+	}
+
+	return res
+}
+
+func (a *JSONArray) ValueStr(i int) string {
+	return string(a.ValueBytes(i))
+}
+
+func (a *JSONArray) ValueBytes(i int) []byte {
+	// convert to json.RawMessage, set to nil if elem isNull.
+	val := a.ValueJSON(i)
+
+	// simply returns wrapped bytes, or null if val is nil.
+	b, err := val.MarshalJSON()
+	if err != nil {
+		panic(err)
+	}
+
+	return b
+}
+
+// ValueJSON wraps the underlying string value as a json.RawMessage,
+// or returns nil if the array value is null.
+func (a *JSONArray) ValueJSON(i int) json.RawMessage {
+	var val json.RawMessage
+	if a.IsValid(i) {
+		val = json.RawMessage(a.Storage().(array.StringLike).Value(i))
+	}
+	return val
+}
+
+// MarshalJSON implements json.Marshaler.
+// Marshaling json.RawMessage is a no-op, except that nil values will
+// be marshaled as a JSON null.
+func (a *JSONArray) MarshalJSON() ([]byte, error) {
+	values := make([]json.RawMessage, a.Len())
+	for i := 0; i < a.Len(); i++ {
+		values[i] = a.ValueJSON(i)
+	}
+	return json.Marshal(values)
+}
+
+// GetOneForMarshal implements arrow.Array.
+func (a *JSONArray) GetOneForMarshal(i int) interface{} {
+	return a.ValueJSON(i)
+}
+
+var (
+	_ arrow.ExtensionType  = (*JSONType)(nil)
+	_ array.ExtensionArray = (*JSONArray)(nil)
+)
diff --git a/go/arrow/extensions/json_test.go b/go/arrow/extensions/json_test.go
new file mode 100644
index 0000000000000..21acc58f93949
--- /dev/null
+++ b/go/arrow/extensions/json_test.go
@@ -0,0 +1,268 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions_test
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
+	"github.com/apache/arrow/go/v18/arrow/ipc"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestJSONTypeBasics(t *testing.T) {
+	typ, err := extensions.NewJSONType(arrow.BinaryTypes.String)
+	require.NoError(t, err)
+
+	typLarge, err := extensions.NewJSONType(arrow.BinaryTypes.LargeString)
+	require.NoError(t, err)
+
+	typView, err := extensions.NewJSONType(arrow.BinaryTypes.StringView)
+	require.NoError(t, err)
+
+	assert.Equal(t, "arrow.json", typ.ExtensionName())
+	assert.Equal(t, "arrow.json", typLarge.ExtensionName())
+	assert.Equal(t, "arrow.json", typView.ExtensionName())
+
+	assert.True(t, typ.ExtensionEquals(typ))
+	assert.True(t, typLarge.ExtensionEquals(typLarge))
+	assert.True(t, typView.ExtensionEquals(typView))
+
+	assert.False(t, arrow.TypeEqual(arrow.BinaryTypes.String, typ))
+	assert.False(t, arrow.TypeEqual(typ, typLarge))
+	assert.False(t, arrow.TypeEqual(typ, typView))
+	assert.False(t, arrow.TypeEqual(typLarge, typView))
+
+	assert.True(t, arrow.TypeEqual(arrow.BinaryTypes.String, typ.StorageType()))
+	assert.True(t, arrow.TypeEqual(arrow.BinaryTypes.LargeString, typLarge.StorageType()))
+	assert.True(t, arrow.TypeEqual(arrow.BinaryTypes.StringView, typView.StorageType()))
+
+	assert.Equal(t, "extension<arrow.json[storage_type=utf8]>", typ.String())
+	assert.Equal(t, "extension<arrow.json[storage_type=large_utf8]>", typLarge.String())
+	assert.Equal(t, "extension<arrow.json[storage_type=string_view]>", typView.String())
+}
+
+var jsonTestCases = []struct {
+	Name           string
+	StorageType    arrow.DataType
+	StorageBuilder func(mem memory.Allocator) array.Builder
+}{
+	{
+		Name:           "string",
+		StorageType:    arrow.BinaryTypes.String,
+		StorageBuilder: func(mem memory.Allocator) array.Builder { return array.NewStringBuilder(mem) },
+	},
+	{
+		Name:           "large_string",
+		StorageType:    arrow.BinaryTypes.LargeString,
+		StorageBuilder: func(mem memory.Allocator) array.Builder { return array.NewLargeStringBuilder(mem) },
+	},
+	{
+		Name:           "string_view",
+		StorageType:    arrow.BinaryTypes.StringView,
+		StorageBuilder: func(mem memory.Allocator) array.Builder { return array.NewStringViewBuilder(mem) },
+	},
+}
+
+func TestJSONTypeCreateFromArray(t *testing.T) {
+	for _, tc := range jsonTestCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			typ, err := extensions.NewJSONType(tc.StorageType)
+			require.NoError(t, err)
+
+			bldr := tc.StorageBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValueFromString(`"foobar"`)
+			bldr.AppendNull()
+			bldr.AppendValueFromString(`{"foo": "bar"}`)
+			bldr.AppendValueFromString(`42`)
+			bldr.AppendValueFromString(`true`)
+			bldr.AppendValueFromString(`[1, true, "3", null, {"five": 5}]`)
+
+			storage := bldr.NewArray()
+			defer storage.Release()
+
+			arr := array.NewExtensionArrayWithStorage(typ, storage)
+			defer arr.Release()
+
+			assert.Equal(t, 6, arr.Len())
+			assert.Equal(t, 1, arr.NullN())
+
+			jsonArr, ok := arr.(*extensions.JSONArray)
+			require.True(t, ok)
+
+			require.Equal(t, "foobar", jsonArr.Value(0))
+			require.Equal(t, nil, jsonArr.Value(1))
+			require.Equal(t, map[string]any{"foo": "bar"}, jsonArr.Value(2))
+			require.Equal(t, float64(42), jsonArr.Value(3))
+			require.Equal(t, true, jsonArr.Value(4))
+			require.Equal(t, []any{float64(1), true, "3", nil, map[string]any{"five": float64(5)}}, jsonArr.Value(5))
+		})
+	}
+}
+
+func TestJSONTypeBatchIPCRoundTrip(t *testing.T) {
+	for _, tc := range jsonTestCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			typ, err := extensions.NewJSONType(tc.StorageType)
+			require.NoError(t, err)
+
+			bldr := tc.StorageBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValueFromString(`"foobar"`)
+			bldr.AppendNull()
+			bldr.AppendValueFromString(`{"foo": "bar"}`)
+			bldr.AppendValueFromString(`42`)
+			bldr.AppendValueFromString(`true`)
+			bldr.AppendValueFromString(`[1, true, "3", null, {"five": 5}]`)
+
+			storage := bldr.NewArray()
+			defer storage.Release()
+
+			arr := array.NewExtensionArrayWithStorage(typ, storage)
+			defer arr.Release()
+
+			batch := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "field", Type: typ, Nullable: true}}, nil),
+				[]arrow.Array{arr}, -1)
+			defer batch.Release()
+
+			var written arrow.Record
+			{
+				var buf bytes.Buffer
+				wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema()))
+				require.NoError(t, wr.Write(batch))
+				require.NoError(t, wr.Close())
+
+				rdr, err := ipc.NewReader(&buf)
+				require.NoError(t, err)
+				written, err = rdr.Read()
+				require.NoError(t, err)
+				written.Retain()
+				defer written.Release()
+				rdr.Release()
+			}
+
+			assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s, got: %s",
+				batch.Schema(), written.Schema())
+
+			assert.Truef(t, array.RecordEqual(batch, written), "expected: %s, got: %s",
+				batch, written)
+		})
+	}
+}
+
+func TestMarshallJSONArray(t *testing.T) {
+	for _, tc := range jsonTestCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			typ, err := extensions.NewJSONType(tc.StorageType)
+			require.NoError(t, err)
+
+			bldr := tc.StorageBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValueFromString(`"foobar"`)
+			bldr.AppendNull()
+			bldr.AppendValueFromString(`{"foo": "bar"}`)
+			bldr.AppendValueFromString(`42`)
+			bldr.AppendValueFromString(`true`)
+			bldr.AppendValueFromString(`[1, true, "3", null, {"five": 5}]`)
+
+			storage := bldr.NewArray()
+			defer storage.Release()
+
+			arr := array.NewExtensionArrayWithStorage(typ, storage)
+			defer arr.Release()
+
+			assert.Equal(t, 6, arr.Len())
+			assert.Equal(t, 1, arr.NullN())
+
+			jsonArr, ok := arr.(*extensions.JSONArray)
+			require.True(t, ok)
+
+			b, err := jsonArr.MarshalJSON()
+			require.NoError(t, err)
+
+			expectedJSON := `["foobar",null,{"foo":"bar"},42,true,[1,true,"3",null,{"five":5}]]`
+			require.Equal(t, expectedJSON, string(b))
+			require.Equal(t, expectedJSON, jsonArr.String())
+		})
+	}
+}
+
+func TestJSONRecordToJSON(t *testing.T) {
+	for _, tc := range jsonTestCases {
+		t.Run(tc.Name, func(t *testing.T) {
+			typ, err := extensions.NewJSONType(tc.StorageType)
+			require.NoError(t, err)
+
+			bldr := tc.StorageBuilder(memory.DefaultAllocator)
+			defer bldr.Release()
+
+			bldr.AppendValueFromString(`"foobar"`)
+			bldr.AppendNull()
+			bldr.AppendValueFromString(`{"foo": "bar"}`)
+			bldr.AppendValueFromString(`42`)
+			bldr.AppendValueFromString(`true`)
+			bldr.AppendValueFromString(`[1, true, "3", null, {"five": 5}]`)
+
+			storage := bldr.NewArray()
+			defer storage.Release()
+
+			arr := array.NewExtensionArrayWithStorage(typ, storage)
+			defer arr.Release()
+
+			assert.Equal(t, 6, arr.Len())
+			assert.Equal(t, 1, arr.NullN())
+
+			jsonArr, ok := arr.(*extensions.JSONArray)
+			require.True(t, ok)
+
+			rec := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "json", Type: typ, Nullable: true}}, nil), []arrow.Array{jsonArr}, 6)
+			defer rec.Release()
+
+			buf := bytes.NewBuffer([]byte("\n")) // expected output has leading newline for clearer formatting
+			require.NoError(t, array.RecordToJSON(rec, buf))
+
+			expectedJSON := `
+				{"json":"foobar"}
+				{"json":null}
+				{"json":{"foo":"bar"}}
+				{"json":42}
+				{"json":true}
+				{"json":[1,true,"3",null,{"five":5}]}
+			`
+
+			expectedJSONLines := strings.Split(expectedJSON, "\n")
+			actualJSONLines := strings.Split(buf.String(), "\n")
+
+			require.Equal(t, len(expectedJSONLines), len(actualJSONLines))
+			for i := range expectedJSONLines {
+				if strings.TrimSpace(expectedJSONLines[i]) != "" {
+					require.JSONEq(t, expectedJSONLines[i], actualJSONLines[i])
+				}
+			}
+		})
+	}
+}
diff --git a/go/arrow/extensions/opaque_test.go b/go/arrow/extensions/opaque_test.go
index b6686e97bc027..a0fc8962ce5e4 100644
--- a/go/arrow/extensions/opaque_test.go
+++ b/go/arrow/extensions/opaque_test.go
@@ -161,9 +161,6 @@ func TestOpaqueTypeMetadataRoundTrip(t *testing.T) {
 
 func TestOpaqueTypeBatchRoundTrip(t *testing.T) {
 	typ := extensions.NewOpaqueType(arrow.BinaryTypes.String, "geometry", "adbc.postgresql")
-	arrow.RegisterExtensionType(typ)
-	defer arrow.UnregisterExtensionType(typ.ExtensionName())
-
 	storage, _, err := array.FromJSON(memory.DefaultAllocator, arrow.BinaryTypes.String,
 		strings.NewReader(`["foobar", null]`))
 	require.NoError(t, err)
diff --git a/go/arrow/extensions/uuid.go b/go/arrow/extensions/uuid.go
new file mode 100644
index 0000000000000..422b9ea118800
--- /dev/null
+++ b/go/arrow/extensions/uuid.go
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions
+
+import (
+	"bytes"
+	"fmt"
+	"reflect"
+	"strings"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/apache/arrow/go/v18/internal/json"
+	"github.com/apache/arrow/go/v18/parquet/schema"
+	"github.com/google/uuid"
+)
+
+type UUIDBuilder struct {
+	*array.ExtensionBuilder
+}
+
+// NewUUIDBuilder creates a new UUIDBuilder, exposing a convenient and efficient interface
+// for writing uuid.UUID (or [16]byte) values to the underlying FixedSizeBinary storage array.
+func NewUUIDBuilder(mem memory.Allocator) *UUIDBuilder {
+	return &UUIDBuilder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewUUIDType())}
+}
+
+func (b *UUIDBuilder) Append(v uuid.UUID) {
+	b.AppendBytes(v)
+}
+
+func (b *UUIDBuilder) AppendBytes(v [16]byte) {
+	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).Append(v[:])
+}
+
+func (b *UUIDBuilder) UnsafeAppend(v uuid.UUID) {
+	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).UnsafeAppend(v[:])
+}
+
+func (b *UUIDBuilder) AppendValueFromString(s string) error {
+	if s == array.NullValueStr {
+		b.AppendNull()
+		return nil
+	}
+
+	uid, err := uuid.Parse(s)
+	if err != nil {
+		return err
+	}
+
+	b.Append(uid)
+	return nil
+}
+
+func (b *UUIDBuilder) AppendValues(v []uuid.UUID, valid []bool) {
+	if len(v) != len(valid) && len(valid) != 0 {
+		panic("len(v) != len(valid) && len(valid) != 0")
+	}
+
+	data := make([][]byte, len(v))
+	for i := range v {
+		if len(valid) > 0 && !valid[i] {
+			continue
+		}
+		data[i] = v[i][:]
+	}
+	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(data, valid)
+}
+
+func (b *UUIDBuilder) UnmarshalOne(dec *json.Decoder) error {
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	var val uuid.UUID
+	switch v := t.(type) {
+	case string:
+		val, err = uuid.Parse(v)
+		if err != nil {
+			return err
+		}
+	case []byte:
+		val, err = uuid.ParseBytes(v)
+		if err != nil {
+			return err
+		}
+	case nil:
+		b.AppendNull()
+		return nil
+	default:
+		return &json.UnmarshalTypeError{
+			Value:  fmt.Sprint(t),
+			Type:   reflect.TypeOf([]byte{}),
+			Offset: dec.InputOffset(),
+			Struct: fmt.Sprintf("FixedSizeBinary[%d]", 16),
+		}
+	}
+
+	b.Append(val)
+	return nil
+}
+
+func (b *UUIDBuilder) Unmarshal(dec *json.Decoder) error {
+	for dec.More() {
+		if err := b.UnmarshalOne(dec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (b *UUIDBuilder) UnmarshalJSON(data []byte) error {
+	dec := json.NewDecoder(bytes.NewReader(data))
+	t, err := dec.Token()
+	if err != nil {
+		return err
+	}
+
+	if delim, ok := t.(json.Delim); !ok || delim != '[' {
+		return fmt.Errorf("uuid builder must unpack from json array, found %s", delim)
+	}
+
+	return b.Unmarshal(dec)
+}
+
+// UUIDArray is a simple array which is a FixedSizeBinary(16)
+type UUIDArray struct {
+	array.ExtensionArrayBase
+}
+
+func (a *UUIDArray) String() string {
+	arr := a.Storage().(*array.FixedSizeBinary)
+	o := new(strings.Builder)
+	o.WriteString("[")
+	for i := 0; i < arr.Len(); i++ {
+		if i > 0 {
+			o.WriteString(" ")
+		}
+		switch {
+		case a.IsNull(i):
+			o.WriteString(array.NullValueStr)
+		default:
+			fmt.Fprintf(o, "%q", a.Value(i))
+		}
+	}
+	o.WriteString("]")
+	return o.String()
+}
+
+func (a *UUIDArray) Value(i int) uuid.UUID {
+	if a.IsNull(i) {
+		return uuid.Nil
+	}
+	return uuid.Must(uuid.FromBytes(a.Storage().(*array.FixedSizeBinary).Value(i)))
+}
+
+func (a *UUIDArray) Values() []uuid.UUID {
+	values := make([]uuid.UUID, a.Len())
+	for i := range values {
+		values[i] = a.Value(i)
+	}
+	return values
+}
+
+func (a *UUIDArray) ValueStr(i int) string {
+	switch {
+	case a.IsNull(i):
+		return array.NullValueStr
+	default:
+		return a.Value(i).String()
+	}
+}
+
+func (a *UUIDArray) MarshalJSON() ([]byte, error) {
+	vals := make([]any, a.Len())
+	for i := range vals {
+		vals[i] = a.GetOneForMarshal(i)
+	}
+	return json.Marshal(vals)
+}
+
+func (a *UUIDArray) GetOneForMarshal(i int) interface{} {
+	if a.IsValid(i) {
+		return a.Value(i)
+	}
+	return nil
+}
+
+// UUIDType is a simple extension type that represents a FixedSizeBinary(16)
+// to be used for representing UUIDs
+type UUIDType struct {
+	arrow.ExtensionBase
+}
+
+// ParquetLogicalType implements pqarrow.ExtensionCustomParquetType.
+func (e *UUIDType) ParquetLogicalType() schema.LogicalType {
+	return schema.UUIDLogicalType{}
+}
+
+// NewUUIDType is a convenience function to create an instance of UUIDType
+// with the correct storage type
+func NewUUIDType() *UUIDType {
+	return &UUIDType{ExtensionBase: arrow.ExtensionBase{Storage: &arrow.FixedSizeBinaryType{ByteWidth: 16}}}
+}
+
+// ArrayType returns TypeOf(UUIDArray{}) for constructing UUID arrays
+func (*UUIDType) ArrayType() reflect.Type {
+	return reflect.TypeOf(UUIDArray{})
+}
+
+func (*UUIDType) ExtensionName() string {
+	return "arrow.uuid"
+}
+
+func (e *UUIDType) String() string {
+	return fmt.Sprintf("extension<%s>", e.ExtensionName())
+}
+
+func (e *UUIDType) MarshalJSON() ([]byte, error) {
+	return []byte(fmt.Sprintf(`{"name":"%s","metadata":%s}`, e.ExtensionName(), e.Serialize())), nil
+}
+
+func (*UUIDType) Serialize() string {
+	return ""
+}
+
+// Deserialize expects storageType to be FixedSizeBinaryType{ByteWidth: 16}
+func (*UUIDType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
+	if !arrow.TypeEqual(storageType, &arrow.FixedSizeBinaryType{ByteWidth: 16}) {
+		return nil, fmt.Errorf("invalid storage type for UUIDType: %s", storageType.Name())
+	}
+	return NewUUIDType(), nil
+}
+
+// ExtensionEquals returns true if both extensions have the same name
+func (e *UUIDType) ExtensionEquals(other arrow.ExtensionType) bool {
+	return e.ExtensionName() == other.ExtensionName()
+}
+
+func (*UUIDType) NewBuilder(mem memory.Allocator) array.Builder {
+	return NewUUIDBuilder(mem)
+}
+
+var (
+	_ arrow.ExtensionType          = (*UUIDType)(nil)
+	_ array.CustomExtensionBuilder = (*UUIDType)(nil)
+	_ array.ExtensionArray         = (*UUIDArray)(nil)
+	_ array.Builder                = (*UUIDBuilder)(nil)
+)
diff --git a/go/arrow/extensions/uuid_test.go b/go/arrow/extensions/uuid_test.go
new file mode 100644
index 0000000000000..80c621db2a0d5
--- /dev/null
+++ b/go/arrow/extensions/uuid_test.go
@@ -0,0 +1,257 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package extensions_test
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
+	"github.com/apache/arrow/go/v18/arrow/ipc"
+	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/apache/arrow/go/v18/internal/json"
+	"github.com/google/uuid"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+var testUUID = uuid.New()
+
+func TestUUIDExtensionBuilder(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+	builder := extensions.NewUUIDBuilder(mem)
+	builder.Append(testUUID)
+	builder.AppendNull()
+	builder.AppendBytes(testUUID)
+	arr := builder.NewArray()
+	defer arr.Release()
+	arrStr := arr.String()
+	assert.Equal(t, fmt.Sprintf(`["%[1]s" (null) "%[1]s"]`, testUUID), arrStr)
+	jsonStr, err := json.Marshal(arr)
+	assert.NoError(t, err)
+
+	arr1, _, err := array.FromJSON(mem, extensions.NewUUIDType(), bytes.NewReader(jsonStr))
+	defer arr1.Release()
+	assert.NoError(t, err)
+	assert.True(t, array.Equal(arr1, arr))
+
+	require.NoError(t, json.Unmarshal(jsonStr, builder))
+	arr2 := builder.NewArray()
+	defer arr2.Release()
+	assert.True(t, array.Equal(arr2, arr))
+}
+
+func TestUUIDExtensionRecordBuilder(t *testing.T) {
+	schema := arrow.NewSchema([]arrow.Field{
+		{Name: "uuid", Type: extensions.NewUUIDType()},
+	}, nil)
+	builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
+	builder.Field(0).(*extensions.UUIDBuilder).Append(testUUID)
+	builder.Field(0).(*extensions.UUIDBuilder).AppendNull()
+	builder.Field(0).(*extensions.UUIDBuilder).Append(testUUID)
+	record := builder.NewRecord()
+	b, err := record.MarshalJSON()
+	require.NoError(t, err)
+	require.Equal(t, "[{\"uuid\":\""+testUUID.String()+"\"}\n,{\"uuid\":null}\n,{\"uuid\":\""+testUUID.String()+"\"}\n]", string(b))
+	record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b))
+	require.NoError(t, err)
+	require.Equal(t, record, record1)
+}
+
+func TestUUIDStringRoundTrip(t *testing.T) {
+	// 1. create array
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(t, 0)
+
+	b := extensions.NewUUIDBuilder(mem)
+	b.Append(uuid.Nil)
+	b.AppendNull()
+	b.Append(uuid.NameSpaceURL)
+	b.AppendNull()
+	b.Append(testUUID)
+
+	arr := b.NewArray()
+	defer arr.Release()
+
+	// 2. create array via AppendValueFromString
+	b1 := extensions.NewUUIDBuilder(mem)
+	defer b1.Release()
+
+	for i := 0; i < arr.Len(); i++ {
+		assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i)))
+	}
+
+	arr1 := b1.NewArray()
+	defer arr1.Release()
+
+	assert.True(t, array.Equal(arr, arr1))
+}
+
+func TestUUIDTypeBasics(t *testing.T) {
+	typ := extensions.NewUUIDType()
+
+	assert.Equal(t, "arrow.uuid", typ.ExtensionName())
+	assert.True(t, typ.ExtensionEquals(typ))
+
+	assert.True(t, arrow.TypeEqual(typ, typ))
+	assert.False(t, arrow.TypeEqual(&arrow.FixedSizeBinaryType{ByteWidth: 16}, typ))
+	assert.True(t, arrow.TypeEqual(&arrow.FixedSizeBinaryType{ByteWidth: 16}, typ.StorageType()))
+
+	assert.Equal(t, "extension<arrow.uuid>", typ.String())
+}
+
+func TestUUIDTypeCreateFromArray(t *testing.T) {
+	typ := extensions.NewUUIDType()
+
+	bldr := array.NewFixedSizeBinaryBuilder(memory.DefaultAllocator, &arrow.FixedSizeBinaryType{ByteWidth: 16})
+	defer bldr.Release()
+
+	bldr.Append(testUUID[:])
+	bldr.AppendNull()
+	bldr.Append(testUUID[:])
+
+	storage := bldr.NewArray()
+	defer storage.Release()
+
+	arr := array.NewExtensionArrayWithStorage(typ, storage)
+	defer arr.Release()
+
+	assert.Equal(t, 3, arr.Len())
+	assert.Equal(t, 1, arr.NullN())
+
+	uuidArr, ok := arr.(*extensions.UUIDArray)
+	require.True(t, ok)
+
+	require.Equal(t, testUUID, uuidArr.Value(0))
+	require.Equal(t, uuid.Nil, uuidArr.Value(1))
+	require.Equal(t, testUUID, uuidArr.Value(2))
+}
+
+func TestUUIDTypeBatchIPCRoundTrip(t *testing.T) {
+	typ := extensions.NewUUIDType()
+
+	bldr := extensions.NewUUIDBuilder(memory.DefaultAllocator)
+	defer bldr.Release()
+
+	bldr.Append(testUUID)
+	bldr.AppendNull()
+	bldr.AppendBytes(testUUID)
+
+	arr := bldr.NewArray()
+	defer arr.Release()
+
+	batch := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "field", Type: typ, Nullable: true}}, nil),
+		[]arrow.Array{arr}, -1)
+	defer batch.Release()
+
+	var written arrow.Record
+	{
+		var buf bytes.Buffer
+		wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema()))
+		require.NoError(t, wr.Write(batch))
+		require.NoError(t, wr.Close())
+
+		rdr, err := ipc.NewReader(&buf)
+		require.NoError(t, err)
+		written, err = rdr.Read()
+		require.NoError(t, err)
+		written.Retain()
+		defer written.Release()
+		rdr.Release()
+	}
+
+	assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s, got: %s",
+		batch.Schema(), written.Schema())
+
+	assert.Truef(t, array.RecordEqual(batch, written), "expected: %s, got: %s",
+		batch, written)
+}
+
+func TestMarshallUUIDArray(t *testing.T) {
+	bldr := extensions.NewUUIDBuilder(memory.DefaultAllocator)
+	defer bldr.Release()
+
+	bldr.Append(testUUID)
+	bldr.AppendNull()
+	bldr.AppendBytes(testUUID)
+
+	arr := bldr.NewArray()
+	defer arr.Release()
+
+	assert.Equal(t, 3, arr.Len())
+	assert.Equal(t, 1, arr.NullN())
+
+	uuidArr, ok := arr.(*extensions.UUIDArray)
+	require.True(t, ok)
+
+	b, err := uuidArr.MarshalJSON()
+	require.NoError(t, err)
+
+	expectedJSON := fmt.Sprintf(`["%[1]s",null,"%[1]s"]`, testUUID)
+	require.Equal(t, expectedJSON, string(b))
+}
+
+func TestUUIDRecordToJSON(t *testing.T) {
+	typ := extensions.NewUUIDType()
+
+	bldr := extensions.NewUUIDBuilder(memory.DefaultAllocator)
+	defer bldr.Release()
+
+	uuid1 := uuid.MustParse("8c607ed4-07b2-4b9c-b5eb-c0387357f9ae")
+
+	bldr.Append(uuid1)
+	bldr.AppendNull()
+
+	// c5f2cbd9-7094-491a-b267-167bb62efe02
+	bldr.AppendBytes([16]byte{197, 242, 203, 217, 112, 148, 73, 26, 178, 103, 22, 123, 182, 46, 254, 2})
+
+	arr := bldr.NewArray()
+	defer arr.Release()
+
+	assert.Equal(t, 3, arr.Len())
+	assert.Equal(t, 1, arr.NullN())
+
+	uuidArr, ok := arr.(*extensions.UUIDArray)
+	require.True(t, ok)
+
+	rec := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "uuid", Type: typ, Nullable: true}}, nil), []arrow.Array{uuidArr}, 3)
+	defer rec.Release()
+
+	buf := bytes.NewBuffer([]byte("\n")) // expected output has leading newline for clearer formatting
+	require.NoError(t, array.RecordToJSON(rec, buf))
+
+	expectedJSON := `
+		{"uuid":"8c607ed4-07b2-4b9c-b5eb-c0387357f9ae"}
+		{"uuid":null}
+		{"uuid":"c5f2cbd9-7094-491a-b267-167bb62efe02"}
+	`
+
+	expectedJSONLines := strings.Split(expectedJSON, "\n")
+	actualJSONLines := strings.Split(buf.String(), "\n")
+
+	require.Equal(t, len(expectedJSONLines), len(actualJSONLines))
+	for i := range expectedJSONLines {
+		if strings.TrimSpace(expectedJSONLines[i]) != "" {
+			require.JSONEq(t, expectedJSONLines[i], actualJSONLines[i])
+		}
+	}
+}
diff --git a/go/arrow/internal/flight_integration/scenario.go b/go/arrow/internal/flight_integration/scenario.go
index 1528bb05d9daa..b9535002a0a17 100644
--- a/go/arrow/internal/flight_integration/scenario.go
+++ b/go/arrow/internal/flight_integration/scenario.go
@@ -40,7 +40,6 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/internal/arrjson"
 	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"golang.org/x/xerrors"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
@@ -161,9 +160,6 @@ func (s *defaultIntegrationTester) RunClient(addr string, opts ...grpc.DialOptio
 
 	ctx := context.Background()
 
-	arrow.RegisterExtensionType(types.NewUUIDType())
-	defer arrow.UnregisterExtensionType("uuid")
-
 	descr := &flight.FlightDescriptor{
 		Type: flight.DescriptorPATH,
 		Path: []string{s.path},
diff --git a/go/arrow/ipc/cmd/arrow-json-integration-test/main.go b/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
index b3e1dcac14119..c47a091268be9 100644
--- a/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
+++ b/go/arrow/ipc/cmd/arrow-json-integration-test/main.go
@@ -22,12 +22,10 @@ import (
 	"log"
 	"os"
 
-	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/arrio"
 	"github.com/apache/arrow/go/v18/arrow/internal/arrjson"
 	"github.com/apache/arrow/go/v18/arrow/ipc"
-	"github.com/apache/arrow/go/v18/internal/types"
 )
 
 func main() {
@@ -50,8 +48,6 @@ func main() {
 }
 
 func runCommand(jsonName, arrowName, mode string, verbose bool) error {
-	arrow.RegisterExtensionType(types.NewUUIDType())
-
 	if jsonName == "" {
 		return fmt.Errorf("must specify json file name")
 	}
diff --git a/go/arrow/ipc/metadata_test.go b/go/arrow/ipc/metadata_test.go
index 33bc63c2a0068..14b8da2cf7cf7 100644
--- a/go/arrow/ipc/metadata_test.go
+++ b/go/arrow/ipc/metadata_test.go
@@ -23,10 +23,10 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/internal/dictutils"
 	"github.com/apache/arrow/go/v18/arrow/internal/flatbuf"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	flatbuffers "github.com/google/flatbuffers/go"
 	"github.com/stretchr/testify/assert"
 )
@@ -169,7 +169,7 @@ func TestRWFooter(t *testing.T) {
 }
 
 func exampleUUID(mem memory.Allocator) arrow.Array {
-	extType := types.NewUUIDType()
+	extType := extensions.NewUUIDType()
 	bldr := array.NewExtensionBuilder(mem, extType)
 	defer bldr.Release()
 
@@ -184,9 +184,6 @@ func TestUnrecognizedExtensionType(t *testing.T) {
 	pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
 	defer pool.AssertSize(t, 0)
 
-	// register the uuid type
-	assert.NoError(t, arrow.RegisterExtensionType(types.NewUUIDType()))
-
 	extArr := exampleUUID(pool)
 	defer extArr.Release()
 
@@ -205,7 +202,9 @@ func TestUnrecognizedExtensionType(t *testing.T) {
 
 	// unregister the uuid type before we read back the buffer so it is
 	// unrecognized when reading back the record batch.
-	assert.NoError(t, arrow.UnregisterExtensionType("uuid"))
+	assert.NoError(t, arrow.UnregisterExtensionType("arrow.uuid"))
+	// re-register once the test is complete
+	defer arrow.RegisterExtensionType(extensions.NewUUIDType())
 	rdr, err := NewReader(&buf, WithAllocator(pool))
 	defer rdr.Release()
 
diff --git a/go/internal/types/extension_types.go b/go/internal/types/extension_types.go
index 85c64d86bffcb..33ada2d488f71 100644
--- a/go/internal/types/extension_types.go
+++ b/go/internal/types/extension_types.go
@@ -18,238 +18,15 @@
 package types
 
 import (
-	"bytes"
 	"encoding/binary"
 	"fmt"
 	"reflect"
-	"strings"
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
-	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/json"
-	"github.com/google/uuid"
 	"golang.org/x/xerrors"
 )
 
-var UUID = NewUUIDType()
-
-type UUIDBuilder struct {
-	*array.ExtensionBuilder
-}
-
-func NewUUIDBuilder(mem memory.Allocator) *UUIDBuilder {
-	return &UUIDBuilder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewUUIDType())}
-}
-
-func (b *UUIDBuilder) Append(v uuid.UUID) {
-	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).Append(v[:])
-}
-
-func (b *UUIDBuilder) UnsafeAppend(v uuid.UUID) {
-	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).UnsafeAppend(v[:])
-}
-
-func (b *UUIDBuilder) AppendValueFromString(s string) error {
-	if s == array.NullValueStr {
-		b.AppendNull()
-		return nil
-	}
-
-	uid, err := uuid.Parse(s)
-	if err != nil {
-		return err
-	}
-
-	b.Append(uid)
-	return nil
-}
-
-func (b *UUIDBuilder) AppendValues(v []uuid.UUID, valid []bool) {
-	if len(v) != len(valid) && len(valid) != 0 {
-		panic("len(v) != len(valid) && len(valid) != 0")
-	}
-
-	data := make([][]byte, len(v))
-	for i := range v {
-		if len(valid) > 0 && !valid[i] {
-			continue
-		}
-		data[i] = v[i][:]
-	}
-	b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(data, valid)
-}
-
-func (b *UUIDBuilder) UnmarshalOne(dec *json.Decoder) error {
-	t, err := dec.Token()
-	if err != nil {
-		return err
-	}
-
-	var val uuid.UUID
-	switch v := t.(type) {
-	case string:
-		val, err = uuid.Parse(v)
-		if err != nil {
-			return err
-		}
-	case []byte:
-		val, err = uuid.ParseBytes(v)
-		if err != nil {
-			return err
-		}
-	case nil:
-		b.AppendNull()
-		return nil
-	default:
-		return &json.UnmarshalTypeError{
-			Value:  fmt.Sprint(t),
-			Type:   reflect.TypeOf([]byte{}),
-			Offset: dec.InputOffset(),
-			Struct: fmt.Sprintf("FixedSizeBinary[%d]", 16),
-		}
-	}
-
-	b.Append(val)
-	return nil
-}
-
-func (b *UUIDBuilder) Unmarshal(dec *json.Decoder) error {
-	for dec.More() {
-		if err := b.UnmarshalOne(dec); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func (b *UUIDBuilder) UnmarshalJSON(data []byte) error {
-	dec := json.NewDecoder(bytes.NewReader(data))
-	t, err := dec.Token()
-	if err != nil {
-		return err
-	}
-
-	if delim, ok := t.(json.Delim); !ok || delim != '[' {
-		return fmt.Errorf("uuid builder must unpack from json array, found %s", delim)
-	}
-
-	return b.Unmarshal(dec)
-}
-
-// UUIDArray is a simple array which is a FixedSizeBinary(16)
-type UUIDArray struct {
-	array.ExtensionArrayBase
-}
-
-func (a *UUIDArray) String() string {
-	arr := a.Storage().(*array.FixedSizeBinary)
-	o := new(strings.Builder)
-	o.WriteString("[")
-	for i := 0; i < arr.Len(); i++ {
-		if i > 0 {
-			o.WriteString(" ")
-		}
-		switch {
-		case a.IsNull(i):
-			o.WriteString(array.NullValueStr)
-		default:
-			fmt.Fprintf(o, "%q", a.Value(i))
-		}
-	}
-	o.WriteString("]")
-	return o.String()
-}
-
-func (a *UUIDArray) Value(i int) uuid.UUID {
-	if a.IsNull(i) {
-		return uuid.Nil
-	}
-	return uuid.Must(uuid.FromBytes(a.Storage().(*array.FixedSizeBinary).Value(i)))
-}
-
-func (a *UUIDArray) ValueStr(i int) string {
-	switch {
-	case a.IsNull(i):
-		return array.NullValueStr
-	default:
-		return a.Value(i).String()
-	}
-}
-
-func (a *UUIDArray) MarshalJSON() ([]byte, error) {
-	arr := a.Storage().(*array.FixedSizeBinary)
-	values := make([]interface{}, a.Len())
-	for i := 0; i < a.Len(); i++ {
-		if a.IsValid(i) {
-			values[i] = uuid.Must(uuid.FromBytes(arr.Value(i))).String()
-		}
-	}
-	return json.Marshal(values)
-}
-
-func (a *UUIDArray) GetOneForMarshal(i int) interface{} {
-	if a.IsNull(i) {
-		return nil
-	}
-	return a.Value(i)
-}
-
-// UUIDType is a simple extension type that represents a FixedSizeBinary(16)
-// to be used for representing UUIDs
-type UUIDType struct {
-	arrow.ExtensionBase
-}
-
-// NewUUIDType is a convenience function to create an instance of UUIDType
-// with the correct storage type
-func NewUUIDType() *UUIDType {
-	return &UUIDType{ExtensionBase: arrow.ExtensionBase{Storage: &arrow.FixedSizeBinaryType{ByteWidth: 16}}}
-}
-
-// ArrayType returns TypeOf(UUIDArray{}) for constructing UUID arrays
-func (*UUIDType) ArrayType() reflect.Type {
-	return reflect.TypeOf(UUIDArray{})
-}
-
-func (*UUIDType) ExtensionName() string {
-	return "uuid"
-}
-
-func (e *UUIDType) String() string {
-	return fmt.Sprintf("extension_type<storage=%s>", e.Storage)
-}
-
-func (e *UUIDType) MarshalJSON() ([]byte, error) {
-	return []byte(fmt.Sprintf(`{"name":"%s","metadata":%s}`, e.ExtensionName(), e.Serialize())), nil
-}
-
-// Serialize returns "uuid-serialized" for testing proper metadata passing
-func (*UUIDType) Serialize() string {
-	return "uuid-serialized"
-}
-
-// Deserialize expects storageType to be FixedSizeBinaryType{ByteWidth: 16} and the data to be
-// "uuid-serialized" in order to correctly create a UUIDType for testing deserialize.
-func (*UUIDType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
-	if data != "uuid-serialized" {
-		return nil, fmt.Errorf("type identifier did not match: '%s'", data)
-	}
-	if !arrow.TypeEqual(storageType, &arrow.FixedSizeBinaryType{ByteWidth: 16}) {
-		return nil, fmt.Errorf("invalid storage type for UUIDType: %s", storageType.Name())
-	}
-	return NewUUIDType(), nil
-}
-
-// ExtensionEquals returns true if both extensions have the same name
-func (e *UUIDType) ExtensionEquals(other arrow.ExtensionType) bool {
-	return e.ExtensionName() == other.ExtensionName()
-}
-
-func (*UUIDType) NewBuilder(mem memory.Allocator) array.Builder {
-	return NewUUIDBuilder(mem)
-}
-
 // Parametric1Array is a simple int32 array for use with the Parametric1Type
 // in testing a parameterized user-defined extension type.
 type Parametric1Array struct {
@@ -518,14 +295,14 @@ func (SmallintType) ArrayType() reflect.Type { return reflect.TypeOf(SmallintArr
 
 func (SmallintType) ExtensionName() string { return "smallint" }
 
-func (SmallintType) Serialize() string { return "smallint" }
+func (SmallintType) Serialize() string { return "smallint-serialized" }
 
 func (s *SmallintType) ExtensionEquals(other arrow.ExtensionType) bool {
 	return s.Name() == other.Name()
 }
 
 func (SmallintType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
-	if data != "smallint" {
+	if data != "smallint-serialized" {
 		return nil, fmt.Errorf("type identifier did not match: '%s'", data)
 	}
 	if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Int16) {
diff --git a/go/internal/types/extension_types_test.go b/go/internal/types/extension_types_test.go
deleted file mode 100644
index 65f6353d01be1..0000000000000
--- a/go/internal/types/extension_types_test.go
+++ /dev/null
@@ -1,95 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package types_test
-
-import (
-	"bytes"
-	"testing"
-
-	"github.com/apache/arrow/go/v18/arrow"
-	"github.com/apache/arrow/go/v18/arrow/array"
-	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/json"
-	"github.com/apache/arrow/go/v18/internal/types"
-	"github.com/google/uuid"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-var testUUID = uuid.New()
-
-func TestUUIDExtensionBuilder(t *testing.T) {
-	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
-	defer mem.AssertSize(t, 0)
-	builder := types.NewUUIDBuilder(mem)
-	builder.Append(testUUID)
-	arr := builder.NewArray()
-	defer arr.Release()
-	arrStr := arr.String()
-	assert.Equal(t, "[\""+testUUID.String()+"\"]", arrStr)
-	jsonStr, err := json.Marshal(arr)
-	assert.NoError(t, err)
-
-	arr1, _, err := array.FromJSON(mem, types.NewUUIDType(), bytes.NewReader(jsonStr))
-	defer arr1.Release()
-	assert.NoError(t, err)
-	assert.Equal(t, arr, arr1)
-}
-
-func TestUUIDExtensionRecordBuilder(t *testing.T) {
-	schema := arrow.NewSchema([]arrow.Field{
-		{Name: "uuid", Type: types.NewUUIDType()},
-	}, nil)
-	builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
-	builder.Field(0).(*types.UUIDBuilder).Append(testUUID)
-	record := builder.NewRecord()
-	b, err := record.MarshalJSON()
-	require.NoError(t, err)
-	require.Equal(t, "[{\"uuid\":\""+testUUID.String()+"\"}\n]", string(b))
-	record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b))
-	require.NoError(t, err)
-	require.Equal(t, record, record1)
-}
-
-func TestUUIDStringRoundTrip(t *testing.T) {
-	// 1. create array
-	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
-	defer mem.AssertSize(t, 0)
-
-	b := types.NewUUIDBuilder(mem)
-	b.Append(uuid.Nil)
-	b.AppendNull()
-	b.Append(uuid.NameSpaceURL)
-	b.AppendNull()
-	b.Append(testUUID)
-
-	arr := b.NewArray()
-	defer arr.Release()
-
-	// 2. create array via AppendValueFromString
-	b1 := types.NewUUIDBuilder(mem)
-	defer b1.Release()
-
-	for i := 0; i < arr.Len(); i++ {
-		assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i)))
-	}
-
-	arr1 := b1.NewArray()
-	defer arr1.Release()
-
-	assert.True(t, array.Equal(arr, arr1))
-}
diff --git a/go/parquet/cmd/parquet_reader/main.go b/go/parquet/cmd/parquet_reader/main.go
index 6e04f4254f9fa..4e480aeb8660b 100644
--- a/go/parquet/cmd/parquet_reader/main.go
+++ b/go/parquet/cmd/parquet_reader/main.go
@@ -154,7 +154,7 @@ func main() {
 			if descr.ConvertedType() != schema.ConvertedTypes.None {
 				fmt.Printf("/%s", descr.ConvertedType())
 				if descr.ConvertedType() == schema.ConvertedTypes.Decimal {
-					dec := descr.LogicalType().(*schema.DecimalLogicalType)
+					dec := descr.LogicalType().(schema.DecimalLogicalType)
 					fmt.Printf("(%d,%d)", dec.Precision(), dec.Scale())
 				}
 			}
diff --git a/go/parquet/metadata/app_version.go b/go/parquet/metadata/app_version.go
index 887ed79343a42..345e9d440a1ca 100644
--- a/go/parquet/metadata/app_version.go
+++ b/go/parquet/metadata/app_version.go
@@ -164,7 +164,7 @@ func (v AppVersion) HasCorrectStatistics(coltype parquet.Type, logicalType schem
 	// parquet-cpp-arrow version 4.0.0 fixed Decimal comparisons for creating min/max stats
 	// parquet-cpp also becomes parquet-cpp-arrow as of version 4.0.0
 	if v.App == "parquet-cpp" || (v.App == "parquet-cpp-arrow" && v.LessThan(parquet1655FixedVersion)) {
-		if _, ok := logicalType.(*schema.DecimalLogicalType); ok && coltype == parquet.Types.FixedLenByteArray {
+		if _, ok := logicalType.(schema.DecimalLogicalType); ok && coltype == parquet.Types.FixedLenByteArray {
 			return false
 		}
 	}
diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go
index 16282173a685c..a238a78133e55 100644
--- a/go/parquet/pqarrow/encode_arrow_test.go
+++ b/go/parquet/pqarrow/encode_arrow_test.go
@@ -30,6 +30,7 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/bitutil"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/decimal256"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/internal/types"
@@ -715,16 +716,6 @@ type ParquetIOTestSuite struct {
 	suite.Suite
 }
 
-func (ps *ParquetIOTestSuite) SetupTest() {
-	ps.NoError(arrow.RegisterExtensionType(types.NewUUIDType()))
-}
-
-func (ps *ParquetIOTestSuite) TearDownTest() {
-	if arrow.GetExtensionType("uuid") != nil {
-		ps.NoError(arrow.UnregisterExtensionType("uuid"))
-	}
-}
-
 func (ps *ParquetIOTestSuite) makeSimpleSchema(typ arrow.DataType, rep parquet.Repetition) *schema.GroupNode {
 	byteWidth := int32(-1)
 
@@ -2053,7 +2044,7 @@ func (ps *ParquetIOTestSuite) TestArrowExtensionTypeRoundTrip() {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	defer mem.AssertSize(ps.T(), 0)
 
-	builder := types.NewUUIDBuilder(mem)
+	builder := extensions.NewUUIDBuilder(mem)
 	builder.Append(uuid.New())
 	arr := builder.NewArray()
 	defer arr.Release()
@@ -2076,22 +2067,23 @@ func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
 
 	{
 		// Prepare `written` table with the extension type registered.
-		extType := types.NewUUIDType()
+		extType := types.NewSmallintType()
 		bldr := array.NewExtensionBuilder(mem, extType)
 		defer bldr.Release()
 
-		bldr.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(
-			[][]byte{nil, []byte("abcdefghijklmno0"), []byte("abcdefghijklmno1"), []byte("abcdefghijklmno2")},
+		bldr.Builder.(*array.Int16Builder).AppendValues(
+			[]int16{0, 0, 1, 2},
 			[]bool{false, true, true, true})
 
 		arr := bldr.NewArray()
 		defer arr.Release()
 
-		if arrow.GetExtensionType("uuid") != nil {
-			ps.NoError(arrow.UnregisterExtensionType("uuid"))
+		if arrow.GetExtensionType("smallint") != nil {
+			ps.NoError(arrow.UnregisterExtensionType("smallint"))
+			defer arrow.RegisterExtensionType(extType)
 		}
 
-		fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true}
+		fld := arrow.Field{Name: "smallint", Type: arr.DataType(), Nullable: true}
 		cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr})
 		defer arr.Release() // NewChunked
 		written = array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1)
@@ -2101,16 +2093,16 @@ func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
 
 	{
 		// Prepare `expected` table with the extension type unregistered in the underlying type.
-		bldr := array.NewFixedSizeBinaryBuilder(mem, &arrow.FixedSizeBinaryType{ByteWidth: 16})
+		bldr := array.NewInt16Builder(mem)
 		defer bldr.Release()
 		bldr.AppendValues(
-			[][]byte{nil, []byte("abcdefghijklmno0"), []byte("abcdefghijklmno1"), []byte("abcdefghijklmno2")},
+			[]int16{0, 0, 1, 2},
 			[]bool{false, true, true, true})
 
 		arr := bldr.NewArray()
 		defer arr.Release()
 
-		fld := arrow.Field{Name: "uuid", Type: arr.DataType(), Nullable: true}
+		fld := arrow.Field{Name: "smallint", Type: arr.DataType(), Nullable: true}
 		cnk := arrow.NewChunked(arr.DataType(), []arrow.Array{arr})
 		defer arr.Release() // NewChunked
 		expected = array.NewTable(arrow.NewSchema([]arrow.Field{fld}, nil), []arrow.Column{*arrow.NewColumn(fld, cnk)}, -1)
@@ -2147,13 +2139,55 @@ func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
 	ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", exc, exc, tbc, tbc)
 
 	expectedMd := arrow.MetadataFrom(map[string]string{
-		ipc.ExtensionTypeKeyName:     "uuid",
-		ipc.ExtensionMetadataKeyName: "uuid-serialized",
+		ipc.ExtensionTypeKeyName:     "smallint",
+		ipc.ExtensionMetadataKeyName: "smallint-serialized",
 		"PARQUET:field_id":           "-1",
 	})
 	ps.Truef(expectedMd.Equal(tbl.Column(0).Field().Metadata), "expected: %v\ngot: %v", expectedMd, tbl.Column(0).Field().Metadata)
 }
 
+func (ps *ParquetIOTestSuite) TestArrowExtensionTypeLogicalType() {
+	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+	defer mem.AssertSize(ps.T(), 0)
+
+	jsonType, err := extensions.NewJSONType(arrow.BinaryTypes.String)
+	ps.NoError(err)
+
+	sch := arrow.NewSchema([]arrow.Field{
+		{Name: "uuid", Type: extensions.NewUUIDType()},
+		{Name: "json", Type: jsonType},
+	},
+		nil,
+	)
+	bldr := array.NewRecordBuilder(mem, sch)
+	defer bldr.Release()
+
+	bldr.Field(0).(*extensions.UUIDBuilder).Append(uuid.New())
+	bldr.Field(1).(*array.ExtensionBuilder).AppendValueFromString(`{"hello": ["world", 2, true], "world": null}`)
+	rec := bldr.NewRecord()
+	defer rec.Release()
+
+	var buf bytes.Buffer
+	wr, err := pqarrow.NewFileWriter(
+		sch,
+		&buf,
+		parquet.NewWriterProperties(),
+		pqarrow.DefaultWriterProps(),
+	)
+	ps.Require().NoError(err)
+
+	ps.Require().NoError(wr.Write(rec))
+	ps.Require().NoError(wr.Close())
+
+	rdr, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
+	ps.Require().NoError(err)
+	defer rdr.Close()
+
+	pqSchema := rdr.MetaData().Schema
+	ps.True(pqSchema.Column(0).LogicalType().Equals(schema.UUIDLogicalType{}))
+	ps.True(pqSchema.Column(1).LogicalType().Equals(schema.JSONLogicalType{}))
+}
+
 func TestWriteTableMemoryAllocation(t *testing.T) {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	sc := arrow.NewSchema([]arrow.Field{
@@ -2163,7 +2197,7 @@ func TestWriteTableMemoryAllocation(t *testing.T) {
 			arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64, Nullable: true},
 			arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64, Nullable: true})},
 		{Name: "arr_i64", Type: arrow.ListOf(arrow.PrimitiveTypes.Int64)},
-		{Name: "uuid", Type: types.NewUUIDType(), Nullable: true},
+		{Name: "uuid", Type: extensions.NewUUIDType(), Nullable: true},
 	}, nil)
 
 	bld := array.NewRecordBuilder(mem, sc)
@@ -2176,7 +2210,7 @@ func TestWriteTableMemoryAllocation(t *testing.T) {
 	abld := bld.Field(3).(*array.ListBuilder)
 	abld.Append(true)
 	abld.ValueBuilder().(*array.Int64Builder).Append(2)
-	bld.Field(4).(*types.UUIDBuilder).Append(uuid.MustParse("00000000-0000-0000-0000-000000000001"))
+	bld.Field(4).(*extensions.UUIDBuilder).Append(uuid.MustParse("00000000-0000-0000-0000-000000000001"))
 
 	rec := bld.NewRecord()
 	bld.Release()
diff --git a/go/parquet/pqarrow/path_builder_test.go b/go/parquet/pqarrow/path_builder_test.go
index 9bbae426b8a46..364f836d0bbca 100644
--- a/go/parquet/pqarrow/path_builder_test.go
+++ b/go/parquet/pqarrow/path_builder_test.go
@@ -22,8 +22,8 @@ import (
 
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/array"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/google/uuid"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -364,12 +364,12 @@ func TestNestedExtensionListsWithSomeNulls(t *testing.T) {
 	mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
 	defer mem.AssertSize(t, 0)
 
-	listType := arrow.ListOf(types.NewUUIDType())
+	listType := arrow.ListOf(extensions.NewUUIDType())
 	bldr := array.NewListBuilder(mem, listType)
 	defer bldr.Release()
 
 	nestedBldr := bldr.ValueBuilder().(*array.ListBuilder)
-	vb := nestedBldr.ValueBuilder().(*types.UUIDBuilder)
+	vb := nestedBldr.ValueBuilder().(*extensions.UUIDBuilder)
 
 	uuid1 := uuid.New()
 	uuid3 := uuid.New()
diff --git a/go/parquet/pqarrow/schema.go b/go/parquet/pqarrow/schema.go
index ce5cc6f905084..4882077671f0f 100644
--- a/go/parquet/pqarrow/schema.go
+++ b/go/parquet/pqarrow/schema.go
@@ -25,7 +25,6 @@ import (
 	"github.com/apache/arrow/go/v18/arrow"
 	"github.com/apache/arrow/go/v18/arrow/decimal128"
 	"github.com/apache/arrow/go/v18/arrow/flight"
-	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/parquet"
 	"github.com/apache/arrow/go/v18/parquet/file"
@@ -120,6 +119,15 @@ func (sm *SchemaManifest) GetFieldIndices(indices []int) ([]int, error) {
 	return ret, nil
 }
 
+// ExtensionCustomParquetType is an interface that Arrow ExtensionTypes may implement
+// to specify the target LogicalType to use when converting to Parquet.
+//
+// The PrimitiveType is not configurable, and is determined by a fixed mapping from
+// the extension's StorageType to a Parquet type (see getParquetType in pqarrow source).
+type ExtensionCustomParquetType interface {
+	ParquetLogicalType() schema.LogicalType
+}
+
 func isDictionaryReadSupported(dt arrow.DataType) bool {
 	return arrow.IsBinaryLike(dt.ID())
 }
@@ -250,104 +258,14 @@ func structToNode(typ *arrow.StructType, name string, nullable bool, props *parq
 }
 
 func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (schema.Node, error) {
-	var (
-		logicalType schema.LogicalType = schema.NoLogicalType{}
-		typ         parquet.Type
-		repType     = repFromNullable(field.Nullable)
-		length      = -1
-		precision   = -1
-		scale       = -1
-		err         error
-	)
+	repType := repFromNullable(field.Nullable)
 
+	// Handle complex types i.e. GroupNodes
 	switch field.Type.ID() {
 	case arrow.NULL:
-		typ = parquet.Types.Int32
-		logicalType = &schema.NullLogicalType{}
 		if repType != parquet.Repetitions.Optional {
 			return nil, xerrors.New("nulltype arrow field must be nullable")
 		}
-	case arrow.BOOL:
-		typ = parquet.Types.Boolean
-	case arrow.UINT8:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(8, false)
-	case arrow.INT8:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(8, true)
-	case arrow.UINT16:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(16, false)
-	case arrow.INT16:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(16, true)
-	case arrow.UINT32:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(32, false)
-	case arrow.INT32:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewIntLogicalType(32, true)
-	case arrow.UINT64:
-		typ = parquet.Types.Int64
-		logicalType = schema.NewIntLogicalType(64, false)
-	case arrow.INT64:
-		typ = parquet.Types.Int64
-		logicalType = schema.NewIntLogicalType(64, true)
-	case arrow.FLOAT32:
-		typ = parquet.Types.Float
-	case arrow.FLOAT64:
-		typ = parquet.Types.Double
-	case arrow.STRING, arrow.LARGE_STRING:
-		logicalType = schema.StringLogicalType{}
-		fallthrough
-	case arrow.BINARY, arrow.LARGE_BINARY:
-		typ = parquet.Types.ByteArray
-	case arrow.FIXED_SIZE_BINARY:
-		typ = parquet.Types.FixedLenByteArray
-		length = field.Type.(*arrow.FixedSizeBinaryType).ByteWidth
-	case arrow.DECIMAL, arrow.DECIMAL256:
-		dectype := field.Type.(arrow.DecimalType)
-		precision = int(dectype.GetPrecision())
-		scale = int(dectype.GetScale())
-
-		if props.StoreDecimalAsInteger() && 1 <= precision && precision <= 18 {
-			if precision <= 9 {
-				typ = parquet.Types.Int32
-			} else {
-				typ = parquet.Types.Int64
-			}
-		} else {
-			typ = parquet.Types.FixedLenByteArray
-			length = int(DecimalSize(int32(precision)))
-		}
-
-		logicalType = schema.NewDecimalLogicalType(int32(precision), int32(scale))
-	case arrow.DATE32:
-		typ = parquet.Types.Int32
-		logicalType = schema.DateLogicalType{}
-	case arrow.DATE64:
-		typ = parquet.Types.Int32
-		logicalType = schema.DateLogicalType{}
-	case arrow.TIMESTAMP:
-		typ, logicalType, err = getTimestampMeta(field.Type.(*arrow.TimestampType), props, arrprops)
-		if err != nil {
-			return nil, err
-		}
-	case arrow.TIME32:
-		typ = parquet.Types.Int32
-		logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMillis)
-	case arrow.TIME64:
-		typ = parquet.Types.Int64
-		timeType := field.Type.(*arrow.Time64Type)
-		if timeType.Unit == arrow.Nanosecond {
-			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitNanos)
-		} else {
-			logicalType = schema.NewTimeLogicalType(true, schema.TimeUnitMicros)
-		}
-	case arrow.FLOAT16:
-		typ = parquet.Types.FixedLenByteArray
-		length = arrow.Float16SizeBytes
-		logicalType = schema.Float16LogicalType{}
 	case arrow.STRUCT:
 		return structToNode(field.Type.(*arrow.StructType), field.Name, field.Nullable, props, arrprops)
 	case arrow.FIXED_SIZE_LIST, arrow.LIST:
@@ -369,16 +287,6 @@ func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties
 		dictType := field.Type.(*arrow.DictionaryType)
 		return fieldToNode(name, arrow.Field{Name: name, Type: dictType.ValueType, Nullable: field.Nullable, Metadata: field.Metadata},
 			props, arrprops)
-	case arrow.EXTENSION:
-		return fieldToNode(name, arrow.Field{
-			Name:     name,
-			Type:     field.Type.(arrow.ExtensionType).StorageType(),
-			Nullable: field.Nullable,
-			Metadata: arrow.MetadataFrom(map[string]string{
-				ipc.ExtensionTypeKeyName:     field.Type.(arrow.ExtensionType).ExtensionName(),
-				ipc.ExtensionMetadataKeyName: field.Type.(arrow.ExtensionType).Serialize(),
-			}),
-		}, props, arrprops)
 	case arrow.MAP:
 		mapType := field.Type.(*arrow.MapType)
 		keyNode, err := fieldToNode("key", mapType.KeyField(), props, arrprops)
@@ -402,8 +310,12 @@ func fieldToNode(name string, field arrow.Field, props *parquet.WriterProperties
 			}, -1)
 		}
 		return schema.MapOf(field.Name, keyNode, valueNode, repFromNullable(field.Nullable), -1)
-	default:
-		return nil, fmt.Errorf("%w: support for %s", arrow.ErrNotImplemented, field.Type.ID())
+	}
+
+	// Not a GroupNode
+	typ, logicalType, length, err := getParquetType(field.Type, props, arrprops)
+	if err != nil {
+		return nil, err
 	}
 
 	return schema.NewPrimitiveNodeLogical(name, repType, logicalType, typ, length, fieldIDFromMeta(field.Metadata))
@@ -472,7 +384,7 @@ func (s schemaTree) RecordLeaf(leaf *SchemaField) {
 	s.manifest.ColIndexToField[leaf.ColIndex] = leaf
 }
 
-func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) {
+func arrowInt(log schema.IntLogicalType) (arrow.DataType, error) {
 	switch log.BitWidth() {
 	case 8:
 		if log.IsSigned() {
@@ -499,7 +411,7 @@ func arrowInt(log *schema.IntLogicalType) (arrow.DataType, error) {
 	}
 }
 
-func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) {
+func arrowTime32(logical schema.TimeLogicalType) (arrow.DataType, error) {
 	if logical.TimeUnit() == schema.TimeUnitMillis {
 		return arrow.FixedWidthTypes.Time32ms, nil
 	}
@@ -507,7 +419,7 @@ func arrowTime32(logical *schema.TimeLogicalType) (arrow.DataType, error) {
 	return nil, xerrors.New(logical.String() + " cannot annotate a time32")
 }
 
-func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) {
+func arrowTime64(logical schema.TimeLogicalType) (arrow.DataType, error) {
 	switch logical.TimeUnit() {
 	case schema.TimeUnitMicros:
 		return arrow.FixedWidthTypes.Time64us, nil
@@ -518,7 +430,7 @@ func arrowTime64(logical *schema.TimeLogicalType) (arrow.DataType, error) {
 	}
 }
 
-func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error) {
+func arrowTimestamp(logical schema.TimestampLogicalType) (arrow.DataType, error) {
 	tz := ""
 
 	// ConvertedTypes are adjusted to UTC per backward compatibility guidelines
@@ -539,7 +451,7 @@ func arrowTimestamp(logical *schema.TimestampLogicalType) (arrow.DataType, error
 	}
 }
 
-func arrowDecimal(logical *schema.DecimalLogicalType) arrow.DataType {
+func arrowDecimal(logical schema.DecimalLogicalType) arrow.DataType {
 	if logical.Precision() <= decimal128.MaxPrecision {
 		return &arrow.Decimal128Type{Precision: logical.Precision(), Scale: logical.Scale()}
 	}
@@ -550,11 +462,11 @@ func arrowFromInt32(logical schema.LogicalType) (arrow.DataType, error) {
 	switch logtype := logical.(type) {
 	case schema.NoLogicalType:
 		return arrow.PrimitiveTypes.Int32, nil
-	case *schema.TimeLogicalType:
+	case schema.TimeLogicalType:
 		return arrowTime32(logtype)
-	case *schema.DecimalLogicalType:
+	case schema.DecimalLogicalType:
 		return arrowDecimal(logtype), nil
-	case *schema.IntLogicalType:
+	case schema.IntLogicalType:
 		return arrowInt(logtype)
 	case schema.DateLogicalType:
 		return arrow.FixedWidthTypes.Date32, nil
@@ -569,13 +481,13 @@ func arrowFromInt64(logical schema.LogicalType) (arrow.DataType, error) {
 	}
 
 	switch logtype := logical.(type) {
-	case *schema.IntLogicalType:
+	case schema.IntLogicalType:
 		return arrowInt(logtype)
-	case *schema.DecimalLogicalType:
+	case schema.DecimalLogicalType:
 		return arrowDecimal(logtype), nil
-	case *schema.TimeLogicalType:
+	case schema.TimeLogicalType:
 		return arrowTime64(logtype)
-	case *schema.TimestampLogicalType:
+	case schema.TimestampLogicalType:
 		return arrowTimestamp(logtype)
 	default:
 		return nil, xerrors.New(logical.String() + " cannot annotate int64")
@@ -586,7 +498,7 @@ func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) {
 	switch logtype := logical.(type) {
 	case schema.StringLogicalType:
 		return arrow.BinaryTypes.String, nil
-	case *schema.DecimalLogicalType:
+	case schema.DecimalLogicalType:
 		return arrowDecimal(logtype), nil
 	case schema.NoLogicalType,
 		schema.EnumLogicalType,
@@ -600,7 +512,7 @@ func arrowFromByteArray(logical schema.LogicalType) (arrow.DataType, error) {
 
 func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, error) {
 	switch logtype := logical.(type) {
-	case *schema.DecimalLogicalType:
+	case schema.DecimalLogicalType:
 		return arrowDecimal(logtype), nil
 	case schema.NoLogicalType, schema.IntervalLogicalType, schema.UUIDLogicalType:
 		return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
@@ -611,6 +523,84 @@ func arrowFromFLBA(logical schema.LogicalType, length int) (arrow.DataType, erro
 	}
 }
 
+func getParquetType(typ arrow.DataType, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (parquet.Type, schema.LogicalType, int, error) {
+	switch typ.ID() {
+	case arrow.NULL:
+		return parquet.Types.Int32, schema.NullLogicalType{}, -1, nil
+	case arrow.BOOL:
+		return parquet.Types.Boolean, schema.NoLogicalType{}, -1, nil
+	case arrow.UINT8:
+		return parquet.Types.Int32, schema.NewIntLogicalType(8, false), -1, nil
+	case arrow.INT8:
+		return parquet.Types.Int32, schema.NewIntLogicalType(8, true), -1, nil
+	case arrow.UINT16:
+		return parquet.Types.Int32, schema.NewIntLogicalType(16, false), -1, nil
+	case arrow.INT16:
+		return parquet.Types.Int32, schema.NewIntLogicalType(16, true), -1, nil
+	case arrow.UINT32:
+		return parquet.Types.Int32, schema.NewIntLogicalType(32, false), -1, nil
+	case arrow.INT32:
+		return parquet.Types.Int32, schema.NewIntLogicalType(32, true), -1, nil
+	case arrow.UINT64:
+		return parquet.Types.Int64, schema.NewIntLogicalType(64, false), -1, nil
+	case arrow.INT64:
+		return parquet.Types.Int64, schema.NewIntLogicalType(64, true), -1, nil
+	case arrow.FLOAT32:
+		return parquet.Types.Float, schema.NoLogicalType{}, -1, nil
+	case arrow.FLOAT64:
+		return parquet.Types.Double, schema.NoLogicalType{}, -1, nil
+	case arrow.STRING, arrow.LARGE_STRING:
+		return parquet.Types.ByteArray, schema.StringLogicalType{}, -1, nil
+	case arrow.BINARY, arrow.LARGE_BINARY:
+		return parquet.Types.ByteArray, schema.NoLogicalType{}, -1, nil
+	case arrow.FIXED_SIZE_BINARY:
+		return parquet.Types.FixedLenByteArray, schema.NoLogicalType{}, typ.(*arrow.FixedSizeBinaryType).ByteWidth, nil
+	case arrow.DECIMAL, arrow.DECIMAL256:
+		dectype := typ.(arrow.DecimalType)
+		precision := int(dectype.GetPrecision())
+		scale := int(dectype.GetScale())
+
+		if !props.StoreDecimalAsInteger() || precision > 18 {
+			return parquet.Types.FixedLenByteArray, schema.NewDecimalLogicalType(int32(precision), int32(scale)), int(DecimalSize(int32(precision))), nil
+		}
+
+		pqType := parquet.Types.Int32
+		if precision > 9 {
+			pqType = parquet.Types.Int64
+		}
+
+		return pqType, schema.NoLogicalType{}, -1, nil
+	case arrow.DATE32:
+		return parquet.Types.Int32, schema.DateLogicalType{}, -1, nil
+	case arrow.DATE64:
+		return parquet.Types.Int32, schema.DateLogicalType{}, -1, nil
+	case arrow.TIMESTAMP:
+		pqType, logicalType, err := getTimestampMeta(typ.(*arrow.TimestampType), props, arrprops)
+		return pqType, logicalType, -1, err
+	case arrow.TIME32:
+		return parquet.Types.Int32, schema.NewTimeLogicalType(true, schema.TimeUnitMillis), -1, nil
+	case arrow.TIME64:
+		pqTimeUnit := schema.TimeUnitMicros
+		if typ.(*arrow.Time64Type).Unit == arrow.Nanosecond {
+			pqTimeUnit = schema.TimeUnitNanos
+		}
+
+		return parquet.Types.Int64, schema.NewTimeLogicalType(true, pqTimeUnit), -1, nil
+	case arrow.FLOAT16:
+		return parquet.Types.FixedLenByteArray, schema.Float16LogicalType{}, arrow.Float16SizeBytes, nil
+	case arrow.EXTENSION:
+		storageType := typ.(arrow.ExtensionType).StorageType()
+		pqType, logicalType, length, err := getParquetType(storageType, props, arrprops)
+		if withCustomType, ok := typ.(ExtensionCustomParquetType); ok {
+			logicalType = withCustomType.ParquetLogicalType()
+		}
+
+		return pqType, logicalType, length, err
+	default:
+		return parquet.Type(0), nil, 0, fmt.Errorf("%w: support for %s", arrow.ErrNotImplemented, typ.ID())
+	}
+}
+
 func getArrowType(physical parquet.Type, logical schema.LogicalType, typeLen int) (arrow.DataType, error) {
 	if !logical.IsValid() || logical.Equals(schema.NullLogicalType{}) {
 		return arrow.Null, nil
diff --git a/go/parquet/pqarrow/schema_test.go b/go/parquet/pqarrow/schema_test.go
index 24b031c174bf2..528200fd0e7d9 100644
--- a/go/parquet/pqarrow/schema_test.go
+++ b/go/parquet/pqarrow/schema_test.go
@@ -21,10 +21,10 @@ import (
 	"testing"
 
 	"github.com/apache/arrow/go/v18/arrow"
+	"github.com/apache/arrow/go/v18/arrow/extensions"
 	"github.com/apache/arrow/go/v18/arrow/flight"
 	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
-	"github.com/apache/arrow/go/v18/internal/types"
 	"github.com/apache/arrow/go/v18/parquet"
 	"github.com/apache/arrow/go/v18/parquet/metadata"
 	"github.com/apache/arrow/go/v18/parquet/pqarrow"
@@ -34,7 +34,7 @@ import (
 )
 
 func TestGetOriginSchemaBase64(t *testing.T) {
-	uuidType := types.NewUUIDType()
+	uuidType := extensions.NewUUIDType()
 	md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
 	extMd := arrow.NewMetadata([]string{ipc.ExtensionMetadataKeyName, ipc.ExtensionTypeKeyName, "PARQUET:field_id"}, []string{uuidType.Serialize(), uuidType.ExtensionName(), "-1"})
 	origArrSc := arrow.NewSchema([]arrow.Field{
@@ -44,10 +44,6 @@ func TestGetOriginSchemaBase64(t *testing.T) {
 	}, nil)
 
 	arrSerializedSc := flight.SerializeSchema(origArrSc, memory.DefaultAllocator)
-	if err := arrow.RegisterExtensionType(uuidType); err != nil {
-		t.Fatal(err)
-	}
-	defer arrow.UnregisterExtensionType(uuidType.ExtensionName())
 	pqschema, err := pqarrow.ToParquet(origArrSc, nil, pqarrow.DefaultWriterProps())
 	require.NoError(t, err)
 
@@ -71,11 +67,7 @@ func TestGetOriginSchemaBase64(t *testing.T) {
 }
 
 func TestGetOriginSchemaUnregisteredExtension(t *testing.T) {
-	uuidType := types.NewUUIDType()
-	if err := arrow.RegisterExtensionType(uuidType); err != nil {
-		t.Fatal(err)
-	}
-
+	uuidType := extensions.NewUUIDType()
 	md := arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})
 	origArrSc := arrow.NewSchema([]arrow.Field{
 		{Name: "f1", Type: arrow.BinaryTypes.String, Metadata: md},
@@ -90,6 +82,7 @@ func TestGetOriginSchemaUnregisteredExtension(t *testing.T) {
 	kv.Append("ARROW:schema", base64.StdEncoding.EncodeToString(arrSerializedSc))
 
 	arrow.UnregisterExtensionType(uuidType.ExtensionName())
+	defer arrow.RegisterExtensionType(uuidType)
 	arrsc, err := pqarrow.FromParquet(pqschema, nil, kv)
 	require.NoError(t, err)
 
diff --git a/go/parquet/schema/converted_types.go b/go/parquet/schema/converted_types.go
index 5fc10f61cebc1..b2b6f50cbf682 100644
--- a/go/parquet/schema/converted_types.go
+++ b/go/parquet/schema/converted_types.go
@@ -113,13 +113,9 @@ func (p ConvertedType) ToLogicalType(convertedDecimal DecimalMetadata) LogicalTy
 	case ConvertedTypes.TimeMicros:
 		return NewTimeLogicalType(true /* adjustedToUTC */, TimeUnitMicros)
 	case ConvertedTypes.TimestampMillis:
-		t := NewTimestampLogicalType(true /* adjustedToUTC */, TimeUnitMillis)
-		t.(*TimestampLogicalType).fromConverted = true
-		return t
+		return NewTimestampLogicalTypeWithOpts(WithTSIsAdjustedToUTC(), WithTSTimeUnitType(TimeUnitMillis), WithTSFromConverted())
 	case ConvertedTypes.TimestampMicros:
-		t := NewTimestampLogicalType(true /* adjustedToUTC */, TimeUnitMicros)
-		t.(*TimestampLogicalType).fromConverted = true
-		return t
+		return NewTimestampLogicalTypeWithOpts(WithTSIsAdjustedToUTC(), WithTSTimeUnitType(TimeUnitMicros), WithTSFromConverted())
 	case ConvertedTypes.Interval:
 		return IntervalLogicalType{}
 	case ConvertedTypes.Int8:
diff --git a/go/parquet/schema/logical_types.go b/go/parquet/schema/logical_types.go
index e8adce1ca140e..fa46ea0172f76 100644
--- a/go/parquet/schema/logical_types.go
+++ b/go/parquet/schema/logical_types.go
@@ -45,21 +45,21 @@ func getLogicalType(l *format.LogicalType) LogicalType {
 	case l.IsSetENUM():
 		return EnumLogicalType{}
 	case l.IsSetDECIMAL():
-		return &DecimalLogicalType{typ: l.DECIMAL}
+		return DecimalLogicalType{typ: l.DECIMAL}
 	case l.IsSetDATE():
 		return DateLogicalType{}
 	case l.IsSetTIME():
 		if timeUnitFromThrift(l.TIME.Unit) == TimeUnitUnknown {
 			panic("parquet: TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type")
 		}
-		return &TimeLogicalType{typ: l.TIME}
+		return TimeLogicalType{typ: l.TIME}
 	case l.IsSetTIMESTAMP():
 		if timeUnitFromThrift(l.TIMESTAMP.Unit) == TimeUnitUnknown {
 			panic("parquet: TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type")
 		}
-		return &TimestampLogicalType{typ: l.TIMESTAMP}
+		return TimestampLogicalType{typ: l.TIMESTAMP}
 	case l.IsSetINTEGER():
-		return &IntLogicalType{typ: l.INTEGER}
+		return IntLogicalType{typ: l.INTEGER}
 	case l.IsSetUNKNOWN():
 		return NullLogicalType{}
 	case l.IsSetJSON():
@@ -344,7 +344,7 @@ func NewDecimalLogicalType(precision int32, scale int32) LogicalType {
 	if scale < 0 || scale > precision {
 		panic("parquet: scale must be a non-negative integer that does not exceed precision for decimal logical type")
 	}
-	return &DecimalLogicalType{typ: &format.DecimalType{Precision: precision, Scale: scale}}
+	return DecimalLogicalType{typ: &format.DecimalType{Precision: precision, Scale: scale}}
 }
 
 // DecimalLogicalType is used to represent a decimal value of a given
@@ -405,7 +405,7 @@ func (t DecimalLogicalType) toThrift() *format.LogicalType {
 }
 
 func (t DecimalLogicalType) Equals(rhs LogicalType) bool {
-	other, ok := rhs.(*DecimalLogicalType)
+	other, ok := rhs.(DecimalLogicalType)
 	if !ok {
 		return false
 	}
@@ -509,7 +509,7 @@ func createTimeUnit(unit TimeUnitType) *format.TimeUnit {
 
 // NewTimeLogicalType returns a time type of the given unit.
 func NewTimeLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType {
-	return &TimeLogicalType{typ: &format.TimeType{
+	return TimeLogicalType{typ: &format.TimeType{
 		IsAdjustedToUTC: isAdjustedToUTC,
 		Unit:            createTimeUnit(unit),
 	}}
@@ -584,7 +584,7 @@ func (t TimeLogicalType) toThrift() *format.LogicalType {
 }
 
 func (t TimeLogicalType) Equals(rhs LogicalType) bool {
-	other, ok := rhs.(*TimeLogicalType)
+	other, ok := rhs.(TimeLogicalType)
 	if !ok {
 		return false
 	}
@@ -595,7 +595,7 @@ func (t TimeLogicalType) Equals(rhs LogicalType) bool {
 // NewTimestampLogicalType returns a logical timestamp type with "forceConverted"
 // set to false
 func NewTimestampLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalType {
-	return &TimestampLogicalType{
+	return TimestampLogicalType{
 		typ: &format.TimestampType{
 			IsAdjustedToUTC: isAdjustedToUTC,
 			Unit:            createTimeUnit(unit),
@@ -608,7 +608,7 @@ func NewTimestampLogicalType(isAdjustedToUTC bool, unit TimeUnitType) LogicalTyp
 // NewTimestampLogicalTypeForce returns a timestamp logical type with
 // "forceConverted" set to true
 func NewTimestampLogicalTypeForce(isAdjustedToUTC bool, unit TimeUnitType) LogicalType {
-	return &TimestampLogicalType{
+	return TimestampLogicalType{
 		typ: &format.TimestampType{
 			IsAdjustedToUTC: isAdjustedToUTC,
 			Unit:            createTimeUnit(unit),
@@ -654,14 +654,14 @@ func WithTSFromConverted() TimestampOpt {
 //
 // TimestampType Unit defaults to milliseconds (TimeUnitMillis)
 func NewTimestampLogicalTypeWithOpts(opts ...TimestampOpt) LogicalType {
-	ts := &TimestampLogicalType{
+	ts := TimestampLogicalType{
 		typ: &format.TimestampType{
 			Unit: createTimeUnit(TimeUnitMillis), // default to milliseconds
 		},
 	}
 
 	for _, o := range opts {
-		o(ts)
+		o(&ts)
 	}
 
 	return ts
@@ -760,7 +760,7 @@ func (t TimestampLogicalType) toThrift() *format.LogicalType {
 }
 
 func (t TimestampLogicalType) Equals(rhs LogicalType) bool {
-	other, ok := rhs.(*TimestampLogicalType)
+	other, ok := rhs.(TimestampLogicalType)
 	if !ok {
 		return false
 	}
@@ -778,7 +778,7 @@ func NewIntLogicalType(bitWidth int8, signed bool) LogicalType {
 	default:
 		panic("parquet: bit width must be exactly 8, 16, 32, or 64 for Int logical type")
 	}
-	return &IntLogicalType{
+	return IntLogicalType{
 		typ: &format.IntType{
 			BitWidth: bitWidth,
 			IsSigned: signed,
@@ -864,7 +864,7 @@ func (t IntLogicalType) toThrift() *format.LogicalType {
 }
 
 func (t IntLogicalType) Equals(rhs LogicalType) bool {
-	other, ok := rhs.(*IntLogicalType)
+	other, ok := rhs.(IntLogicalType)
 	if !ok {
 		return false
 	}
diff --git a/go/parquet/schema/logical_types_test.go b/go/parquet/schema/logical_types_test.go
index e33925966e178..395d1504182fe 100644
--- a/go/parquet/schema/logical_types_test.go
+++ b/go/parquet/schema/logical_types_test.go
@@ -38,18 +38,18 @@ func TestConvertedLogicalEquivalences(t *testing.T) {
 		{"list", schema.ConvertedTypes.List, schema.NewListLogicalType(), schema.NewListLogicalType()},
 		{"enum", schema.ConvertedTypes.Enum, schema.EnumLogicalType{}, schema.EnumLogicalType{}},
 		{"date", schema.ConvertedTypes.Date, schema.DateLogicalType{}, schema.DateLogicalType{}},
-		{"timemilli", schema.ConvertedTypes.TimeMillis, schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), &schema.TimeLogicalType{}},
-		{"timemicro", schema.ConvertedTypes.TimeMicros, schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), &schema.TimeLogicalType{}},
-		{"timestampmilli", schema.ConvertedTypes.TimestampMillis, schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), &schema.TimestampLogicalType{}},
-		{"timestampmicro", schema.ConvertedTypes.TimestampMicros, schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), &schema.TimestampLogicalType{}},
-		{"uint8", schema.ConvertedTypes.Uint8, schema.NewIntLogicalType(8 /* bitWidth */, false /* signed */), &schema.IntLogicalType{}},
-		{"uint16", schema.ConvertedTypes.Uint16, schema.NewIntLogicalType(16 /* bitWidth */, false /* signed */), &schema.IntLogicalType{}},
-		{"uint32", schema.ConvertedTypes.Uint32, schema.NewIntLogicalType(32 /* bitWidth */, false /* signed */), &schema.IntLogicalType{}},
-		{"uint64", schema.ConvertedTypes.Uint64, schema.NewIntLogicalType(64 /* bitWidth */, false /* signed */), &schema.IntLogicalType{}},
-		{"int8", schema.ConvertedTypes.Int8, schema.NewIntLogicalType(8 /* bitWidth */, true /* signed */), &schema.IntLogicalType{}},
-		{"int16", schema.ConvertedTypes.Int16, schema.NewIntLogicalType(16 /* bitWidth */, true /* signed */), &schema.IntLogicalType{}},
-		{"int32", schema.ConvertedTypes.Int32, schema.NewIntLogicalType(32 /* bitWidth */, true /* signed */), &schema.IntLogicalType{}},
-		{"int64", schema.ConvertedTypes.Int64, schema.NewIntLogicalType(64 /* bitWidth */, true /* signed */), &schema.IntLogicalType{}},
+		{"timemilli", schema.ConvertedTypes.TimeMillis, schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), schema.TimeLogicalType{}},
+		{"timemicro", schema.ConvertedTypes.TimeMicros, schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), schema.TimeLogicalType{}},
+		{"timestampmilli", schema.ConvertedTypes.TimestampMillis, schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMillis), schema.TimestampLogicalType{}},
+		{"timestampmicro", schema.ConvertedTypes.TimestampMicros, schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitMicros), schema.TimestampLogicalType{}},
+		{"uint8", schema.ConvertedTypes.Uint8, schema.NewIntLogicalType(8 /* bitWidth */, false /* signed */), schema.IntLogicalType{}},
+		{"uint16", schema.ConvertedTypes.Uint16, schema.NewIntLogicalType(16 /* bitWidth */, false /* signed */), schema.IntLogicalType{}},
+		{"uint32", schema.ConvertedTypes.Uint32, schema.NewIntLogicalType(32 /* bitWidth */, false /* signed */), schema.IntLogicalType{}},
+		{"uint64", schema.ConvertedTypes.Uint64, schema.NewIntLogicalType(64 /* bitWidth */, false /* signed */), schema.IntLogicalType{}},
+		{"int8", schema.ConvertedTypes.Int8, schema.NewIntLogicalType(8 /* bitWidth */, true /* signed */), schema.IntLogicalType{}},
+		{"int16", schema.ConvertedTypes.Int16, schema.NewIntLogicalType(16 /* bitWidth */, true /* signed */), schema.IntLogicalType{}},
+		{"int32", schema.ConvertedTypes.Int32, schema.NewIntLogicalType(32 /* bitWidth */, true /* signed */), schema.IntLogicalType{}},
+		{"int64", schema.ConvertedTypes.Int64, schema.NewIntLogicalType(64 /* bitWidth */, true /* signed */), schema.IntLogicalType{}},
 		{"json", schema.ConvertedTypes.JSON, schema.JSONLogicalType{}, schema.JSONLogicalType{}},
 		{"bson", schema.ConvertedTypes.BSON, schema.BSONLogicalType{}, schema.BSONLogicalType{}},
 		{"interval", schema.ConvertedTypes.Interval, schema.IntervalLogicalType{}, schema.IntervalLogicalType{}},
@@ -72,8 +72,8 @@ func TestConvertedLogicalEquivalences(t *testing.T) {
 		fromMake := schema.NewDecimalLogicalType(10, 4)
 		assert.IsType(t, fromMake, fromConverted)
 		assert.True(t, fromConverted.Equals(fromMake))
-		assert.IsType(t, &schema.DecimalLogicalType{}, fromConverted)
-		assert.IsType(t, &schema.DecimalLogicalType{}, fromMake)
+		assert.IsType(t, schema.DecimalLogicalType{}, fromConverted)
+		assert.IsType(t, schema.DecimalLogicalType{}, fromMake)
 		assert.True(t, schema.NewDecimalLogicalType(16, 0).Equals(schema.NewDecimalLogicalType(16, 0)))
 	})
 }
@@ -160,12 +160,12 @@ func TestNewTypeIncompatibility(t *testing.T) {
 		{"uuid", schema.UUIDLogicalType{}, schema.UUIDLogicalType{}},
 		{"float16", schema.Float16LogicalType{}, schema.Float16LogicalType{}},
 		{"null", schema.NullLogicalType{}, schema.NullLogicalType{}},
-		{"not-utc-time_milli", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMillis), &schema.TimeLogicalType{}},
-		{"not-utc-time-micro", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMicros), &schema.TimeLogicalType{}},
-		{"not-utc-time-nano", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimeLogicalType{}},
-		{"utc-time-nano", schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimeLogicalType{}},
-		{"not-utc-timestamp-nano", schema.NewTimestampLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimestampLogicalType{}},
-		{"utc-timestamp-nano", schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), &schema.TimestampLogicalType{}},
+		{"not-utc-time_milli", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMillis), schema.TimeLogicalType{}},
+		{"not-utc-time-micro", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitMicros), schema.TimeLogicalType{}},
+		{"not-utc-time-nano", schema.NewTimeLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), schema.TimeLogicalType{}},
+		{"utc-time-nano", schema.NewTimeLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), schema.TimeLogicalType{}},
+		{"not-utc-timestamp-nano", schema.NewTimestampLogicalType(false /* adjustedToUTC */, schema.TimeUnitNanos), schema.TimestampLogicalType{}},
+		{"utc-timestamp-nano", schema.NewTimestampLogicalType(true /* adjustedToUTC */, schema.TimeUnitNanos), schema.TimestampLogicalType{}},
 	}
 
 	for _, tt := range tests {
diff --git a/go/parquet/schema/schema_element_test.go b/go/parquet/schema/schema_element_test.go
index 7da55ce93abe6..e427ba6485e64 100644
--- a/go/parquet/schema/schema_element_test.go
+++ b/go/parquet/schema/schema_element_test.go
@@ -192,7 +192,7 @@ func (s *SchemaElementConstructionSuite) TestSimple() {
 
 func (s *SchemaElementConstructionSuite) reconstructDecimal(c schemaElementConstructArgs) *decimalSchemaElementConstruction {
 	ret := s.reconstruct(c)
-	dec := c.logical.(*DecimalLogicalType)
+	dec := c.logical.(DecimalLogicalType)
 	return &decimalSchemaElementConstruction{*ret, int(dec.Precision()), int(dec.Scale())}
 }
 
@@ -359,7 +359,7 @@ func (s *SchemaElementConstructionSuite) TestTemporal() {
 
 func (s *SchemaElementConstructionSuite) reconstructInteger(c schemaElementConstructArgs) *intSchemaElementConstruction {
 	base := s.reconstruct(c)
-	l := c.logical.(*IntLogicalType)
+	l := c.logical.(IntLogicalType)
 	return &intSchemaElementConstruction{
 		*base,
 		l.BitWidth(),

From 82ecf3e6ed8cb58a08d600041617ce85c9bdb7c1 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 22 Aug 2024 22:57:14 +0200
Subject: [PATCH 018/186] MINOR: [CI][C++][Python] Fix Cuda builds on git main
 (#43789)

On the Cuda self-hosted runners, we need to use legacy `docker-compose` on all Archery Docker invocations, including the "image push" step. This is because the Docker client version on those runners is too old to accept the `--file` option to the `compose` subcommand.

This is a followup to https://github.com/apache/arrow/pull/43586 . The image push step cannot easily be verified in a PR, hence this second PR.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/tasks/docker-tests/github.cuda.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/tasks/docker-tests/github.cuda.yml b/dev/tasks/docker-tests/github.cuda.yml
index 9c7adf53a6f70..8c04da8a91a4f 100644
--- a/dev/tasks/docker-tests/github.cuda.yml
+++ b/dev/tasks/docker-tests/github.cuda.yml
@@ -26,6 +26,8 @@ jobs:
     runs-on: ['self-hosted', 'cuda']
 {{ macros.github_set_env(env) }}
     timeout-minutes: {{ timeout|default(60) }}
+    env:
+      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
     steps:
       {{ macros.github_checkout_arrow(fetch_depth=fetch_depth|default(1))|indent }}
       # python 3.8 is installed on the runner, no need to install
@@ -34,7 +36,6 @@ jobs:
       - name: Execute Docker Build
         shell: bash
         env:
-          ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
         {{ macros.github_set_sccache_envvars()|indent(8) }}
         run: |
           archery docker run \

From bad064f705ec9fc72efac2d13a1fc3fac6d3d137 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 22 Aug 2024 14:08:26 -0700
Subject: [PATCH 019/186] MINOR: [C++] Ensure setting the default
 CMAKE_BUILD_TYPE (#43794)

### Rationale for this change

The current logic for detecting whether the `CMAKE_BUILD_TYPE` is set is incorrect. That variable is never fully undefined; by default, in cases where it is unset is actually set to the empty string. Therefore, the condition that must be checked is not whether the variable is defined, but whether it tests to a truthy value (i.e. is a non-empty string).

I consider this a minor change so I have not opened an associated issue.

### What changes are included in this PR?

This PR changes `if(NOT DEFINED CMAKE_BUILD_TYPE)` to `if(NOT CMAKE_BUILD_TYPE)`.

### Are these changes tested?

Since this fixes a particular CMake build scenario I am not sure if a test is merited, or where one would be added.

### Are there any user-facing changes?

No.

Authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/CMakeLists.txt                        | 2 +-
 cpp/examples/minimal_build/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a1e3138da9e0b..5ead9e4b063cd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -84,7 +84,7 @@ set(ARROW_VERSION "18.0.0-SNAPSHOT")
 string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}")
 
 # if no build type is specified, default to release builds
-if(NOT DEFINED CMAKE_BUILD_TYPE)
+if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE
       Release
       CACHE STRING "Choose the type of build.")
diff --git a/cpp/examples/minimal_build/CMakeLists.txt b/cpp/examples/minimal_build/CMakeLists.txt
index b4a7cde938c87..95dad34221add 100644
--- a/cpp/examples/minimal_build/CMakeLists.txt
+++ b/cpp/examples/minimal_build/CMakeLists.txt
@@ -30,7 +30,7 @@ endif()
 # We require a C++17 compliant compiler
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-if(NOT DEFINED CMAKE_BUILD_TYPE)
+if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 

From 53b15b61691dde1ea86e14b7a2216fa0a26f8054 Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Fri, 23 Aug 2024 16:17:29 -0400
Subject: [PATCH 020/186] MINOR: [Go] Fix Flakey
 TestRowsPrematureCloseDuringNextLoop Test (#43804)

### Rationale for this change

Fixes a race condition in rows initialization that has been causing intermittent test failures.

### What changes are included in this PR?

Split query and init context. Update test to check for failure _after_ reading rows.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 go/arrow/flight/flightsql/driver/driver.go      | 10 ++++++----
 go/arrow/flight/flightsql/driver/driver_test.go |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/go/arrow/flight/flightsql/driver/driver.go b/go/arrow/flight/flightsql/driver/driver.go
index 0f2b02deaca7c..0513fe1ecd346 100644
--- a/go/arrow/flight/flightsql/driver/driver.go
+++ b/go/arrow/flight/flightsql/driver/driver.go
@@ -266,13 +266,14 @@ func (s *Stmt) QueryContext(ctx context.Context, args []driver.NamedValue) (driv
 		return nil, err
 	}
 
+	execCtx := ctx
 	if _, set := ctx.Deadline(); !set && s.timeout > 0 {
 		var cancel context.CancelFunc
-		ctx, cancel = context.WithTimeout(ctx, s.timeout)
+		execCtx, cancel = context.WithTimeout(ctx, s.timeout)
 		defer cancel()
 	}
 
-	info, err := s.stmt.Execute(ctx)
+	info, err := s.stmt.Execute(execCtx)
 	if err != nil {
 		return nil, err
 	}
@@ -497,13 +498,14 @@ func (c *Connection) QueryContext(ctx context.Context, query string, args []driv
 		return nil, driver.ErrSkip
 	}
 
+	execCtx := ctx
 	if _, set := ctx.Deadline(); !set && c.timeout > 0 {
 		var cancel context.CancelFunc
-		ctx, cancel = context.WithTimeout(ctx, c.timeout)
+		execCtx, cancel = context.WithTimeout(ctx, c.timeout)
 		defer cancel()
 	}
 
-	info, err := c.client.Execute(ctx, query)
+	info, err := c.client.Execute(execCtx, query)
 	if err != nil {
 		return nil, err
 	}
diff --git a/go/arrow/flight/flightsql/driver/driver_test.go b/go/arrow/flight/flightsql/driver/driver_test.go
index e5060ccbe33d0..c00dfe3c5d9a0 100644
--- a/go/arrow/flight/flightsql/driver/driver_test.go
+++ b/go/arrow/flight/flightsql/driver/driver_test.go
@@ -626,7 +626,6 @@ func (s *SqlTestSuite) TestRowsPrematureCloseDuringNextLoop() {
 	rows, err := db.QueryContext(context.TODO(), sqlSelectAll)
 	require.NoError(t, err)
 	require.NotNil(t, rows)
-	require.NoError(t, rows.Err())
 
 	const closeAfterNRows = 10
 	var (
@@ -645,6 +644,7 @@ func (s *SqlTestSuite) TestRowsPrematureCloseDuringNextLoop() {
 			require.NoError(t, rows.Close())
 		}
 	}
+	require.NoError(t, rows.Err())
 
 	require.Equal(t, closeAfterNRows, i)
 

From cb645a1b27dd66fddb88458c939e2851f9dadf35 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 24 Aug 2024 06:08:18 +0900
Subject: [PATCH 021/186] GH-43802: [GLib] Add `GAFlightRecordBatchWriter`
 (#43803)

### Rationale for this change

This is needed to implement `DoPut`.

### What changes are included in this PR?

We can't add tests for it because it's an abstract class.

I'm not sure `is_owner` is needed like
`GAFlightRecordBatchReader`. `is_owner` may be removed later if we find that it's needless.

### Are these changes tested?

No.

### Are there any user-facing changes?

Yes.

`GAFlightRecordBatchWriter` is a new public API.
* GitHub Issue: #43802

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-flight-glib/common.cpp | 198 ++++++++++++++++++++++++++--
 c_glib/arrow-flight-glib/common.h   |  32 +++++
 c_glib/arrow-flight-glib/common.hpp |   4 +
 3 files changed, 224 insertions(+), 10 deletions(-)

diff --git a/c_glib/arrow-flight-glib/common.cpp b/c_glib/arrow-flight-glib/common.cpp
index efc544f10cf66..f7eea08c264b3 100644
--- a/c_glib/arrow-flight-glib/common.cpp
+++ b/c_glib/arrow-flight-glib/common.cpp
@@ -48,7 +48,11 @@ G_BEGIN_DECLS
  *
  * #GAFlightStreamChunk is a class for a chunk in stream.
  *
- * #GAFlightRecordBatchReader is a class for reading record batches.
+ * #GAFlightRecordBatchReader is an abstract class for reading record
+ * batches with metadata.
+ *
+ * #GAFlightRecordBatchWeriter is an abstract class for
+ * writing record batches with metadata.
  *
  * Since: 5.0.0
  */
@@ -1172,13 +1176,13 @@ typedef struct GAFlightRecordBatchReaderPrivate_
 } GAFlightRecordBatchReaderPrivate;
 
 enum {
-  PROP_READER = 1,
-  PROP_IS_OWNER,
+  PROP_RECORD_BATCH_READER_READER = 1,
+  PROP_RECORD_BATCH_READER_IS_OWNER,
 };
 
-G_DEFINE_TYPE_WITH_PRIVATE(GAFlightRecordBatchReader,
-                           gaflight_record_batch_reader,
-                           G_TYPE_OBJECT)
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchReader,
+                                    gaflight_record_batch_reader,
+                                    G_TYPE_OBJECT)
 
 #define GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(obj)                                    \
   static_cast<GAFlightRecordBatchReaderPrivate *>(                                       \
@@ -1204,11 +1208,11 @@ gaflight_record_batch_reader_set_property(GObject *object,
   auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_READER:
+  case PROP_RECORD_BATCH_READER_READER:
     priv->reader =
       static_cast<arrow::flight::MetadataRecordBatchReader *>(g_value_get_pointer(value));
     break;
-  case PROP_IS_OWNER:
+  case PROP_RECORD_BATCH_READER_IS_OWNER:
     priv->is_owner = g_value_get_boolean(value);
     break;
   default:
@@ -1236,7 +1240,7 @@ gaflight_record_batch_reader_class_init(GAFlightRecordBatchReaderClass *klass)
     nullptr,
     nullptr,
     static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_READER, spec);
+  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_READER_READER, spec);
 
   spec = g_param_spec_boolean(
     "is-owner",
@@ -1244,7 +1248,7 @@ gaflight_record_batch_reader_class_init(GAFlightRecordBatchReaderClass *klass)
     nullptr,
     TRUE,
     static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_IS_OWNER, spec);
+  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_READER_IS_OWNER, spec);
 }
 
 /**
@@ -1296,6 +1300,173 @@ gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError
   }
 }
 
+typedef struct GAFlightRecordBatchWriterPrivate_
+{
+  arrow::flight::MetadataRecordBatchWriter *writer;
+  bool is_owner;
+} GAFlightRecordBatchWriterPrivate;
+
+enum {
+  PROP_RECORD_BATCH_WRITER_WRITER = 1,
+  PROP_RECORD_BATCH_WRITER_IS_OWNER,
+};
+
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchWriter,
+                                    gaflight_record_batch_writer,
+                                    GARROW_TYPE_RECORD_BATCH_WRITER)
+
+#define GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object)                                 \
+  static_cast<GAFlightRecordBatchWriterPrivate *>(                                       \
+    gaflight_record_batch_writer_get_instance_private(                                   \
+      GAFLIGHT_RECORD_BATCH_WRITER(object)))
+
+static void
+gaflight_record_batch_writer_finalize(GObject *object)
+{
+  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object);
+  if (priv->is_owner) {
+    delete priv->writer;
+  }
+  G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object);
+}
+
+static void
+gaflight_record_batch_writer_set_property(GObject *object,
+                                          guint prop_id,
+                                          const GValue *value,
+                                          GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_RECORD_BATCH_WRITER_WRITER:
+    priv->writer =
+      static_cast<arrow::flight::MetadataRecordBatchWriter *>(g_value_get_pointer(value));
+    break;
+  case PROP_RECORD_BATCH_WRITER_IS_OWNER:
+    priv->is_owner = g_value_get_boolean(value);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object)
+{
+}
+
+static void
+gaflight_record_batch_writer_class_init(GAFlightRecordBatchWriterClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize = gaflight_record_batch_writer_finalize;
+  gobject_class->set_property = gaflight_record_batch_writer_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer(
+    "writer",
+    nullptr,
+    nullptr,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_WRITER, spec);
+
+  spec = g_param_spec_boolean(
+    "is-owner",
+    nullptr,
+    nullptr,
+    TRUE,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_IS_OWNER, spec);
+}
+
+/**
+ * gaflight_record_batch_writer_begin:
+ * @writer: A #GAFlightRecordBatchWriter.
+ * @schema: A #GArrowSchema.
+ * @options: (nullable): A #GArrowWriteOptions.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Begins writing data with the given schema. Only used with
+ * `DoExchange`.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer,
+                                   GArrowSchema *schema,
+                                   GArrowWriteOptions *options,
+                                   GError **error)
+{
+  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto arrow_schema = garrow_schema_get_raw(schema);
+  arrow::ipc::IpcWriteOptions arrow_write_options;
+  if (options) {
+    arrow_write_options = *garrow_write_options_get_raw(options);
+  } else {
+    arrow_write_options = arrow::ipc::IpcWriteOptions::Defaults();
+  }
+  return garrow::check(error,
+                       flight_writer->Begin(arrow_schema, arrow_write_options),
+                       "[flight-record-batch-writer][begin]");
+}
+
+/**
+ * gaflight_record_batch_writer_write_metadata:
+ * @writer: A #GAFlightRecordBatchWriter.
+ * @metadata: A #GArrowBuffer.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Write metadata.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
+                                            GArrowBuffer *metadata,
+                                            GError **error)
+{
+  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto arrow_metadata = garrow_buffer_get_raw(metadata);
+  return garrow::check(error,
+                       flight_writer->WriteMetadata(arrow_metadata),
+                       "[flight-record-batch-writer][write-metadata]");
+}
+
+/**
+ * gaflight_record_batch_writer_write:
+ * @writer: A #GAFlightRecordBatchWriter.
+ * @record_batch: A #GArrowRecordBatch.
+ * @metadata: (nullable): A #GArrowBuffer.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Write a record batch with metadata.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer,
+                                   GArrowRecordBatch *record_batch,
+                                   GArrowBuffer *metadata,
+                                   GError **error)
+{
+  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
+  auto arrow_metadata = garrow_buffer_get_raw(metadata);
+  return garrow::check(
+    error,
+    flight_writer->WriteWithMetadata(*arrow_record_batch, arrow_metadata),
+    "[flight-record-batch-writer][write]");
+}
+
 G_END_DECLS
 
 GAFlightCriteria *
@@ -1428,3 +1599,10 @@ gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader)
   auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(reader);
   return priv->reader;
 }
+
+arrow::flight::MetadataRecordBatchWriter *
+gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer)
+{
+  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(writer);
+  return priv->writer;
+}
diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h
index b1d89f79c357e..91c828caabb36 100644
--- a/c_glib/arrow-flight-glib/common.h
+++ b/c_glib/arrow-flight-glib/common.h
@@ -232,4 +232,36 @@ GAFLIGHT_AVAILABLE_IN_6_0
 GArrowTable *
 gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError **error);
 
+#define GAFLIGHT_TYPE_RECORD_BATCH_WRITER (gaflight_record_batch_writer_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(GAFlightRecordBatchWriter,
+                         gaflight_record_batch_writer,
+                         GAFLIGHT,
+                         RECORD_BATCH_WRITER,
+                         GArrowRecordBatchWriter)
+struct _GAFlightRecordBatchWriterClass
+{
+  GArrowRecordBatchWriterClass parent_class;
+};
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer,
+                                   GArrowSchema *schema,
+                                   GArrowWriteOptions *options,
+                                   GError **error);
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
+                                            GArrowBuffer *metadata,
+                                            GError **error);
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer,
+                                   GArrowRecordBatch *record_batch,
+                                   GArrowBuffer *metadata,
+                                   GError **error);
+
 G_END_DECLS
diff --git a/c_glib/arrow-flight-glib/common.hpp b/c_glib/arrow-flight-glib/common.hpp
index db56fff579baf..ae5a7703397dd 100644
--- a/c_glib/arrow-flight-glib/common.hpp
+++ b/c_glib/arrow-flight-glib/common.hpp
@@ -79,3 +79,7 @@ gaflight_stream_chunk_get_raw(GAFlightStreamChunk *chunk);
 GAFLIGHT_EXTERN
 arrow::flight::MetadataRecordBatchReader *
 gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader);
+
+GAFLIGHT_EXTERN
+arrow::flight::MetadataRecordBatchWriter *
+gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer);

From 146b4e9669071984c883ec5791676638014bd655 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 24 Aug 2024 06:22:26 +0900
Subject: [PATCH 022/186] GH-43743: [CI][Docs] Ensure creating build directory
 (#43744)

### Rationale for this change

It's used as a volume. If it doesn't exist, `docker compose` reports an error:

    Error response from daemon: invalid mount config for type "bind": bind source path does not exist: /home/runner/work/crossbow/crossbow/build/

### What changes are included in this PR?

* Create build directory
* Move required `-v $PWD/build/:/build/` to `docs/github.linux.yml`

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #43743

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/tasks/docs/github.linux.yml | 4 +++-
 dev/tasks/tasks.yml             | 4 +---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/tasks/docs/github.linux.yml b/dev/tasks/docs/github.linux.yml
index 8ab8a593c3ef3..5863d68d2c828 100644
--- a/dev/tasks/docs/github.linux.yml
+++ b/dev/tasks/docs/github.linux.yml
@@ -34,8 +34,10 @@ jobs:
         env:
           ARROW_JAVA_SKIP_GIT_PLUGIN: true
         run: |
+          mkdir -p build
           archery docker run \
             -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \
+            -v $PWD/build/:/build/ \
             {{ flags|default("") }} \
             {{ image }} \
             {{ command|default("") }}
@@ -45,7 +47,7 @@ jobs:
           ref: {{ default_branch|default("main") }}
           path: crossbow
           fetch-depth: 1
-      {% if  publish %}
+      {% if publish %}
       - name: Prepare Docs Preview
         run: |
           # build files are created by the docker user
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 60114d6930878..cae34c3231381 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1487,7 +1487,7 @@ tasks:
       image: debian-go
   {% endfor %}
 
-  # be sure to update binary-task.rb when upgrading ubuntu
+  # be sure to update binary-task.rb when upgrading Debian
   test-debian-12-docs:
     ci: github
     template: docs/github.linux.yml
@@ -1495,7 +1495,6 @@ tasks:
       env:
         JDK: 17
       pr_number: Unset
-      flags: "-v $PWD/build/:/build/"
       image: debian-docs
       publish: false
     artifacts:
@@ -1621,6 +1620,5 @@ tasks:
       env:
         JDK: 17
       pr_number: Unset
-      flags: "-v $PWD/build/:/build/"
       image: debian-docs
       publish: true

From e61c105c73dfabb51d5afc972ff21cc5326b3d93 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Sat, 24 Aug 2024 07:07:09 +0530
Subject: [PATCH 023/186] GH-41584: [Java] ListView Implementation for C Data
 Interface (#43686)

### Rationale for this change

C Data Interface is missing `ListView` and `LargeListView` after recently merging core functionalities.

Also closes;

- [x] https://github.com/apache/arrow/issues/41585

### What changes are included in this PR?

This PR includes C Data interface related component additions to `ListView` and `LargeListView` along with the corresponding test cases.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No
* GitHub Issue: #41584

Authored-by: Vibhatha Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 dev/archery/archery/integration/datagen.py    |   1 -
 .../arrow/c/BufferImportTypeVisitor.java      |  14 +-
 .../main/java/org/apache/arrow/c/Format.java  |   8 ++
 .../org/apache/arrow/c/RoundtripTest.java     |  42 ++++++
 java/c/src/test/python/integration_tests.py   |  47 ++++++
 .../BaseLargeRepeatedValueViewVector.java     |  29 ++--
 .../complex/BaseRepeatedValueViewVector.java  |  30 ++--
 .../vector/complex/LargeListViewVector.java   |  10 +-
 .../arrow/vector/complex/ListViewVector.java  |   6 +-
 .../arrow/vector/TestLargeListViewVector.java | 134 ++++++++++++++++++
 .../arrow/vector/TestListViewVector.java      | 132 +++++++++++++++++
 .../testing/ValueVectorDataPopulator.java     |  34 +++++
 12 files changed, 451 insertions(+), 36 deletions(-)

diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index 47310c905a9ff..d395d26cb71d3 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1936,7 +1936,6 @@ def _temp_path():
 
         generate_list_view_case()
         .skip_tester('C#')     # Doesn't support large list views
-        .skip_tester('Java')
         .skip_tester('JS')
         .skip_tester('nanoarrow')
         .skip_tester('Rust'),
diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
index 633ecd43bd570..93fef6d7ca801 100644
--- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
+++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
@@ -47,7 +47,9 @@
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.MapVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
@@ -400,13 +402,17 @@ public List<ArrowBuf> visit(ArrowType.Duration type) {
 
   @Override
   public List<ArrowBuf> visit(ArrowType.ListView type) {
-    throw new UnsupportedOperationException(
-        "Importing buffers for view type: " + type + " not supported");
+    return Arrays.asList(
+        maybeImportBitmap(type),
+        importFixedBytes(type, 1, ListViewVector.OFFSET_WIDTH),
+        importFixedBytes(type, 2, ListViewVector.SIZE_WIDTH));
   }
 
   @Override
   public List<ArrowBuf> visit(ArrowType.LargeListView type) {
-    throw new UnsupportedOperationException(
-        "Importing buffers for view type: " + type + " not supported");
+    return Arrays.asList(
+        maybeImportBitmap(type),
+        importFixedBytes(type, 1, LargeListViewVector.OFFSET_WIDTH),
+        importFixedBytes(type, 2, LargeListViewVector.SIZE_WIDTH));
   }
 }
diff --git a/java/c/src/main/java/org/apache/arrow/c/Format.java b/java/c/src/main/java/org/apache/arrow/c/Format.java
index aff51e7b734ab..f77a555d18481 100644
--- a/java/c/src/main/java/org/apache/arrow/c/Format.java
+++ b/java/c/src/main/java/org/apache/arrow/c/Format.java
@@ -229,6 +229,10 @@ static String asString(ArrowType arrowType) {
         return "vu";
       case BinaryView:
         return "vz";
+      case ListView:
+        return "+vl";
+      case LargeListView:
+        return "+vL";
       case NONE:
         throw new IllegalArgumentException("Arrow type ID is NONE");
       default:
@@ -313,6 +317,10 @@ static ArrowType asType(String format, long flags)
         return new ArrowType.Utf8View();
       case "vz":
         return new ArrowType.BinaryView();
+      case "+vl":
+        return new ArrowType.ListView();
+      case "+vL":
+        return new ArrowType.LargeListView();
       default:
         String[] parts = format.split(":", 2);
         if (parts.length == 2) {
diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
index 6591d1f730990..18b2e94adde47 100644
--- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
+++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
@@ -84,7 +84,9 @@
 import org.apache.arrow.vector.compare.VectorEqualsVisitor;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.MapVector;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
@@ -683,6 +685,46 @@ public void testFixedSizeListVector() {
     }
   }
 
+  @Test
+  public void testListViewVector() {
+    try (final ListViewVector vector = ListViewVector.empty("v", allocator)) {
+      setVector(
+          vector,
+          Arrays.stream(new int[] {1, 2}).boxed().collect(Collectors.toList()),
+          Arrays.stream(new int[] {3, 4}).boxed().collect(Collectors.toList()),
+          new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, ListViewVector.class));
+    }
+  }
+
+  @Test
+  public void testEmptyListViewVector() {
+    try (final ListViewVector vector = ListViewVector.empty("v", allocator)) {
+      setVector(vector, new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, ListViewVector.class));
+    }
+  }
+
+  @Test
+  public void testLargeListViewVector() {
+    try (final LargeListViewVector vector = LargeListViewVector.empty("v", allocator)) {
+      setVector(
+          vector,
+          Arrays.stream(new int[] {1, 2}).boxed().collect(Collectors.toList()),
+          Arrays.stream(new int[] {3, 4}).boxed().collect(Collectors.toList()),
+          new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, LargeListViewVector.class));
+    }
+  }
+
+  @Test
+  public void testEmptyLargeListViewVector() {
+    try (final LargeListViewVector vector = LargeListViewVector.empty("v", allocator)) {
+      setVector(vector, new ArrayList<Integer>());
+      assertTrue(roundtrip(vector, LargeListViewVector.class));
+    }
+  }
+
   @Test
   public void testMapVector() {
     int count = 5;
diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py
index ab2ee1742f366..b0a86e9c66e59 100644
--- a/java/c/src/test/python/integration_tests.py
+++ b/java/c/src/test/python/integration_tests.py
@@ -352,6 +352,53 @@ def test_reader_complex_roundtrip(self):
         ]
         self.round_trip_reader(schema, data)
 
+    def test_listview_array(self):
+        self.round_trip_array(lambda: pa.array(
+            [[], [0], [1, 2], [4, 5, 6]], pa.list_view(pa.int64())
+            # disabled check_metadata since in Java API the listview
+            # internal field name ("item") is not preserved 
+            # during round trips (it becomes "$data$").
+        ), check_metadata=False)
+
+    def test_empty_listview_array(self):
+        with pa.BufferOutputStream() as bos:
+            schema = pa.schema([pa.field("f0", pa.list_view(pa.int32()), True)])
+            with ipc.new_stream(bos, schema) as writer:
+                src = pa.RecordBatch.from_arrays(
+                    [pa.array([[]], pa.list_view(pa.int32()))], schema=schema)
+                writer.write(src)
+        data_bytes = bos.getvalue()
+
+        def recreate_batch():
+            with pa.input_stream(data_bytes) as ios:
+                with ipc.open_stream(ios) as reader:
+                    return reader.read_next_batch()
+
+        self.round_trip_record_batch(recreate_batch)
+
+    def test_largelistview_array(self):
+        self.round_trip_array(lambda: pa.array(
+            [[], [0], [1, 2], [4, 5, 6]], pa.large_list_view(pa.int64())
+            # disabled check_metadata since in Java API the listview
+            # internal field name ("item") is not preserved
+            # during round trips (it becomes "$data$").
+        ), check_metadata=False)
+
+    def test_empty_largelistview_array(self):
+        with pa.BufferOutputStream() as bos:
+            schema = pa.schema([pa.field("f0", pa.large_list_view(pa.int32()), True)])
+            with ipc.new_stream(bos, schema) as writer:
+                src = pa.RecordBatch.from_arrays(
+                    [pa.array([[]], pa.large_list_view(pa.int32()))], schema=schema)
+                writer.write(src)
+        data_bytes = bos.getvalue()
+
+        def recreate_batch():
+            with pa.input_stream(data_bytes) as ios:
+                with ipc.open_stream(ios) as reader:
+                    return reader.read_next_batch()
+
+        self.round_trip_record_batch(recreate_batch)
 
 if __name__ == '__main__':
     unittest.main(verbosity=2)
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
index f643306cfdcff..12edd6557bd9c 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseLargeRepeatedValueViewVector.java
@@ -305,38 +305,43 @@ public void setValueCount(int valueCount) {
     while (valueCount > getOffsetBufferValueCapacity()) {
       reallocateBuffers();
     }
-    final int childValueCount = valueCount == 0 ? 0 : getLengthOfChildVector();
+    final int childValueCount = valueCount == 0 ? 0 : getMaxViewEndChildVector();
     vector.setValueCount(childValueCount);
   }
 
-  protected int getLengthOfChildVector() {
+  /**
+   * Get the end of the child vector via the maximum view length. This method deduces the length by
+   * considering the condition i.e., argmax_i(offsets[i] + size[i]).
+   *
+   * @return the end of the child vector.
+   */
+  protected int getMaxViewEndChildVector() {
     int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0);
-    int minOffset = offsetBuffer.getInt(0);
     for (int i = 0; i < valueCount; i++) {
       int currentOffset = offsetBuffer.getInt((long) i * OFFSET_WIDTH);
       int currentSize = sizeBuffer.getInt((long) i * SIZE_WIDTH);
       int currentSum = currentOffset + currentSize;
-
       maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum);
-      minOffset = Math.min(minOffset, currentOffset);
     }
 
-    return maxOffsetSizeSum - minOffset;
+    return maxOffsetSizeSum;
   }
 
-  protected int getLengthOfChildVectorByIndex(int index) {
+  /**
+   * Get the end of the child vector via the maximum view length of the child vector by index.
+   *
+   * @return the end of the child vector by index
+   */
+  protected int getMaxViewEndChildVectorByIndex(int index) {
     int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0);
-    int minOffset = offsetBuffer.getInt(0);
     for (int i = 0; i < index; i++) {
       int currentOffset = offsetBuffer.getInt((long) i * OFFSET_WIDTH);
       int currentSize = sizeBuffer.getInt((long) i * SIZE_WIDTH);
       int currentSum = currentOffset + currentSize;
-
       maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum);
-      minOffset = Math.min(minOffset, currentOffset);
     }
 
-    return maxOffsetSizeSum - minOffset;
+    return maxOffsetSizeSum;
   }
 
   /**
@@ -390,7 +395,7 @@ public int startNewValue(int index) {
     }
 
     if (index > 0) {
-      final int prevOffset = getLengthOfChildVectorByIndex(index);
+      final int prevOffset = getMaxViewEndChildVectorByIndex(index);
       offsetBuffer.setInt((long) index * OFFSET_WIDTH, prevOffset);
     }
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java
index 031cc8037bb8b..e6213316b55a3 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java
@@ -304,38 +304,44 @@ public void setValueCount(int valueCount) {
     while (valueCount > getOffsetBufferValueCapacity()) {
       reallocateBuffers();
     }
-    final int childValueCount = valueCount == 0 ? 0 : getLengthOfChildVector();
+    final int childValueCount = valueCount == 0 ? 0 : getMaxViewEndChildVector();
     vector.setValueCount(childValueCount);
   }
 
-  protected int getLengthOfChildVector() {
+  /**
+   * Get the end of the child vector via the maximum view length. This method deduces the length by
+   * considering the condition i.e., argmax_i(offsets[i] + size[i]).
+   *
+   * @return the end of the child vector.
+   */
+  protected int getMaxViewEndChildVector() {
     int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0);
-    int minOffset = offsetBuffer.getInt(0);
     for (int i = 0; i < valueCount; i++) {
       int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH);
       int currentSize = sizeBuffer.getInt(i * SIZE_WIDTH);
       int currentSum = currentOffset + currentSize;
-
       maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum);
-      minOffset = Math.min(minOffset, currentOffset);
     }
 
-    return maxOffsetSizeSum - minOffset;
+    return maxOffsetSizeSum;
   }
 
-  protected int getLengthOfChildVectorByIndex(int index) {
+  /**
+   * Get the end of the child vector via the maximum view length of the child vector by index.
+   *
+   * @return the end of the child vector by index
+   */
+  protected int getMaxViewEndChildVectorByIndex(int index) {
     int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0);
-    int minOffset = offsetBuffer.getInt(0);
+    // int minOffset = offsetBuffer.getInt(0);
     for (int i = 0; i < index; i++) {
       int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH);
       int currentSize = sizeBuffer.getInt(i * SIZE_WIDTH);
       int currentSum = currentOffset + currentSize;
-
       maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum);
-      minOffset = Math.min(minOffset, currentOffset);
     }
 
-    return maxOffsetSizeSum - minOffset;
+    return maxOffsetSizeSum;
   }
 
   /**
@@ -389,7 +395,7 @@ public int startNewValue(int index) {
     }
 
     if (index > 0) {
-      final int prevOffset = getLengthOfChildVectorByIndex(index);
+      final int prevOffset = getMaxViewEndChildVectorByIndex(index);
       offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset);
     }
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
index 2c61f799a4cf9..84c6f03edb25d 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListViewVector.java
@@ -250,7 +250,9 @@ public List<ArrowBuf> getFieldBuffers() {
    */
   @Override
   public void exportCDataBuffers(List<ArrowBuf> buffers, ArrowBuf buffersPtr, long nullValue) {
-    throw new UnsupportedOperationException("exportCDataBuffers Not implemented yet");
+    exportBuffer(validityBuffer, buffers, buffersPtr, nullValue, true);
+    exportBuffer(offsetBuffer, buffers, buffersPtr, nullValue, true);
+    exportBuffer(sizeBuffer, buffers, buffersPtr, nullValue, true);
   }
 
   @Override
@@ -851,7 +853,7 @@ public int startNewValue(int index) {
     }
 
     if (index > 0) {
-      final int prevOffset = getLengthOfChildVectorByIndex(index);
+      final int prevOffset = getMaxViewEndChildVectorByIndex(index);
       offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset);
     }
 
@@ -943,7 +945,7 @@ public void setValueCount(int valueCount) {
       }
     }
     /* valueCount for the data vector is the current end offset */
-    final long childValueCount = (valueCount == 0) ? 0 : getLengthOfChildVector();
+    final long childValueCount = (valueCount == 0) ? 0 : getMaxViewEndChildVector();
     /* set the value count of data vector and this will take care of
      * checking whether data buffer needs to be reallocated.
      * TODO: revisit when 64-bit vectors are supported
@@ -1001,7 +1003,7 @@ public double getDensity() {
     if (valueCount == 0) {
       return 0.0D;
     }
-    final double totalListSize = getLengthOfChildVector();
+    final double totalListSize = getMaxViewEndChildVector();
     return totalListSize / valueCount;
   }
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
index 7f6d92f3be9c8..9b4e6b4c0cd4a 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java
@@ -858,7 +858,7 @@ public int startNewValue(int index) {
     }
 
     if (index > 0) {
-      final int prevOffset = getLengthOfChildVectorByIndex(index);
+      final int prevOffset = getMaxViewEndChildVectorByIndex(index);
       offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset);
     }
 
@@ -942,7 +942,7 @@ public void setValueCount(int valueCount) {
       }
     }
     /* valueCount for the data vector is the current end offset */
-    final int childValueCount = (valueCount == 0) ? 0 : getLengthOfChildVector();
+    final int childValueCount = (valueCount == 0) ? 0 : getMaxViewEndChildVector();
     /* set the value count of data vector and this will take care of
      * checking whether data buffer needs to be reallocated.
      */
@@ -1005,7 +1005,7 @@ public double getDensity() {
     if (valueCount == 0) {
       return 0.0D;
     }
-    final double totalListSize = getLengthOfChildVector();
+    final double totalListSize = getMaxViewEndChildVector();
     return totalListSize / valueCount;
   }
 
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
index 2ed8d4d7005ea..26e7bb4a0d3b2 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeListViewVector.java
@@ -2095,6 +2095,140 @@ public void testOutOfOrderOffsetSplitAndTransfer() {
     }
   }
 
+  @Test
+  public void testRangeChildVector1() {
+    /*
+     * Non-overlapping ranges
+     * offsets: [0, 2]
+     * sizes: [4, 1]
+     * values: [0, 1, 2, 3]
+     *
+     * vector: [[0, 1, 2, 3], [2]]
+     * */
+    try (LargeListViewVector largeListViewVector =
+        LargeListViewVector.empty("largelistview", allocator)) {
+      // Allocate buffers in listViewVector by calling `allocateNew` method.
+      largeListViewVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(32, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      largeListViewVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = largeListViewVector.getDataVector();
+      fieldVector.clear();
+
+      IntVector childVector = (IntVector) fieldVector;
+
+      childVector.allocateNew(8);
+
+      childVector.set(0, 0);
+      childVector.set(1, 1);
+      childVector.set(2, 2);
+      childVector.set(3, 3);
+      childVector.set(4, 4);
+      childVector.set(5, 5);
+      childVector.set(6, 6);
+      childVector.set(7, 7);
+
+      childVector.setValueCount(8);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      largeListViewVector.setValidity(0, 1);
+      largeListViewVector.setValidity(1, 1);
+
+      largeListViewVector.setOffset(0, 0);
+      largeListViewVector.setOffset(1, 2);
+
+      largeListViewVector.setSize(0, 4);
+      largeListViewVector.setSize(1, 1);
+
+      assertEquals(8, largeListViewVector.getDataVector().getValueCount());
+
+      largeListViewVector.setValueCount(2);
+      assertEquals(4, largeListViewVector.getDataVector().getValueCount());
+
+      IntVector childVector1 = (IntVector) largeListViewVector.getDataVector();
+      final ArrowBuf dataBuffer = childVector1.getDataBuffer();
+      final ArrowBuf validityBuffer = childVector1.getValidityBuffer();
+
+      // yet the underneath buffer contains the original buffer
+      for (int i = 0; i < validityBuffer.capacity(); i++) {
+        assertEquals(i, dataBuffer.getInt((long) i * IntVector.TYPE_WIDTH));
+      }
+    }
+  }
+
+  @Test
+  public void testRangeChildVector2() {
+    /*
+     * Overlapping ranges
+     * offsets: [0, 2]
+     * sizes: [3, 1]
+     * values: [0, 1, 2, 3]
+     *
+     * vector: [[1, 2, 3], [2]]
+     * */
+    try (LargeListViewVector largeListViewVector =
+        LargeListViewVector.empty("largelistview", allocator)) {
+      // Allocate buffers in listViewVector by calling `allocateNew` method.
+      largeListViewVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(32, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      largeListViewVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = largeListViewVector.getDataVector();
+      fieldVector.clear();
+
+      IntVector childVector = (IntVector) fieldVector;
+
+      childVector.allocateNew(8);
+
+      childVector.set(0, 0);
+      childVector.set(1, 1);
+      childVector.set(2, 2);
+      childVector.set(3, 3);
+      childVector.set(4, 4);
+      childVector.set(5, 5);
+      childVector.set(6, 6);
+      childVector.set(7, 7);
+
+      childVector.setValueCount(8);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      largeListViewVector.setValidity(0, 1);
+      largeListViewVector.setValidity(1, 1);
+
+      largeListViewVector.setOffset(0, 1);
+      largeListViewVector.setOffset(1, 2);
+
+      largeListViewVector.setSize(0, 3);
+      largeListViewVector.setSize(1, 1);
+
+      assertEquals(8, largeListViewVector.getDataVector().getValueCount());
+
+      largeListViewVector.setValueCount(2);
+      assertEquals(4, largeListViewVector.getDataVector().getValueCount());
+
+      IntVector childVector1 = (IntVector) largeListViewVector.getDataVector();
+      final ArrowBuf dataBuffer = childVector1.getDataBuffer();
+      final ArrowBuf validityBuffer = childVector1.getValidityBuffer();
+
+      // yet the underneath buffer contains the original buffer
+      for (int i = 0; i < validityBuffer.capacity(); i++) {
+        assertEquals(i, dataBuffer.getInt((long) i * IntVector.TYPE_WIDTH));
+      }
+    }
+  }
+
   private void writeIntValues(UnionLargeListViewWriter writer, int[] values) {
     writer.startListView();
     for (int v : values) {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java
index 4fa808c18aece..639585fc48d0a 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java
@@ -2084,6 +2084,138 @@ public void testOutOfOrderOffsetSplitAndTransfer() {
     }
   }
 
+  @Test
+  public void testRangeChildVector1() {
+    /*
+     * Non-overlapping ranges
+     * offsets: [0, 2]
+     * sizes: [4, 1]
+     * values: [0, 1, 2, 3]
+     *
+     * vector: [[0, 1, 2, 3], [2]]
+     * */
+    try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) {
+      // Allocate buffers in listViewVector by calling `allocateNew` method.
+      listViewVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(32, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      listViewVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = listViewVector.getDataVector();
+      fieldVector.clear();
+
+      IntVector childVector = (IntVector) fieldVector;
+
+      childVector.allocateNew(8);
+
+      childVector.set(0, 0);
+      childVector.set(1, 1);
+      childVector.set(2, 2);
+      childVector.set(3, 3);
+      childVector.set(4, 4);
+      childVector.set(5, 5);
+      childVector.set(6, 6);
+      childVector.set(7, 7);
+
+      childVector.setValueCount(8);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      listViewVector.setValidity(0, 1);
+      listViewVector.setValidity(1, 1);
+
+      listViewVector.setOffset(0, 0);
+      listViewVector.setOffset(1, 2);
+
+      listViewVector.setSize(0, 4);
+      listViewVector.setSize(1, 1);
+
+      assertEquals(8, listViewVector.getDataVector().getValueCount());
+
+      listViewVector.setValueCount(2);
+      assertEquals(4, listViewVector.getDataVector().getValueCount());
+
+      IntVector childVector1 = (IntVector) listViewVector.getDataVector();
+      final ArrowBuf dataBuffer = childVector1.getDataBuffer();
+      final ArrowBuf validityBuffer = childVector1.getValidityBuffer();
+
+      // yet the underneath buffer contains the original buffer
+      for (int i = 0; i < validityBuffer.capacity(); i++) {
+        assertEquals(i, dataBuffer.getInt((long) i * IntVector.TYPE_WIDTH));
+      }
+    }
+  }
+
+  @Test
+  public void testRangeChildVector2() {
+    /*
+     * Overlapping ranges
+     * offsets: [0, 2]
+     * sizes: [3, 1]
+     * values: [0, 1, 2, 3]
+     *
+     * vector: [[1, 2, 3], [2]]
+     * */
+    try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) {
+      // Allocate buffers in listViewVector by calling `allocateNew` method.
+      listViewVector.allocateNew();
+
+      // Initialize the child vector using `initializeChildrenFromFields` method.
+
+      FieldType fieldType = new FieldType(true, new ArrowType.Int(32, true), null, null);
+      Field field = new Field("child-vector", fieldType, null);
+      listViewVector.initializeChildrenFromFields(Collections.singletonList(field));
+
+      // Set values in the child vector.
+      FieldVector fieldVector = listViewVector.getDataVector();
+      fieldVector.clear();
+
+      IntVector childVector = (IntVector) fieldVector;
+
+      childVector.allocateNew(8);
+
+      childVector.set(0, 0);
+      childVector.set(1, 1);
+      childVector.set(2, 2);
+      childVector.set(3, 3);
+      childVector.set(4, 4);
+      childVector.set(5, 5);
+      childVector.set(6, 6);
+      childVector.set(7, 7);
+
+      childVector.setValueCount(8);
+
+      // Set validity, offset and size buffers using `setValidity`,
+      //  `setOffset` and `setSize` methods.
+      listViewVector.setValidity(0, 1);
+      listViewVector.setValidity(1, 1);
+
+      listViewVector.setOffset(0, 1);
+      listViewVector.setOffset(1, 2);
+
+      listViewVector.setSize(0, 3);
+      listViewVector.setSize(1, 1);
+
+      assertEquals(8, listViewVector.getDataVector().getValueCount());
+
+      listViewVector.setValueCount(2);
+      assertEquals(4, listViewVector.getDataVector().getValueCount());
+
+      IntVector childVector1 = (IntVector) listViewVector.getDataVector();
+      final ArrowBuf dataBuffer = childVector1.getDataBuffer();
+      final ArrowBuf validityBuffer = childVector1.getValidityBuffer();
+
+      // yet the underneath buffer contains the original buffer
+      for (int i = 0; i < validityBuffer.capacity(); i++) {
+        assertEquals(i, dataBuffer.getInt((long) i * IntVector.TYPE_WIDTH));
+      }
+    }
+  }
+
   private void writeIntValues(UnionListViewWriter writer, int[] values) {
     writer.startListView();
     for (int v : values) {
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
index 69e16dc470351..afbc30f019ef6 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java
@@ -60,10 +60,12 @@
 import org.apache.arrow.vector.VarBinaryVector;
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VariableWidthFieldVector;
+import org.apache.arrow.vector.complex.BaseLargeRepeatedValueViewVector;
 import org.apache.arrow.vector.complex.BaseRepeatedValueVector;
 import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector;
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.LargeListVector;
+import org.apache.arrow.vector.complex.LargeListViewVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.StructVector;
@@ -760,4 +762,36 @@ public static void setVector(ListViewVector vector, List<Integer>... values) {
     dataVector.setValueCount(curPos);
     vector.setValueCount(values.length);
   }
+
+  /** Populate values for {@link ListViewVector}. */
+  public static void setVector(LargeListViewVector vector, List<Integer>... values) {
+    vector.allocateNewSafe();
+    Types.MinorType type = Types.MinorType.INT;
+    vector.addOrGetVector(FieldType.nullable(type.getType()));
+
+    IntVector dataVector = (IntVector) vector.getDataVector();
+    dataVector.allocateNew();
+
+    // set underlying vectors
+    int curPos = 0;
+    for (int i = 0; i < values.length; i++) {
+      vector
+          .getOffsetBuffer()
+          .setInt((long) i * BaseLargeRepeatedValueViewVector.OFFSET_WIDTH, curPos);
+      if (values[i] == null) {
+        BitVectorHelper.unsetBit(vector.getValidityBuffer(), i);
+      } else {
+        BitVectorHelper.setBit(vector.getValidityBuffer(), i);
+        for (int value : values[i]) {
+          dataVector.setSafe(curPos, value);
+          curPos += 1;
+        }
+      }
+      vector
+          .getSizeBuffer()
+          .setInt((long) i * BaseRepeatedValueViewVector.SIZE_WIDTH, values[i].size());
+    }
+    dataVector.setValueCount(curPos);
+    vector.setValueCount(values.length);
+  }
 }

From 83d915a3d2ac2acecbb2cb2dc0dd7f5a213dd625 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 12:38:38 +0900
Subject: [PATCH 024/186] MINOR: [Java] Bump dep.slf4j.version from 2.0.13 to
 2.0.16 in /java (#43652)

Bumps `dep.slf4j.version` from 2.0.13 to 2.0.16.
Updates `org.slf4j:slf4j-api` from 2.0.13 to 2.0.16

Updates `org.slf4j:slf4j-jdk14` from 2.0.13 to 2.0.16

Updates `org.slf4j:jul-to-slf4j` from 2.0.13 to 2.0.16

Updates `org.slf4j:jcl-over-slf4j` from 2.0.13 to 2.0.16

Updates `org.slf4j:log4j-over-slf4j` from 2.0.13 to 2.0.16

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index a73453df68fd2..54bb7a0ae0eb9 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -94,7 +94,7 @@ under the License.
     <target.gen.source.path>${project.build.directory}/generated-sources</target.gen.source.path>
     <dep.junit.platform.version>1.9.0</dep.junit.platform.version>
     <dep.junit.jupiter.version>5.10.3</dep.junit.jupiter.version>
-    <dep.slf4j.version>2.0.13</dep.slf4j.version>
+    <dep.slf4j.version>2.0.16</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>
     <dep.grpc-bom.version>1.66.0</dep.grpc-bom.version>

From cbb5f96306972aa236750602aba4b40ceb4219c4 Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Sun, 25 Aug 2024 21:33:51 -0700
Subject: [PATCH 025/186] MINOR: [R] Add missing PR num to news.md item
 (#43811)

### Rationale for this change

We normally link to somewhere to give the user more context on news items. I noticed the link was missing for this one.

### What changes are included in this PR?

Added PR number to news item.

### Are these changes tested?

No.

### Are there any user-facing changes?

No.

Authored-by: Bryce Mecum <petridish@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
---
 r/NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 0e6e4634a0af8..b9568afe66542 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -32,7 +32,7 @@
   functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223)
 * `mutate()` expressions can now include aggregations, such as `x - mean(x)`. (#41350)
 * `summarize()` supports more complex expressions, and correctly handles cases
-  where column names are reused in expressions.
+  where column names are reused in expressions. (#41223)
 * The `na_matches` argument to the `dplyr::*_join()` functions is now supported.
   This argument controls whether `NA` values are considered equal when joining. (#41358)
 * R metadata, stored in the Arrow schema to support round-tripping data between

From 51e9f70f94cd09a0a08196afdd2f4fc644666b5e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 16:20:20 +0900
Subject: [PATCH 026/186] MINOR: [Java] Bump dep.junit.jupiter.version from
 5.10.3 to 5.11.0 in /java (#43751)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps `dep.junit.jupiter.version` from 5.10.3 to 5.11.0.
Updates `org.junit.jupiter:junit-jupiter-engine` from 5.10.3 to 5.11.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/junit-team/junit5/releases">org.junit.jupiter:junit-jupiter-engine's releases</a>.</em></p>
<blockquote>
<p>JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/pshevche"><code>@​pshevche</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3427">junit-team/junit5#3427</a></li>
<li><a href="https://github.com/rybak"><code>@​rybak</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3416">junit-team/junit5#3416</a></li>
<li><a href="https://github.com/pixeebot"><code>@​pixeebot</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3491">junit-team/junit5#3491</a></li>
<li><a href="https://github.com/shartte"><code>@​shartte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3562">junit-team/junit5#3562</a></li>
<li><a href="https://github.com/eliasnogueira"><code>@​eliasnogueira</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3674">junit-team/junit5#3674</a></li>
<li><a href="https://github.com/bigdaz"><code>@​bigdaz</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3668">junit-team/junit5#3668</a></li>
<li><a href="https://github.com/gilday"><code>@​gilday</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3667">junit-team/junit5#3667</a></li>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
<li><a href="https://github.com/rfscholte"><code>@​rfscholte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3909">junit-team/junit5#3909</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0</a></p>
<p>JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-RC1/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1">https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1</a></p>
<p>JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-M2/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2">https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2</a></p>
<p>JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/junit-team/junit5/commit/6b8e42b7a7d1606962341a61941c60b045646278"><code>6b8e42b</code></a> Release 5.11</li>
<li><a href="https://github.com/junit-team/junit5/commit/9430ecee6b99d9438c5a0204549ab88fc66ead86"><code>9430ece</code></a> Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (<a href="https://redirect.github.com/junit-team/junit5/issues/3924">#3924</a>)</li>
<li><a href="https://github.com/junit-team/junit5/commit/0b10f86dd2e0a7fd232c1de032d1e2fbe312f615"><code>0b10f86</code></a> Polish release notes</li>
<li><a href="https://github.com/junit-team/junit5/commit/4dbd0f943efd53e49f8896ec1c9f677526c212cb"><code>4dbd0f9</code></a> Let <code>@ TempDir</code> fail fast with <code>File</code> annotated element and non-default file s...</li>
<li><a href="https://github.com/junit-team/junit5/commit/57f1ad4efd75236e531b9bcbad7c955eb1fb3943"><code>57f1ad4</code></a> Fix syntax</li>
<li><a href="https://github.com/junit-team/junit5/commit/d78730ae9f74bc63a136a29f5c5332154731c99b"><code>d78730a</code></a> Prioritize tasks on critical path of task graph</li>
<li><a href="https://github.com/junit-team/junit5/commit/b6719e2e05ea5001f25dc1628917d23d7e3e76dc"><code>b6719e2</code></a> Remove obsolete directory</li>
<li><a href="https://github.com/junit-team/junit5/commit/d8ec757357932e224ea081b1c8b9d993f143e75f"><code>d8ec757</code></a> Apply Spotless formatting to Gradle script plugins</li>
<li><a href="https://github.com/junit-team/junit5/commit/dae525d51c0811f69f3087b38f24fa9053a31d36"><code>dae525d</code></a> Disable caching of some Spotless tasks due to negative avoidance savings</li>
<li><a href="https://github.com/junit-team/junit5/commit/c63d11843506d908584ebde270d1b3b299417d54"><code>c63d118</code></a> Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)</li>
<li>Additional commits viewable in <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">compare view</a></li>
</ul>
</details>
<br />

Updates `org.junit.jupiter:junit-jupiter-api` from 5.10.3 to 5.11.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/junit-team/junit5/releases">org.junit.jupiter:junit-jupiter-api's releases</a>.</em></p>
<blockquote>
<p>JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/pshevche"><code>@​pshevche</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3427">junit-team/junit5#3427</a></li>
<li><a href="https://github.com/rybak"><code>@​rybak</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3416">junit-team/junit5#3416</a></li>
<li><a href="https://github.com/pixeebot"><code>@​pixeebot</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3491">junit-team/junit5#3491</a></li>
<li><a href="https://github.com/shartte"><code>@​shartte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3562">junit-team/junit5#3562</a></li>
<li><a href="https://github.com/eliasnogueira"><code>@​eliasnogueira</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3674">junit-team/junit5#3674</a></li>
<li><a href="https://github.com/bigdaz"><code>@​bigdaz</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3668">junit-team/junit5#3668</a></li>
<li><a href="https://github.com/gilday"><code>@​gilday</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3667">junit-team/junit5#3667</a></li>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
<li><a href="https://github.com/rfscholte"><code>@​rfscholte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3909">junit-team/junit5#3909</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0</a></p>
<p>JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-RC1/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1">https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1</a></p>
<p>JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-M2/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2">https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2</a></p>
<p>JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/junit-team/junit5/commit/6b8e42b7a7d1606962341a61941c60b045646278"><code>6b8e42b</code></a> Release 5.11</li>
<li><a href="https://github.com/junit-team/junit5/commit/9430ecee6b99d9438c5a0204549ab88fc66ead86"><code>9430ece</code></a> Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (<a href="https://redirect.github.com/junit-team/junit5/issues/3924">#3924</a>)</li>
<li><a href="https://github.com/junit-team/junit5/commit/0b10f86dd2e0a7fd232c1de032d1e2fbe312f615"><code>0b10f86</code></a> Polish release notes</li>
<li><a href="https://github.com/junit-team/junit5/commit/4dbd0f943efd53e49f8896ec1c9f677526c212cb"><code>4dbd0f9</code></a> Let <code>@ TempDir</code> fail fast with <code>File</code> annotated element and non-default file s...</li>
<li><a href="https://github.com/junit-team/junit5/commit/57f1ad4efd75236e531b9bcbad7c955eb1fb3943"><code>57f1ad4</code></a> Fix syntax</li>
<li><a href="https://github.com/junit-team/junit5/commit/d78730ae9f74bc63a136a29f5c5332154731c99b"><code>d78730a</code></a> Prioritize tasks on critical path of task graph</li>
<li><a href="https://github.com/junit-team/junit5/commit/b6719e2e05ea5001f25dc1628917d23d7e3e76dc"><code>b6719e2</code></a> Remove obsolete directory</li>
<li><a href="https://github.com/junit-team/junit5/commit/d8ec757357932e224ea081b1c8b9d993f143e75f"><code>d8ec757</code></a> Apply Spotless formatting to Gradle script plugins</li>
<li><a href="https://github.com/junit-team/junit5/commit/dae525d51c0811f69f3087b38f24fa9053a31d36"><code>dae525d</code></a> Disable caching of some Spotless tasks due to negative avoidance savings</li>
<li><a href="https://github.com/junit-team/junit5/commit/c63d11843506d908584ebde270d1b3b299417d54"><code>c63d118</code></a> Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)</li>
<li>Additional commits viewable in <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">compare view</a></li>
</ul>
</details>
<br />

Updates `org.junit.jupiter:junit-jupiter-params` from 5.10.3 to 5.11.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/junit-team/junit5/releases">org.junit.jupiter:junit-jupiter-params's releases</a>.</em></p>
<blockquote>
<p>JUnit 5.11.0 = Platform 1.11.0 + Jupiter 5.11.0 + Vintage 5.11.0</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/pshevche"><code>@​pshevche</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3427">junit-team/junit5#3427</a></li>
<li><a href="https://github.com/rybak"><code>@​rybak</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3416">junit-team/junit5#3416</a></li>
<li><a href="https://github.com/pixeebot"><code>@​pixeebot</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3491">junit-team/junit5#3491</a></li>
<li><a href="https://github.com/shartte"><code>@​shartte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3562">junit-team/junit5#3562</a></li>
<li><a href="https://github.com/eliasnogueira"><code>@​eliasnogueira</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3674">junit-team/junit5#3674</a></li>
<li><a href="https://github.com/bigdaz"><code>@​bigdaz</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3668">junit-team/junit5#3668</a></li>
<li><a href="https://github.com/gilday"><code>@​gilday</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3667">junit-team/junit5#3667</a></li>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
<li><a href="https://github.com/rfscholte"><code>@​rfscholte</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3909">junit-team/junit5#3909</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0</a></p>
<p>JUnit 5.11.0-RC1 = Platform 1.11.0-RC1 + Jupiter 5.11.0-RC1 + Vintage 5.11.0-RC1</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-RC1/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/compf"><code>@​compf</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3867">junit-team/junit5#3867</a></li>
<li><a href="https://github.com/SveinKare"><code>@​SveinKare</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3840">junit-team/junit5#3840</a></li>
<li><a href="https://github.com/mobounya"><code>@​mobounya</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3720">junit-team/junit5#3720</a></li>
<li><a href="https://github.com/robinjhector"><code>@​robinjhector</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3345">junit-team/junit5#3345</a></li>
<li><a href="https://github.com/jabhatfield"><code>@​jabhatfield</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3829">junit-team/junit5#3829</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1">https://github.com/junit-team/junit5/compare/r5.11.0-M2...r5.11.0-RC1</a></p>
<p>JUnit 5.11.0-M2 = Platform 1.11.0-M2 + Jupiter 5.11.0-M2 + Vintage 5.11.0-M2</p>
<p>See <a href="http://junit.org/junit5/docs/5.11.0-M2/release-notes/">Release Notes</a>.</p>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/bjmi"><code>@​bjmi</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3806">junit-team/junit5#3806</a></li>
<li><a href="https://github.com/madalingiurca"><code>@​madalingiurca</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3787">junit-team/junit5#3787</a></li>
<li><a href="https://github.com/dmlloyd"><code>@​dmlloyd</code></a> made their first contribution in <a href="https://redirect.github.com/junit-team/junit5/pull/3820">junit-team/junit5#3820</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2">https://github.com/junit-team/junit5/compare/r5.11.0-M1...r5.11.0-M2</a></p>
<p>JUnit 5.11.0-M1 = Platform 1.11.0-M1 + Jupiter 5.11.0-M1 + Vintage 5.11.0-M1</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/junit-team/junit5/commit/6b8e42b7a7d1606962341a61941c60b045646278"><code>6b8e42b</code></a> Release 5.11</li>
<li><a href="https://github.com/junit-team/junit5/commit/9430ecee6b99d9438c5a0204549ab88fc66ead86"><code>9430ece</code></a> Allow potentially unlimited maxCharsPerColumn in Csv{File}Source (<a href="https://redirect.github.com/junit-team/junit5/issues/3924">#3924</a>)</li>
<li><a href="https://github.com/junit-team/junit5/commit/0b10f86dd2e0a7fd232c1de032d1e2fbe312f615"><code>0b10f86</code></a> Polish release notes</li>
<li><a href="https://github.com/junit-team/junit5/commit/4dbd0f943efd53e49f8896ec1c9f677526c212cb"><code>4dbd0f9</code></a> Let <code>@ TempDir</code> fail fast with <code>File</code> annotated element and non-default file s...</li>
<li><a href="https://github.com/junit-team/junit5/commit/57f1ad4efd75236e531b9bcbad7c955eb1fb3943"><code>57f1ad4</code></a> Fix syntax</li>
<li><a href="https://github.com/junit-team/junit5/commit/d78730ae9f74bc63a136a29f5c5332154731c99b"><code>d78730a</code></a> Prioritize tasks on critical path of task graph</li>
<li><a href="https://github.com/junit-team/junit5/commit/b6719e2e05ea5001f25dc1628917d23d7e3e76dc"><code>b6719e2</code></a> Remove obsolete directory</li>
<li><a href="https://github.com/junit-team/junit5/commit/d8ec757357932e224ea081b1c8b9d993f143e75f"><code>d8ec757</code></a> Apply Spotless formatting to Gradle script plugins</li>
<li><a href="https://github.com/junit-team/junit5/commit/dae525d51c0811f69f3087b38f24fa9053a31d36"><code>dae525d</code></a> Disable caching of some Spotless tasks due to negative avoidance savings</li>
<li><a href="https://github.com/junit-team/junit5/commit/c63d11843506d908584ebde270d1b3b299417d54"><code>c63d118</code></a> Re-enable caching verifyOSGi tasks (issue was fixed in bnd 7.0.0)</li>
<li>Additional commits viewable in <a href="https://github.com/junit-team/junit5/compare/r5.10.3...r5.11.0">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 54bb7a0ae0eb9..77feed12f3f1d 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -93,7 +93,7 @@ under the License.
   <properties>
     <target.gen.source.path>${project.build.directory}/generated-sources</target.gen.source.path>
     <dep.junit.platform.version>1.9.0</dep.junit.platform.version>
-    <dep.junit.jupiter.version>5.10.3</dep.junit.jupiter.version>
+    <dep.junit.jupiter.version>5.11.0</dep.junit.jupiter.version>
     <dep.slf4j.version>2.0.16</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>

From 2328b6ee39b497d9f48e6d342db9f7d0c34d9791 Mon Sep 17 00:00:00 2001
From: Rok Mihevc <rok@mihevc.org>
Date: Mon, 26 Aug 2024 16:34:18 +0200
Subject: [PATCH 027/186] GH-15058: [C++][Python] Native support for UUID
 (#37298)

### Rationale for this change

See #15058.
UUID datatype is common in throughout the ecosystem and Arrow as supporting it as a native type would reduce friction.

### What changes are included in this PR?

This PR implements logic for Arrow canonical extension type in C++ and a Python wrapper.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes, new extension type is added.
* Closes: #15058

Authored-by: Rok Mihevc <rok@mihevc.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/CMakeLists.txt                  |   3 +-
 cpp/src/arrow/acero/hash_join_node_test.cc    |   1 +
 cpp/src/arrow/extension/CMakeLists.txt        |   2 +-
 .../extension/fixed_shape_tensor_test.cc      |  17 +--
 cpp/src/arrow/extension/uuid.cc               |  58 ++++++++++
 cpp/src/arrow/extension/uuid.h                |  61 ++++++++++
 cpp/src/arrow/extension/uuid_test.cc          |  72 ++++++++++++
 cpp/src/arrow/extension_type.cc               |   4 +-
 cpp/src/arrow/extension_type_test.cc          |  19 +---
 .../integration/json_integration_test.cc      |   2 +-
 cpp/src/arrow/ipc/test_common.cc              |  35 ++++--
 cpp/src/arrow/ipc/test_common.h               |   3 +
 cpp/src/arrow/scalar_test.cc                  |   5 +-
 cpp/src/arrow/testing/extension_type.h        |   6 +-
 cpp/src/arrow/testing/gtest_util.cc           |  16 ++-
 dev/archery/archery/integration/datagen.py    |   2 +-
 docs/source/format/CanonicalExtensions.rst    |   2 +
 docs/source/status.rst                        |   2 +-
 python/pyarrow/__init__.py                    |  18 +--
 python/pyarrow/array.pxi                      |   6 +
 python/pyarrow/includes/libarrow.pxd          |  10 ++
 python/pyarrow/lib.pxd                        |   3 +
 python/pyarrow/public-api.pxi                 |  11 +-
 python/pyarrow/scalar.pxi                     |  10 ++
 python/pyarrow/src/arrow/python/gdb.cc        |  27 +----
 python/pyarrow/tests/extensions.pyx           |   2 +-
 python/pyarrow/tests/test_extension_type.py   | 105 ++++++++++++------
 python/pyarrow/tests/test_gdb.py              |   8 +-
 python/pyarrow/types.pxi                      |  34 ++++++
 29 files changed, 412 insertions(+), 132 deletions(-)
 create mode 100644 cpp/src/arrow/extension/uuid.cc
 create mode 100644 cpp/src/arrow/extension/uuid.h
 create mode 100644 cpp/src/arrow/extension/uuid_test.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 89f28ee416ede..6b0ac8c23c75a 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -375,6 +375,7 @@ set(ARROW_SRCS
     device.cc
     extension_type.cc
     extension/bool8.cc
+    extension/uuid.cc
     pretty_print.cc
     record_batch.cc
     result.cc
@@ -1225,6 +1226,7 @@ add_subdirectory(testing)
 add_subdirectory(array)
 add_subdirectory(c)
 add_subdirectory(compute)
+add_subdirectory(extension)
 add_subdirectory(io)
 add_subdirectory(tensor)
 add_subdirectory(util)
@@ -1267,7 +1269,6 @@ endif()
 
 if(ARROW_JSON)
   add_subdirectory(json)
-  add_subdirectory(extension)
 endif()
 
 if(ARROW_ORC)
diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc
index 9065e286a2228..76ad9c7d650eb 100644
--- a/cpp/src/arrow/acero/hash_join_node_test.cc
+++ b/cpp/src/arrow/acero/hash_join_node_test.cc
@@ -29,6 +29,7 @@
 #include "arrow/compute/kernels/test_util.h"
 #include "arrow/compute/light_array_internal.h"
 #include "arrow/compute/row/row_encoder_internal.h"
+#include "arrow/extension/uuid.h"
 #include "arrow/testing/extension_type.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt
index 5cb4bc77af2a4..065ea3f1ddb16 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set(CANONICAL_EXTENSION_TESTS bool8_test.cc)
+set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc)
 
 if(ARROW_JSON)
   list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc)
diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
index 3fd39a11ff50d..842a78e1a4f7a 100644
--- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
+++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
@@ -23,7 +23,7 @@
 #include "arrow/array/array_primitive.h"
 #include "arrow/io/memory.h"
 #include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
+#include "arrow/ipc/test_common.h"
 #include "arrow/record_batch.h"
 #include "arrow/tensor.h"
 #include "arrow/testing/gtest_util.h"
@@ -33,6 +33,7 @@
 namespace arrow {
 
 using FixedShapeTensorType = extension::FixedShapeTensorType;
+using arrow::ipc::test::RoundtripBatch;
 using extension::fixed_shape_tensor;
 using extension::FixedShapeTensorArray;
 
@@ -71,20 +72,6 @@ class TestExtensionType : public ::testing::Test {
   std::string serialized_;
 };
 
-auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
-                         std::shared_ptr<RecordBatch>* out) {
-  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
-  ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
-                                        out_stream.get()));
-
-  ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
-
-  io::BufferReader reader(complete_ipc_stream);
-  std::shared_ptr<RecordBatchReader> batch_reader;
-  ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
-  ASSERT_OK(batch_reader->ReadNext(out));
-};
-
 TEST_F(TestExtensionType, CheckDummyRegistration) {
   // We need a registered dummy type at runtime to allow for IPC deserialization
   auto registered_type = GetExtensionType("arrow.fixed_shape_tensor");
diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc
new file mode 100644
index 0000000000000..43b917a17f8b2
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include "arrow/extension_type.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/extension/uuid.h"
+
+namespace arrow::extension {
+
+bool UuidType::ExtensionEquals(const ExtensionType& other) const {
+  return (other.extension_name() == this->extension_name());
+}
+
+std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
+  DCHECK_EQ(data->type->id(), Type::EXTENSION);
+  DCHECK_EQ("arrow.uuid",
+            static_cast<const ExtensionType&>(*data->type).extension_name());
+  return std::make_shared<UuidArray>(data);
+}
+
+Result<std::shared_ptr<DataType>> UuidType::Deserialize(
+    std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
+  if (!serialized.empty()) {
+    return Status::Invalid("Unexpected serialized metadata: '", serialized, "'");
+  }
+  if (!storage_type->Equals(*fixed_size_binary(16))) {
+    return Status::Invalid("Invalid storage type for UuidType: ",
+                           storage_type->ToString());
+  }
+  return std::make_shared<UuidType>();
+}
+
+std::string UuidType::ToString(bool show_metadata) const {
+  std::stringstream ss;
+  ss << "extension<" << this->extension_name() << ">";
+  return ss.str();
+}
+
+std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h
new file mode 100644
index 0000000000000..42bb21cf0b2ed
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief UuidArray stores array of UUIDs. Underlying storage type is
+/// FixedSizeBinary(16).
+class ARROW_EXPORT UuidArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief UuidType is a canonical arrow extension type for UUIDs.
+/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this
+/// does not interpret the bytes in any way. Specific UUID version is not
+/// required or guaranteed.
+class ARROW_EXPORT UuidType : public ExtensionType {
+ public:
+  /// \brief Construct a UuidType.
+  UuidType() : ExtensionType(fixed_size_binary(16)) {}
+
+  std::string extension_name() const override { return "arrow.uuid"; }
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  /// Create a UuidArray from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return ""; }
+
+  /// \brief Create a UuidType instance
+  static Result<std::shared_ptr<DataType>> Make() { return std::make_shared<UuidType>(); }
+};
+
+/// \brief Return a UuidType instance.
+ARROW_EXPORT std::shared_ptr<DataType> uuid();
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc
new file mode 100644
index 0000000000000..3bbb6eeb4aef1
--- /dev/null
+++ b/cpp/src/arrow/extension/uuid_test.cc
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/uuid.h"
+
+#include "arrow/testing/matchers.h"
+
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/test_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/key_value_metadata.h"
+
+#include "arrow/testing/extension_type.h"
+
+namespace arrow {
+
+using arrow::ipc::test::RoundtripBatch;
+
+TEST(TestUuuidExtensionType, ExtensionTypeTest) {
+  auto type = uuid();
+  ASSERT_EQ(type->id(), Type::EXTENSION);
+
+  const auto& ext_type = static_cast<const ExtensionType&>(*type);
+  std::string serialized = ext_type.Serialize();
+
+  ASSERT_OK_AND_ASSIGN(auto deserialized,
+                       ext_type.Deserialize(fixed_size_binary(16), serialized));
+  ASSERT_TRUE(deserialized->Equals(*type));
+  ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16)));
+}
+
+TEST(TestUuuidExtensionType, RoundtripBatch) {
+  auto ext_type = extension::uuid();
+  auto exact_ext_type = internal::checked_pointer_cast<extension::UuidType>(ext_type);
+  auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])");
+  auto ext_arr = ExtensionType::WrapArray(ext_type, arr);
+
+  // Pass extension array, expect getting back extension array
+  std::shared_ptr<RecordBatch> read_batch;
+  auto ext_field = field(/*name=*/"f0", /*type=*/ext_type);
+  auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
+  RoundtripBatch(batch, &read_batch);
+  CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);
+
+  // Pass extension metadata and storage array, expect getting back extension array
+  std::shared_ptr<RecordBatch> read_batch2;
+  auto ext_metadata =
+      key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()},
+                          {"ARROW:extension:metadata", ""}});
+  ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(),
+                    /*nullable=*/true, /*metadata=*/ext_metadata);
+  auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr});
+  RoundtripBatch(batch2, &read_batch2);
+  CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index 83c7ebed4f319..fc220f73a6beb 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -32,6 +32,7 @@
 #include "arrow/extension/fixed_shape_tensor.h"
 #include "arrow/extension/opaque.h"
 #endif
+#include "arrow/extension/uuid.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
@@ -147,14 +148,13 @@ static void CreateGlobalRegistry() {
   // Register canonical extension types
 
   g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
-  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8()};
+  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), extension::uuid()};
 
 #ifdef ARROW_JSON
   ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
   ext_types.push_back(extension::opaque(null(), "", ""));
 #endif
 
-  // Register canonical extension types
   for (const auto& ext_type : ext_types) {
     ARROW_CHECK_OK(
         g_registry->RegisterType(checked_pointer_cast<ExtensionType>(ext_type)));
diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index f104c984a64b4..f49ffc5cba553 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -30,6 +30,7 @@
 #include "arrow/io/memory.h"
 #include "arrow/ipc/options.h"
 #include "arrow/ipc/reader.h"
+#include "arrow/ipc/test_common.h"
 #include "arrow/ipc/writer.h"
 #include "arrow/record_batch.h"
 #include "arrow/status.h"
@@ -41,6 +42,8 @@
 
 namespace arrow {
 
+using arrow::ipc::test::RoundtripBatch;
+
 class Parametric1Array : public ExtensionArray {
  public:
   using ExtensionArray::ExtensionArray;
@@ -178,7 +181,7 @@ class ExtStructType : public ExtensionType {
 
 class TestExtensionType : public ::testing::Test {
  public:
-  void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>())); }
+  void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<ExampleUuidType>())); }
 
   void TearDown() {
     if (GetExtensionType("uuid")) {
@@ -211,20 +214,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) {
   ASSERT_EQ(deserialized->byte_width(), 16);
 }
 
-auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
-                         std::shared_ptr<RecordBatch>* out) {
-  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
-  ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
-                                        out_stream.get()));
-
-  ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
-
-  io::BufferReader reader(complete_ipc_stream);
-  std::shared_ptr<RecordBatchReader> batch_reader;
-  ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
-  ASSERT_OK(batch_reader->ReadNext(out));
-};
-
 TEST_F(TestExtensionType, IpcRoundtrip) {
   auto ext_arr = ExampleUuid();
   auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr});
diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc
index 9b56928c68843..0e84ea6124d5d 100644
--- a/cpp/src/arrow/integration/json_integration_test.cc
+++ b/cpp/src/arrow/integration/json_integration_test.cc
@@ -1046,7 +1046,7 @@ TEST(TestJsonFileReadWrite, JsonExample2) {
 
     auto storage_array =
         ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])");
-    AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array));
+    AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type, storage_array));
 
     AssertArraysEqual(*batch->column(1), NullArray(2));
   }
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index 87c02e2d87a1e..fb4f6bd8eadcf 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -27,8 +27,10 @@
 #include "arrow/array.h"
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_primitive.h"
-#include "arrow/array/builder_time.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
 #include "arrow/ipc/test_common.h"
+#include "arrow/ipc/writer.h"
 #include "arrow/pretty_print.h"
 #include "arrow/record_batch.h"
 #include "arrow/status.h"
@@ -242,11 +244,11 @@ Status MakeRandomBooleanArray(const int length, bool include_nulls,
                               std::shared_ptr<Array>* out) {
   std::vector<uint8_t> values(length);
   random_null_bytes(length, 0.5, values.data());
-  ARROW_ASSIGN_OR_RAISE(auto data, internal::BytesToBits(values));
+  ARROW_ASSIGN_OR_RAISE(auto data, arrow::internal::BytesToBits(values));
 
   if (include_nulls) {
     std::vector<uint8_t> valid_bytes(length);
-    ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(valid_bytes));
+    ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(valid_bytes));
     random_null_bytes(length, 0.1, valid_bytes.data());
     *out = std::make_shared<BooleanArray>(length, data, null_bitmap, -1);
   } else {
@@ -596,7 +598,7 @@ Status MakeStruct(std::shared_ptr<RecordBatch>* out) {
   std::shared_ptr<Array> no_nulls(new StructArray(type, list_batch->num_rows(), columns));
   std::vector<uint8_t> null_bytes(list_batch->num_rows(), 1);
   null_bytes[0] = 0;
-  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes));
+  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(null_bytes));
   std::shared_ptr<Array> with_nulls(
       new StructArray(type, list_batch->num_rows(), columns, null_bitmap, 1));
 
@@ -1088,9 +1090,9 @@ Status MakeUuid(std::shared_ptr<RecordBatch>* out) {
   auto f1 = field("f1", uuid_type, /*nullable=*/false);
   auto schema = ::arrow::schema({f0, f1});
 
-  auto a0 = std::make_shared<UuidArray>(
+  auto a0 = std::make_shared<ExampleUuidArray>(
       uuid_type, ArrayFromJSON(storage_type, R"(["0123456789abcdef", null])"));
-  auto a1 = std::make_shared<UuidArray>(
+  auto a1 = std::make_shared<ExampleUuidArray>(
       uuid_type,
       ArrayFromJSON(storage_type, R"(["ZYXWVUTSRQPONMLK", "JIHGFEDBA9876543"])"));
 
@@ -1176,12 +1178,13 @@ enable_if_t<std::is_floating_point<CValueType>::value, void> FillRandomData(
 Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
                         const std::vector<int64_t>& shape, bool row_major_p,
                         std::shared_ptr<Tensor>* out, uint32_t seed) {
-  const auto& element_type = internal::checked_cast<const FixedWidthType&>(*type);
+  const auto& element_type = arrow::internal::checked_cast<const FixedWidthType&>(*type);
   std::vector<int64_t> strides;
   if (row_major_p) {
-    RETURN_NOT_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides));
+    RETURN_NOT_OK(arrow::internal::ComputeRowMajorStrides(element_type, shape, &strides));
   } else {
-    RETURN_NOT_OK(internal::ComputeColumnMajorStrides(element_type, shape, &strides));
+    RETURN_NOT_OK(
+        arrow::internal::ComputeColumnMajorStrides(element_type, shape, &strides));
   }
 
   const int64_t element_size = element_type.bit_width() / CHAR_BIT;
@@ -1233,6 +1236,20 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
   return Tensor::Make(type, buf, shape, strides).Value(out);
 }
 
+void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
+                    std::shared_ptr<RecordBatch>* out) {
+  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
+  ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
+                                        out_stream.get()));
+
+  ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+
+  io::BufferReader reader(complete_ipc_stream);
+  std::shared_ptr<RecordBatchReader> batch_reader;
+  ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
+  ASSERT_OK(batch_reader->ReadNext(out));
+}
+
 }  // namespace test
 }  // namespace ipc
 }  // namespace arrow
diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h
index db8613cbb1e6a..9b7e7f13e3a8e 100644
--- a/cpp/src/arrow/ipc/test_common.h
+++ b/cpp/src/arrow/ipc/test_common.h
@@ -184,6 +184,9 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
                         const std::vector<int64_t>& shape, bool row_major_p,
                         std::shared_ptr<Tensor>* out, uint32_t seed = 0);
 
+ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
+                                         std::shared_ptr<RecordBatch>* out);
+
 }  // namespace test
 }  // namespace ipc
 }  // namespace arrow
diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc
index 104a5697b5727..e9ec13e98b4ee 100644
--- a/cpp/src/arrow/scalar_test.cc
+++ b/cpp/src/arrow/scalar_test.cc
@@ -43,7 +43,6 @@ namespace arrow {
 
 using compute::Cast;
 using compute::CastOptions;
-
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
@@ -2038,7 +2037,7 @@ class TestExtensionScalar : public ::testing::Test {
   void SetUp() {
     type_ = uuid();
     storage_type_ = fixed_size_binary(16);
-    uuid_type_ = checked_cast<const UuidType*>(type_.get());
+    uuid_type_ = checked_cast<const ExampleUuidType*>(type_.get());
   }
 
  protected:
@@ -2049,7 +2048,7 @@ class TestExtensionScalar : public ::testing::Test {
   }
 
   std::shared_ptr<DataType> type_, storage_type_;
-  const UuidType* uuid_type_{nullptr};
+  const ExampleUuidType* uuid_type_{nullptr};
 
   const std::string_view uuid_string1_{UUID_STRING1};
   const std::string_view uuid_string2_{UUID_STRING2};
diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h
index 6515631f202ae..a4526e31c2b93 100644
--- a/cpp/src/arrow/testing/extension_type.h
+++ b/cpp/src/arrow/testing/extension_type.h
@@ -27,14 +27,14 @@
 
 namespace arrow {
 
-class ARROW_TESTING_EXPORT UuidArray : public ExtensionArray {
+class ARROW_TESTING_EXPORT ExampleUuidArray : public ExtensionArray {
  public:
   using ExtensionArray::ExtensionArray;
 };
 
-class ARROW_TESTING_EXPORT UuidType : public ExtensionType {
+class ARROW_TESTING_EXPORT ExampleUuidType : public ExtensionType {
  public:
-  UuidType() : ExtensionType(fixed_size_binary(16)) {}
+  ExampleUuidType() : ExtensionType(fixed_size_binary(16)) {}
 
   std::string extension_name() const override { return "uuid"; }
 
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index 95de16c715f19..ae2e53b30a3ee 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -49,9 +49,13 @@
 #include "arrow/buffer.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/datum.h"
+#include "arrow/io/memory.h"
 #include "arrow/ipc/json_simple.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
 #include "arrow/json/rapidjson_defs.h"  // IWYU pragma: keep
 #include "arrow/pretty_print.h"
+#include "arrow/record_batch.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/tensor.h"
@@ -847,17 +851,17 @@ Future<> SleepABitAsync() {
 ///////////////////////////////////////////////////////////////////////////
 // Extension types
 
-bool UuidType::ExtensionEquals(const ExtensionType& other) const {
+bool ExampleUuidType::ExtensionEquals(const ExtensionType& other) const {
   return (other.extension_name() == this->extension_name());
 }
 
-std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
+std::shared_ptr<Array> ExampleUuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
   DCHECK_EQ(data->type->id(), Type::EXTENSION);
   DCHECK_EQ("uuid", static_cast<const ExtensionType&>(*data->type).extension_name());
-  return std::make_shared<UuidArray>(data);
+  return std::make_shared<ExampleUuidArray>(data);
 }
 
-Result<std::shared_ptr<DataType>> UuidType::Deserialize(
+Result<std::shared_ptr<DataType>> ExampleUuidType::Deserialize(
     std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
   if (serialized != "uuid-serialized") {
     return Status::Invalid("Type identifier did not match: '", serialized, "'");
@@ -866,7 +870,7 @@ Result<std::shared_ptr<DataType>> UuidType::Deserialize(
     return Status::Invalid("Invalid storage type for UuidType: ",
                            storage_type->ToString());
   }
-  return std::make_shared<UuidType>();
+  return std::make_shared<ExampleUuidType>();
 }
 
 bool SmallintType::ExtensionEquals(const ExtensionType& other) const {
@@ -982,7 +986,7 @@ Result<std::shared_ptr<DataType>> Complex128Type::Deserialize(
   return std::make_shared<Complex128Type>();
 }
 
-std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }
+std::shared_ptr<DataType> uuid() { return std::make_shared<ExampleUuidType>(); }
 
 std::shared_ptr<DataType> smallint() { return std::make_shared<SmallintType>(); }
 
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index d395d26cb71d3..f63aa0d95a484 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case():
 def generate_extension_case():
     dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0')
 
-    uuid_type = ExtensionType('uuid', 'uuid-serialized',
+    uuid_type = ExtensionType('arrow.uuid', '',
                               FixedSizeBinaryField('', 16))
     dict_ext_type = ExtensionType(
         'dict-extension', 'dict-extension-serialized',
diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst
index 5658f949ceeaa..1106f8aaffdd3 100644
--- a/docs/source/format/CanonicalExtensions.rst
+++ b/docs/source/format/CanonicalExtensions.rst
@@ -272,6 +272,8 @@ JSON
   In the future, additional fields may be added, but they are not required
   to interpret the array.
 
+.. _uuid_extension:
+
 UUID
 ====
 
diff --git a/docs/source/status.rst b/docs/source/status.rst
index 5e2c2cc19c890..b685d4bbf8add 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -121,7 +121,7 @@ Data Types
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 | JSON                  |       |       | ✓     |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
-| UUID                  |       |       | ✓     |            |       |       |       |       |
+| UUID                  | ✓     |       | ✓     |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 | 8-bit Boolean         | ✓     |       | ✓     |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 807bcdc315036..d31c93119b73a 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -172,9 +172,7 @@ def print_entry(label, value):
                          union, sparse_union, dense_union,
                          dictionary,
                          run_end_encoded,
-                         fixed_shape_tensor,
-                         opaque,
-                         bool8,
+                         bool8, fixed_shape_tensor, opaque, uuid,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -184,8 +182,9 @@ def print_entry(label, value):
                          TimestampType, Time32Type, Time64Type, DurationType,
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
-                         RunEndEncodedType, FixedShapeTensorType, OpaqueType,
-                         Bool8Type, PyExtensionType, UnknownExtensionType,
+                         RunEndEncodedType, Bool8Type, FixedShapeTensorType,
+                         OpaqueType, UuidType,
+                         PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
                          KeyValueMetadata,
@@ -218,8 +217,9 @@ def print_entry(label, value):
                          Time32Array, Time64Array, DurationArray,
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
-                         RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray,
-                         Bool8Array, scalar, NA, _NULL as NULL, Scalar,
+                         RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
+                         OpaqueArray, UuidArray,
+                         scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
                          UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
@@ -235,8 +235,8 @@ def print_entry(label, value):
                          StringScalar, LargeStringScalar, StringViewScalar,
                          FixedSizeBinaryScalar, DictionaryScalar,
                          MapScalar, StructScalar, UnionScalar,
-                         RunEndEncodedScalar, ExtensionScalar,
-                         FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar)
+                         RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
+                         FixedShapeTensorScalar, OpaqueScalar, UuidScalar)
 
 # Buffers, allocation
 from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 77d6c9c06d2de..1587de0e6b744 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -4338,6 +4338,12 @@ cdef class ExtensionArray(Array):
         return result
 
 
+class UuidArray(ExtensionArray):
+    """
+    Concrete class for Arrow arrays of UUID data type.
+    """
+
+
 cdef class FixedShapeTensorArray(ExtensionArray):
     """
     Concrete class for fixed shape tensor extension arrays.
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 6f510cfc0c06c..c2346750a196f 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2865,6 +2865,16 @@ cdef extern from "arrow/extension_type.h" namespace "arrow":
         shared_ptr[CArray] storage()
 
 
+cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil:
+    cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make()
+
+    cdef cppclass CUuidArray" arrow::extension::UuidArray"(CExtensionArray):
+        pass
+
+
 cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil:
     cdef cppclass CFixedShapeTensorType \
             " arrow::extension::FixedShapeTensorType"(CExtensionType):
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index a7c3b496a0045..5c3d981c3adc7 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -222,6 +222,9 @@ cdef class OpaqueType(BaseExtensionType):
     cdef:
         const COpaqueType* opaque_ext_type
 
+cdef class UuidType(BaseExtensionType):
+    cdef:
+        const CUuidType* uuid_ext_type
 
 cdef class PyExtensionType(ExtensionType):
     pass
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 19a26bd6c683d..d3e2ff2e99d91 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -120,14 +120,17 @@ cdef api object pyarrow_wrap_data_type(
     elif type.get().id() == _Type_EXTENSION:
         ext_type = <const CExtensionType*> type.get()
         cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
+        extension_name = ext_type.extension_name()
         if cpy_ext_type != nullptr:
             return cpy_ext_type.GetInstance()
-        elif ext_type.extension_name() == b"arrow.fixed_shape_tensor":
+        elif extension_name == b"arrow.bool8":
+            out = Bool8Type.__new__(Bool8Type)
+        elif extension_name == b"arrow.fixed_shape_tensor":
             out = FixedShapeTensorType.__new__(FixedShapeTensorType)
-        elif ext_type.extension_name() == b"arrow.opaque":
+        elif extension_name == b"arrow.opaque":
             out = OpaqueType.__new__(OpaqueType)
-        elif ext_type.extension_name() == b"arrow.bool8":
-            out = Bool8Type.__new__(Bool8Type)
+        elif extension_name == b"arrow.uuid":
+            out = UuidType.__new__(UuidType)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 72ae2aee5f8b3..68f77832c4342 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -17,6 +17,7 @@
 
 import collections
 from cython cimport binding
+from uuid import UUID
 
 
 cdef class Scalar(_Weakrefable):
@@ -1043,6 +1044,15 @@ cdef class ExtensionScalar(Scalar):
         return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)
 
 
+class UuidScalar(ExtensionScalar):
+    """
+    Concrete class for Uuid extension scalar.
+    """
+
+    def as_py(self):
+        return None if self.value is None else UUID(bytes=self.value.as_py())
+
+
 cdef class FixedShapeTensorScalar(ExtensionScalar):
     """
     Concrete class for fixed shape tensor extension scalar.
diff --git a/python/pyarrow/src/arrow/python/gdb.cc b/python/pyarrow/src/arrow/python/gdb.cc
index 6941769e4efe8..7c58bae3342c2 100644
--- a/python/pyarrow/src/arrow/python/gdb.cc
+++ b/python/pyarrow/src/arrow/python/gdb.cc
@@ -22,7 +22,7 @@
 #include "arrow/array.h"
 #include "arrow/chunked_array.h"
 #include "arrow/datum.h"
-#include "arrow/extension_type.h"
+#include "arrow/extension/uuid.h"
 #include "arrow/ipc/json_simple.h"
 #include "arrow/python/gdb.h"
 #include "arrow/record_batch.h"
@@ -37,6 +37,8 @@
 
 namespace arrow {
 
+using extension::uuid;
+using extension::UuidType;
 using ipc::internal::json::ArrayFromJSON;
 using ipc::internal::json::ChunkedArrayFromJSON;
 using ipc::internal::json::ScalarFromJSON;
@@ -56,29 +58,6 @@ class CustomStatusDetail : public StatusDetail {
   std::string ToString() const override { return "This is a detail"; }
 };
 
-class UuidType : public ExtensionType {
- public:
-  UuidType() : ExtensionType(fixed_size_binary(16)) {}
-
-  std::string extension_name() const override { return "uuid"; }
-
-  bool ExtensionEquals(const ExtensionType& other) const override {
-    return (other.extension_name() == this->extension_name());
-  }
-
-  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override {
-    return std::make_shared<ExtensionArray>(data);
-  }
-
-  Result<std::shared_ptr<DataType>> Deserialize(
-      std::shared_ptr<DataType> storage_type,
-      const std::string& serialized) const override {
-    return Status::NotImplemented("");
-  }
-
-  std::string Serialize() const override { return "uuid-serialized"; }
-};
-
 std::shared_ptr<Array> SliceArrayFromJSON(const std::shared_ptr<DataType>& ty,
                                           std::string_view json, int64_t offset = 0,
                                           int64_t length = -1) {
diff --git a/python/pyarrow/tests/extensions.pyx b/python/pyarrow/tests/extensions.pyx
index c1bf9aae1ec03..309b574dc0264 100644
--- a/python/pyarrow/tests/extensions.pyx
+++ b/python/pyarrow/tests/extensions.pyx
@@ -37,7 +37,7 @@ cdef extern from * namespace "arrow::py" nogil:
     class UuidType : public ExtensionType {
     public:
         UuidType() : ExtensionType(fixed_size_binary(16)) {}
-        std::string extension_name() const override { return "uuid"; }
+        std::string extension_name() const override { return "example-uuid"; }
 
         bool ExtensionEquals(const ExtensionType& other) const override {
             return other.extension_name() == this->extension_name();
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index 0d50c467e96bd..aacbd2cb6e756 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -95,18 +95,21 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized):
         return cls()
 
 
-class UuidScalarType(pa.ExtensionScalar):
+class ExampleUuidScalarType(pa.ExtensionScalar):
     def as_py(self):
         return None if self.value is None else UUID(bytes=self.value.as_py())
 
 
-class UuidType(pa.ExtensionType):
+class ExampleUuidType(pa.ExtensionType):
 
     def __init__(self):
-        super().__init__(pa.binary(16), 'pyarrow.tests.UuidType')
+        super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType')
+
+    def __reduce__(self):
+        return ExampleUuidType, ()
 
     def __arrow_ext_scalar_class__(self):
-        return UuidScalarType
+        return ExampleUuidScalarType
 
     def __arrow_ext_serialize__(self):
         return b''
@@ -116,10 +119,10 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized):
         return cls()
 
 
-class UuidType2(pa.ExtensionType):
+class ExampleUuidType2(pa.ExtensionType):
 
     def __init__(self):
-        super().__init__(pa.binary(16), 'pyarrow.tests.UuidType2')
+        super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType2')
 
     def __arrow_ext_serialize__(self):
         return b''
@@ -250,8 +253,8 @@ def ipc_read_batch(buf):
 
 
 def test_ext_type_basics():
-    ty = UuidType()
-    assert ty.extension_name == "pyarrow.tests.UuidType"
+    ty = ExampleUuidType()
+    assert ty.extension_name == "pyarrow.tests.ExampleUuidType"
 
 
 def test_ext_type_str():
@@ -267,16 +270,16 @@ def test_ext_type_repr():
 
 
 def test_ext_type_lifetime():
-    ty = UuidType()
+    ty = ExampleUuidType()
     wr = weakref.ref(ty)
     del ty
     assert wr() is None
 
 
 def test_ext_type_storage_type():
-    ty = UuidType()
+    ty = ExampleUuidType()
     assert ty.storage_type == pa.binary(16)
-    assert ty.__class__ is UuidType
+    assert ty.__class__ is ExampleUuidType
     ty = ParamExtType(5)
     assert ty.storage_type == pa.binary(5)
     assert ty.__class__ is ParamExtType
@@ -284,7 +287,7 @@ def test_ext_type_storage_type():
 
 def test_ext_type_byte_width():
     # Test for fixed-size binary types
-    ty = UuidType()
+    ty = pa.uuid()
     assert ty.byte_width == 16
     ty = ParamExtType(5)
     assert ty.byte_width == 5
@@ -297,7 +300,7 @@ def test_ext_type_byte_width():
 
 def test_ext_type_bit_width():
     # Test for fixed-size binary types
-    ty = UuidType()
+    ty = pa.uuid()
     assert ty.bit_width == 128
     ty = ParamExtType(5)
     assert ty.bit_width == 40
@@ -309,7 +312,7 @@ def test_ext_type_bit_width():
 
 
 def test_ext_type_as_py():
-    ty = UuidType()
+    ty = ExampleUuidType()
     expected = uuid4()
     scalar = pa.ExtensionScalar.from_storage(ty, expected.bytes)
     assert scalar.as_py() == expected
@@ -342,12 +345,22 @@ def test_ext_type_as_py():
 
 def test_uuid_type_pickle(pickle_module):
     for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
-        ty = UuidType()
+        ty = ExampleUuidType()
         ser = pickle_module.dumps(ty, protocol=proto)
         del ty
         ty = pickle_module.loads(ser)
         wr = weakref.ref(ty)
-        assert ty.extension_name == "pyarrow.tests.UuidType"
+        assert ty.extension_name == "pyarrow.tests.ExampleUuidType"
+        del ty
+        assert wr() is None
+
+    for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1):
+        ty = pa.uuid()
+        ser = pickle_module.dumps(ty, protocol=proto)
+        del ty
+        ty = pickle_module.loads(ser)
+        wr = weakref.ref(ty)
+        assert ty.extension_name == "arrow.uuid"
         del ty
         assert wr() is None
 
@@ -358,8 +371,8 @@ def test_ext_type_equality():
     c = ParamExtType(6)
     assert a != b
     assert b == c
-    d = UuidType()
-    e = UuidType()
+    d = ExampleUuidType()
+    e = ExampleUuidType()
     assert a != d
     assert d == e
 
@@ -403,7 +416,7 @@ def test_ext_array_equality():
     storage1 = pa.array([b"0123456789abcdef"], type=pa.binary(16))
     storage2 = pa.array([b"0123456789abcdef"], type=pa.binary(16))
     storage3 = pa.array([], type=pa.binary(16))
-    ty1 = UuidType()
+    ty1 = ExampleUuidType()
     ty2 = ParamExtType(16)
 
     a = pa.ExtensionArray.from_storage(ty1, storage1)
@@ -451,9 +464,9 @@ def test_ext_scalar_from_array():
     data = [b"0123456789abcdef", b"0123456789abcdef",
             b"zyxwvutsrqponmlk", None]
     storage = pa.array(data, type=pa.binary(16))
-    ty1 = UuidType()
+    ty1 = ExampleUuidType()
     ty2 = ParamExtType(16)
-    ty3 = UuidType2()
+    ty3 = ExampleUuidType2()
 
     a = pa.ExtensionArray.from_storage(ty1, storage)
     b = pa.ExtensionArray.from_storage(ty2, storage)
@@ -462,9 +475,9 @@ def test_ext_scalar_from_array():
     scalars_a = list(a)
     assert len(scalars_a) == 4
 
-    assert ty1.__arrow_ext_scalar_class__() == UuidScalarType
-    assert isinstance(a[0], UuidScalarType)
-    assert isinstance(scalars_a[0], UuidScalarType)
+    assert ty1.__arrow_ext_scalar_class__() == ExampleUuidScalarType
+    assert isinstance(a[0], ExampleUuidScalarType)
+    assert isinstance(scalars_a[0], ExampleUuidScalarType)
 
     for s, val in zip(scalars_a, data):
         assert isinstance(s, pa.ExtensionScalar)
@@ -505,7 +518,7 @@ def test_ext_scalar_from_array():
 
 
 def test_ext_scalar_from_storage():
-    ty = UuidType()
+    ty = ExampleUuidType()
 
     s = pa.ExtensionScalar.from_storage(ty, None)
     assert isinstance(s, pa.ExtensionScalar)
@@ -706,14 +719,14 @@ def test_cast_between_extension_types():
     tiny_int_arr.cast(pa.int64()).cast(IntegerType())
 
     # Between the same extension types is okay
-    array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(UuidType())
-    out = array.cast(UuidType())
-    assert out.type == UuidType()
+    array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(ExampleUuidType())
+    out = array.cast(ExampleUuidType())
+    assert out.type == ExampleUuidType()
 
     # Will still fail casting between extensions who share storage type,
     # can only cast between exactly the same extension types.
     with pytest.raises(TypeError, match='Casting from *'):
-        array.cast(UuidType2())
+        array.cast(ExampleUuidType2())
 
 
 def test_cast_to_extension_with_extension_storage():
@@ -744,10 +757,10 @@ def test_cast_nested_extension_types(data, type_factory):
 
 def test_casting_dict_array_to_extension_type():
     storage = pa.array([b"0123456789abcdef"], type=pa.binary(16))
-    arr = pa.ExtensionArray.from_storage(UuidType(), storage)
+    arr = pa.ExtensionArray.from_storage(ExampleUuidType(), storage)
     dict_arr = pa.DictionaryArray.from_arrays(pa.array([0, 0], pa.int32()),
                                               arr)
-    out = dict_arr.cast(UuidType())
+    out = dict_arr.cast(ExampleUuidType())
     assert isinstance(out, pa.ExtensionArray)
     assert out.to_pylist() == [UUID('30313233-3435-3637-3839-616263646566'),
                                UUID('30313233-3435-3637-3839-616263646566')]
@@ -1347,7 +1360,7 @@ def test_cpp_extension_in_python(tmpdir):
     mod = __import__('extensions')
 
     uuid_type = mod._make_uuid_type()
-    assert uuid_type.extension_name == "uuid"
+    assert uuid_type.extension_name == "example-uuid"
     assert uuid_type.storage_type == pa.binary(16)
 
     array = mod._make_uuid_array()
@@ -1356,6 +1369,31 @@ def test_cpp_extension_in_python(tmpdir):
     assert array[0].as_py() == b'abcdefghijklmno0'
     assert array[1].as_py() == b'0onmlkjihgfedcba'
 
+    buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["example-uuid"]))
+
+    batch = ipc_read_batch(buf)
+    reconstructed_array = batch.column(0)
+    assert reconstructed_array.type == uuid_type
+    assert reconstructed_array == array
+
+
+def test_uuid_extension():
+    data = [b"0123456789abcdef", b"0123456789abcdef",
+            b"zyxwvutsrqponmlk", None]
+
+    uuid_type = pa.uuid()
+    assert uuid_type.extension_name == "arrow.uuid"
+    assert uuid_type.storage_type == pa.binary(16)
+    assert uuid_type.__class__ is pa.UuidType
+
+    storage = pa.array(data, pa.binary(16))
+    array = pa.ExtensionArray.from_storage(uuid_type, storage)
+    assert array.type == uuid_type
+
+    assert array.to_pylist() == [x if x is None else UUID(bytes=x) for x in data]
+    assert array[0].as_py() == UUID(bytes=data[0])
+    assert array[3].as_py() is None
+
     buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["uuid"]))
 
     batch = ipc_read_batch(buf)
@@ -1363,6 +1401,9 @@ def test_cpp_extension_in_python(tmpdir):
     assert reconstructed_array.type == uuid_type
     assert reconstructed_array == array
 
+    assert uuid_type.__arrow_ext_scalar_class__() == pa.UuidScalar
+    assert isinstance(array[0], pa.UuidScalar)
+
 
 def test_tensor_type():
     tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py
index 0d12d710dcf64..2ac2f55754fe5 100644
--- a/python/pyarrow/tests/test_gdb.py
+++ b/python/pyarrow/tests/test_gdb.py
@@ -409,7 +409,7 @@ def test_types_stack(gdb_arrow):
 
     check_stack_repr(
         gdb_arrow, "uuid_type",
-        ('arrow::ExtensionType "extension<uuid>" '
+        ('arrow::ExtensionType "extension<arrow.uuid>" '
          'with storage type arrow::fixed_size_binary(16)'))
 
 
@@ -447,7 +447,7 @@ def test_types_heap(gdb_arrow):
 
     check_heap_repr(
         gdb_arrow, "heap_uuid_type",
-        ('arrow::ExtensionType "extension<uuid>" '
+        ('arrow::ExtensionType "extension<arrow.uuid>" '
          'with storage type arrow::fixed_size_binary(16)'))
 
 
@@ -716,12 +716,12 @@ def test_scalars_stack(gdb_arrow):
 
     check_stack_repr(
         gdb_arrow, "extension_scalar",
-        ('arrow::ExtensionScalar of type "extension<uuid>", '
+        ('arrow::ExtensionScalar of type "extension<arrow.uuid>", '
          'value arrow::FixedSizeBinaryScalar of size 16, '
          'value "0123456789abcdef"'))
     check_stack_repr(
         gdb_arrow, "extension_scalar_null",
-        'arrow::ExtensionScalar of type "extension<uuid>", null value')
+        'arrow::ExtensionScalar of type "extension<arrow.uuid>", null value')
 
 
 def test_scalars_heap(gdb_arrow):
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 563782f0c2643..f83ecc3aa4326 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1765,6 +1765,25 @@ cdef class ExtensionType(BaseExtensionType):
         return ExtensionScalar
 
 
+cdef class UuidType(BaseExtensionType):
+    """
+    Concrete class for UUID extension type.
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        BaseExtensionType.init(self, type)
+        self.uuid_ext_type = <const CUuidType*> type.get()
+
+    def __arrow_ext_class__(self):
+        return UuidArray
+
+    def __reduce__(self):
+        return uuid, ()
+
+    def __arrow_ext_scalar_class__(self):
+        return UuidScalar
+
+
 cdef class FixedShapeTensorType(BaseExtensionType):
     """
     Concrete class for fixed shape tensor extension type.
@@ -5208,6 +5227,21 @@ def run_end_encoded(run_end_type, value_type):
     return pyarrow_wrap_data_type(ree_type)
 
 
+def uuid():
+    """
+    Create UuidType instance.
+
+    Returns
+    -------
+    type : UuidType
+    """
+
+    cdef UuidType out = UuidType.__new__(UuidType)
+    c_uuid_ext_type = GetResultValue(CUuidType.Make())
+    out.init(c_uuid_ext_type)
+    return out
+
+
 def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=None):
     """
     Create instance of fixed shape tensor extension type with shape and optional

From 8eb7bd4115da0027aad6362f0fe0901ec44b0616 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 27 Aug 2024 09:12:57 +0900
Subject: [PATCH 028/186] MINOR: [Go] Bump github.com/hamba/avro/v2 from 2.24.1
 to 2.25.0 in /go (#43829)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [github.com/hamba/avro/v2](https://github.com/hamba/avro) from 2.24.1 to 2.25.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/hamba/avro/releases">github.com/hamba/avro/v2's releases</a>.</em></p>
<blockquote>
<h2>v2.25.0</h2>
<h2>What's Changed</h2>
<ul>
<li>chore: bump golang.org/x/tools from 0.23.0 to 0.24.0 in the all group by <a href="https://github.com/dependabot"><code>@​dependabot</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/431">hamba/avro#431</a></li>
<li>feat: support custom logical types by <a href="https://github.com/Emptyless"><code>@​Emptyless</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/429">hamba/avro#429</a></li>
<li>chore: support go 1.23 by <a href="https://github.com/nrwiersma"><code>@​nrwiersma</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/433">hamba/avro#433</a></li>
<li>docs: add who use case by <a href="https://github.com/haoxins"><code>@​haoxins</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/434">hamba/avro#434</a></li>
<li>chore: update decoder.go by <a href="https://github.com/kasperlewau"><code>@​kasperlewau</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/436">hamba/avro#436</a></li>
<li>fix: ref already seen schemas in deref walk by <a href="https://github.com/nrwiersma"><code>@​nrwiersma</code></a> in <a href="https://redirect.github.com/hamba/avro/pull/438">hamba/avro#438</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/Emptyless"><code>@​Emptyless</code></a> made their first contribution in <a href="https://redirect.github.com/hamba/avro/pull/429">hamba/avro#429</a></li>
<li><a href="https://github.com/kasperlewau"><code>@​kasperlewau</code></a> made their first contribution in <a href="https://redirect.github.com/hamba/avro/pull/436">hamba/avro#436</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/hamba/avro/compare/v2.24.1...v2.24.2">https://github.com/hamba/avro/compare/v2.24.1...v2.24.2</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/hamba/avro/commit/c2ac60e27f027cecefd33df085cd7dd13ed2b4f5"><code>c2ac60e</code></a> fix: ref already seen schemas in deref walk (<a href="https://redirect.github.com/hamba/avro/issues/438">#438</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/917a77ee07e81c28cc010d261115b8f86b7be234"><code>917a77e</code></a> chore: update decoder.go (<a href="https://redirect.github.com/hamba/avro/issues/436">#436</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/3a276f315d119ea178a75b61e2ddf1d1d425f3bd"><code>3a276f3</code></a> docs: Add who use case (<a href="https://redirect.github.com/hamba/avro/issues/434">#434</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/fdb7050201e160aaeacc303f1b4d26111f1d81c9"><code>fdb7050</code></a> chore: support go 1.23, remove go 1.21 (<a href="https://redirect.github.com/hamba/avro/issues/433">#433</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/51c1d356a5f16b0d16de083d7811be8e6c92e66f"><code>51c1d35</code></a> feat: support custom logical types (<a href="https://redirect.github.com/hamba/avro/issues/429">#429</a>)</li>
<li><a href="https://github.com/hamba/avro/commit/2623a40ea9178daaf1ec6876c0906029bcf83827"><code>2623a40</code></a> chore: bump golang.org/x/tools from 0.23.0 to 0.24.0 in the all group (<a href="https://redirect.github.com/hamba/avro/issues/431">#431</a>)</li>
<li>See full diff in <a href="https://github.com/hamba/avro/compare/v2.24.1...v2.25.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/hamba/avro/v2&package-manager=go_modules&previous-version=2.24.1&new-version=2.25.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 go/go.mod | 2 +-
 go/go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/go.mod b/go/go.mod
index 9f4222a541bb6..97ac05685970c 100644
--- a/go/go.mod
+++ b/go/go.mod
@@ -47,7 +47,7 @@ require (
 
 require (
 	github.com/google/uuid v1.6.0
-	github.com/hamba/avro/v2 v2.24.1
+	github.com/hamba/avro/v2 v2.25.0
 	github.com/huandu/xstrings v1.4.0
 	github.com/substrait-io/substrait-go v0.6.0
 	github.com/tidwall/sjson v1.2.5
diff --git a/go/go.sum b/go/go.sum
index c7eb3a66deeec..bd761e1589453 100644
--- a/go/go.sum
+++ b/go/go.sum
@@ -43,8 +43,8 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu
 github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/hamba/avro/v2 v2.24.1 h1:Xi+7AnhaAc41aA/jmmYpxMsdEDOf1rdup6NJ85P7q2I=
-github.com/hamba/avro/v2 v2.24.1/go.mod h1:7vDfy/2+kYCE8WUHoj2et59GTv0ap7ptktMXu0QHePI=
+github.com/hamba/avro/v2 v2.25.0 h1:9qig/K4VP5tMq6DuKGfI6YdXncTkPJT1IJDMSv82EeI=
+github.com/hamba/avro/v2 v2.25.0/go.mod h1:I8glyswHnpED3Nlx2ZdUe+4LJnCOOyiCzLMno9i/Uu0=
 github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
 github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=

From 93c5ddb957bb93421a8f84dbd7c5a5b7be2d6d45 Mon Sep 17 00:00:00 2001
From: PANKAJ9768 <48675737+PANKAJ9768@users.noreply.github.com>
Date: Tue, 27 Aug 2024 05:59:09 +0530
Subject: [PATCH 029/186] GH-43667: [Java] Keeping Flight default header size
 consistent between server and client  (#43697)

### Rationale for this change

### What changes are included in this PR?
Flight client can send header size larger than server can accept. This PR is to keep default values consistent across server and client.

### Are these changes tested?

### Are there any user-facing changes?

* GitHub Issue: #43667

Authored-by: pankaj kesari <pankaj.kesari99@yahoo.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../org/apache/arrow/flight/FlightServer.java |  7 ++
 .../arrow/flight/TestFlightService.java       | 73 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java
index 05dbe42c49172..ac761457f57fd 100644
--- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java
+++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightServer.java
@@ -188,6 +188,7 @@ public static final class Builder {
     private CallHeaderAuthenticator headerAuthenticator = CallHeaderAuthenticator.NO_OP;
     private ExecutorService executor = null;
     private int maxInboundMessageSize = MAX_GRPC_MESSAGE_SIZE;
+    private int maxHeaderListSize = MAX_GRPC_MESSAGE_SIZE;
     private int backpressureThreshold = DEFAULT_BACKPRESSURE_THRESHOLD;
     private InputStream certChain;
     private InputStream key;
@@ -324,6 +325,7 @@ public FlightServer build() {
       builder
           .executor(exec)
           .maxInboundMessageSize(maxInboundMessageSize)
+          .maxInboundMetadataSize(maxHeaderListSize)
           .addService(
               ServerInterceptors.intercept(
                   flightService,
@@ -366,6 +368,11 @@ public FlightServer build() {
       return new FlightServer(location, builder.build(), grpcExecutor);
     }
 
+    public Builder setMaxHeaderListSize(int maxHeaderListSize) {
+      this.maxHeaderListSize = maxHeaderListSize;
+      return this;
+    }
+
     /**
      * Set the maximum size of a message. Defaults to "unlimited", depending on the underlying
      * transport.
diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java
index 5ebeb44c1d36e..fc3f83e4eafd3 100644
--- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java
+++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestFlightService.java
@@ -27,6 +27,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.Collections;
 import java.util.Optional;
+import java.util.Random;
 import org.apache.arrow.flight.impl.Flight;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
@@ -152,4 +153,76 @@ public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor
       assertEquals("No schema is present in FlightInfo", e.getMessage());
     }
   }
+
+  /**
+   * Test for GH-41584 where flight defaults for header size was not in sync b\w client and server.
+   */
+  @Test
+  public void testHeaderSizeExchangeInService() throws Exception {
+    final FlightProducer producer =
+        new NoOpFlightProducer() {
+          @Override
+          public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor) {
+            String longHeader =
+                context.getMiddleware(FlightConstants.HEADER_KEY).headers().get("long-header");
+            return new FlightInfo(
+                null,
+                descriptor,
+                Collections.emptyList(),
+                0,
+                0,
+                false,
+                IpcOption.DEFAULT,
+                longHeader.getBytes(StandardCharsets.UTF_8));
+          }
+        };
+
+    String headerVal = generateRandom(1024 * 10);
+    FlightCallHeaders callHeaders = new FlightCallHeaders();
+    callHeaders.insert("long-header", headerVal);
+    // sever with default header limit same as client
+    try (final FlightServer s =
+            FlightServer.builder(allocator, forGrpcInsecure(LOCALHOST, 0), producer)
+                .build()
+                .start();
+        final FlightClient client = FlightClient.builder(allocator, s.getLocation()).build()) {
+      FlightInfo flightInfo =
+          client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders));
+      assertEquals(Optional.empty(), flightInfo.getSchemaOptional());
+      assertEquals(new Schema(Collections.emptyList()), flightInfo.getSchema());
+      assertArrayEquals(flightInfo.getAppMetadata(), headerVal.getBytes(StandardCharsets.UTF_8));
+    }
+    // server with 15kb header limit
+    try (final FlightServer s =
+            FlightServer.builder(allocator, forGrpcInsecure(LOCALHOST, 0), producer)
+                .setMaxHeaderListSize(1024 * 15)
+                .build()
+                .start();
+        final FlightClient client = FlightClient.builder(allocator, s.getLocation()).build()) {
+      FlightInfo flightInfo =
+          client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders));
+      assertEquals(Optional.empty(), flightInfo.getSchemaOptional());
+      assertEquals(new Schema(Collections.emptyList()), flightInfo.getSchema());
+      assertArrayEquals(flightInfo.getAppMetadata(), headerVal.getBytes(StandardCharsets.UTF_8));
+
+      callHeaders.insert("another-header", headerVal + headerVal);
+      FlightRuntimeException e =
+          assertThrows(
+              FlightRuntimeException.class,
+              () ->
+                  client.getInfo(FlightDescriptor.path("test"), new HeaderCallOption(callHeaders)));
+      assertEquals("http2 exception", e.getMessage());
+    }
+  }
+
+  private static String generateRandom(int size) {
+    String aToZ = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890";
+    Random random = new Random();
+    StringBuilder res = new StringBuilder();
+    for (int i = 0; i < size; i++) {
+      int randIndex = random.nextInt(aToZ.length());
+      res.append(aToZ.charAt(randIndex));
+    }
+    return res.toString();
+  }
 }

From 11f92491b1d2ecf700e6e023a1e413ec4c4345ae Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 27 Aug 2024 11:06:13 +0900
Subject: [PATCH 030/186] MINOR: [Go] Bump github.com/substrait-io/substrait-go
 from 0.6.0 to 0.7.0 in /go (#43830)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [github.com/substrait-io/substrait-go](https://github.com/substrait-io/substrait-go) from 0.6.0 to 0.7.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/substrait-io/substrait-go/releases">github.com/substrait-io/substrait-go's releases</a>.</em></p>
<blockquote>
<h1>v0.7.0 (2024-08-25)</h1>
<h3>Features</h3>
<ul>
<li>Add convenience literal APIs (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/47">#47</a>) (<a href="https://github.com/substrait-io/substrait-go/commit/597afdb7059171990014b357fa5b0865428c034f">597afdb</a>)
<blockquote>
<ul>
<li>Introduce literal package</li>
</ul>
<hr />
</blockquote>
</li>
</ul>
<h3>Changes to the build process or auxiliary tools and libraries such as documentation generation</h3>
<ul>
<li><strong><code>extensions</code></strong> Minor refactoring in extension_mgr.go (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/45">#45</a>) (<a href="https://github.com/substrait-io/substrait-go/commit/cbd28cb19499af1923484ec82540350528249075">cbd28cb</a>)
<blockquote>
<ul>
<li>Minor refactoring in extension_mgr.go</li>
</ul>
</blockquote>
</li>
<li>Move typeName maps to types package (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/46">#46</a>) (<a href="https://github.com/substrait-io/substrait-go/commit/5556c236d4fce79681d3c9e7db9b543a8e4245ce">5556c23</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/substrait-io/substrait-go/commit/597afdb7059171990014b357fa5b0865428c034f"><code>597afdb</code></a> feat: Add convenience literal APIs (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/47">#47</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/e77df6728b1f9499d2f650a927074ffc1354a5df"><code>e77df67</code></a> feat(types) Make time precision value explicit (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/49">#49</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/a3e8ee0724d42061f76fe5c64eaece37ca468c8c"><code>a3e8ee0</code></a> feat(substrait) Update to substrait v0.55.0 (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/48">#48</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/2229c12e14ac23f631c19e9e8001d826715dccef"><code>2229c12</code></a> ci(build-test): golangci should use the go.mod version of golang (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/51">#51</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/cbd28cb19499af1923484ec82540350528249075"><code>cbd28cb</code></a> chore(extensions): Minor refactoring in extension_mgr.go (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/45">#45</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/5556c236d4fce79681d3c9e7db9b543a8e4245ce"><code>5556c23</code></a> chore: Move typeName maps to types package (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/46">#46</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/dd790cb46265074e7737d102675f790dbb3f2e56"><code>dd790cb</code></a> Add a function registry for a given BFT dialect  (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/32">#32</a>)</li>
<li><a href="https://github.com/substrait-io/substrait-go/commit/828636c51ea752cf7a34aa18e3336ac2c43fe3f4"><code>828636c</code></a> ci(build-test): Add golangci-lint to do import checking and other linting (<a href="https://redirect.github.com/substrait-io/substrait-go/issues/42">#42</a>)</li>
<li>See full diff in <a href="https://github.com/substrait-io/substrait-go/compare/v0.6.0...v0.7.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/substrait-io/substrait-go&package-manager=go_modules&previous-version=0.6.0&new-version=0.7.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 go/go.mod | 2 +-
 go/go.sum | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/go/go.mod b/go/go.mod
index 97ac05685970c..a995eee24d563 100644
--- a/go/go.mod
+++ b/go/go.mod
@@ -49,7 +49,7 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/hamba/avro/v2 v2.25.0
 	github.com/huandu/xstrings v1.4.0
-	github.com/substrait-io/substrait-go v0.6.0
+	github.com/substrait-io/substrait-go v0.7.0
 	github.com/tidwall/sjson v1.2.5
 )
 
diff --git a/go/go.sum b/go/go.sum
index bd761e1589453..6f22e11aef03a 100644
--- a/go/go.sum
+++ b/go/go.sum
@@ -99,8 +99,8 @@ github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
-github.com/substrait-io/substrait-go v0.6.0 h1:n2G/SGmrn7U5Q39VA8WeM2UfVL5Y/6HX8WAP9uJLNk4=
-github.com/substrait-io/substrait-go v0.6.0/go.mod h1:cl8Wsc7aBPDfcHp9+OrUqGpjkgrYlhcDsH/lMP6KUZA=
+github.com/substrait-io/substrait-go v0.7.0 h1:53yi73t4wW383+RD1YuhXhbjhP1KzF9GCxPC7SsRlqc=
+github.com/substrait-io/substrait-go v0.7.0/go.mod h1:7mjSvIaxk94bOF+YZn/vBOpHK4DWTpBv7nC/btjXCmc=
 github.com/tidwall/gjson v1.14.2 h1:6BBkirS0rAHjumnjHF6qgy5d2YAJ1TLIaFE2lzfOLqo=
 github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
 github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=

From a49493d96bc3021af1a126ce33f859bfb7a2ec80 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Tue, 27 Aug 2024 11:44:19 +0900
Subject: [PATCH 031/186] MINOR: [Java] Downgrade gRPC to 1.65 (#43839)

### Rationale for this change
Newer versions don't run in all CI pipelines due to protoc using a newer glibc.

### What changes are included in this PR?

This reverts commit 4af1e491df7ac22217656668b65c3e8d55f5b5ab.

### Are these changes tested?

N/A

### Are there any user-facing changes?

No

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 77feed12f3f1d..f78d02c0c650f 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -97,7 +97,7 @@ under the License.
     <dep.slf4j.version>2.0.16</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>
-    <dep.grpc-bom.version>1.66.0</dep.grpc-bom.version>
+    <dep.grpc-bom.version>1.65.0</dep.grpc-bom.version>
     <dep.protobuf-bom.version>3.25.4</dep.protobuf-bom.version>
     <dep.jackson-bom.version>2.17.2</dep.jackson-bom.version>
     <dep.hadoop.version>3.4.0</dep.hadoop.version>

From 23fe1ce3361b9a6825fea77deb20d0bd7f247fe2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 27 Aug 2024 11:56:45 +0900
Subject: [PATCH 032/186] MINOR: [Java] Bump
 org.apache.commons:commons-compress from 1.27.0 to 1.27.1 in /java (#43826)

Bumps org.apache.commons:commons-compress from 1.27.0 to 1.27.1.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.commons:commons-compress&package-manager=maven&previous-version=1.27.0&new-version=1.27.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/compression/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/compression/pom.xml b/java/compression/pom.xml
index a1f2bc861da1f..46ed8796423eb 100644
--- a/java/compression/pom.xml
+++ b/java/compression/pom.xml
@@ -50,7 +50,7 @@ under the License.
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>
-      <version>1.27.0</version>
+      <version>1.27.1</version>
     </dependency>
     <dependency>
       <groupId>com.github.luben</groupId>

From fa5d158282b316819e4e23e0903b696467a61d38 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 21:01:45 -0700
Subject: [PATCH 033/186] MINOR: [C#] Bump Microsoft.NET.Test.Sdk from 17.10.0
 to 17.11.0 in /csharp (#43822)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [Microsoft.NET.Test.Sdk](https://github.com/microsoft/vstest) from 17.10.0 to 17.11.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/microsoft/vstest/releases">Microsoft.NET.Test.Sdk's releases</a>.</em></p>
<blockquote>
<h2>v17.11.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Add reference to the AdapterUtilities library in the spec docs. by <a href="https://github.com/peterwald"><code>@​peterwald</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4958">microsoft/vstest#4958</a></li>
<li>Stack trace when localized, and new messages by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4944">microsoft/vstest#4944</a></li>
<li>Fix single quote and space in F# pretty methods by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4969">microsoft/vstest#4969</a></li>
<li>Update .NET runtimes to latest patch version by <a href="https://github.com/Evangelink"><code>@​Evangelink</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4975">microsoft/vstest#4975</a></li>
<li>Update dotnetcoretests.md by <a href="https://github.com/DickBaker"><code>@​DickBaker</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4977">microsoft/vstest#4977</a></li>
<li>Add list of known TestingPlatform dlls by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4983">microsoft/vstest#4983</a></li>
<li>Update framework version used for testing, and test matrix by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4970">microsoft/vstest#4970</a></li>
<li>Add output forwarding for .NET by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4988">microsoft/vstest#4988</a></li>
<li>Remove usage of pt images before decomissioning by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4994">microsoft/vstest#4994</a></li>
<li>chore: Add more details to acquistion section. by <a href="https://github.com/voroninp"><code>@​voroninp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4999">microsoft/vstest#4999</a></li>
<li>Simplify banner by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5013">microsoft/vstest#5013</a></li>
<li>Forward standard output of testhost by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/4998">microsoft/vstest#4998</a></li>
<li>Add missing copyright header by <a href="https://github.com/MichaelSimons"><code>@​MichaelSimons</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5020">microsoft/vstest#5020</a></li>
<li>Add option to not share .NET Framework testhosts by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5018">microsoft/vstest#5018</a></li>
<li>GetTypesToLoad Attribute cant be null by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5054">microsoft/vstest#5054</a></li>
<li>rawArgument in GetArgumentList cant be null by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5056">microsoft/vstest#5056</a></li>
<li>fix Atribute typo by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5057">microsoft/vstest#5057</a></li>
<li>remove unnecessary list alloc for 2 scenarios in TestRequestManager.GetSources by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5058">microsoft/vstest#5058</a></li>
<li>fix incompatiblity typo by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5059">microsoft/vstest#5059</a></li>
<li>remove redundant inline method in IsPlatformIncompatible by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5060">microsoft/vstest#5060</a></li>
<li>fix Sucess typo by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5061">microsoft/vstest#5061</a></li>
<li>use some null coalescing by <a href="https://github.com/SimonCropp"><code>@​SimonCropp</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5062">microsoft/vstest#5062</a></li>
<li>Add cts into friends of TranslationLayer by <a href="https://github.com/jakubch1"><code>@​jakubch1</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5075">microsoft/vstest#5075</a></li>
<li>Use built in sha1 for id generation by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5081">microsoft/vstest#5081</a></li>
<li>All output in terminal logger by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5083">microsoft/vstest#5083</a></li>
<li>Ignore env test by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5095">microsoft/vstest#5095</a></li>
<li>Dispose XmlReader in XmlRunSettingsUtilities by <a href="https://github.com/omajid"><code>@​omajid</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5094">microsoft/vstest#5094</a></li>
<li>Bump to macos-12 build image by <a href="https://github.com/akoeplinger"><code>@​akoeplinger</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5101">microsoft/vstest#5101</a></li>
<li>Handle ansi escape in terminal logger reporter by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5084">microsoft/vstest#5084</a></li>
<li>remove disable interactive auth by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5110">microsoft/vstest#5110</a></li>
<li>Error output as info in terminal logger by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5113">microsoft/vstest#5113</a></li>
<li>Write dll instead of target on abort, rename errors by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5115">microsoft/vstest#5115</a></li>
<li>
<ul>
<li>[rel/17.11] Update dependencies from devdiv/DevDiv/vs-code-coverage by <a href="https://github.com/dotnet-maestro"><code>@​dotnet-maestro</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5152">microsoft/vstest#5152</a></li>
</ul>
</li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/peterwald"><code>@​peterwald</code></a> made their first contribution in <a href="https://redirect.github.com/microsoft/vstest/pull/4958">microsoft/vstest#4958</a></li>
<li><a href="https://github.com/DickBaker"><code>@​DickBaker</code></a> made their first contribution in <a href="https://redirect.github.com/microsoft/vstest/pull/4977">microsoft/vstest#4977</a></li>
<li><a href="https://github.com/voroninp"><code>@​voroninp</code></a> made their first contribution in <a href="https://redirect.github.com/microsoft/vstest/pull/4999">microsoft/vstest#4999</a></li>
<li><a href="https://github.com/akoeplinger"><code>@​akoeplinger</code></a> made their first contribution in <a href="https://redirect.github.com/microsoft/vstest/pull/5101">microsoft/vstest#5101</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/microsoft/vstest/compare/v17.10.0...v17.11.0-release-24352-06">https://github.com/microsoft/vstest/compare/v17.10.0...v17.11.0-release-24352-06</a></p>
<h2>v17.11.0-release-24373-02</h2>
<h2>What's Changed</h2>
<ul>
<li>[rel/17.11] Update dependencies from devdiv/DevDiv/vs-code-coverage by <a href="https://github.com/dotnet-maestro"><code>@​dotnet-maestro</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5152">microsoft/vstest#5152</a></li>
</ul>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/microsoft/vstest/commit/c6ad3e3fa4120fb32c8a48bab4fa478adfdb2740"><code>c6ad3e3</code></a> Update dependencies from <a href="https://dev.azure.com/devdiv/DevDiv/_git/vs-code-cov">https://dev.azure.com/devdiv/DevDiv/_git/vs-code-cov</a>...</li>
<li><a href="https://github.com/microsoft/vstest/commit/910ca0dcc779068418464794f5af570eda195222"><code>910ca0d</code></a> Fix output based test (<a href="https://redirect.github.com/microsoft/vstest/issues/5131">#5131</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/0518ceaee8e9b3689ebf0de5f250eb2a2e9de1c1"><code>0518cea</code></a> Rebrand to 17.11-release (<a href="https://redirect.github.com/microsoft/vstest/issues/5128">#5128</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/5b69fd31169dd07ced917329bbb483f3b73ea98f"><code>5b69fd3</code></a> Write dll instead of target on abort, rename errors (<a href="https://redirect.github.com/microsoft/vstest/issues/5115">#5115</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/7264afa0720d846bc4d64efaf5ebe9587e071ca7"><code>7264afa</code></a> Error output as info in terminal logger (<a href="https://redirect.github.com/microsoft/vstest/issues/5113">#5113</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/3d2ea06c998a002f640b01f5d84fdefb14167502"><code>3d2ea06</code></a> remove disable interactive auth (<a href="https://redirect.github.com/microsoft/vstest/issues/5110">#5110</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/e4da2c15416e898d665f2b41bd3939b49e20859a"><code>e4da2c1</code></a> Add option to ignore tests (<a href="https://redirect.github.com/microsoft/vstest/issues/5109">#5109</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/6b3b95952d3e8c31259536fe2d7d2c0530a90347"><code>6b3b959</code></a> Ignore dump failing test while I investigate (<a href="https://redirect.github.com/microsoft/vstest/issues/5107">#5107</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/24b992fda379b2443b8b986d1c146df4d7d7e14d"><code>24b992f</code></a> Ignore dispose error (<a href="https://redirect.github.com/microsoft/vstest/issues/5105">#5105</a>)</li>
<li><a href="https://github.com/microsoft/vstest/commit/bfdaf0bfd7727b509c6f509c4736ee9d685c794b"><code>bfdaf0b</code></a> Object disposed flaky (<a href="https://redirect.github.com/microsoft/vstest/issues/5104">#5104</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/microsoft/vstest/compare/v17.10.0...v17.11.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Microsoft.NET.Test.Sdk&package-manager=nuget&previous-version=17.10.0&new-version=17.11.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../Apache.Arrow.Compression.Tests.csproj                       | 2 +-
 .../Apache.Arrow.Flight.Sql.Tests.csproj                        | 2 +-
 .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj  | 2 +-
 csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
index 047cdb94b963e..4ea02e0ed21c0 100644
--- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
@@ -7,7 +7,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
   </ItemGroup>
diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
index dc95f9edf9f7f..fd8274230ec64 100644
--- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
@@ -6,7 +6,7 @@
     </PropertyGroup>
 
     <ItemGroup>
-      <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
+      <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
       <PackageReference Include="xunit" Version="2.9.0" />
       <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
       <PackageReference Include="coverlet.collector" Version="6.0.2" />
diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
index e68a97670cc7e..eae9ab746f283 100644
--- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
@@ -6,7 +6,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
     <PackageReference Include="coverlet.collector" Version="6.0.2" />
diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
index f05338313063c..ee71b203218f8 100644
--- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
@@ -16,7 +16,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.10.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2">
       <PrivateAssets>all</PrivateAssets>

From c30bb6a84536d66bc1179e2a051915d5c34b2616 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 27 Aug 2024 14:49:45 +0900
Subject: [PATCH 034/186] GH-41056: [GLib][FlightRPC] Add
 gaflight_client_do_put() and related APIs (#43813)

### Rationale for this change

DoPut is needed to upload data.

### What changes are included in this PR?

* Add `gaflight_client_do_put()`
* Add `GAFlightStreamWriter`
* Add `GAFlightMetadataReader`
* Add `GAFlightDoPutResult`
* Fix `GAFlightRecordBatchWriter` API

### Are these changes tested?

No. They aren't tested yet. We will add tests when we implement server side DoPut.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #41056

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-flight-glib/client.cpp | 337 +++++++++++++++++++++++++++-
 c_glib/arrow-flight-glib/client.h   |  46 ++++
 c_glib/arrow-flight-glib/client.hpp |  16 ++
 c_glib/arrow-flight-glib/common.cpp | 102 ++-------
 c_glib/arrow-flight-glib/common.h   |   8 +-
 c_glib/arrow-glib/writer.hpp        |   4 +
 6 files changed, 421 insertions(+), 92 deletions(-)

diff --git a/c_glib/arrow-flight-glib/client.cpp b/c_glib/arrow-flight-glib/client.cpp
index 80c47e336f872..23f59c9da69ad 100644
--- a/c_glib/arrow-flight-glib/client.cpp
+++ b/c_glib/arrow-flight-glib/client.cpp
@@ -33,10 +33,19 @@ G_BEGIN_DECLS
  * #GAFlightStreamReader is a class for reading record batches from a
  * server.
  *
+ * #GAFlightStreamWriter is a class for writing record batches to a
+ * server.
+ *
+ * #GAFlightMetadataReader is a class for reading metadata from a
+ * server.
+ *
  * #GAFlightCallOptions is a class for options of each call.
  *
  * #GAFlightClientOptions is a class for options of each client.
  *
+ * #GAFlightDoPutResult is a class that has gaflight_client_do_put()
+ * result.
+ *
  * #GAFlightClient is a class for Apache Arrow Flight client.
  *
  * Since: 5.0.0
@@ -56,6 +65,128 @@ gaflight_stream_reader_class_init(GAFlightStreamReaderClass *klass)
 {
 }
 
+G_DEFINE_TYPE(GAFlightStreamWriter,
+              gaflight_stream_writer,
+              GAFLIGHT_TYPE_RECORD_BATCH_WRITER)
+
+static void
+gaflight_stream_writer_init(GAFlightStreamWriter *object)
+{
+}
+
+static void
+gaflight_stream_writer_class_init(GAFlightStreamWriterClass *klass)
+{
+}
+
+/**
+ * gaflight_stream_writer_done_writing:
+ * @writer: A #GAFlightStreamWriter.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error)
+{
+  auto flight_writer = std::static_pointer_cast<arrow::flight::FlightStreamWriter>(
+    garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer)));
+  return garrow::check(error,
+                       flight_writer->DoneWriting(),
+                       "[flight-stream-writer][done-writing]");
+}
+
+struct GAFlightMetadataReaderPrivate
+{
+  arrow::flight::FlightMetadataReader *reader;
+};
+
+enum {
+  PROP_METADATA_READER_READER = 1,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GAFlightMetadataReader,
+                           gaflight_metadata_reader,
+                           G_TYPE_OBJECT)
+
+#define GAFLIGHT_METADATA_READER_GET_PRIVATE(object)                                     \
+  static_cast<GAFlightMetadataReaderPrivate *>(                                          \
+    gaflight_metadata_reader_get_instance_private(GAFLIGHT_METADATA_READER(object)))
+
+static void
+gaflight_metadata_reader_finalize(GObject *object)
+{
+  auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object);
+  delete priv->reader;
+  G_OBJECT_CLASS(gaflight_metadata_reader_parent_class)->finalize(object);
+}
+
+static void
+gaflight_metadata_reader_set_property(GObject *object,
+                                      guint prop_id,
+                                      const GValue *value,
+                                      GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_METADATA_READER_READER:
+    priv->reader =
+      static_cast<arrow::flight::FlightMetadataReader *>(g_value_get_pointer(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_metadata_reader_init(GAFlightMetadataReader *object)
+{
+}
+
+static void
+gaflight_metadata_reader_class_init(GAFlightMetadataReaderClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize = gaflight_metadata_reader_finalize;
+  gobject_class->set_property = gaflight_metadata_reader_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer(
+    "reader",
+    nullptr,
+    nullptr,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_METADATA_READER_READER, spec);
+}
+
+/**
+ * gaflight_metadata_reader_read:
+ * @reader: A #GAFlightMetadataReader.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full): The metadata on success, %NULL on error.
+ *
+ * Since: 18.0.0
+ */
+GArrowBuffer *
+gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error)
+{
+  auto flight_reader = gaflight_metadata_reader_get_raw(reader);
+  std::shared_ptr<arrow::Buffer> metadata;
+  if (garrow::check(error,
+                    flight_reader->ReadMetadata(&metadata),
+                    "[flight-metadata-reader][read]")) {
+    return garrow_buffer_new_raw(&metadata);
+  } else {
+    return nullptr;
+  }
+}
+
 typedef struct GAFlightCallOptionsPrivate_
 {
   arrow::flight::FlightCallOptions options;
@@ -385,6 +516,137 @@ gaflight_client_options_new(void)
     g_object_new(GAFLIGHT_TYPE_CLIENT_OPTIONS, NULL));
 }
 
+struct GAFlightDoPutResultPrivate
+{
+  GAFlightStreamWriter *writer;
+  GAFlightMetadataReader *reader;
+};
+
+enum {
+  PROP_DO_PUT_RESULT_RESULT = 1,
+  PROP_DO_PUT_RESULT_WRITER,
+  PROP_DO_PUT_RESULT_READER,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GAFlightDoPutResult, gaflight_do_put_result, G_TYPE_OBJECT)
+
+#define GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object)                                       \
+  static_cast<GAFlightDoPutResultPrivate *>(                                             \
+    gaflight_do_put_result_get_instance_private(GAFLIGHT_DO_PUT_RESULT(object)))
+
+static void
+gaflight_do_put_result_dispose(GObject *object)
+{
+  auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object);
+
+  if (priv->writer) {
+    g_object_unref(priv->writer);
+    priv->writer = nullptr;
+  }
+
+  if (priv->reader) {
+    g_object_unref(priv->reader);
+    priv->reader = nullptr;
+  }
+
+  G_OBJECT_CLASS(gaflight_do_put_result_parent_class)->dispose(object);
+}
+
+static void
+gaflight_do_put_result_init(GAFlightDoPutResult *object)
+{
+}
+
+static void
+gaflight_do_put_result_set_property(GObject *object,
+                                    guint prop_id,
+                                    const GValue *value,
+                                    GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_DO_PUT_RESULT_RESULT:
+    {
+      auto result = static_cast<arrow::flight::FlightClient::DoPutResult *>(
+        g_value_get_pointer(value));
+      priv->writer = gaflight_stream_writer_new_raw(result->writer.release());
+      priv->reader = gaflight_metadata_reader_new_raw(result->reader.release());
+      break;
+    }
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_do_put_result_get_property(GObject *object,
+                                    guint prop_id,
+                                    GValue *value,
+                                    GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_DO_PUT_RESULT_WRITER:
+    g_value_set_object(value, priv->writer);
+    break;
+  case PROP_DO_PUT_RESULT_READER:
+    g_value_set_object(value, priv->reader);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_do_put_result_class_init(GAFlightDoPutResultClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->dispose = gaflight_do_put_result_dispose;
+  gobject_class->set_property = gaflight_do_put_result_set_property;
+  gobject_class->get_property = gaflight_do_put_result_get_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer(
+    "result",
+    nullptr,
+    nullptr,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_RESULT, spec);
+
+  /**
+   * GAFlightDoPutResult:writer:
+   *
+   * A writer to write record batches to.
+   *
+   * Since: 18.0.0
+   */
+  spec = g_param_spec_object("writer",
+                             nullptr,
+                             nullptr,
+                             GAFLIGHT_TYPE_STREAM_WRITER,
+                             static_cast<GParamFlags>(G_PARAM_READABLE));
+  g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_WRITER, spec);
+
+  /**
+   * GAFlightDoPutResult:reader:
+   *
+   * A reader for application metadata from the server.
+   *
+   * Since: 18.0.0
+   */
+  spec = g_param_spec_object("reader",
+                             nullptr,
+                             nullptr,
+                             GAFLIGHT_TYPE_METADATA_READER,
+                             static_cast<GParamFlags>(G_PARAM_READABLE));
+  g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_READER, spec);
+}
+
 struct GAFlightClientPrivate
 {
   std::shared_ptr<arrow::flight::FlightClient> client;
@@ -661,6 +923,51 @@ gaflight_client_do_get(GAFlightClient *client,
   return gaflight_stream_reader_new_raw(flight_reader.release(), TRUE);
 }
 
+/**
+ * gaflight_client_do_put:
+ * @client: A #GAFlightClient.
+ * @descriptor: A #GAFlightDescriptor.
+ * @schema: A #GArrowSchema.
+ * @options: (nullable): A #GAFlightCallOptions.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Upload data to a Flight described by the given descriptor. The
+ * caller must call garrow_record_batch_writer_close() on the
+ * returned stream once they are done writing.
+ *
+ * The reader and writer are linked; closing the writer will also
+ * close the reader. Use garrow_flight_stream_writer_done_writing() to
+ * only close the write side of the channel.
+ *
+ * Returns: (nullable) (transfer full):
+ *   The #GAFlighDoPutResult holding a reader and a writer on success,
+ *   %NULL on error.
+ *
+ * Since: 18.0.0
+ */
+GAFlightDoPutResult *
+gaflight_client_do_put(GAFlightClient *client,
+                       GAFlightDescriptor *descriptor,
+                       GArrowSchema *schema,
+                       GAFlightCallOptions *options,
+                       GError **error)
+{
+  auto flight_client = gaflight_client_get_raw(client);
+  auto flight_descriptor = gaflight_descriptor_get_raw(descriptor);
+  auto arrow_schema = garrow_schema_get_raw(schema);
+  arrow::flight::FlightCallOptions flight_default_options;
+  auto flight_options = &flight_default_options;
+  if (options) {
+    flight_options = gaflight_call_options_get_raw(options);
+  }
+  auto result = flight_client->DoPut(*flight_options, *flight_descriptor, arrow_schema);
+  if (!garrow::check(error, result, "[flight-client][do-put]")) {
+    return nullptr;
+  }
+  auto flight_result = std::move(*result);
+  return gaflight_do_put_result_new_raw(&flight_result);
+}
+
 G_END_DECLS
 
 GAFlightStreamReader *
@@ -672,7 +979,28 @@ gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader,
                                              flight_reader,
                                              "is-owner",
                                              is_owner,
-                                             NULL));
+                                             nullptr));
+}
+
+GAFlightStreamWriter *
+gaflight_stream_writer_new_raw(arrow::flight::FlightStreamWriter *flight_writer)
+{
+  return GAFLIGHT_STREAM_WRITER(
+    g_object_new(GAFLIGHT_TYPE_STREAM_WRITER, "writer", flight_writer, nullptr));
+}
+
+GAFlightMetadataReader *
+gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader)
+{
+  return GAFLIGHT_METADATA_READER(
+    g_object_new(GAFLIGHT_TYPE_METADATA_READER, "reader", flight_reader, nullptr));
+}
+
+arrow::flight::FlightMetadataReader *
+gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader)
+{
+  auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(reader);
+  return priv->reader;
 }
 
 arrow::flight::FlightCallOptions *
@@ -689,6 +1017,13 @@ gaflight_client_options_get_raw(GAFlightClientOptions *options)
   return &(priv->options);
 }
 
+GAFlightDoPutResult *
+gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result)
+{
+  return GAFLIGHT_DO_PUT_RESULT(
+    g_object_new(GAFLIGHT_TYPE_DO_PUT_RESULT, "result", flight_result, nullptr));
+}
+
 std::shared_ptr<arrow::flight::FlightClient>
 gaflight_client_get_raw(GAFlightClient *client)
 {
diff --git a/c_glib/arrow-flight-glib/client.h b/c_glib/arrow-flight-glib/client.h
index a91bbe55e3c04..12c5a06b810e1 100644
--- a/c_glib/arrow-flight-glib/client.h
+++ b/c_glib/arrow-flight-glib/client.h
@@ -35,6 +35,35 @@ struct _GAFlightStreamReaderClass
   GAFlightRecordBatchReaderClass parent_class;
 };
 
+#define GAFLIGHT_TYPE_STREAM_WRITER (gaflight_stream_writer_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(GAFlightStreamWriter,
+                         gaflight_stream_writer,
+                         GAFLIGHT,
+                         STREAM_WRITER,
+                         GAFlightRecordBatchWriter)
+struct _GAFlightStreamWriterClass
+{
+  GAFlightRecordBatchWriterClass parent_class;
+};
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error);
+
+#define GAFLIGHT_TYPE_METADATA_READER (gaflight_metadata_reader_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(
+  GAFlightMetadataReader, gaflight_metadata_reader, GAFLIGHT, METADATA_READER, GObject)
+struct _GAFlightMetadataReaderClass
+{
+  GObjectClass parent_class;
+};
+
+GAFLIGHT_AVAILABLE_IN_18_0
+GArrowBuffer *
+gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error);
+
 #define GAFLIGHT_TYPE_CALL_OPTIONS (gaflight_call_options_get_type())
 GAFLIGHT_AVAILABLE_IN_5_0
 G_DECLARE_DERIVABLE_TYPE(
@@ -75,6 +104,15 @@ GAFLIGHT_AVAILABLE_IN_5_0
 GAFlightClientOptions *
 gaflight_client_options_new(void);
 
+#define GAFLIGHT_TYPE_DO_PUT_RESULT (gaflight_do_put_result_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(
+  GAFlightDoPutResult, gaflight_do_put_result, GAFLIGHT, DO_PUT_RESULT, GObject)
+struct _GAFlightDoPutResultClass
+{
+  GObjectClass parent_class;
+};
+
 #define GAFLIGHT_TYPE_CLIENT (gaflight_client_get_type())
 GAFLIGHT_AVAILABLE_IN_5_0
 G_DECLARE_DERIVABLE_TYPE(GAFlightClient, gaflight_client, GAFLIGHT, CLIENT, GObject)
@@ -124,4 +162,12 @@ gaflight_client_do_get(GAFlightClient *client,
                        GAFlightCallOptions *options,
                        GError **error);
 
+GAFLIGHT_AVAILABLE_IN_18_0
+GAFlightDoPutResult *
+gaflight_client_do_put(GAFlightClient *client,
+                       GAFlightDescriptor *descriptor,
+                       GArrowSchema *schema,
+                       GAFlightCallOptions *options,
+                       GError **error);
+
 G_END_DECLS
diff --git a/c_glib/arrow-flight-glib/client.hpp b/c_glib/arrow-flight-glib/client.hpp
index 185a28e6dc4bd..888f87ecb5732 100644
--- a/c_glib/arrow-flight-glib/client.hpp
+++ b/c_glib/arrow-flight-glib/client.hpp
@@ -28,6 +28,18 @@ GAFlightStreamReader *
 gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader,
                                gboolean is_owner);
 
+GAFLIGHT_EXTERN
+GAFlightStreamWriter *
+gaflight_stream_writer_new_raw(arrow::flight::FlightStreamWriter *flight_writer);
+
+GAFLIGHT_EXTERN
+GAFlightMetadataReader *
+gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader);
+
+GAFLIGHT_EXTERN
+arrow::flight::FlightMetadataReader *
+gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader);
+
 GAFLIGHT_EXTERN
 arrow::flight::FlightCallOptions *
 gaflight_call_options_get_raw(GAFlightCallOptions *options);
@@ -36,6 +48,10 @@ GAFLIGHT_EXTERN
 arrow::flight::FlightClientOptions *
 gaflight_client_options_get_raw(GAFlightClientOptions *options);
 
+GAFLIGHT_EXTERN
+GAFlightDoPutResult *
+gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result);
+
 GAFLIGHT_EXTERN
 std::shared_ptr<arrow::flight::FlightClient>
 gaflight_client_get_raw(GAFlightClient *client);
diff --git a/c_glib/arrow-flight-glib/common.cpp b/c_glib/arrow-flight-glib/common.cpp
index f7eea08c264b3..3deaf67cc14e8 100644
--- a/c_glib/arrow-flight-glib/common.cpp
+++ b/c_glib/arrow-flight-glib/common.cpp
@@ -1196,7 +1196,7 @@ gaflight_record_batch_reader_finalize(GObject *object)
   if (priv->is_owner) {
     delete priv->reader;
   }
-  G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object);
+  G_OBJECT_CLASS(gaflight_record_batch_reader_parent_class)->finalize(object);
 }
 
 static void
@@ -1300,57 +1300,9 @@ gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError
   }
 }
 
-typedef struct GAFlightRecordBatchWriterPrivate_
-{
-  arrow::flight::MetadataRecordBatchWriter *writer;
-  bool is_owner;
-} GAFlightRecordBatchWriterPrivate;
-
-enum {
-  PROP_RECORD_BATCH_WRITER_WRITER = 1,
-  PROP_RECORD_BATCH_WRITER_IS_OWNER,
-};
-
-G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchWriter,
-                                    gaflight_record_batch_writer,
-                                    GARROW_TYPE_RECORD_BATCH_WRITER)
-
-#define GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object)                                 \
-  static_cast<GAFlightRecordBatchWriterPrivate *>(                                       \
-    gaflight_record_batch_writer_get_instance_private(                                   \
-      GAFLIGHT_RECORD_BATCH_WRITER(object)))
-
-static void
-gaflight_record_batch_writer_finalize(GObject *object)
-{
-  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object);
-  if (priv->is_owner) {
-    delete priv->writer;
-  }
-  G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object);
-}
-
-static void
-gaflight_record_batch_writer_set_property(GObject *object,
-                                          guint prop_id,
-                                          const GValue *value,
-                                          GParamSpec *pspec)
-{
-  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object);
-
-  switch (prop_id) {
-  case PROP_RECORD_BATCH_WRITER_WRITER:
-    priv->writer =
-      static_cast<arrow::flight::MetadataRecordBatchWriter *>(g_value_get_pointer(value));
-    break;
-  case PROP_RECORD_BATCH_WRITER_IS_OWNER:
-    priv->is_owner = g_value_get_boolean(value);
-    break;
-  default:
-    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
-    break;
-  }
-}
+G_DEFINE_ABSTRACT_TYPE(GAFlightRecordBatchWriter,
+                       gaflight_record_batch_writer,
+                       GARROW_TYPE_RECORD_BATCH_WRITER)
 
 static void
 gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object)
@@ -1360,26 +1312,6 @@ gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object)
 static void
 gaflight_record_batch_writer_class_init(GAFlightRecordBatchWriterClass *klass)
 {
-  auto gobject_class = G_OBJECT_CLASS(klass);
-
-  gobject_class->finalize = gaflight_record_batch_writer_finalize;
-  gobject_class->set_property = gaflight_record_batch_writer_set_property;
-
-  GParamSpec *spec;
-  spec = g_param_spec_pointer(
-    "writer",
-    nullptr,
-    nullptr,
-    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_WRITER, spec);
-
-  spec = g_param_spec_boolean(
-    "is-owner",
-    nullptr,
-    nullptr,
-    TRUE,
-    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_IS_OWNER, spec);
 }
 
 /**
@@ -1402,7 +1334,8 @@ gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer,
                                    GArrowWriteOptions *options,
                                    GError **error)
 {
-  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto flight_writer = std::static_pointer_cast<arrow::flight::MetadataRecordBatchWriter>(
+    garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer)));
   auto arrow_schema = garrow_schema_get_raw(schema);
   arrow::ipc::IpcWriteOptions arrow_write_options;
   if (options) {
@@ -1432,7 +1365,8 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
                                             GArrowBuffer *metadata,
                                             GError **error)
 {
-  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto flight_writer = std::static_pointer_cast<arrow::flight::MetadataRecordBatchWriter>(
+    garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer)));
   auto arrow_metadata = garrow_buffer_get_raw(metadata);
   return garrow::check(error,
                        flight_writer->WriteMetadata(arrow_metadata),
@@ -1440,7 +1374,7 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
 }
 
 /**
- * gaflight_record_batch_writer_write:
+ * gaflight_record_batch_writer_write_record_batch:
  * @writer: A #GAFlightRecordBatchWriter.
  * @record_batch: A #GArrowRecordBatch.
  * @metadata: (nullable): A #GArrowBuffer.
@@ -1453,12 +1387,13 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
  * Since: 18.0.0
  */
 gboolean
-gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer,
-                                   GArrowRecordBatch *record_batch,
-                                   GArrowBuffer *metadata,
-                                   GError **error)
+gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer,
+                                                GArrowRecordBatch *record_batch,
+                                                GArrowBuffer *metadata,
+                                                GError **error)
 {
-  auto flight_writer = gaflight_record_batch_writer_get_raw(writer);
+  auto flight_writer = std::static_pointer_cast<arrow::flight::MetadataRecordBatchWriter>(
+    garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer)));
   auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
   auto arrow_metadata = garrow_buffer_get_raw(metadata);
   return garrow::check(
@@ -1599,10 +1534,3 @@ gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader)
   auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(reader);
   return priv->reader;
 }
-
-arrow::flight::MetadataRecordBatchWriter *
-gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer)
-{
-  auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(writer);
-  return priv->writer;
-}
diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h
index 91c828caabb36..726132fe4921b 100644
--- a/c_glib/arrow-flight-glib/common.h
+++ b/c_glib/arrow-flight-glib/common.h
@@ -259,9 +259,9 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer,
 
 GAFLIGHT_AVAILABLE_IN_18_0
 gboolean
-gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer,
-                                   GArrowRecordBatch *record_batch,
-                                   GArrowBuffer *metadata,
-                                   GError **error);
+gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer,
+                                                GArrowRecordBatch *record_batch,
+                                                GArrowBuffer *metadata,
+                                                GError **error);
 
 G_END_DECLS
diff --git a/c_glib/arrow-glib/writer.hpp b/c_glib/arrow-glib/writer.hpp
index aa87ffe77d79b..1d85ac52f88d1 100644
--- a/c_glib/arrow-glib/writer.hpp
+++ b/c_glib/arrow-glib/writer.hpp
@@ -25,16 +25,20 @@
 
 #include <arrow-glib/writer.h>
 
+GARROW_AVAILABLE_IN_ALL
 GArrowRecordBatchWriter *
 garrow_record_batch_writer_new_raw(
   std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);
+GARROW_AVAILABLE_IN_ALL
 std::shared_ptr<arrow::ipc::RecordBatchWriter>
 garrow_record_batch_writer_get_raw(GArrowRecordBatchWriter *writer);
 
+GARROW_AVAILABLE_IN_ALL
 GArrowRecordBatchStreamWriter *
 garrow_record_batch_stream_writer_new_raw(
   std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);
 
+GARROW_AVAILABLE_IN_ALL
 GArrowRecordBatchFileWriter *
 garrow_record_batch_file_writer_new_raw(
   std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);

From b83666234c05d34c23993708160033c259b9ec26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Tue, 27 Aug 2024 10:30:23 +0200
Subject: [PATCH 035/186] GH-43815: [CI][Packaging][Python] Avoid uploading
 wheel to gemfury if version already exists (#43816)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes are included in this PR?

Check whether version exists on gemfury before trying upload

### Are these changes tested?

Will be tested via archery

### Are there any user-facing changes?

No
* GitHub Issue: #43815

Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 dev/tasks/macros.jinja | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index 6423ca0e9efda..df55f32222e91 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -169,10 +169,14 @@ env:
   - name: Upload package to Gemfury
     shell: bash
     run: |
-      fury push \
-        --api-token=${CROSSBOW_GEMFURY_TOKEN} \
-        --as=${CROSSBOW_GEMFURY_ORG} \
-        {{ pattern }}
+      if $(fury versions --as=${CROSSBOW_GEMFURY_ORG} --api-token=${CROSSBOW_GEMFURY_TOKEN} pyarrow | grep --fixed-strings -q "{{ arrow.no_rc_version }}"); then
+        echo "Version {{ arrow.no_rc_version }} already exists. Avoid pushing version."
+      else
+        fury push \
+          --api-token=${CROSSBOW_GEMFURY_TOKEN} \
+          --as=${CROSSBOW_GEMFURY_ORG} \
+          {{ pattern }}
+      fi
     env:
       CROSSBOW_GEMFURY_TOKEN: {{ '${{ secrets.CROSSBOW_GEMFURY_TOKEN }}' }}
       CROSSBOW_GEMFURY_ORG: {{ '${{ secrets.CROSSBOW_GEMFURY_ORG }}' }}

From 6502f0e3ad046d361aba44385ab3379ed7af5b7f Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Tue, 27 Aug 2024 13:17:39 -0400
Subject: [PATCH 036/186] GH-43790: [Go][Parquet] Add support for LZ4_RAW
 compression codec (#43835)

### Rationale for this change

Fixes: #43790

The LZ4 compression codec for Parquet is no longer ambiguous, as it has been superceded by the [LZ4_RAW](https://github.com/apache/parquet-format/blob/master/Compression.md#lz4_raw) spec.

### What changes are included in this PR?

- Add `LZ4Raw` compression codec
- Split out `StreamingCodec` methods from core `Codec` interface
- Various conformance/roundtrip tests
- Set of benchmarks for reading/writing an Arrow table to/from Parquet, using each compression codec

### Are these changes tested?

Yes

### Are there any user-facing changes?

- New codec `LZ4Raw` is available
- `Codec` interface no long provides the following methods, which are now part of `StreamingCodec`:
  - `NewReader`
  - `NewWriter`
  - `NewWriterLevel`

* GitHub Issue: #43790

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 go/parquet/compress/compress.go          |  22 ++--
 go/parquet/compress/compress_test.go     |   8 +-
 go/parquet/compress/lz4_raw.go           |  66 ++++++++++++
 go/parquet/file/file_reader_test.go      | 127 +++++++++++++++++++++++
 go/parquet/file/file_writer_test.go      |  58 ++++++++++-
 go/parquet/pqarrow/reader_writer_test.go | 111 ++++++++++++++++++++
 6 files changed, 380 insertions(+), 12 deletions(-)
 create mode 100644 go/parquet/compress/lz4_raw.go

diff --git a/go/parquet/compress/compress.go b/go/parquet/compress/compress.go
index b6a1349133e84..92f2ae99bb13f 100644
--- a/go/parquet/compress/compress.go
+++ b/go/parquet/compress/compress.go
@@ -49,8 +49,9 @@ var Codecs = struct {
 	Brotli Compression
 	// LZ4 unsupported in this library due to problematic issues between the Hadoop LZ4 spec vs regular lz4
 	// see: http://mail-archives.apache.org/mod_mbox/arrow-dev/202007.mbox/%3CCAAri41v24xuA8MGHLDvgSnE+7AAgOhiEukemW_oPNHMvfMmrWw@mail.gmail.com%3E
-	Lz4  Compression
-	Zstd Compression
+	Lz4    Compression
+	Zstd   Compression
+	Lz4Raw Compression
 }{
 	Uncompressed: Compression(parquet.CompressionCodec_UNCOMPRESSED),
 	Snappy:       Compression(parquet.CompressionCodec_SNAPPY),
@@ -59,17 +60,12 @@ var Codecs = struct {
 	Brotli:       Compression(parquet.CompressionCodec_BROTLI),
 	Lz4:          Compression(parquet.CompressionCodec_LZ4),
 	Zstd:         Compression(parquet.CompressionCodec_ZSTD),
+	Lz4Raw:       Compression(parquet.CompressionCodec_LZ4_RAW),
 }
 
 // Codec is an interface which is implemented for each compression type in order to make the interactions easy to
 // implement. Most consumers won't be calling GetCodec directly.
 type Codec interface {
-	// NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data
-	NewReader(io.Reader) io.ReadCloser
-	// NewWriter provides a wrapper around a write stream to compress data before writing it.
-	NewWriter(io.Writer) io.WriteCloser
-	// NewWriterLevel is like NewWriter but allows specifying the compression level
-	NewWriterLevel(io.Writer, int) (io.WriteCloser, error)
 	// Encode encodes a block of data given by src and returns the compressed block. dst should be either nil
 	// or sized large enough to fit the compressed block (use CompressBound to allocate). dst and src should not
 	// overlap since some of the compression types don't allow it.
@@ -90,6 +86,16 @@ type Codec interface {
 	Decode(dst, src []byte) []byte
 }
 
+// StreamingCodec is an interface that may be implemented for compression codecs that expose a streaming API.
+type StreamingCodec interface {
+	// NewReader provides a reader that wraps a stream with compressed data to stream the uncompressed data
+	NewReader(io.Reader) io.ReadCloser
+	// NewWriter provides a wrapper around a write stream to compress data before writing it.
+	NewWriter(io.Writer) io.WriteCloser
+	// NewWriterLevel is like NewWriter but allows specifying the compression level
+	NewWriterLevel(io.Writer, int) (io.WriteCloser, error)
+}
+
 var codecs = map[Compression]Codec{}
 
 // RegisterCodec adds or overrides a codec implementation for a given compression algorithm.
diff --git a/go/parquet/compress/compress_test.go b/go/parquet/compress/compress_test.go
index 843062c0d024a..5aac74759e1f9 100644
--- a/go/parquet/compress/compress_test.go
+++ b/go/parquet/compress/compress_test.go
@@ -66,8 +66,8 @@ func TestCompressDataOneShot(t *testing.T) {
 		{compress.Codecs.Gzip},
 		{compress.Codecs.Brotli},
 		{compress.Codecs.Zstd},
+		{compress.Codecs.Lz4Raw},
 		// {compress.Codecs.Lzo},
-		// {compress.Codecs.Lz4},
 	}
 
 	for _, tt := range tests {
@@ -107,9 +107,11 @@ func TestCompressReaderWriter(t *testing.T) {
 			var buf bytes.Buffer
 			codec, err := compress.GetCodec(tt.c)
 			assert.NoError(t, err)
+			streamingCodec, ok := codec.(compress.StreamingCodec)
+			assert.True(t, ok)
 			data := makeRandomData(RandomDataSize)
 
-			wr := codec.NewWriter(&buf)
+			wr := streamingCodec.NewWriter(&buf)
 
 			const chunkSize = 1111
 			input := data
@@ -129,7 +131,7 @@ func TestCompressReaderWriter(t *testing.T) {
 			}
 			wr.Close()
 
-			rdr := codec.NewReader(&buf)
+			rdr := streamingCodec.NewReader(&buf)
 			out, err := io.ReadAll(rdr)
 			assert.NoError(t, err)
 			assert.Exactly(t, data, out)
diff --git a/go/parquet/compress/lz4_raw.go b/go/parquet/compress/lz4_raw.go
new file mode 100644
index 0000000000000..788d9520a668b
--- /dev/null
+++ b/go/parquet/compress/lz4_raw.go
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compress
+
+import (
+	"sync"
+
+	"github.com/pierrec/lz4/v4"
+)
+
+// lz4.Compressor is not goroutine-safe, so we use a pool to amortize the cost
+// of allocating a new one for each call to Encode().
+var compressorPool = sync.Pool{New: func() interface{} { return new(lz4.Compressor) }}
+
+func compressBlock(src, dst []byte) (int, error) {
+	c := compressorPool.Get().(*lz4.Compressor)
+	defer compressorPool.Put(c)
+	return c.CompressBlock(src, dst)
+}
+
+type lz4RawCodec struct{}
+
+func (c lz4RawCodec) Encode(dst, src []byte) []byte {
+	n, err := compressBlock(src, dst[:cap(dst)])
+	if err != nil {
+		panic(err)
+	}
+
+	return dst[:n]
+}
+
+func (c lz4RawCodec) EncodeLevel(dst, src []byte, _ int) []byte {
+	// the lz4 block implementation does not allow level to be set
+	return c.Encode(dst, src)
+}
+
+func (lz4RawCodec) Decode(dst, src []byte) []byte {
+	n, err := lz4.UncompressBlock(src, dst)
+	if err != nil {
+		panic(err)
+	}
+
+	return dst[:n]
+}
+
+func (c lz4RawCodec) CompressBound(len int64) int64 {
+	return int64(lz4.CompressBlockBound(int(len)))
+}
+
+func init() {
+	RegisterCodec(Codecs.Lz4Raw, lz4RawCodec{})
+}
diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go
index 547ec475c2720..35f4da4e8667c 100644
--- a/go/parquet/file/file_reader_test.go
+++ b/go/parquet/file/file_reader_test.go
@@ -644,3 +644,130 @@ func TestDeltaBinaryPackedMultipleBatches(t *testing.T) {
 
 	require.Equalf(t, size, totalRows, "Expected %d rows, but got %d rows", size, totalRows)
 }
+
+// Test read file lz4_raw_compressed.parquet
+// Contents documented at https://github.com/apache/parquet-testing/commit/ddd898958803cb89b7156c6350584d1cda0fe8de
+func TestLZ4RawFileRead(t *testing.T) {
+	dir := os.Getenv("PARQUET_TEST_DATA")
+	if dir == "" {
+		t.Skip("no path supplied with PARQUET_TEST_DATA")
+	}
+	require.DirExists(t, dir)
+
+	props := parquet.NewReaderProperties(memory.DefaultAllocator)
+	fileReader, err := file.OpenParquetFile(path.Join(dir, "lz4_raw_compressed.parquet"),
+		false, file.WithReadProps(props))
+	require.NoError(t, err)
+	defer fileReader.Close()
+
+	nRows := 4
+	nCols := 3
+	require.Equal(t, 1, fileReader.NumRowGroups())
+	rgr := fileReader.RowGroup(0)
+	require.EqualValues(t, nRows, rgr.NumRows())
+	require.EqualValues(t, nCols, rgr.NumColumns())
+
+	rdr, err := rgr.Column(0)
+	require.NoError(t, err)
+
+	rowsInt64, ok := rdr.(*file.Int64ColumnChunkReader)
+	require.True(t, ok)
+
+	valsInt64 := make([]int64, nRows)
+	total, read, err := rowsInt64.ReadBatch(int64(nRows), valsInt64, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, int64(nRows), total)
+	require.Equal(t, nRows, read)
+
+	expectedValsInt64 := []int64{
+		1593604800,
+		1593604800,
+		1593604801,
+		1593604801,
+	}
+	require.Equal(t, expectedValsInt64, valsInt64)
+
+	rdr, err = rgr.Column(1)
+	require.NoError(t, err)
+
+	rowsByteArray, ok := rdr.(*file.ByteArrayColumnChunkReader)
+	require.True(t, ok)
+
+	valsByteArray := make([]parquet.ByteArray, nRows)
+	total, read, err = rowsByteArray.ReadBatch(int64(nRows), valsByteArray, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, int64(nRows), total)
+	require.Equal(t, nRows, read)
+
+	expectedValsByteArray := []parquet.ByteArray{
+		[]byte("abc"),
+		[]byte("def"),
+		[]byte("abc"),
+		[]byte("def"),
+	}
+	require.Equal(t, expectedValsByteArray, valsByteArray)
+
+	rdr, err = rgr.Column(2)
+	require.NoError(t, err)
+
+	rowsFloat64, ok := rdr.(*file.Float64ColumnChunkReader)
+	require.True(t, ok)
+
+	valsFloat64 := make([]float64, nRows)
+	total, read, err = rowsFloat64.ReadBatch(int64(nRows), valsFloat64, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, int64(nRows), total)
+	require.Equal(t, nRows, read)
+
+	expectedValsFloat64 := []float64{
+		42.0,
+		7.7,
+		42.125,
+		7.7,
+	}
+	require.Equal(t, expectedValsFloat64, valsFloat64)
+}
+
+// Test read file lz4_raw_compressed_larger.parquet
+// Contents documented at https://github.com/apache/parquet-testing/commit/ddd898958803cb89b7156c6350584d1cda0fe8de
+func TestLZ4RawLargerFileRead(t *testing.T) {
+	dir := os.Getenv("PARQUET_TEST_DATA")
+	if dir == "" {
+		t.Skip("no path supplied with PARQUET_TEST_DATA")
+	}
+	require.DirExists(t, dir)
+
+	props := parquet.NewReaderProperties(memory.DefaultAllocator)
+	fileReader, err := file.OpenParquetFile(path.Join(dir, "lz4_raw_compressed_larger.parquet"),
+		false, file.WithReadProps(props))
+	require.NoError(t, err)
+	defer fileReader.Close()
+
+	nRows := 10000
+	nCols := 1
+	require.Equal(t, 1, fileReader.NumRowGroups())
+	rgr := fileReader.RowGroup(0)
+	require.EqualValues(t, nRows, rgr.NumRows())
+	require.EqualValues(t, nCols, rgr.NumColumns())
+
+	rdr, err := rgr.Column(0)
+	require.NoError(t, err)
+
+	rows, ok := rdr.(*file.ByteArrayColumnChunkReader)
+	require.True(t, ok)
+
+	vals := make([]parquet.ByteArray, nRows)
+	total, read, err := rows.ReadBatch(int64(nRows), vals, nil, nil)
+	require.NoError(t, err)
+	require.Equal(t, int64(nRows), total)
+	require.Equal(t, nRows, read)
+
+	expectedValsHead := []parquet.ByteArray{
+		[]byte("c7ce6bef-d5b0-4863-b199-8ea8c7fb117b"),
+		[]byte("e8fb9197-cb9f-4118-b67f-fbfa65f61843"),
+		[]byte("885136e1-0aa1-4fdb-8847-63d87b07c205"),
+		[]byte("ce7b2019-8ebe-4906-a74d-0afa2409e5df"),
+		[]byte("a9ee2527-821b-4b71-a926-03f73c3fc8b7"),
+	}
+	require.Equal(t, expectedValsHead, vals[:len(expectedValsHead)])
+}
diff --git a/go/parquet/file/file_writer_test.go b/go/parquet/file/file_writer_test.go
index 0faf3f7233bd3..12ac93d1ef4b2 100644
--- a/go/parquet/file/file_writer_test.go
+++ b/go/parquet/file/file_writer_test.go
@@ -260,7 +260,7 @@ func (t *SerializeTestSuite) TestSmallFile() {
 		compress.Codecs.Brotli,
 		compress.Codecs.Gzip,
 		compress.Codecs.Zstd,
-		// compress.Codecs.Lz4,
+		compress.Codecs.Lz4Raw,
 		// compress.Codecs.Lzo,
 	}
 	for _, c := range codecs {
@@ -540,3 +540,59 @@ func TestBatchedByteStreamSplitFileRoundtrip(t *testing.T) {
 
 	require.NoError(t, rdr.Close())
 }
+
+func TestLZ4RawFileRoundtrip(t *testing.T) {
+	input := []int64{
+		-1, 0, 1, 2, 3, 4, 5, 123456789, -123456789,
+	}
+
+	size := len(input)
+
+	field, err := schema.NewPrimitiveNodeLogical("int64", parquet.Repetitions.Required, nil, parquet.Types.Int64, 0, 1)
+	require.NoError(t, err)
+
+	schema, err := schema.NewGroupNode("test", parquet.Repetitions.Required, schema.FieldList{field}, 0)
+	require.NoError(t, err)
+
+	sink := encoding.NewBufferWriter(0, memory.DefaultAllocator)
+	writer := file.NewParquetWriter(sink, schema, file.WithWriterProps(parquet.NewWriterProperties(parquet.WithCompression(compress.Codecs.Lz4Raw))))
+
+	rgw := writer.AppendRowGroup()
+	cw, err := rgw.NextColumn()
+	require.NoError(t, err)
+
+	i64ColumnWriter, ok := cw.(*file.Int64ColumnChunkWriter)
+	require.True(t, ok)
+
+	nVals, err := i64ColumnWriter.WriteBatch(input, nil, nil)
+	require.NoError(t, err)
+	require.EqualValues(t, size, nVals)
+
+	require.NoError(t, cw.Close())
+	require.NoError(t, rgw.Close())
+	require.NoError(t, writer.Close())
+
+	rdr, err := file.NewParquetReader(bytes.NewReader(sink.Bytes()))
+	require.NoError(t, err)
+
+	require.Equal(t, 1, rdr.NumRowGroups())
+	require.EqualValues(t, size, rdr.NumRows())
+
+	rgr := rdr.RowGroup(0)
+	cr, err := rgr.Column(0)
+	require.NoError(t, err)
+
+	i64ColumnReader, ok := cr.(*file.Int64ColumnChunkReader)
+	require.True(t, ok)
+
+	output := make([]int64, size)
+
+	total, valuesRead, err := i64ColumnReader.ReadBatch(int64(size), output, nil, nil)
+	require.NoError(t, err)
+	require.EqualValues(t, size, total)
+	require.EqualValues(t, size, valuesRead)
+
+	require.Equal(t, input, output)
+
+	require.NoError(t, rdr.Close())
+}
diff --git a/go/parquet/pqarrow/reader_writer_test.go b/go/parquet/pqarrow/reader_writer_test.go
index 31bd0eba84388..e020c7d9457a9 100644
--- a/go/parquet/pqarrow/reader_writer_test.go
+++ b/go/parquet/pqarrow/reader_writer_test.go
@@ -19,6 +19,8 @@ package pqarrow_test
 import (
 	"bytes"
 	"context"
+	"fmt"
+	"math"
 	"testing"
 	"unsafe"
 
@@ -26,8 +28,10 @@ import (
 	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/memory"
 	"github.com/apache/arrow/go/v18/parquet"
+	"github.com/apache/arrow/go/v18/parquet/compress"
 	"github.com/apache/arrow/go/v18/parquet/file"
 	"github.com/apache/arrow/go/v18/parquet/pqarrow"
+	"github.com/stretchr/testify/require"
 	"golang.org/x/exp/rand"
 	"gonum.org/v1/gonum/stat/distuv"
 )
@@ -275,3 +279,110 @@ func BenchmarkReadColumnFloat64(b *testing.B) {
 		benchReadTable(b, tt.name, tbl, int64(arrow.Int32Traits.BytesRequired(SIZELEN)))
 	}
 }
+
+var compressTestCases = []struct {
+	c compress.Compression
+}{
+	{compress.Codecs.Uncompressed},
+	{compress.Codecs.Snappy},
+	{compress.Codecs.Gzip},
+	{compress.Codecs.Brotli},
+	{compress.Codecs.Zstd},
+	{compress.Codecs.Lz4Raw},
+	// {compress.Codecs.Lzo},
+}
+
+func buildTableForTest(mem memory.Allocator) arrow.Table {
+	schema := arrow.NewSchema(
+		[]arrow.Field{
+			{Name: "int64s", Type: arrow.PrimitiveTypes.Int64},
+			{Name: "strings", Type: arrow.BinaryTypes.String},
+			{Name: "bools", Type: arrow.FixedWidthTypes.Boolean},
+			{Name: "repeated_int64s", Type: arrow.PrimitiveTypes.Int64},
+			{Name: "repeated_strings", Type: arrow.BinaryTypes.String},
+			{Name: "repeated_bools", Type: arrow.FixedWidthTypes.Boolean},
+		},
+		nil,
+	)
+	bldr := array.NewRecordBuilder(mem, schema)
+	defer bldr.Release()
+
+	for i := 0; i < SIZELEN; i++ {
+		bldr.Field(0).(*array.Int64Builder).Append(int64(i))
+		bldr.Field(1).(*array.StringBuilder).Append(fmt.Sprint(i))
+		bldr.Field(2).(*array.BooleanBuilder).Append(i%2 == 0)
+		bldr.Field(3).(*array.Int64Builder).Append(0)
+		bldr.Field(4).(*array.StringBuilder).Append("the string is the same")
+		bldr.Field(5).(*array.BooleanBuilder).Append(true)
+	}
+
+	rec := bldr.NewRecord()
+	return array.NewTableFromRecords(schema, []arrow.Record{rec})
+}
+
+func BenchmarkWriteTableCompressed(b *testing.B) {
+	mem := memory.DefaultAllocator
+	table := buildTableForTest(mem)
+	defer table.Release()
+
+	var uncompressedSize uint64
+	for idxCol := 0; int64(idxCol) < table.NumCols(); idxCol++ {
+		column := table.Column(idxCol)
+		for _, chunk := range column.Data().Chunks() {
+			uncompressedSize += chunk.Data().SizeInBytes()
+		}
+	}
+
+	var buf bytes.Buffer
+	buf.Grow(int(uncompressedSize))
+	for _, tc := range compressTestCases {
+		b.Run(fmt.Sprintf("codec=%s", tc.c), func(b *testing.B) {
+			buf.Reset()
+			b.ResetTimer()
+			b.SetBytes(int64(uncompressedSize))
+			for n := 0; n < b.N; n++ {
+				require.NoError(b,
+					pqarrow.WriteTable(
+						table,
+						&buf,
+						math.MaxInt64,
+						parquet.NewWriterProperties(parquet.WithAllocator(mem), parquet.WithCompression(tc.c)),
+						pqarrow.DefaultWriterProps(),
+					),
+				)
+			}
+		})
+	}
+}
+
+func BenchmarkReadTableCompressed(b *testing.B) {
+	ctx := context.Background()
+	mem := memory.DefaultAllocator
+	table := buildTableForTest(mem)
+	defer table.Release()
+
+	for _, tc := range compressTestCases {
+		b.Run(fmt.Sprintf("codec=%s", tc.c), func(b *testing.B) {
+			var buf bytes.Buffer
+			err := pqarrow.WriteTable(
+				table,
+				&buf,
+				math.MaxInt64,
+				parquet.NewWriterProperties(parquet.WithAllocator(mem), parquet.WithCompression(tc.c)),
+				pqarrow.DefaultWriterProps(),
+			)
+			require.NoError(b, err)
+
+			compressedBytes := buf.Len()
+			rdr := bytes.NewReader(buf.Bytes())
+
+			b.ResetTimer()
+			b.SetBytes(int64(compressedBytes))
+			for n := 0; n < b.N; n++ {
+				tab, err := pqarrow.ReadTable(ctx, rdr, nil, pqarrow.ArrowReadProperties{}, mem)
+				require.NoError(b, err)
+				defer tab.Release()
+			}
+		})
+	}
+}

From ce1e724d7ea292746ede6a538519658f1ecab849 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 27 Aug 2024 19:17:55 +0200
Subject: [PATCH 037/186] MINOR: [CI] Use `docker compose` on self-hosted ARM
 builds (#43844)

### Rationale for this change

The Docker client version on the ARM64 self-hosted runners is now recent enough, so we don't need to use `docker-compose` there anymore.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/cpp.yml                 | 5 +----
 .github/workflows/go.yml                  | 5 -----
 dev/tasks/java-jars/github.yml            | 2 --
 dev/tasks/linux-packages/github.linux.yml | 1 -
 dev/tasks/python-wheels/github.linux.yml  | 1 -
 5 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index a82e1eb76660b..c5482f730823b 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -99,7 +99,6 @@ jobs:
             cat <<JSON >> "$GITHUB_OUTPUT"
           {
             "arch": "arm64v8",
-            "archery-use-legacy-docker-compose": "1",
             "clang-tools": "10",
             "image": "ubuntu-cpp",
             "llvm": "10",
@@ -124,9 +123,6 @@ jobs:
         include: ${{ fromJson(needs.docker-targets.outputs.targets) }}
     env:
       ARCH: ${{ matrix.arch }}
-      # By default, use `docker compose` because docker-compose v1 is obsolete,
-      # except where the Docker client version is too old.
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: ${{ matrix.archery-use-legacy-docker-compose || '0' }}
       ARROW_SIMD_LEVEL: ${{ matrix.simd-level }}
       CLANG_TOOLS: ${{ matrix.clang-tools }}
       LLVM: ${{ matrix.llvm }}
@@ -147,6 +143,7 @@ jobs:
         run: |
           sudo apt update
           sudo apt install -y --no-install-recommends python3 python3-dev python3-pip
+          python3 -m pip install -U pip
       - name: Setup Archery
         run: python3 -m pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 20c78d86cb2a3..ffd543691d5b2 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -78,14 +78,12 @@ jobs:
           {
             "arch-label": "ARM64",
             "arch": "arm64v8",
-            "archery-use-legacy-docker-compose": "1",
             "go": "1.21",
             "runs-on": ["self-hosted", "arm", "linux"]
           },
           {
             "arch-label": "ARM64",
             "arch": "arm64v8",
-            "archery-use-legacy-docker-compose": "1",
             "go": "1.22",
             "runs-on": ["self-hosted", "arm", "linux"]
           }
@@ -106,9 +104,6 @@ jobs:
         include: ${{ fromJson(needs.docker-targets.outputs.targets) }}
     env:
       ARCH: ${{ matrix.arch }}
-      # By default, use Docker CLI because docker-compose v1 is obsolete,
-      # except where the Docker client version is too old.
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: ${{ matrix.archery-use-legacy-docker-compose || '0' }}
       GO: ${{ matrix.go }}
     steps:
       - name: Checkout Arrow
diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml
index 7cbd5f05dab4a..bdbed1bd678e6 100644
--- a/dev/tasks/java-jars/github.yml
+++ b/dev/tasks/java-jars/github.yml
@@ -30,7 +30,6 @@ jobs:
       ARCH: {{ '${{ matrix.platform.archery_arch }}' }}
       ARCH_ALIAS: {{ '${{ matrix.platform.archery_arch_alias }}' }}
       ARCH_SHORT: {{ '${{ matrix.platform.archery_arch_short }}' }}
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: {{ "${{matrix.platform.archery_use_legacy_docker_compose || '0'}}" }}
     strategy:
       fail-fast: false
       matrix:
@@ -45,7 +44,6 @@ jobs:
             archery_arch: "arm64v8"
             archery_arch_alias: "aarch64"
             archery_arch_short: "arm64"
-            archery_use_legacy_docker_compose: "1"
     steps:
       {{ macros.github_checkout_arrow()|indent }}
       {{ macros.github_free_space()|indent }}
diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml
index 4bf2295ef3e95..cce976cd60e4e 100644
--- a/dev/tasks/linux-packages/github.linux.yml
+++ b/dev/tasks/linux-packages/github.linux.yml
@@ -29,7 +29,6 @@ jobs:
     {% endif %}
     env:
       ARCHITECTURE: {{ architecture }}
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: {{ '1' if architecture == 'arm64' else '0' }}
     steps:
       {{ macros.github_checkout_arrow()|indent }}
       {{ macros.github_login_dockerhub()|indent }}
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index 2854d4349fb7c..97746ba3f9b8b 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -33,7 +33,6 @@ jobs:
       ARCH: amd64
       {% else %}
       ARCH: arm64v8
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
       {% endif %}
       PYTHON: "{{ python_version }}"
       {% if python_version == "3.13" %}

From 75ca5b3631144f58ea3edbe6b4933a686c0e0fd9 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 28 Aug 2024 05:47:43 +0900
Subject: [PATCH 038/186] GH-43805: [C++] Enable filesystem automatically when
 one of ARROW_{AZURE,GCS,HDFS,S3}=ON is specified (#43806)

### Rationale for this change

`ARROW_{AZURE,GCS,HDFS,S3}=ON` are meaningful only when filesystem is enabled. If the user specified one of them, we can assume that the user wants to enable filesystem.

### What changes are included in this PR?

Enable `ARROW_FILESYSTEM` when one of `ARROW_{AZURE,GCS,HDFS,S3}=ON` are specified.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.

`ARROW_FILESYSTEM` is enabled automatically with one of `ARROW_{AZURE,GCS,HDFS,S3}=ON`.
* GitHub Issue: #43805

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/cmake_modules/DefineOptions.cmake | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake
index 41466a1c22404..755887314d110 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -303,7 +303,10 @@ takes precedence over ccache if a storage backend is configured" ON)
                 ARROW_IPC)
 
   define_option(ARROW_AZURE
-                "Build Arrow with Azure support (requires the Azure SDK for C++)" OFF)
+                "Build Arrow with Azure support (requires the Azure SDK for C++)"
+                OFF
+                DEPENDS
+                ARROW_FILESYSTEM)
 
   define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF)
 
@@ -346,9 +349,16 @@ takes precedence over ccache if a storage backend is configured" ON)
                 ARROW_WITH_UTF8PROC)
 
   define_option(ARROW_GCS
-                "Build Arrow with GCS support (requires the GCloud SDK for C++)" OFF)
+                "Build Arrow with GCS support (requires the GCloud SDK for C++)"
+                OFF
+                DEPENDS
+                ARROW_FILESYSTEM)
 
-  define_option(ARROW_HDFS "Build the Arrow HDFS bridge" OFF)
+  define_option(ARROW_HDFS
+                "Build the Arrow HDFS bridge"
+                OFF
+                DEPENDS
+                ARROW_FILESYSTEM)
 
   define_option(ARROW_IPC "Build the Arrow IPC extensions" ON)
 
@@ -398,7 +408,11 @@ takes precedence over ccache if a storage backend is configured" ON)
                 ARROW_HDFS
                 ARROW_JSON)
 
-  define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF)
+  define_option(ARROW_S3
+                "Build Arrow with S3 support (requires the AWS SDK for C++)"
+                OFF
+                DEPENDS
+                ARROW_FILESYSTEM)
 
   define_option(ARROW_SKYHOOK
                 "Build the Skyhook libraries"

From 09bb24a5cdf5b6e73334e9a8b521f0188d940c73 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 28 Aug 2024 06:13:31 +0530
Subject: [PATCH 039/186] MINOR: [Java] Logback dependency upgrade (#43842)

### Rationale for this change

Fusing https://github.com/apache/arrow/pull/43752 and https://github.com/apache/arrow/pull/43827 dependabot PRs into a single PR.

### What changes are included in this PR?

Keeping a single version for both `logback-classic` and `logback-core`.

### Are these changes tested?

N/A

### Are there any user-facing changes?

No

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/memory/memory-netty/pom.xml |  1 -
 java/pom.xml                     | 13 ++++++++++++-
 java/tools/pom.xml               |  1 -
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/java/memory/memory-netty/pom.xml b/java/memory/memory-netty/pom.xml
index f2d4d2d0fe3bc..6cf573dd4d381 100644
--- a/java/memory/memory-netty/pom.xml
+++ b/java/memory/memory-netty/pom.xml
@@ -56,7 +56,6 @@ under the License.
     <dependency>
       <groupId>ch.qos.logback</groupId>
       <artifactId>logback-core</artifactId>
-      <version>1.3.14</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/java/pom.xml b/java/pom.xml
index f78d02c0c650f..577f23e6a719c 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -111,6 +111,7 @@ under the License.
     <mockito.core.version>5.11.0</mockito.core.version>
     <mockito.inline.version>5.2.0</mockito.inline.version>
     <checker.framework.version>3.46.0</checker.framework.version>
+    <logback.version>1.5.7</logback.version>
     <doclint>none</doclint>
     <additionalparam>-Xdoclint:none</additionalparam>
     <!-- List of add-opens arg line arguments for tests -->
@@ -221,6 +222,16 @@ under the License.
         <type>pom</type>
         <scope>import</scope>
       </dependency>
+      <dependency>
+        <groupId>ch.qos.logback</groupId>
+        <artifactId>logback-classic</artifactId>
+        <version>${logback.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>ch.qos.logback</groupId>
+        <artifactId>logback-core</artifactId>
+        <version>${logback.version}</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
@@ -274,7 +285,7 @@ under the License.
     <dependency>
       <groupId>ch.qos.logback</groupId>
       <artifactId>logback-classic</artifactId>
-      <version>1.4.14</version>
+      <version>${logback.version}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index 94566495dff19..082f06860c61b 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -59,7 +59,6 @@ under the License.
     <dependency>
       <groupId>ch.qos.logback</groupId>
       <artifactId>logback-classic</artifactId>
-      <version>1.4.14</version>
       <scope>test</scope>
     </dependency>
     <!--

From 9c801bbb9de55591ec026719c45180be0363f7e6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 28 Aug 2024 09:50:20 +0900
Subject: [PATCH 040/186] MINOR: [Java] Bump commons-cli:commons-cli from 1.8.0
 to 1.9.0 in /java (#43825)

Bumps commons-cli:commons-cli from 1.8.0 to 1.9.0.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=commons-cli:commons-cli&package-manager=maven&previous-version=1.8.0&new-version=1.9.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/flight/flight-integration-tests/pom.xml | 2 +-
 java/flight/flight-sql/pom.xml               | 2 +-
 java/tools/pom.xml                           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/flight/flight-integration-tests/pom.xml b/java/flight/flight-integration-tests/pom.xml
index a154062ba814d..7da5156404dba 100644
--- a/java/flight/flight-integration-tests/pom.xml
+++ b/java/flight/flight-integration-tests/pom.xml
@@ -58,7 +58,7 @@ under the License.
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>
-      <version>1.8.0</version>
+      <version>1.9.0</version>
     </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml
index c9c589d202ac6..92bab5e206757 100644
--- a/java/flight/flight-sql/pom.xml
+++ b/java/flight/flight-sql/pom.xml
@@ -118,7 +118,7 @@ under the License.
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>
-      <version>1.8.0</version>
+      <version>1.9.0</version>
       <optional>true</optional>
     </dependency>
   </dependencies>
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index 082f06860c61b..d261496040b78 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -54,7 +54,7 @@ under the License.
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>
-      <version>1.8.0</version>
+      <version>1.9.0</version>
     </dependency>
     <dependency>
       <groupId>ch.qos.logback</groupId>

From 6b268f62a8a172249ef35f093009c740c32e1f36 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 28 Aug 2024 09:50:36 +0900
Subject: [PATCH 041/186] MINOR: [Java] Bump
 com.google.api.grpc:proto-google-common-protos from 2.42.0 to 2.43.0 in /java
 (#43824)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [com.google.api.grpc:proto-google-common-protos](https://github.com/googleapis/sdk-platform-java) from 2.42.0 to 2.43.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/googleapis/sdk-platform-java/releases">com.google.api.grpc:proto-google-common-protos's releases</a>.</em></p>
<blockquote>
<h2>v2.43.0</h2>
<h2><a href="https://github.com/googleapis/sdk-platform-java/compare/v2.42.0...v2.43.0">2.43.0</a> (2024-07-25)</h2>
<h3>Features</h3>
<ul>
<li>add <code>transport</code> option to <code>generation_config.yaml</code> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3052">#3052</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/3b1a91551ab6bbaf6a46950e1677c15cdd70d2e9">3b1a915</a>)</li>
<li>get released version from versions.txt to render <code>README.md</code> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3007">#3007</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/99bb2b339eadd480dcc1753d4ba3aeda3b5c64de">99bb2b3</a>)</li>
<li>Introduce java.time to Gax-Java (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/1872">#1872</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/308aeafc9f04795d2e1df8206c84689b11c4323a">308aeaf</a>)</li>
<li>Mark <code>getDefaultEndpoint()</code> with <a href="https://github.com/ObsoleteApi"><code>@​ObsoleteApi</code></a> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2347">#2347</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/e46648f112a149f967783539d30b4c44474b39fe">e46648f</a>)</li>
<li>parse <code>BUILD.bzel</code> to determine whether a commit that only changed <code>BUILD.bazel</code> is a qualified commit (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2937">#2937</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/502f80101dec191befb660a1aba6d0c354758c18">502f801</a>)</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Fix:  (<a href="https://github.com/googleapis/sdk-platform-java/commit/d996c2dfb4b1cb115e0a2cd117eebd8a4ab41cad">d996c2d</a>)</li>
<li><code>BaseApiTracer</code> to noop on attemptFailed via overloaded method call (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3016">#3016</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2fc938a819f4a2da9cfd25d2d306b62f53fa1f91">2fc938a</a>)</li>
<li>Generator to skip generation for empty services. (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3051">#3051</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/ff2c48543940bb0ceb78392b0f5af67568823002">ff2c485</a>)</li>
<li>restore hermetic build image publication (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2952">#2952</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/97a6d678569b7d8768ff83fe5370d8966a06ca95">97a6d67</a>)</li>
</ul>
<h3>Dependencies</h3>
<ul>
<li>update dependency com.fasterxml.jackson:jackson-bom to v2.17.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3028">#3028</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/d16f9d114a75fb8a77dfc39edf6fe2aa2f967704">d16f9d1</a>)</li>
<li>update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.30.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2975">#2975</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/b3ec93f1925ff5a92b47200a61303e5561dbb1b8">b3ec93f</a>)</li>
<li>update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.31.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3044">#3044</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6bd07dc9fb589c72cf7b86bb2e0137687e1f61f2">6bd07dc</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.29.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3058">#3058</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/8ea0868e9e67a4c58075b98de0cf7b51635ea2f8">8ea0868</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.29.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3059">#3059</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/81b23dc88eeff492f6cef6328ce3b5d32992f500">81b23dc</a>)</li>
<li>update dependency com.google.guava:guava to v33.2.1-jre (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3027">#3027</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/12ee456727d0cd9d86aeadd65e633b5d7abb3d50">12ee456</a>)</li>
<li>update dependency commons-codec:commons-codec to v1.17.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3049">#3049</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/58d94b795db06fa76099c871501d2a1f7465633b">58d94b7</a>)</li>
<li>update dependency dev.cel:cel to v0.6.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3050">#3050</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bc332d95919c0a1909e43f4ab7c7fe4db406697e">bc332d9</a>)</li>
<li>update dependency net.bytebuddy:byte-buddy to v1.14.18 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3029">#3029</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/8799cf602a3204a4adeaf4f48000979e49107959">8799cf6</a>)</li>
<li>update dependency org.apache.commons:commons-lang3 to v3.15.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3060">#3060</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2538334aff96a4ad70a26bac2141d3235856b1a1">2538334</a>)</li>
<li>update dependency org.checkerframework:checker-qual to v3.45.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2988">#2988</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/4edd216163662008ee1060b6eb82ca673045826f">4edd216</a>)</li>
<li>update google api dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2951">#2951</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c16f6c95636b4997861ef3914b06f7819a8bd69a">c16f6c9</a>)</li>
<li>update google auth library dependencies to v1.24.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3039">#3039</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/98b5bd7d2ddb98c7e52bffd0b93c5661a1c9d39b">98b5bd7</a>)</li>
<li>update googleapis/java-cloud-bom digest to 47c5dbc (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2974">#2974</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/57623f08441969e0ff0170a72779fb8425ff6592">57623f0</a>)</li>
<li>update grpc dependencies to v1.65.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3061">#3061</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/27497e215cda4e8ad17fce2faa794b600edfc4cd">27497e2</a>)</li>
<li>update junit5 monorepo to v5.10.3 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2963">#2963</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bc55fe1fe55876ee3b4843cefb05ee401c323865">bc55fe1</a>)</li>
<li>update netty dependencies to v4.1.112.final (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3057">#3057</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/5af127be3d9dadcdf0d9a5519ce6ad3b2e3bb481">5af127b</a>)</li>
<li>update opentelemetry-java monorepo to v1.40.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3035">#3035</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/5c31c4211993f25d2c352ef8f3e085187bc5fd30">5c31c42</a>)</li>
<li>Use Gapic-Showcase v0.35.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3018">#3018</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/43773f0cf2418051b2c0e6245100973b8ce2152e">43773f0</a>)</li>
</ul>
<h3>Documentation</h3>
<ul>
<li>add support option to 'new issue' choices (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3055">#3055</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6a2a17d1d84da9d45a4be6675ea6ca0235b42c99">6a2a17d</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/googleapis/sdk-platform-java/blob/main/CHANGELOG.md">com.google.api.grpc:proto-google-common-protos's changelog</a>.</em></p>
<blockquote>
<h2><a href="https://github.com/googleapis/sdk-platform-java/compare/v2.42.0...v2.43.0">2.43.0</a> (2024-07-25)</h2>
<h3>Features</h3>
<ul>
<li>add <code>transport</code> option to <code>generation_config.yaml</code> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3052">#3052</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/3b1a91551ab6bbaf6a46950e1677c15cdd70d2e9">3b1a915</a>)</li>
<li>get released version from versions.txt to render <code>README.md</code> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3007">#3007</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/99bb2b339eadd480dcc1753d4ba3aeda3b5c64de">99bb2b3</a>)</li>
<li>Introduce java.time to Gax-Java (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/1872">#1872</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/308aeafc9f04795d2e1df8206c84689b11c4323a">308aeaf</a>)</li>
<li>Mark <code>getDefaultEndpoint()</code> with <a href="https://github.com/ObsoleteApi"><code>@​ObsoleteApi</code></a> (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2347">#2347</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/e46648f112a149f967783539d30b4c44474b39fe">e46648f</a>)</li>
<li>parse <code>BUILD.bzel</code> to determine whether a commit that only changed <code>BUILD.bazel</code> is a qualified commit (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2937">#2937</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/502f80101dec191befb660a1aba6d0c354758c18">502f801</a>)</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Fix:  (<a href="https://github.com/googleapis/sdk-platform-java/commit/d996c2dfb4b1cb115e0a2cd117eebd8a4ab41cad">d996c2d</a>)</li>
<li><code>BaseApiTracer</code> to noop on attemptFailed via overloaded method call (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3016">#3016</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2fc938a819f4a2da9cfd25d2d306b62f53fa1f91">2fc938a</a>)</li>
<li>Generator to skip generation for empty services. (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3051">#3051</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/ff2c48543940bb0ceb78392b0f5af67568823002">ff2c485</a>)</li>
<li>restore hermetic build image publication (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2952">#2952</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/97a6d678569b7d8768ff83fe5370d8966a06ca95">97a6d67</a>)</li>
</ul>
<h3>Dependencies</h3>
<ul>
<li>update dependency com.fasterxml.jackson:jackson-bom to v2.17.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3028">#3028</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/d16f9d114a75fb8a77dfc39edf6fe2aa2f967704">d16f9d1</a>)</li>
<li>update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.30.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2975">#2975</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/b3ec93f1925ff5a92b47200a61303e5561dbb1b8">b3ec93f</a>)</li>
<li>update dependency com.google.cloud.opentelemetry:detector-resources-support to v0.31.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3044">#3044</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6bd07dc9fb589c72cf7b86bb2e0137687e1f61f2">6bd07dc</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.29.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3058">#3058</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/8ea0868e9e67a4c58075b98de0cf7b51635ea2f8">8ea0868</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.29.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3059">#3059</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/81b23dc88eeff492f6cef6328ce3b5d32992f500">81b23dc</a>)</li>
<li>update dependency com.google.guava:guava to v33.2.1-jre (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3027">#3027</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/12ee456727d0cd9d86aeadd65e633b5d7abb3d50">12ee456</a>)</li>
<li>update dependency commons-codec:commons-codec to v1.17.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3049">#3049</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/58d94b795db06fa76099c871501d2a1f7465633b">58d94b7</a>)</li>
<li>update dependency dev.cel:cel to v0.6.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3050">#3050</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bc332d95919c0a1909e43f4ab7c7fe4db406697e">bc332d9</a>)</li>
<li>update dependency net.bytebuddy:byte-buddy to v1.14.18 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3029">#3029</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/8799cf602a3204a4adeaf4f48000979e49107959">8799cf6</a>)</li>
<li>update dependency org.apache.commons:commons-lang3 to v3.15.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3060">#3060</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2538334aff96a4ad70a26bac2141d3235856b1a1">2538334</a>)</li>
<li>update dependency org.checkerframework:checker-qual to v3.45.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2988">#2988</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/4edd216163662008ee1060b6eb82ca673045826f">4edd216</a>)</li>
<li>update google api dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2951">#2951</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c16f6c95636b4997861ef3914b06f7819a8bd69a">c16f6c9</a>)</li>
<li>update google auth library dependencies to v1.24.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3039">#3039</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/98b5bd7d2ddb98c7e52bffd0b93c5661a1c9d39b">98b5bd7</a>)</li>
<li>update googleapis/java-cloud-bom digest to 47c5dbc (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2974">#2974</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/57623f08441969e0ff0170a72779fb8425ff6592">57623f0</a>)</li>
<li>update grpc dependencies to v1.65.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3061">#3061</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/27497e215cda4e8ad17fce2faa794b600edfc4cd">27497e2</a>)</li>
<li>update junit5 monorepo to v5.10.3 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2963">#2963</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bc55fe1fe55876ee3b4843cefb05ee401c323865">bc55fe1</a>)</li>
<li>update netty dependencies to v4.1.112.final (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3057">#3057</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/5af127be3d9dadcdf0d9a5519ce6ad3b2e3bb481">5af127b</a>)</li>
<li>update opentelemetry-java monorepo to v1.40.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3035">#3035</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/5c31c4211993f25d2c352ef8f3e085187bc5fd30">5c31c42</a>)</li>
<li>Use Gapic-Showcase v0.35.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3018">#3018</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/43773f0cf2418051b2c0e6245100973b8ce2152e">43773f0</a>)</li>
</ul>
<h3>Documentation</h3>
<ul>
<li>add support option to 'new issue' choices (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3055">#3055</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6a2a17d1d84da9d45a4be6675ea6ca0235b42c99">6a2a17d</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/5e6da42ecfc5818d53a3053614a71680b482484f"><code>5e6da42</code></a> chore(main): release 2.43.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2953">#2953</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/10f950edf9f5ddc293d102bb46bae7aecdae6b98"><code>10f950e</code></a> chore: make generator version an optional param (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3040">#3040</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/43de5b568dc0bbdaddf2419652e0cb16de77ddb6"><code>43de5b5</code></a> build(deps): update dependency com.google.cloud:google-cloud-shared-config to...</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/941f08c6e58cf1662ffce195cd7f351992ae9806"><code>941f08c</code></a> chore: Add OpenTelemetry semantic conventions to shared dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3020">#3020</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/27497e215cda4e8ad17fce2faa794b600edfc4cd"><code>27497e2</code></a> deps: update grpc dependencies to v1.65.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3061">#3061</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/ff2c48543940bb0ceb78392b0f5af67568823002"><code>ff2c485</code></a> fix: Generator to skip generation for empty services. (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3051">#3051</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/5af127be3d9dadcdf0d9a5519ce6ad3b2e3bb481"><code>5af127b</code></a> deps: update netty dependencies to v4.1.112.final (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3057">#3057</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/8ea0868e9e67a4c58075b98de0cf7b51635ea2f8"><code>8ea0868</code></a> deps: update dependency com.google.errorprone:error_prone_annotations to v2.2...</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/81b23dc88eeff492f6cef6328ce3b5d32992f500"><code>81b23dc</code></a> deps: update dependency com.google.errorprone:error_prone_annotations to v2.2...</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/2538334aff96a4ad70a26bac2141d3235856b1a1"><code>2538334</code></a> deps: update dependency org.apache.commons:commons-lang3 to v3.15.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3060">#3060</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/googleapis/sdk-platform-java/compare/v2.42.0...v2.43.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.google.api.grpc:proto-google-common-protos&package-manager=maven&previous-version=2.42.0&new-version=2.43.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/flight/flight-core/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml
index e4d1d5d3885a5..dec679de3a543 100644
--- a/java/flight/flight-core/pom.xml
+++ b/java/flight/flight-core/pom.xml
@@ -134,7 +134,7 @@ under the License.
     <dependency>
       <groupId>com.google.api.grpc</groupId>
       <artifactId>proto-google-common-protos</artifactId>
-      <version>2.42.0</version>
+      <version>2.43.0</version>
       <scope>test</scope>
     </dependency>
     <dependency>

From 9d40a6a6630f951b9ccf8e8984c58dc0602921eb Mon Sep 17 00:00:00 2001
From: "yihao.dai" <954206947@qq.com>
Date: Wed, 28 Aug 2024 23:32:10 +0800
Subject: [PATCH 042/186] GH-43860: [Go][Parquet] Handle the error correctly
 (#43861)

### Rationale for this change
Fixes: https://github.com/apache/arrow/issues/43860

### What changes are included in this PR?
Return error correctly

### Are these changes tested?
Yes

### Are there any user-facing changes?
Nope

* GitHub Issue: #43860

Authored-by: bigsheeper <yihao.dai@zilliz.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
---
 go/parquet/file/file_reader_test.go | 49 +++++++++++++++++++++++++++++
 go/parquet/file/record_reader.go    |  2 +-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go
index 35f4da4e8667c..74926c958e2f7 100644
--- a/go/parquet/file/file_reader_test.go
+++ b/go/parquet/file/file_reader_test.go
@@ -452,6 +452,55 @@ func TestRleBooleanEncodingFileRead(t *testing.T) {
 	assert.Equal(t, expected, values[:len(expected)])
 }
 
+type mockBadReader struct {
+	cnt    int
+	reader *os.File
+}
+
+func (m *mockBadReader) Seek(offset int64, whence int) (int64, error) {
+	return m.reader.Seek(offset, whence)
+}
+
+func (m *mockBadReader) ReadAt(p []byte, off int64) (n int, err error) {
+	if m.cnt == 0 {
+		return 0, fmt.Errorf("mock error")
+	}
+	m.cnt--
+	return m.reader.ReadAt(p, off)
+}
+
+func TestBadReader(t *testing.T) {
+	dir := os.Getenv("PARQUET_TEST_DATA")
+	if dir == "" {
+		t.Skip("no path supplied with PARQUET_TEST_DATA")
+	}
+	require.DirExists(t, dir)
+
+	filePath := path.Join(dir, "byte_stream_split_extended.gzip.parquet")
+	f, err := os.Open(filePath)
+	assert.NoError(t, err)
+	defer f.Close()
+
+	reader := &mockBadReader{
+		cnt:    2,
+		reader: f,
+	}
+	r, err := file.NewParquetReader(reader, file.WithReadProps(&parquet.ReaderProperties{
+		BufferSize:            int64(1024),
+		BufferedStreamEnabled: true,
+	}))
+	assert.NoError(t, err)
+
+	fileReader, err := pqarrow.NewFileReader(r, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
+	assert.NoError(t, err)
+
+	columnReader, err := fileReader.GetColumn(context.Background(), 0)
+	assert.NoError(t, err)
+
+	_, err = columnReader.NextBatch(1)
+	assert.ErrorContains(t, err, "mock error") // Expect an error to occur.
+}
+
 func TestByteStreamSplitEncodingFileRead(t *testing.T) {
 	dir := os.Getenv("PARQUET_TEST_DATA")
 	if dir == "" {
diff --git a/go/parquet/file/record_reader.go b/go/parquet/file/record_reader.go
index 667ffca77a8d1..765f4a9d34b33 100755
--- a/go/parquet/file/record_reader.go
+++ b/go/parquet/file/record_reader.go
@@ -645,7 +645,7 @@ func (rr *recordReader) ReadRecords(numRecords int64) (int64, error) {
 		}
 	}
 
-	return recordsRead, nil
+	return recordsRead, rr.Err()
 }
 
 func (rr *recordReader) ReleaseValidBits() *memory.Buffer {

From 0bc91dd2447696a208adec266270ab722099b0e2 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Wed, 28 Aug 2024 15:07:02 -0300
Subject: [PATCH 043/186] GH-43854: [C++] Expose the set of device types where
 a ChunkedArray is allocated (#43853)

### Rationale for this change

`ChunkedArray`s allow flexible allocation of arrays -- the whole array doesn't have to be allocated in huge contiguous buffers. Nothing today prevents chunked arrays from being made of chunks allocated in different devices and that is good. But we need a way to query the set of devices where a chunked array is allocated at. This PR adds that missing part.

### What changes are included in this PR?

Addition of:
- the `DeviceAllocationTypeSet` class
- `ChunkedArray::device_types()`
- `Datum::device_types()`

Moved `enum DeviceAllocationType` to the `type_fwd.h` header because `device.h` is too expensive of a header to hold this widely used `enum`.

### Are these changes tested?

Added more asserts to `chunked_array_test.cc`.

### Are there any user-facing changes?

New APIs.
* GitHub Issue: #43854

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
---
 cpp/src/arrow/CMakeLists.txt                |  1 +
 cpp/src/arrow/chunked_array.cc              | 13 +++
 cpp/src/arrow/chunked_array.h               |  8 ++
 cpp/src/arrow/chunked_array_test.cc         |  5 ++
 cpp/src/arrow/compute/function.cc           |  1 +
 cpp/src/arrow/compute/kernel.cc             |  1 +
 cpp/src/arrow/compute/kernel.h              |  1 +
 cpp/src/arrow/datum.cc                      | 40 +++++++++
 cpp/src/arrow/datum.h                       |  3 +
 cpp/src/arrow/device.h                      | 18 ----
 cpp/src/arrow/device_allocation_type_set.cc | 80 +++++++++++++++++
 cpp/src/arrow/device_allocation_type_set.h  | 97 +++++++++++++++++++++
 cpp/src/arrow/type_fwd.h                    | 21 +++++
 13 files changed, 271 insertions(+), 18 deletions(-)
 create mode 100644 cpp/src/arrow/device_allocation_type_set.cc
 create mode 100644 cpp/src/arrow/device_allocation_type_set.h

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 6b0ac8c23c75a..65343df1291ba 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -373,6 +373,7 @@ set(ARROW_SRCS
     config.cc
     datum.cc
     device.cc
+    device_allocation_type_set.cc
     extension_type.cc
     extension/bool8.cc
     extension/uuid.cc
diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc
index c36b736d5d5df..dd6aa51534fcb 100644
--- a/cpp/src/arrow/chunked_array.cc
+++ b/cpp/src/arrow/chunked_array.cc
@@ -27,6 +27,7 @@
 #include "arrow/array/array_nested.h"
 #include "arrow/array/util.h"
 #include "arrow/array/validate.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/pretty_print.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -86,6 +87,18 @@ Result<std::shared_ptr<ChunkedArray>> ChunkedArray::MakeEmpty(
   return std::make_shared<ChunkedArray>(std::move(new_chunks));
 }
 
+DeviceAllocationTypeSet ChunkedArray::device_types() const {
+  if (chunks_.empty()) {
+    // An empty ChunkedArray is considered to be CPU-only.
+    return DeviceAllocationTypeSet::CpuOnly();
+  }
+  DeviceAllocationTypeSet set;
+  for (const auto& chunk : chunks_) {
+    set.add(chunk->device_type());
+  }
+  return set;
+}
+
 bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) const {
   if (length_ != other.length()) {
     return false;
diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h
index 5d300861d85c2..c65b6cb6e227f 100644
--- a/cpp/src/arrow/chunked_array.h
+++ b/cpp/src/arrow/chunked_array.h
@@ -25,6 +25,7 @@
 
 #include "arrow/chunk_resolver.h"
 #include "arrow/compare.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/type_fwd.h"
@@ -116,6 +117,13 @@ class ARROW_EXPORT ChunkedArray {
   /// \return an ArrayVector of chunks
   const ArrayVector& chunks() const { return chunks_; }
 
+  /// \return The set of device allocation types used by the chunks in this
+  /// chunked array.
+  DeviceAllocationTypeSet device_types() const;
+
+  /// \return true if all chunks are allocated on CPU-accessible memory.
+  bool is_cpu() const { return device_types().is_cpu_only(); }
+
   /// \brief Construct a zero-copy slice of the chunked array with the
   /// indicated offset and length
   ///
diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc
index e9cc283b53cd5..b796e9250008a 100644
--- a/cpp/src/arrow/chunked_array_test.cc
+++ b/cpp/src/arrow/chunked_array_test.cc
@@ -61,12 +61,17 @@ TEST_F(TestChunkedArray, Make) {
                        ChunkedArray::Make({}, int64()));
   AssertTypeEqual(*int64(), *result->type());
   ASSERT_EQ(result->num_chunks(), 0);
+  // Empty chunked arrays are treated as CPU-allocated.
+  ASSERT_TRUE(result->is_cpu());
 
   auto chunk0 = ArrayFromJSON(int8(), "[0, 1, 2]");
   auto chunk1 = ArrayFromJSON(int16(), "[3, 4, 5]");
 
   ASSERT_OK_AND_ASSIGN(result, ChunkedArray::Make({chunk0, chunk0}));
   ASSERT_OK_AND_ASSIGN(auto result2, ChunkedArray::Make({chunk0, chunk0}, int8()));
+  // All chunks are CPU-accessible.
+  ASSERT_TRUE(result->is_cpu());
+  ASSERT_TRUE(result2->is_cpu());
   AssertChunkedEqual(*result, *result2);
 
   ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0, chunk1}));
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index e1a2e8c5d8879..0478a3d1e801a 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -30,6 +30,7 @@
 #include "arrow/compute/kernels/common_internal.h"
 #include "arrow/compute/registry.h"
 #include "arrow/datum.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/tracing_internal.h"
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 5c87ef2cd0561..5e7461cc52d0e 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -24,6 +24,7 @@
 
 #include "arrow/buffer.h"
 #include "arrow/compute/exec.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/result.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 1adb3e96c97c8..cfa1cd8193f36 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -31,6 +31,7 @@
 #include "arrow/buffer.h"
 #include "arrow/compute/exec.h"
 #include "arrow/datum.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/memory_pool.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc
index 2ac230232e1b7..b19d186447547 100644
--- a/cpp/src/arrow/datum.cc
+++ b/cpp/src/arrow/datum.cc
@@ -25,6 +25,7 @@
 #include "arrow/array/array_base.h"
 #include "arrow/array/util.h"
 #include "arrow/chunked_array.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/record_batch.h"
 #include "arrow/scalar.h"
 #include "arrow/table.h"
@@ -156,6 +157,45 @@ ArrayVector Datum::chunks() const {
   return this->chunked_array()->chunks();
 }
 
+DeviceAllocationTypeSet Datum::device_types() const {
+  switch (kind()) {
+    case NONE:
+      break;
+    case SCALAR:
+      // Scalars are asssumed as always residing in CPU memory for now.
+      return DeviceAllocationTypeSet::CpuOnly();
+    case ARRAY:
+      return DeviceAllocationTypeSet{array()->device_type()};
+    case CHUNKED_ARRAY:
+      return chunked_array()->device_types();
+    case RECORD_BATCH: {
+      auto& columns = record_batch()->columns();
+      if (columns.empty()) {
+        // An empty RecordBatch is considered to be CPU-only.
+        return DeviceAllocationTypeSet::CpuOnly();
+      }
+      DeviceAllocationTypeSet set;
+      for (const auto& column : columns) {
+        set.add(column->device_type());
+      }
+      return set;
+    }
+    case TABLE: {
+      auto& columns = table()->columns();
+      if (columns.empty()) {
+        // An empty Table is considered to be CPU-only.
+        return DeviceAllocationTypeSet::CpuOnly();
+      }
+      DeviceAllocationTypeSet set;
+      for (const auto& column : columns) {
+        set.Add(column->device_types());
+      }
+      return set;
+    }
+  }
+  return {};
+}
+
 bool Datum::Equals(const Datum& other) const {
   if (this->kind() != other.kind()) return false;
 
diff --git a/cpp/src/arrow/datum.h b/cpp/src/arrow/datum.h
index 31b2d2274c900..4a88e7a81125c 100644
--- a/cpp/src/arrow/datum.h
+++ b/cpp/src/arrow/datum.h
@@ -26,6 +26,7 @@
 #include <vector>
 
 #include "arrow/array/data.h"
+#include "arrow/device_allocation_type_set.h"
 #include "arrow/scalar.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -295,6 +296,8 @@ struct ARROW_EXPORT Datum {
   /// \return empty if not arraylike
   ArrayVector chunks() const;
 
+  DeviceAllocationTypeSet device_types() const;
+
   /// \brief True if the two data are equal
   bool Equals(const Datum& other) const;
 
diff --git a/cpp/src/arrow/device.h b/cpp/src/arrow/device.h
index f5cca0d27d7b2..1dbe5b4b13e89 100644
--- a/cpp/src/arrow/device.h
+++ b/cpp/src/arrow/device.h
@@ -32,24 +32,6 @@
 
 namespace arrow {
 
-/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types
-enum class DeviceAllocationType : char {
-  kCPU = 1,
-  kCUDA = 2,
-  kCUDA_HOST = 3,
-  kOPENCL = 4,
-  kVULKAN = 7,
-  kMETAL = 8,
-  kVPI = 9,
-  kROCM = 10,
-  kROCM_HOST = 11,
-  kEXT_DEV = 12,
-  kCUDA_MANAGED = 13,
-  kONEAPI = 14,
-  kWEBGPU = 15,
-  kHEXAGON = 16,
-};
-
 class MemoryManager;
 
 /// \brief EXPERIMENTAL: Abstract interface for hardware devices
diff --git a/cpp/src/arrow/device_allocation_type_set.cc b/cpp/src/arrow/device_allocation_type_set.cc
new file mode 100644
index 0000000000000..83e9e57f2ee47
--- /dev/null
+++ b/cpp/src/arrow/device_allocation_type_set.cc
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <string>
+
+#include "arrow/device_allocation_type_set.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+
+const char* DeviceAllocationTypeToCStr(DeviceAllocationType type) {
+  switch (type) {
+    case DeviceAllocationType::kCPU:
+      return "CPU";
+    case DeviceAllocationType::kCUDA:
+      return "CUDA";
+    case DeviceAllocationType::kCUDA_HOST:
+      return "CUDA_HOST";
+    case DeviceAllocationType::kOPENCL:
+      return "OPENCL";
+    case DeviceAllocationType::kVULKAN:
+      return "VULKAN";
+    case DeviceAllocationType::kMETAL:
+      return "METAL";
+    case DeviceAllocationType::kVPI:
+      return "VPI";
+    case DeviceAllocationType::kROCM:
+      return "ROCM";
+    case DeviceAllocationType::kROCM_HOST:
+      return "ROCM_HOST";
+    case DeviceAllocationType::kEXT_DEV:
+      return "EXT_DEV";
+    case DeviceAllocationType::kCUDA_MANAGED:
+      return "CUDA_MANAGED";
+    case DeviceAllocationType::kONEAPI:
+      return "ONEAPI";
+    case DeviceAllocationType::kWEBGPU:
+      return "WEBGPU";
+    case DeviceAllocationType::kHEXAGON:
+      return "HEXAGON";
+  }
+  return "<UNKNOWN>";
+}
+
+std::string DeviceAllocationTypeSet::ToString() const {
+  std::string result = "{";
+  for (int i = 1; i <= kDeviceAllocationTypeMax; i++) {
+    if (device_type_bitset_.test(i)) {
+      // Skip all the unused values in the enum.
+      switch (i) {
+        case 0:
+        case 5:
+        case 6:
+          continue;
+      }
+      if (result.size() > 1) {
+        result += ", ";
+      }
+      result += DeviceAllocationTypeToCStr(static_cast<DeviceAllocationType>(i));
+    }
+  }
+  result += "}";
+  return result;
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/device_allocation_type_set.h b/cpp/src/arrow/device_allocation_type_set.h
new file mode 100644
index 0000000000000..974367307e6d4
--- /dev/null
+++ b/cpp/src/arrow/device_allocation_type_set.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <bitset>
+#include <string>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+ARROW_EXPORT
+const char* DeviceAllocationTypeToCStr(DeviceAllocationType type);
+
+class ARROW_EXPORT DeviceAllocationTypeSet {
+ private:
+  std::bitset<kDeviceAllocationTypeMax + 1> device_type_bitset_;
+
+ public:
+  /// \brief Construct an empty set of device types.
+  DeviceAllocationTypeSet() = default;
+
+  /// \brief Construct a set of device types with a single device type.
+  DeviceAllocationTypeSet(  // NOLINT implicit construction
+      DeviceAllocationType accepted_device_type) {
+    add(accepted_device_type);
+  }
+
+  /// \brief Construct a set of device types containing only "kCPU".
+  static DeviceAllocationTypeSet CpuOnly() {
+    return DeviceAllocationTypeSet{DeviceAllocationType::kCPU};
+  }
+
+  /// \brief Construct a set of device types containing all device types.
+  static DeviceAllocationTypeSet All() {
+    DeviceAllocationTypeSet all;
+    all.device_type_bitset_.set();
+    // Don't set the invalid enum values.
+    all.device_type_bitset_.reset(0);
+    all.device_type_bitset_.reset(5);
+    all.device_type_bitset_.reset(6);
+    return all;
+  }
+
+  /// \brief Add a device type to the set of device types.
+  void add(DeviceAllocationType device_type) {
+    device_type_bitset_.set(static_cast<int>(device_type));
+  }
+
+  /// \brief Remove a device type from the set of device types.
+  void remove(DeviceAllocationType device_type) {
+    device_type_bitset_.reset(static_cast<int>(device_type));
+  }
+
+  /// \brief Return true iff the set only contains the CPU device type.
+  bool is_cpu_only() const {
+    return device_type_bitset_ == CpuOnly().device_type_bitset_;
+  }
+
+  /// \brief Return true if the set of accepted device types includes the
+  /// device type.
+  bool contains(DeviceAllocationType device_type) const {
+    return device_type_bitset_.test(static_cast<int>(device_type));
+  }
+
+  /// \brief Add all device types from another set to this set.
+  void Add(DeviceAllocationTypeSet other) {
+    device_type_bitset_ |= other.device_type_bitset_;
+  }
+
+  /// \brief Return true if the set of accepted device types includes all the
+  /// device types in the other set.
+  bool Contains(DeviceAllocationTypeSet other) const {
+    // other \subseteq this <==> (other \intersect this == other)
+    return (other.device_type_bitset_ & device_type_bitset_) == other.device_type_bitset_;
+  }
+
+  std::string ToString() const;
+};
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 08777d247edbf..8faebe217f141 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -724,4 +724,25 @@ ARROW_EXPORT MemoryPool* default_memory_pool();
 
 constexpr int64_t kDefaultBufferAlignment = 64;
 
+/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types
+enum class DeviceAllocationType : char {
+  kCPU = 1,
+  kCUDA = 2,
+  kCUDA_HOST = 3,
+  kOPENCL = 4,
+  kVULKAN = 7,
+  kMETAL = 8,
+  kVPI = 9,
+  kROCM = 10,
+  kROCM_HOST = 11,
+  kEXT_DEV = 12,
+  kCUDA_MANAGED = 13,
+  kONEAPI = 14,
+  kWEBGPU = 15,
+  kHEXAGON = 16,
+};
+constexpr int kDeviceAllocationTypeMax = 16;
+
+class DeviceAllocationTypeSet;
+
 }  // namespace arrow

From 58415d1fac50cb829b3dcf08526033d6db8c30db Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 29 Aug 2024 02:54:32 +0200
Subject: [PATCH 044/186] GH-38183: [CI][Python] Use pipx to install GCS
 testbench (#43852)

### Rationale for this change

Installing the GCS testbench using the same Python that's being used to test PyArrow is fragile: some testbench versions may not be compatible, or there could be conflicts among the dependencies of the respective libraries.

### What changes are included in this PR?

Use `pipx` to install the GCS testbench in a separate, controlled environment, using an appropriate Python version.

### Are these changes tested?

Yes, by CI.

### Are there any user-facing changes?

No.

* GitHub Issue: #38183

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/cpp.yml                     |  8 ++-
 appveyor.yml                                  |  1 +
 ci/appveyor-cpp-build.bat                     |  2 +
 ci/docker/conda-cpp.dockerfile                | 12 ++--
 ci/docker/conda-python.dockerfile             |  5 --
 ...ython-wheel-windows-test-vs2019.dockerfile | 27 +++++---
 ci/docker/ubuntu-20.04-cpp-minimal.dockerfile |  1 +
 ci/docker/ubuntu-22.04-cpp-minimal.dockerfile |  1 +
 ci/docker/ubuntu-24.04-cpp-minimal.dockerfile |  1 +
 ci/scripts/install_gcs_testbench.bat          | 13 +++-
 ci/scripts/install_gcs_testbench.sh           | 20 +++---
 ci/scripts/python_wheel_windows_test.bat      | 40 ++++++-----
 cpp/src/arrow/filesystem/gcsfs_test.cc        | 68 +++++++++----------
 python/pyarrow/tests/conftest.py              |  7 +-
 python/scripts/run_emscripten_tests.py        |  2 +-
 r/tests/testthat/test-gcs.R                   |  4 +-
 16 files changed, 122 insertions(+), 90 deletions(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index c5482f730823b..fd23e0cf217e6 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -465,15 +465,17 @@ jobs:
           chmod +x /usr/local/bin/minio.exe
       - name: Set up Python
         uses: actions/setup-python@v5.1.1
+        id: python-install
         with:
           python-version: 3.9
       - name: Install Google Cloud Storage Testbench
-        shell: bash
+        shell: msys2 {0}
+        env:
+          PIPX_BIN_DIR: /usr/local/bin
+          PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }}
         run: |
           ci/scripts/install_gcs_testbench.sh default
-          echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which python3.exe)))" >> $GITHUB_ENV
       - name: Test
         shell: msys2 {0}
         run: |
-          PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}"
           ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build"
diff --git a/appveyor.yml b/appveyor.yml
index 5954251d34733..9e4582f1d8d7f 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -24,6 +24,7 @@ only_commits:
     - appveyor.yml
     - ci/appveyor*
     - ci/conda*
+    - ci/scripts/*.bat
     - cpp/
     - format/
     - python/
diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index f688fbb63a9ad..08a052e82f24d 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -46,7 +46,9 @@ set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON
 set ARROW_CXXFLAGS=/WX /MP
 
 @rem Install GCS testbench
+set PIPX_BIN_DIR=C:\Windows\
 call %CD%\ci\scripts\install_gcs_testbench.bat
+storage-testbench -h || exit /B
 
 @rem
 @rem Build and test Arrow C++ libraries (including Parquet)
diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile
index dff1f2224809a..eb035d887a158 100644
--- a/ci/docker/conda-cpp.dockerfile
+++ b/ci/docker/conda-cpp.dockerfile
@@ -42,17 +42,19 @@ RUN mamba install -q -y \
         valgrind && \
     mamba clean --all
 
+# We want to install the GCS testbench using the Conda base environment's Python,
+# because the test environment's Python may later change.
+ENV PIPX_PYTHON=/opt/conda/bin/python3
+COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
+RUN /arrow/ci/scripts/install_gcs_testbench.sh default
+
 # Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to 
-# be on the path for the tests to run.  
+# be on the path for the tests to run.
 ENV PATH=/opt/conda/envs/arrow/bin:$PATH
 
 COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_azurite.sh
 
-# We want to install the GCS testbench using the same Python binary that the Conda code will use.
-COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
-RUN /arrow/ci/scripts/install_gcs_testbench.sh default
-
 COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin
 
diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile
index 027fd589cecca..7e8dbe76f6248 100644
--- a/ci/docker/conda-python.dockerfile
+++ b/ci/docker/conda-python.dockerfile
@@ -32,11 +32,6 @@ RUN mamba install -q -y \
         nomkl && \
     mamba clean --all
 
-# XXX The GCS testbench was already installed in conda-cpp.dockerfile,
-# but we changed the installed Python version above, so we need to reinstall it.
-COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
-RUN /arrow/ci/scripts/install_gcs_testbench.sh default
-
 ENV ARROW_ACERO=ON \
     ARROW_BUILD_STATIC=OFF \
     ARROW_BUILD_TESTS=OFF \
diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
index 5f488a4c285ff..625ab25f848f2 100644
--- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
@@ -35,16 +35,27 @@ RUN setx path "%path%;C:\Program Files\Git\usr\bin"
 RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \
     rm -rf Python*
 
+# Install the GCS testbench using a well-known Python version.
+# NOTE: cannot use pipx's `--fetch-missing-python` because of
+# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves.
+RUN choco install -r -y --pre --no-progress python --version=3.11.9
+ENV PIPX_BIN_DIR=C:\\Windows\\
+ENV PIPX_PYTHON="C:\Python311\python.exe"
+COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/
+RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \
+    storage-testbench -h
+
 # Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0)
 ARG python=3.8
-RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \
-    (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
-    (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \
-    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
-    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
-    (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts")
+RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10") & \
+    (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \
+    (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \
+    (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \
+    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4") & \
+    (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1")
 
 # Install archiver to extract xz archives
-RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% & \
-    python -m pip install --no-cache-dir -U pip setuptools & \
+RUN choco install -r -y --pre --no-progress --force python --version=%PYTHON_VERSION% && \
     choco install --no-progress -r -y archiver
+
+ENV PYTHON=$python
diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
index e17c0306f115d..4d867a448c994 100644
--- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
         libssl-dev \
         libcurl4-openssl-dev \
         python3-pip \
+        python3-venv \
         tzdata \
         wget && \
     apt-get clean && \
diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
index 341d8a87e8661..f26cad51f0983 100644
--- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
         libssl-dev \
         libcurl4-openssl-dev \
         python3-pip \
+        python3-venv \
         tzdata \
         wget && \
     apt-get clean && \
diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
index a995ab2a8bc2d..125bc7ba46a81 100644
--- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
@@ -33,6 +33,7 @@ RUN apt-get update -y -q && \
         libssl-dev \
         libcurl4-openssl-dev \
         python3-pip \
+        python3-venv \
         tzdata \
         tzdata-legacy \
         wget && \
diff --git a/ci/scripts/install_gcs_testbench.bat b/ci/scripts/install_gcs_testbench.bat
index b03d0c2ad6608..f54f98db7cac8 100644
--- a/ci/scripts/install_gcs_testbench.bat
+++ b/ci/scripts/install_gcs_testbench.bat
@@ -17,9 +17,18 @@
 
 @echo on
 
-set GCS_TESTBENCH_VERSION="v0.36.0"
+set GCS_TESTBENCH_VERSION="v0.40.0"
+
+set PIPX_FLAGS=--verbose
+if NOT "%PIPX_PYTHON%"=="" (
+  set PIPX_FLAGS=--python %PIPX_PYTHON% %PIPX_FLAGS%
+)
+
+python -m pip install -U pipx || exit /B 1
 
 @REM Install GCS testbench %GCS_TESTBENCH_VERSION%
-python -m pip install  ^
+pipx install %PIPX_FLAGS% ^
         "https://github.com/googleapis/storage-testbench/archive/%GCS_TESTBENCH_VERSION%.tar.gz" ^
         || exit /B 1
+
+pipx list --verbose
diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh
index 5471b3cc238ca..78826e94d3294 100755
--- a/ci/scripts/install_gcs_testbench.sh
+++ b/ci/scripts/install_gcs_testbench.sh
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
+set -ex
 
 if [ "$#" -ne 1 ]; then
   echo "Usage: $0 <storage-testbench version>"
@@ -34,19 +34,23 @@ case "$(uname -m)" in
     ;;
 esac
 
-# On newer pythons install into the system will fail, so override that
-export PIP_BREAK_SYSTEM_PACKAGES=1
-
 version=$1
 if [[ "${version}" -eq "default" ]]; then
   version="v0.39.0"
-  # Latests versions of Testbench require newer setuptools
-  python3 -m pip install --upgrade setuptools
 fi
 
+: ${PIPX_PYTHON:=$(which python3)}
+
+export PIP_BREAK_SYSTEM_PACKAGES=1
+${PIPX_PYTHON} -m pip install -U pipx
+
 # This script is run with PYTHON undefined in some places,
 # but those only use older pythons.
 if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then
-  python3 -m pip install \
-    "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+  pipx_flags=--verbose
+  if [[ $(id -un) == "root" ]]; then
+    # Install globally as /root/.local/bin is typically not in $PATH
+    pipx_flags="${pipx_flags} --global"
+  fi
+  ${PIPX_PYTHON} -m pipx install ${pipx_flags} "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
 fi
diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat
index 87c0bb1252024..cac3f18434b6c 100755
--- a/ci/scripts/python_wheel_windows_test.bat
+++ b/ci/scripts/python_wheel_windows_test.bat
@@ -37,28 +37,32 @@ set PYARROW_TEST_TENSORFLOW=ON
 set ARROW_TEST_DATA=C:\arrow\testing\data
 set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data
 
-@REM Install testing dependencies
-pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1
+@REM List installed Pythons
+py -0p
+
+set PYTHON_CMD=py -%PYTHON%
 
-@REM Install GCS testbench
-call "C:\arrow\ci\scripts\install_gcs_testbench.bat"
+%PYTHON_CMD% -m pip install -U pip setuptools || exit /B 1
+
+@REM Install testing dependencies
+%PYTHON_CMD% -m pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1
 
 @REM Install the built wheels
-python -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 
+%PYTHON_CMD% -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1
 
 @REM Test that the modules are importable
-python -c "import pyarrow" || exit /B 1
-python -c "import pyarrow._gcsfs" || exit /B 1
-python -c "import pyarrow._hdfs" || exit /B 1 
-python -c "import pyarrow._s3fs" || exit /B 1
-python -c "import pyarrow.csv" || exit /B 1
-python -c "import pyarrow.dataset" || exit /B 1
-python -c "import pyarrow.flight" || exit /B 1
-python -c "import pyarrow.fs" || exit /B 1
-python -c "import pyarrow.json" || exit /B 1
-python -c "import pyarrow.orc" || exit /B 1
-python -c "import pyarrow.parquet" || exit /B 1
-python -c "import pyarrow.substrait" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.csv" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.dataset" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.flight" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.fs" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.json" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.orc" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1
+%PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1
 
 @rem Download IANA Timezone Database for ORC C++
 curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B
@@ -67,4 +71,4 @@ arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata
 set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo
 
 @REM Execute unittest
-pytest -r s --pyargs pyarrow || exit /B 1
+%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1
diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc
index a6022a8d21681..2098cf4d7f319 100644
--- a/cpp/src/arrow/filesystem/gcsfs_test.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_test.cc
@@ -95,44 +95,41 @@ class GcsTestbench : public ::testing::Environment {
     if (const auto* env = std::getenv("PYTHON")) {
       names = {env};
     }
-    auto error = std::string(
-        "Could not start GCS emulator."
-        " Used the following list of python interpreter names:");
-    for (const auto& interpreter : names) {
-      auto exe_path = bp::search_path(interpreter);
-      error += " " + interpreter;
-      if (exe_path.empty()) {
-        error += " (exe not found)";
-        continue;
-      }
+    auto error = std::string("Could not start GCS emulator 'storage-testbench'");
 
-      bp::ipstream output;
-      server_process_ = bp::child(exe_path, "-m", "testbench", "--port", port_, group_,
-                                  bp::std_err > output);
+    auto testbench_is_running = [](bp::child& process, bp::ipstream& output) {
       // Wait for message: "* Restarting with"
-      auto testbench_is_running = [&output, this](bp::child& process) {
-        std::string line;
-        std::chrono::time_point<std::chrono::steady_clock> end =
-            std::chrono::steady_clock::now() + std::chrono::seconds(10);
-        while (server_process_.valid() && server_process_.running() &&
-               std::chrono::steady_clock::now() < end) {
-          if (output.peek() && std::getline(output, line)) {
-            std::cerr << line << std::endl;
-            if (line.find("* Restarting with") != std::string::npos) return true;
-          } else {
-            std::this_thread::sleep_for(std::chrono::milliseconds(20));
-          }
+      std::string line;
+      std::chrono::time_point<std::chrono::steady_clock> end =
+          std::chrono::steady_clock::now() + std::chrono::seconds(10);
+      while (process.valid() && process.running() &&
+             std::chrono::steady_clock::now() < end) {
+        if (output.peek() && std::getline(output, line)) {
+          std::cerr << line << std::endl;
+          if (line.find("* Restarting with") != std::string::npos) return true;
+        } else {
+          std::this_thread::sleep_for(std::chrono::milliseconds(20));
         }
-        return false;
-      };
+      }
+      return false;
+    };
 
-      if (testbench_is_running(server_process_)) break;
-      error += " (failed to start)";
-      server_process_.terminate();
-      server_process_.wait();
+    auto exe_path = bp::search_path("storage-testbench");
+    if (!exe_path.empty()) {
+      bp::ipstream output;
+      server_process_ =
+          bp::child(exe_path, "--port", port_, group_, bp::std_err > output);
+      if (!testbench_is_running(server_process_, output)) {
+        error += " (failed to start)";
+        server_process_.terminate();
+        server_process_.wait();
+      }
+    } else {
+      error += " (exe not found)";
+    }
+    if (!server_process_.valid()) {
+      error_ = std::move(error);
     }
-    if (server_process_.valid() && server_process_.valid()) return;
-    error_ = std::move(error);
   }
 
   bool running() { return server_process_.running(); }
@@ -140,7 +137,10 @@ class GcsTestbench : public ::testing::Environment {
   ~GcsTestbench() override {
     // Brutal shutdown, kill the full process group because the GCS testbench may launch
     // additional children.
-    group_.terminate();
+    try {
+      group_.terminate();
+    } catch (bp::process_error&) {
+    }
     if (server_process_.valid()) {
       server_process_.wait();
     }
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index e1919497b5116..7a222cec8a7c4 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -233,17 +233,16 @@ def minio_server_health_check(address):
 def gcs_server():
     port = find_free_port()
     env = os.environ.copy()
-    args = [sys.executable, '-m', 'testbench', '--port', str(port)]
+    exe = 'storage-testbench'
+    args = [exe, '--port', str(port)]
     proc = None
     try:
-        # check first if testbench module is available
-        import testbench  # noqa:F401
         # start server
         proc = subprocess.Popen(args, env=env)
         # Make sure the server is alive.
         if proc.poll() is not None:
             pytest.skip(f"Command {args} did not start server successfully!")
-    except (ModuleNotFoundError, OSError) as e:
+    except OSError as e:
         pytest.skip(f"Command {args} failed to execute: {e}")
     else:
         yield {
diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py
index 1a4b4a4e05614..53d3dd52bd8a6 100644
--- a/python/scripts/run_emscripten_tests.py
+++ b/python/scripts/run_emscripten_tests.py
@@ -335,7 +335,7 @@ def _load_pyarrow_in_runner(driver, wheel_name):
         """
 import pyarrow,pathlib
 pyarrow_dir = pathlib.Path(pyarrow.__file__).parent
-pytest.main([pyarrow_dir, '-v'])
+pytest.main([pyarrow_dir, '-r', 's'])
 """,
         wait_for_terminate=False,
     )
diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R
index d671c12138c60..54159e82ca60f 100644
--- a/r/tests/testthat/test-gcs.R
+++ b/r/tests/testthat/test-gcs.R
@@ -116,12 +116,12 @@ test_that("GcsFileSystem$create() can read json_credentials", {
 })
 
 skip_on_cran()
-skip_if_not(system('python -c "import testbench"') == 0, message = "googleapis-storage-testbench is not installed.")
+skip_if_not(system("storage-testbench -h") == 0, message = "googleapis-storage-testbench is not installed.")
 library(dplyr)
 
 testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001")
 
-pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port", testbench_port),
+pid_minio <- sys::exec_background("storage-testbench", c("--port", testbench_port),
   std_out = FALSE,
   std_err = FALSE # TODO: is there a good place to send output?
 )

From 6c17b794509d3931225cf295ae864204162c786f Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 29 Aug 2024 17:53:14 +0900
Subject: [PATCH 045/186] GH-43877: [Ruby] Add support for 0 decimal value
 (#43882)

### Rationale for this change

Apache Arrow C++ may use "0.EXXX" string such as "0.E-9" for 0 decimal value. Ruby's BigDecimal doesn't accept it.

### What changes are included in this PR?

We convert "0.EXXX" to "0.0EXXX" in Ruby.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #43877

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ruby/red-arrow/lib/arrow/decimal128-array.rb | 4 +++-
 ruby/red-arrow/lib/arrow/decimal256-array.rb | 4 +++-
 ruby/red-arrow/test/test-decimal128-array.rb | 6 ++++++
 ruby/red-arrow/test/test-decimal256-array.rb | 6 ++++++
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/ruby/red-arrow/lib/arrow/decimal128-array.rb b/ruby/red-arrow/lib/arrow/decimal128-array.rb
index a5ee53be7b229..528c878a859b5 100644
--- a/ruby/red-arrow/lib/arrow/decimal128-array.rb
+++ b/ruby/red-arrow/lib/arrow/decimal128-array.rb
@@ -18,7 +18,9 @@
 module Arrow
   class Decimal128Array
     def get_value(i)
-      BigDecimal(format_value(i))
+      string = format_value(i)
+      string.sub!(".E", ".0E") if string.include?(".E")
+      BigDecimal(string)
     end
   end
 end
diff --git a/ruby/red-arrow/lib/arrow/decimal256-array.rb b/ruby/red-arrow/lib/arrow/decimal256-array.rb
index 8c2306dfe3627..32841ca4862f5 100644
--- a/ruby/red-arrow/lib/arrow/decimal256-array.rb
+++ b/ruby/red-arrow/lib/arrow/decimal256-array.rb
@@ -19,7 +19,9 @@ module Arrow
   class Decimal256Array
     # @since 3.0.0
     def get_value(i)
-      BigDecimal(format_value(i))
+      string = format_value(i)
+      string.sub!(".E", ".0E") if string.include?(".E")
+      BigDecimal(string)
     end
   end
 end
diff --git a/ruby/red-arrow/test/test-decimal128-array.rb b/ruby/red-arrow/test/test-decimal128-array.rb
index a50e2cf4a4832..a6e7c4e1ac433 100644
--- a/ruby/red-arrow/test/test-decimal128-array.rb
+++ b/ruby/red-arrow/test/test-decimal128-array.rb
@@ -38,4 +38,10 @@ class Decimal128ArrayTest < Test::Unit::TestCase
                    array.to_a)
     end
   end
+
+  def test_zero
+    array = Arrow::Decimal128Array.new({precision: 38, scale: 9},
+                                       [BigDecimal("0")])
+    assert_equal(BigDecimal("0"), array[0])
+  end
 end
diff --git a/ruby/red-arrow/test/test-decimal256-array.rb b/ruby/red-arrow/test/test-decimal256-array.rb
index ed542f2d6c75e..053e948fc84b7 100644
--- a/ruby/red-arrow/test/test-decimal256-array.rb
+++ b/ruby/red-arrow/test/test-decimal256-array.rb
@@ -38,4 +38,10 @@ class Decimal256ArrayTest < Test::Unit::TestCase
                    array.to_a)
     end
   end
+
+  def test_zero
+    array = Arrow::Decimal256Array.new({precision: 38, scale: 9},
+                                       [BigDecimal("0")])
+    assert_equal(BigDecimal("0"), array[0])
+  end
 end

From 30893876e0650d9c3c003c5646f94c274ade9669 Mon Sep 17 00:00:00 2001
From: Rossi Sun <zanmato1984@gmail.com>
Date: Thu, 29 Aug 2024 19:09:52 +0800
Subject: [PATCH 046/186] GH-43870: [C++][Acero] Fix typos in join benchmark
 (#43871)

### Rationale for this change

These are rather obvious typos.

### What changes are included in this PR?

### Are these changes tested?

### Are there any user-facing changes?

* GitHub Issue: #43870

Authored-by: Ruoxi Sun <zanmato1984@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/acero/hash_join_benchmark.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc
index 470960b1c5062..e3e37e249e6a3 100644
--- a/cpp/src/arrow/acero/hash_join_benchmark.cc
+++ b/cpp/src/arrow/acero/hash_join_benchmark.cc
@@ -104,7 +104,7 @@ class JoinBenchmark {
       key_cmp.push_back(JoinKeyCmp::EQ);
     }
 
-    for (size_t i = 0; i < settings.build_payload_types.size(); i++) {
+    for (size_t i = 0; i < settings.probe_payload_types.size(); i++) {
       std::string name = "lp" + std::to_string(i);
       DCHECK_OK(l_schema_builder.AddField(field(name, settings.probe_payload_types[i])));
     }
@@ -279,7 +279,7 @@ static void BM_HashJoinBasic_MatchesPerRow(benchmark::State& st) {
   settings.cardinality = 1.0 / static_cast<double>(st.range(0));
 
   settings.num_build_batches = static_cast<int>(st.range(1));
-  settings.num_probe_batches = settings.num_probe_batches;
+  settings.num_probe_batches = settings.num_build_batches;
 
   HashJoinBasicBenchmarkImpl(st, settings);
 }
@@ -291,7 +291,7 @@ static void BM_HashJoinBasic_PayloadSize(benchmark::State& st) {
   settings.cardinality = 1.0 / static_cast<double>(st.range(1));
 
   settings.num_build_batches = static_cast<int>(st.range(2));
-  settings.num_probe_batches = settings.num_probe_batches;
+  settings.num_probe_batches = settings.num_build_batches;
 
   HashJoinBasicBenchmarkImpl(st, settings);
 }

From 6db12f2ca7cccb5f90e1cd0e753d5e92fe3b17bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Thu, 29 Aug 2024 13:36:17 +0200
Subject: [PATCH 047/186] GH-41696: [Python][Packaging] Bump
 MACOSX_DEPLOYMENT_TARGET to 12 instead of 11 (#43137)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

As shown on the associated issue there seems to be a problem with `MACOSX_DEPLOYMENT_TARGET` 11 on the wheels.

### What changes are included in this PR?

Update `MACOSX_DEPLOYMENT_TARGET` everywhere to the latest supported macOS version.

### Are these changes tested?

Via CI, even though the issue was not reproducible on CI.

### Are there any user-facing changes?

Yes, wheels won't be available for macOS 11 but those were crashing on the previous release.
* GitHub Issue: #41696

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 .github/workflows/python.yml                 |  2 +-
 ci/scripts/python_wheel_macos_build.sh       |  2 +-
 ci/vcpkg/arm64-osx-static-debug.cmake        |  2 +-
 ci/vcpkg/arm64-osx-static-release.cmake      |  2 +-
 ci/vcpkg/universal2-osx-static-debug.cmake   |  2 +-
 ci/vcpkg/universal2-osx-static-release.cmake |  2 +-
 cpp/src/arrow/flight/CMakeLists.txt          |  6 ++++++
 dev/tasks/tasks.yml                          | 10 +++++-----
 dev/tasks/verify-rc/github.macos.yml         |  2 +-
 python/CMakeLists.txt                        |  2 +-
 ruby/red-arrow/ext/arrow/extconf.rb          |  2 +-
 11 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 916db2580e371..854d792f3100d 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -163,7 +163,7 @@ jobs:
       ARROW_BUILD_TESTS: OFF
       PYARROW_TEST_LARGE_MEMORY: ON
       # Current oldest supported version according to https://endoflife.date/macos
-      MACOSX_DEPLOYMENT_TARGET: 10.15
+      MACOSX_DEPLOYMENT_TARGET: 12.0
     steps:
       - name: Checkout Arrow
         uses: actions/checkout@v4
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
index d5430f26748eb..92b962f1740bd 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -34,7 +34,7 @@ rm -rf ${source_dir}/python/pyarrow/*.so.*
 
 echo "=== (${PYTHON_VERSION}) Set SDK, C++ and Wheel flags ==="
 export _PYTHON_HOST_PLATFORM="macosx-${MACOSX_DEPLOYMENT_TARGET}-${arch}"
-export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-10.15}
+export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-12.0}
 export SDKROOT=${SDKROOT:-$(xcrun --sdk macosx --show-sdk-path)}
 
 if [ $arch = "arm64" ]; then
diff --git a/ci/vcpkg/arm64-osx-static-debug.cmake b/ci/vcpkg/arm64-osx-static-debug.cmake
index f511819a2edd9..32ae7bc433489 100644
--- a/ci/vcpkg/arm64-osx-static-debug.cmake
+++ b/ci/vcpkg/arm64-osx-static-debug.cmake
@@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static)
 
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES arm64)
-set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0")
+set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0")
 
 set(VCPKG_BUILD_TYPE debug)
diff --git a/ci/vcpkg/arm64-osx-static-release.cmake b/ci/vcpkg/arm64-osx-static-release.cmake
index 43d65efb2651b..dde46cd763afe 100644
--- a/ci/vcpkg/arm64-osx-static-release.cmake
+++ b/ci/vcpkg/arm64-osx-static-release.cmake
@@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static)
 
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES arm64)
-set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0")
+set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0")
 
 set(VCPKG_BUILD_TYPE release)
diff --git a/ci/vcpkg/universal2-osx-static-debug.cmake b/ci/vcpkg/universal2-osx-static-debug.cmake
index 8abc1ebf838f1..d3ef0d67eb719 100644
--- a/ci/vcpkg/universal2-osx-static-debug.cmake
+++ b/ci/vcpkg/universal2-osx-static-debug.cmake
@@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static)
 
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
-set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15")
+set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0")
 
 set(VCPKG_BUILD_TYPE debug)
diff --git a/ci/vcpkg/universal2-osx-static-release.cmake b/ci/vcpkg/universal2-osx-static-release.cmake
index 2eb36c15175b2..3018aa93e5fbb 100644
--- a/ci/vcpkg/universal2-osx-static-release.cmake
+++ b/ci/vcpkg/universal2-osx-static-release.cmake
@@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static)
 
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
-set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15")
+set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0")
 
 set(VCPKG_BUILD_TYPE release)
diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt
index 98f93705f6f56..835c4fc83bf18 100644
--- a/cpp/src/arrow/flight/CMakeLists.txt
+++ b/cpp/src/arrow/flight/CMakeLists.txt
@@ -26,6 +26,12 @@ endif()
 if(WIN32)
   list(APPEND ARROW_FLIGHT_LINK_LIBS ws2_32.lib)
 endif()
+# Updating the MACOSX_DEPLOYMENT_TARGET to 12 required us to explicitly
+# link Flight with OpenSSL on macOS. Read this comment for more details:
+# https://github.com/apache/arrow/pull/43137#pullrequestreview-2267476893
+if(APPLE AND ARROW_USE_OPENSSL)
+  list(APPEND ARROW_FLIGHT_LINK_LIBS ${ARROW_OPENSSL_LIBS})
+endif()
 
 set(ARROW_FLIGHT_TEST_LINKAGE "${ARROW_TEST_LINKAGE}")
 if(Protobuf_USE_STATIC_LIBS)
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index cae34c3231381..7f52fe7b05232 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -413,7 +413,7 @@ tasks:
 
 {############################## Wheel macOS ####################################}
 
-{% for macos_version, macos_codename in [("10.15", "catalina")] %}
+{% for macos_version, macos_codename in [("12.0", "monterey")] %}
   {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %}
 
   wheel-macos-{{ macos_codename }}-{{ python_tag }}-amd64:
@@ -424,25 +424,25 @@ tasks:
       arrow_jemalloc: "ON"
       python_version: "{{ python_version }}"
       macos_deployment_target: "{{ macos_version }}"
-      runs_on: "macos-13"
+      runs_on: "macos-12"
       vcpkg_arch: "amd64"
     artifacts:
       - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl
 
 {% endfor %}
 
-  wheel-macos-big-sur-{{ python_tag }}-arm64:
+  wheel-macos-monterey-{{ python_tag }}-arm64:
     ci: github
     template: python-wheels/github.osx.yml
     params:
       arch: "arm64"
       arrow_jemalloc: "OFF"
       python_version: "{{ python_version }}"
-      macos_deployment_target: "11.0"
+      macos_deployment_target: "12.0"
       runs_on: "macos-14"
       vcpkg_arch: "arm64"
     artifacts:
-      - pyarrow-{no_rc_version}-{{ python_tag }}-{{ python_tag }}-macosx_11_0_arm64.whl
+      - pyarrow-{no_rc_version}-{{ python_tag }}-{{ python_tag }}-macosx_12_0_arm64.whl
 
 {############################## Wheel Windows ################################}
 
diff --git a/dev/tasks/verify-rc/github.macos.yml b/dev/tasks/verify-rc/github.macos.yml
index 4bc3fff71b64a..e2bc7895c6d05 100644
--- a/dev/tasks/verify-rc/github.macos.yml
+++ b/dev/tasks/verify-rc/github.macos.yml
@@ -22,7 +22,7 @@
 {% set use_conda = use_conda|default(False) %}
 # env: is generated by macros.github_header()
   # Current oldest supported version according to https://endoflife.date/macos
-  MACOSX_DEPLOYMENT_TARGET: "10.15"
+  MACOSX_DEPLOYMENT_TARGET: "12.0"
 
 jobs:
   verify:
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 5d5eeaf8157b4..1a18b2b173acb 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -84,7 +84,7 @@ set(CMAKE_MACOSX_RPATH 1)
 if(DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
   set(CMAKE_OSX_DEPLOYMENT_TARGET $ENV{MACOSX_DEPLOYMENT_TARGET})
 else()
-  set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15)
+  set(CMAKE_OSX_DEPLOYMENT_TARGET 12.0)
 endif()
 
 # Generate a Clang compile_commands.json "compilation database" file for use
diff --git a/ruby/red-arrow/ext/arrow/extconf.rb b/ruby/red-arrow/ext/arrow/extconf.rb
index 28ccd0b2d59e1..a3005cd56f270 100644
--- a/ruby/red-arrow/ext/arrow/extconf.rb
+++ b/ruby/red-arrow/ext/arrow/extconf.rb
@@ -91,7 +91,7 @@
   symbols_in_external_bundles.each do |symbol|
     $DLDFLAGS << " -Wl,-U,#{symbol}"
   end
-  mmacosx_version_min = "-mmacosx-version-min=10.15"
+  mmacosx_version_min = "-mmacosx-version-min=12.0"
   $CFLAGS << " #{mmacosx_version_min}"
   $CXXFLAGS << " #{mmacosx_version_min}"
 end

From 45592f9e1d98da75a7bdc534375b32a004f13e02 Mon Sep 17 00:00:00 2001
From: Xin Hao <haoxinst@gmail.com>
Date: Thu, 29 Aug 2024 22:53:54 +0800
Subject: [PATCH 048/186] GH-43732: [Go] Require Go 1.22 or above (#43864)

### Rationale for this change

https://github.com/apache/arrow/issues/43732

### What changes are included in this PR?


### Are these changes tested?

### Are there any user-facing changes?

* GitHub Issue: #43732

Authored-by: Xin Hao <haoxinst@gmail.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
---
 .env                                    |  4 ++--
 .github/workflows/go.yml                | 22 +++++++++++-----------
 ci/docker/conda-integration.dockerfile  |  2 +-
 ci/docker/debian-12-go.dockerfile       |  4 ++--
 dev/release/verify-release-candidate.sh |  8 ++++----
 dev/tasks/tasks.yml                     |  2 +-
 go/arrow/compute/cast_test.go           |  2 +-
 go/arrow/scalar/parse.go                |  2 +-
 go/go.mod                               |  2 +-
 go/parquet/file/file_reader.go          |  2 +-
 go/parquet/schema/reflection.go         |  8 ++++----
 11 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/.env b/.env
index 21f904c3208f6..af647fc8b7a7f 100644
--- a/.env
+++ b/.env
@@ -58,8 +58,8 @@ CUDA=11.2.2
 DASK=latest
 DOTNET=8.0
 GCC_VERSION=""
-GO=1.21.8
-STATICCHECK=v0.4.7
+GO=1.22.6
+STATICCHECK=v0.5.1
 HDFS=3.2.1
 JDK=11
 KARTOTHEK=latest
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index ffd543691d5b2..b9a19d182d5c4 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -62,13 +62,13 @@ jobs:
           {
             "arch-label": "AMD64",
             "arch": "amd64",
-            "go": "1.21",
+            "go": "1.22",
             "runs-on": "ubuntu-latest"
           },
           {
             "arch-label": "AMD64",
             "arch": "amd64",
-            "go": "1.22",
+            "go": "1.23",
             "runs-on": "ubuntu-latest"
           }
           JSON
@@ -78,13 +78,13 @@ jobs:
           {
             "arch-label": "ARM64",
             "arch": "arm64v8",
-            "go": "1.21",
+            "go": "1.22",
             "runs-on": ["self-hosted", "arm", "linux"]
           },
           {
             "arch-label": "ARM64",
             "arch": "arm64v8",
-            "go": "1.22",
+            "go": "1.23",
             "runs-on": ["self-hosted", "arm", "linux"]
           }
           JSON
@@ -197,7 +197,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     env:
       GO: ${{ matrix.go }}
     steps:
@@ -238,7 +238,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     env:
       GO: ${{ matrix.go }}
     steps:
@@ -277,7 +277,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     steps:
       - name: Checkout Arrow
         uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
@@ -310,7 +310,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     steps:
       - name: Checkout Arrow
         uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
@@ -323,7 +323,7 @@ jobs:
           go-version: ${{ matrix.go }}
           cache: true
           cache-dependency-path: go/go.sum
-      - name: Install staticcheck      
+      - name: Install staticcheck
         run: |
           . .env
           go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK}
@@ -368,7 +368,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.21', '1.22']
+        go: ['1.22', '1.23']
     env:
       ARROW_GO_TESTCGO: "1"
     steps:
@@ -439,7 +439,7 @@ jobs:
           ci/scripts/msys2_setup.sh cgo
       - name: Get required Go version
         run: |
-          (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV  
+          (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV
       - name: Update CGO Env vars
         shell: msys2 {0}
         run: |
diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile
index c602490d6b729..7ad2e5c0e8008 100644
--- a/ci/docker/conda-integration.dockerfile
+++ b/ci/docker/conda-integration.dockerfile
@@ -24,7 +24,7 @@ ARG maven=3.8.7
 ARG node=16
 ARG yarn=1.22
 ARG jdk=11
-ARG go=1.21.8
+ARG go=1.22.6
 
 # Install Archery and integration dependencies
 COPY ci/conda_env_archery.txt /arrow/ci/
diff --git a/ci/docker/debian-12-go.dockerfile b/ci/docker/debian-12-go.dockerfile
index c958e6bdee211..4bc683c109eb8 100644
--- a/ci/docker/debian-12-go.dockerfile
+++ b/ci/docker/debian-12-go.dockerfile
@@ -16,8 +16,8 @@
 # under the License.
 
 ARG arch=amd64
-ARG go=1.21
-ARG staticcheck=v0.4.7
+ARG go=1.22
+ARG staticcheck=v0.5.1
 FROM ${arch}/golang:${go}-bookworm
 
 # FROM collects all the args, get back the staticcheck version arg
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index 07e765a759ea0..cdea4ca0d00a1 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -24,7 +24,7 @@
 # - JDK >= 11
 # - gcc >= 4.8
 # - Node.js >= 18
-# - Go >= 1.21
+# - Go >= 1.22
 # - Docker
 #
 # If using a non-system Boost, set BOOST_ROOT and add Boost libraries to
@@ -403,7 +403,7 @@ install_go() {
     return 0
   fi
 
-  local version=1.21.8
+  local version=1.22.6
   show_info "Installing go version ${version}..."
 
   local arch="$(uname -m)"
@@ -512,7 +512,7 @@ install_maven() {
     show_info "System Maven version ${SYSTEM_MAVEN_VERSION} matches required Maven version ${MAVEN_VERSION}. Skipping installation."
   else
     # Append pipe character to make preview release versions like "X.Y.Z-beta-1" sort
-    # as older than their corresponding release version "X.Y.Z". This works because 
+    # as older than their corresponding release version "X.Y.Z". This works because
     # `sort -V` orders the pipe character lower than any version number character.
     older_version=$(printf '%s\n%s\n' "$SYSTEM_MAVEN_VERSION" "$MAVEN_VERSION" | sed 's/$/|/' | sort -V | sed 's/|$//' | head -n1)
     if [[ "$older_version" == "$SYSTEM_MAVEN_VERSION" ]]; then
@@ -953,7 +953,7 @@ test_go() {
   show_header "Build and test Go libraries"
 
   maybe_setup_go
-  maybe_setup_conda compilers go=1.21
+  maybe_setup_conda compilers go=1.22
 
   pushd go
   go get -v ./...
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 7f52fe7b05232..c6d2f2175d44c 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1475,7 +1475,7 @@ tasks:
         R_PRUNE_DEPS: TRUE
       image: r-clang-sanitizer
 
-  {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %}
+  {% for go_version, staticcheck in [("1.22", "v0.5.1"), ("1.23", "latest")] %}
   test-debian-12-go-{{ go_version }}:
     ci: github
     template: docker-tests/github.linux.yml
diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go
index fa08467dd3946..db6098225dda8 100644
--- a/go/arrow/compute/cast_test.go
+++ b/go/arrow/compute/cast_test.go
@@ -129,7 +129,7 @@ func checkScalarWithScalars(t *testing.T, funcName string, inputs []scalar.Scala
 			fmt.Fprintf(&b, " (types differed: %s vs %s)",
 				out.(*compute.ScalarDatum).Type(), expected.DataType())
 		}
-		t.Fatalf(b.String())
+		t.Fatal(b.String())
 	}
 }
 
diff --git a/go/arrow/scalar/parse.go b/go/arrow/scalar/parse.go
index 866e627113d88..27db42afa69b1 100644
--- a/go/arrow/scalar/parse.go
+++ b/go/arrow/scalar/parse.go
@@ -329,7 +329,7 @@ func fromListScalar(s ListScalar, v reflect.Value) error {
 		}
 	case *array.Map:
 		// only implementing slice of metadata for now
-		if v.Type().Elem() != reflect.PtrTo(reflect.TypeOf(arrow.Metadata{})) {
+		if v.Type().Elem() != reflect.PointerTo(reflect.TypeOf(arrow.Metadata{})) {
 			return fmt.Errorf("unimplemented fromListScalar type %s to %s", arr.DataType(), v.Type().String())
 		}
 
diff --git a/go/go.mod b/go/go.mod
index a995eee24d563..77f98cefb0f0e 100644
--- a/go/go.mod
+++ b/go/go.mod
@@ -16,7 +16,7 @@
 
 module github.com/apache/arrow/go/v18
 
-go 1.21
+go 1.22
 
 require (
 	github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c
diff --git a/go/parquet/file/file_reader.go b/go/parquet/file/file_reader.go
index f838482fbb0e9..f25b882e00647 100644
--- a/go/parquet/file/file_reader.go
+++ b/go/parquet/file/file_reader.go
@@ -233,7 +233,7 @@ func (f *Reader) parseMetaData() error {
 func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, algo *parquet.Algorithm) (string, error) {
 	aadPrefixInProps := fileDecrypt.AadPrefix()
 	aadPrefix := []byte(aadPrefixInProps)
-	fileHasAadPrefix := algo.Aad.AadPrefix != nil && len(algo.Aad.AadPrefix) > 0
+	fileHasAadPrefix := len(algo.Aad.AadPrefix) > 0
 	aadPrefixInFile := algo.Aad.AadPrefix
 
 	if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" {
diff --git a/go/parquet/schema/reflection.go b/go/parquet/schema/reflection.go
index 0bec9eb599dc8..51d0a84f2244f 100644
--- a/go/parquet/schema/reflection.go
+++ b/go/parquet/schema/reflection.go
@@ -639,7 +639,7 @@ func typeFromNode(n Node) reflect.Type {
 		}
 
 		if n.RepetitionType() == parquet.Repetitions.Optional {
-			typ = reflect.PtrTo(typ)
+			typ = reflect.PointerTo(typ)
 		} else if n.RepetitionType() == parquet.Repetitions.Repeated {
 			typ = reflect.SliceOf(typ)
 		}
@@ -707,7 +707,7 @@ func typeFromNode(n Node) reflect.Type {
 				elemType = reflect.SliceOf(elemType)
 			}
 			if gnode.RepetitionType() == parquet.Repetitions.Optional {
-				elemType = reflect.PtrTo(elemType)
+				elemType = reflect.PointerTo(elemType)
 			}
 			return elemType
 		case ConvertedTypes.Map, ConvertedTypes.MapKeyValue:
@@ -778,7 +778,7 @@ func typeFromNode(n Node) reflect.Type {
 
 			mapType := reflect.MapOf(keyType, valType)
 			if gnode.RepetitionType() == parquet.Repetitions.Optional {
-				mapType = reflect.PtrTo(mapType)
+				mapType = reflect.PointerTo(mapType)
 			}
 			return mapType
 		default:
@@ -796,7 +796,7 @@ func typeFromNode(n Node) reflect.Type {
 				return reflect.SliceOf(structType)
 			}
 			if gnode.RepetitionType() == parquet.Repetitions.Optional {
-				return reflect.PtrTo(structType)
+				return reflect.PointerTo(structType)
 			}
 			return structType
 		}

From 4f91c8f144125bd147c25cb49ac0071c8d28765c Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Thu, 29 Aug 2024 23:38:41 +0800
Subject: [PATCH 049/186] GH-43759: [C++] Acero: Minor code enhancement for
 Join (#43760)

### Rationale for this change

Minor style enhancement for join

### What changes are included in this PR?

Minor style enhancement for join

### Are these changes tested?

Covered by existing

### Are there any user-facing changes?

no

* GitHub Issue: #43759

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/arrow/acero/hash_join_dict.cc         |  9 ++-
 cpp/src/arrow/acero/hash_join_node.cc         | 16 ++---
 cpp/src/arrow/acero/hash_join_node.h          |  6 +-
 cpp/src/arrow/acero/swiss_join.cc             |  7 +-
 cpp/src/arrow/compute/light_array_internal.cc | 68 +++++++++----------
 cpp/src/arrow/compute/light_array_internal.h  |  6 +-
 cpp/src/arrow/compute/light_array_test.cc     |  4 +-
 7 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/cpp/src/arrow/acero/hash_join_dict.cc b/cpp/src/arrow/acero/hash_join_dict.cc
index 3aef08e6e9ccf..8db9dddb2c3a0 100644
--- a/cpp/src/arrow/acero/hash_join_dict.cc
+++ b/cpp/src/arrow/acero/hash_join_dict.cc
@@ -225,21 +225,20 @@ Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr<Array> dictiona
     return Status::OK();
   }
 
-  dictionary_ = dictionary;
+  dictionary_ = std::move(dictionary);
 
   // Initialize encoder
   RowEncoder encoder;
-  std::vector<TypeHolder> encoder_types;
-  encoder_types.emplace_back(value_type_);
+  std::vector<TypeHolder> encoder_types{value_type_};
   encoder.Init(encoder_types, ctx);
 
   // Encode all dictionary values
-  int64_t length = dictionary->data()->length;
+  int64_t length = dictionary_->data()->length;
   if (length >= std::numeric_limits<int32_t>::max()) {
     return Status::Invalid(
         "Dictionary length in hash join must fit into signed 32-bit integer.");
   }
-  RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary->data()}, length)));
+  RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary_->data()}, length)));
 
   std::vector<int32_t> entries_to_take;
 
diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc
index 67f902e64be93..80dd163ced740 100644
--- a/cpp/src/arrow/acero/hash_join_node.cc
+++ b/cpp/src/arrow/acero/hash_join_node.cc
@@ -61,30 +61,30 @@ Result<std::vector<FieldRef>> HashJoinSchema::ComputePayload(
     const std::vector<FieldRef>& filter, const std::vector<FieldRef>& keys) {
   // payload = (output + filter) - keys, with no duplicates
   std::unordered_set<int> payload_fields;
-  for (auto ref : output) {
+  for (const auto& ref : output) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     payload_fields.insert(match[0]);
   }
 
-  for (auto ref : filter) {
+  for (const auto& ref : filter) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     payload_fields.insert(match[0]);
   }
 
-  for (auto ref : keys) {
+  for (const auto& ref : keys) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     payload_fields.erase(match[0]);
   }
 
   std::vector<FieldRef> payload_refs;
-  for (auto ref : output) {
+  for (const auto& ref : output) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     if (payload_fields.find(match[0]) != payload_fields.end()) {
       payload_refs.push_back(ref);
       payload_fields.erase(match[0]);
     }
   }
-  for (auto ref : filter) {
+  for (const auto& ref : filter) {
     ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema));
     if (payload_fields.find(match[0]) != payload_fields.end()) {
       payload_refs.push_back(ref);
@@ -198,7 +198,7 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc
     return Status::Invalid("Different number of key fields on left (", left_keys.size(),
                            ") and right (", right_keys.size(), ") side of the join");
   }
-  if (left_keys.size() < 1) {
+  if (left_keys.empty()) {
     return Status::Invalid("Join key cannot be empty");
   }
   for (size_t i = 0; i < left_keys.size() + right_keys.size(); ++i) {
@@ -432,7 +432,7 @@ Status HashJoinSchema::CollectFilterColumns(std::vector<FieldRef>& left_filter,
         indices[0] -= left_schema.num_fields();
         FieldPath corrected_path(std::move(indices));
         if (right_seen_paths.find(*path) == right_seen_paths.end()) {
-          right_filter.push_back(corrected_path);
+          right_filter.emplace_back(corrected_path);
           right_seen_paths.emplace(std::move(corrected_path));
         }
       } else if (left_seen_paths.find(*path) == left_seen_paths.end()) {
@@ -698,7 +698,7 @@ class HashJoinNode : public ExecNode, public TracedNode {
                std::shared_ptr<Schema> output_schema,
                std::unique_ptr<HashJoinSchema> schema_mgr, Expression filter,
                std::unique_ptr<HashJoinImpl> impl)
-      : ExecNode(plan, inputs, {"left", "right"},
+      : ExecNode(plan, std::move(inputs), {"left", "right"},
                  /*output_schema=*/std::move(output_schema)),
         TracedNode(this),
         join_type_(join_options.join_type),
diff --git a/cpp/src/arrow/acero/hash_join_node.h b/cpp/src/arrow/acero/hash_join_node.h
index ad60019ceabc4..19745b8675cf0 100644
--- a/cpp/src/arrow/acero/hash_join_node.h
+++ b/cpp/src/arrow/acero/hash_join_node.h
@@ -65,9 +65,9 @@ class ARROW_ACERO_EXPORT HashJoinSchema {
   std::shared_ptr<Schema> MakeOutputSchema(const std::string& left_field_name_suffix,
                                            const std::string& right_field_name_suffix);
 
-  bool LeftPayloadIsEmpty() { return PayloadIsEmpty(0); }
+  bool LeftPayloadIsEmpty() const { return PayloadIsEmpty(0); }
 
-  bool RightPayloadIsEmpty() { return PayloadIsEmpty(1); }
+  bool RightPayloadIsEmpty() const { return PayloadIsEmpty(1); }
 
   static int kMissingField() {
     return SchemaProjectionMaps<HashJoinProjection>::kMissingField;
@@ -88,7 +88,7 @@ class ARROW_ACERO_EXPORT HashJoinSchema {
                                             const SchemaProjectionMap& right_to_filter,
                                             const Expression& filter);
 
-  bool PayloadIsEmpty(int side) {
+  bool PayloadIsEmpty(int side) const {
     assert(side == 0 || side == 1);
     return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0;
   }
diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc
index 4d0c8187ac6e2..6c783110af571 100644
--- a/cpp/src/arrow/acero/swiss_join.cc
+++ b/cpp/src/arrow/acero/swiss_join.cc
@@ -1667,7 +1667,7 @@ Result<std::shared_ptr<ArrayData>> JoinResultMaterialize::FlushBuildColumn(
     const std::shared_ptr<DataType>& data_type, const RowArray* row_array, int column_id,
     uint32_t* row_ids) {
   ResizableArrayData output;
-  output.Init(data_type, pool_, bit_util::Log2(num_rows_));
+  RETURN_NOT_OK(output.Init(data_type, pool_, bit_util::Log2(num_rows_)));
 
   for (size_t i = 0; i <= null_ranges_.size(); ++i) {
     int row_id_begin =
@@ -2247,8 +2247,9 @@ Result<ExecBatch> JoinResidualFilter::MaterializeFilterInput(
         build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD);
     for (int i = 0; i < num_build_cols; ++i) {
       ResizableArrayData column_data;
-      column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), pool_,
-                       bit_util::Log2(num_batch_rows));
+      RETURN_NOT_OK(
+          column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i),
+                           pool_, bit_util::Log2(num_batch_rows)));
       if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) {
         RETURN_NOT_OK(build_keys_->DecodeSelected(&column_data, idx, num_batch_rows,
                                                   key_ids_maybe_null, pool_));
diff --git a/cpp/src/arrow/compute/light_array_internal.cc b/cpp/src/arrow/compute/light_array_internal.cc
index 4f235925d0fb6..e4b1f1b8cdd63 100644
--- a/cpp/src/arrow/compute/light_array_internal.cc
+++ b/cpp/src/arrow/compute/light_array_internal.cc
@@ -118,10 +118,9 @@ Result<KeyColumnMetadata> ColumnMetadataFromDataType(
     const std::shared_ptr<DataType>& type) {
   const bool is_extension = type->id() == Type::EXTENSION;
   const std::shared_ptr<DataType>& typ =
-      is_extension
-          ? arrow::internal::checked_pointer_cast<ExtensionType>(type->GetSharedPtr())
-                ->storage_type()
-          : type;
+      is_extension ? arrow::internal::checked_cast<const ExtensionType*>(type.get())
+                         ->storage_type()
+                   : type;
 
   if (typ->id() == Type::DICTIONARY) {
     auto bit_width =
@@ -205,22 +204,25 @@ Status ColumnArraysFromExecBatch(const ExecBatch& batch,
                                    column_arrays);
 }
 
-void ResizableArrayData::Init(const std::shared_ptr<DataType>& data_type,
-                              MemoryPool* pool, int log_num_rows_min) {
+Status ResizableArrayData::Init(const std::shared_ptr<DataType>& data_type,
+                                MemoryPool* pool, int log_num_rows_min) {
 #ifndef NDEBUG
   if (num_rows_allocated_ > 0) {
-    ARROW_DCHECK(data_type_ != NULLPTR);
-    KeyColumnMetadata metadata_before =
-        ColumnMetadataFromDataType(data_type_).ValueOrDie();
-    KeyColumnMetadata metadata_after = ColumnMetadataFromDataType(data_type).ValueOrDie();
+    ARROW_DCHECK(data_type_ != nullptr);
+    const KeyColumnMetadata& metadata_before = column_metadata_;
+    ARROW_ASSIGN_OR_RAISE(KeyColumnMetadata metadata_after,
+                          ColumnMetadataFromDataType(data_type));
     ARROW_DCHECK(metadata_before.is_fixed_length == metadata_after.is_fixed_length &&
                  metadata_before.fixed_length == metadata_after.fixed_length);
   }
 #endif
+  ARROW_DCHECK(data_type != nullptr);
+  ARROW_ASSIGN_OR_RAISE(column_metadata_, ColumnMetadataFromDataType(data_type));
   Clear(/*release_buffers=*/false);
   log_num_rows_min_ = log_num_rows_min;
   data_type_ = data_type;
   pool_ = pool;
+  return Status::OK();
 }
 
 void ResizableArrayData::Clear(bool release_buffers) {
@@ -246,8 +248,6 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
     num_rows_allocated_new *= 2;
   }
 
-  KeyColumnMetadata column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
-
   if (buffers_[kFixedLengthBuffer] == NULLPTR) {
     ARROW_DCHECK(buffers_[kValidityBuffer] == NULLPTR &&
                  buffers_[kVariableLengthBuffer] == NULLPTR);
@@ -258,8 +258,8 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
             bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes, pool_));
     memset(mutable_data(kValidityBuffer), 0,
            bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes);
-    if (column_metadata.is_fixed_length) {
-      if (column_metadata.fixed_length == 0) {
+    if (column_metadata_.is_fixed_length) {
+      if (column_metadata_.fixed_length == 0) {
         ARROW_ASSIGN_OR_RAISE(
             buffers_[kFixedLengthBuffer],
             AllocateResizableBuffer(
@@ -271,7 +271,7 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
         ARROW_ASSIGN_OR_RAISE(
             buffers_[kFixedLengthBuffer],
             AllocateResizableBuffer(
-                num_rows_allocated_new * column_metadata.fixed_length + kNumPaddingBytes,
+                num_rows_allocated_new * column_metadata_.fixed_length + kNumPaddingBytes,
                 pool_));
       }
     } else {
@@ -300,15 +300,15 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
     memset(mutable_data(kValidityBuffer) + bytes_for_bits_before, 0,
            bytes_for_bits_after - bytes_for_bits_before);
 
-    if (column_metadata.is_fixed_length) {
-      if (column_metadata.fixed_length == 0) {
+    if (column_metadata_.is_fixed_length) {
+      if (column_metadata_.fixed_length == 0) {
         RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize(
             bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes));
         memset(mutable_data(kFixedLengthBuffer) + bytes_for_bits_before, 0,
                bytes_for_bits_after - bytes_for_bits_before);
       } else {
         RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize(
-            num_rows_allocated_new * column_metadata.fixed_length + kNumPaddingBytes));
+            num_rows_allocated_new * column_metadata_.fixed_length + kNumPaddingBytes));
       }
     } else {
       RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize(
@@ -323,10 +323,7 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) {
 }
 
 Status ResizableArrayData::ResizeVaryingLengthBuffer() {
-  KeyColumnMetadata column_metadata;
-  column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
-
-  if (!column_metadata.is_fixed_length) {
+  if (!column_metadata_.is_fixed_length) {
     int64_t min_new_size = buffers_[kFixedLengthBuffer]->data_as<int32_t>()[num_rows_];
     ARROW_DCHECK(var_len_buf_size_ > 0);
     if (var_len_buf_size_ < min_new_size) {
@@ -343,23 +340,19 @@ Status ResizableArrayData::ResizeVaryingLengthBuffer() {
 }
 
 KeyColumnArray ResizableArrayData::column_array() const {
-  KeyColumnMetadata column_metadata;
-  column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
-  return KeyColumnArray(column_metadata, num_rows_,
+  return KeyColumnArray(column_metadata_, num_rows_,
                         buffers_[kValidityBuffer]->mutable_data(),
                         buffers_[kFixedLengthBuffer]->mutable_data(),
                         buffers_[kVariableLengthBuffer]->mutable_data());
 }
 
 std::shared_ptr<ArrayData> ResizableArrayData::array_data() const {
-  KeyColumnMetadata column_metadata;
-  column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
-
-  auto valid_count = arrow::internal::CountSetBits(
-      buffers_[kValidityBuffer]->data(), /*offset=*/0, static_cast<int64_t>(num_rows_));
+  auto valid_count =
+      arrow::internal::CountSetBits(buffers_[kValidityBuffer]->data(), /*bit_offset=*/0,
+                                    static_cast<int64_t>(num_rows_));
   int null_count = static_cast<int>(num_rows_) - static_cast<int>(valid_count);
 
-  if (column_metadata.is_fixed_length) {
+  if (column_metadata_.is_fixed_length) {
     return ArrayData::Make(data_type_, num_rows_,
                            {buffers_[kValidityBuffer], buffers_[kFixedLengthBuffer]},
                            null_count);
@@ -493,10 +486,12 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr<ArrayData>& source
   ARROW_DCHECK(num_rows_before >= 0);
   int num_rows_after = num_rows_before + num_rows_to_append;
   if (target->num_rows() == 0) {
-    target->Init(source->type, pool, kLogNumRows);
+    RETURN_NOT_OK(target->Init(source->type, pool, kLogNumRows));
   }
   RETURN_NOT_OK(target->ResizeFixedLengthBuffers(num_rows_after));
 
+  // Since target->Init is called before, we can assume that the ColumnMetadata
+  // would never fail to be created
   KeyColumnMetadata column_metadata =
       ColumnMetadataFromDataType(source->type).ValueOrDie();
 
@@ -647,11 +642,12 @@ Status ExecBatchBuilder::AppendNulls(const std::shared_ptr<DataType>& type,
   int num_rows_before = target.num_rows();
   int num_rows_after = num_rows_before + num_rows_to_append;
   if (target.num_rows() == 0) {
-    target.Init(type, pool, kLogNumRows);
+    RETURN_NOT_OK(target.Init(type, pool, kLogNumRows));
   }
   RETURN_NOT_OK(target.ResizeFixedLengthBuffers(num_rows_after));
 
-  KeyColumnMetadata column_metadata = ColumnMetadataFromDataType(type).ValueOrDie();
+  ARROW_ASSIGN_OR_RAISE(KeyColumnMetadata column_metadata,
+                        ColumnMetadataFromDataType(type));
 
   // Process fixed length buffer
   //
@@ -708,7 +704,7 @@ Status ExecBatchBuilder::AppendSelected(MemoryPool* pool, const ExecBatch& batch
       const Datum& data = batch.values[col_ids ? col_ids[i] : i];
       ARROW_DCHECK(data.is_array());
       const std::shared_ptr<ArrayData>& array_data = data.array();
-      values_[i].Init(array_data->type, pool, kLogNumRows);
+      RETURN_NOT_OK(values_[i].Init(array_data->type, pool, kLogNumRows));
     }
   }
 
@@ -739,7 +735,7 @@ Status ExecBatchBuilder::AppendNulls(MemoryPool* pool,
   if (values_.empty()) {
     values_.resize(types.size());
     for (size_t i = 0; i < types.size(); ++i) {
-      values_[i].Init(types[i], pool, kLogNumRows);
+      RETURN_NOT_OK(values_[i].Init(types[i], pool, kLogNumRows));
     }
   }
 
diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h
index 995c4211998e0..b8e48f096baeb 100644
--- a/cpp/src/arrow/compute/light_array_internal.h
+++ b/cpp/src/arrow/compute/light_array_internal.h
@@ -295,8 +295,8 @@ class ARROW_EXPORT ResizableArrayData {
   /// \param pool The pool to make allocations on
   /// \param log_num_rows_min All resize operations will allocate at least enough
   ///                         space for (1 << log_num_rows_min) rows
-  void Init(const std::shared_ptr<DataType>& data_type, MemoryPool* pool,
-            int log_num_rows_min);
+  Status Init(const std::shared_ptr<DataType>& data_type, MemoryPool* pool,
+              int log_num_rows_min);
 
   /// \brief Resets the array back to an empty state
   /// \param release_buffers If true then allocated memory is released and the
@@ -351,6 +351,8 @@ class ARROW_EXPORT ResizableArrayData {
   static constexpr int64_t kNumPaddingBytes = 64;
   int log_num_rows_min_;
   std::shared_ptr<DataType> data_type_;
+  // Would be valid if data_type_ != NULLPTR.
+  KeyColumnMetadata column_metadata_{};
   MemoryPool* pool_;
   int num_rows_;
   int num_rows_allocated_;
diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc
index cc02d489d138f..98a1ab8b7acae 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -295,7 +295,7 @@ TEST(ResizableArrayData, Basic) {
         arrow::internal::checked_pointer_cast<FixedWidthType>(type)->bit_width() / 8;
     {
       ResizableArrayData array;
-      array.Init(type, pool.get(), /*log_num_rows_min=*/16);
+      ASSERT_OK(array.Init(type, pool.get(), /*log_num_rows_min=*/16));
       ASSERT_EQ(0, array.num_rows());
       ASSERT_OK(array.ResizeFixedLengthBuffers(2));
       ASSERT_EQ(2, array.num_rows());
@@ -330,7 +330,7 @@ TEST(ResizableArrayData, Binary) {
     ARROW_SCOPED_TRACE("Type: ", type->ToString());
     {
       ResizableArrayData array;
-      array.Init(type, pool.get(), /*log_num_rows_min=*/4);
+      ASSERT_OK(array.Init(type, pool.get(), /*log_num_rows_min=*/4));
       ASSERT_EQ(0, array.num_rows());
       ASSERT_OK(array.ResizeFixedLengthBuffers(2));
       ASSERT_EQ(2, array.num_rows());

From 6b242538cf5723da5735814af9a18d0a9b41d5a4 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <pitrou@free.fr>
Date: Thu, 29 Aug 2024 21:14:39 +0200
Subject: [PATCH 050/186] GH-43885: [C++][CI] Catch potential integer overflow
 in PoolBuffer (#43886)

This should fix https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=71200

* GitHub Issue: #43885

Lead-authored-by: Antoine Pitrou <antoine@python.org>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/memory_pool.cc | 11 +++++++++--
 testing                      |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc
index 1e855311a98ed..34207781277d1 100644
--- a/cpp/src/arrow/memory_pool.cc
+++ b/cpp/src/arrow/memory_pool.cc
@@ -858,7 +858,7 @@ class PoolBuffer final : public ResizableBuffer {
     }
     uint8_t* ptr = mutable_data();
     if (!ptr || capacity > capacity_) {
-      int64_t new_capacity = bit_util::RoundUpToMultipleOf64(capacity);
+      ARROW_ASSIGN_OR_RAISE(int64_t new_capacity, RoundCapacity(capacity));
       if (ptr) {
         RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, alignment_, &ptr));
       } else {
@@ -878,7 +878,7 @@ class PoolBuffer final : public ResizableBuffer {
     if (ptr && shrink_to_fit && new_size <= size_) {
       // Buffer is non-null and is not growing, so shrink to the requested size without
       // excess space.
-      int64_t new_capacity = bit_util::RoundUpToMultipleOf64(new_size);
+      ARROW_ASSIGN_OR_RAISE(int64_t new_capacity, RoundCapacity(new_size));
       if (capacity_ != new_capacity) {
         // Buffer hasn't got yet the requested size.
         RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, alignment_, &ptr));
@@ -916,6 +916,13 @@ class PoolBuffer final : public ResizableBuffer {
   }
 
  private:
+  static Result<int64_t> RoundCapacity(int64_t capacity) {
+    if (capacity > std::numeric_limits<int64_t>::max() - 63) {
+      return Status::OutOfMemory("capacity too large");
+    }
+    return bit_util::RoundUpToMultipleOf64(capacity);
+  }
+
   MemoryPool* pool_;
   int64_t alignment_;
 };
diff --git a/testing b/testing
index 735ae7128d571..4d209492d514c 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 735ae7128d571398dd798d7ff004adebeb342883
+Subproject commit 4d209492d514c2d3cb2d392681b9aa00e6d8da1c

From 07420b0c56066326bd409e9537ee3d43ab6b1a51 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Fri, 30 Aug 2024 07:50:53 +0530
Subject: [PATCH 051/186] GH-43869: [Java][CI] Flight related failure in the
 AMD64 Windows Server 2022 Java JDK 11 CI (#43850)

### Rationale for this change

CIs have been consistently failing on windows recently due to an issue with derby configuration. This PR investigates a solution for this.

### What changes are included in this PR?

Changing the flow of the exception handling and state return.

### Are these changes tested?

Via existing test cases.

### Are there any user-facing changes?

No
* GitHub Issue: #43869

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../flight/sql/example/FlightSqlExample.java  | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
index e7127faf97539..67bfc85c48602 100644
--- a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
+++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
@@ -181,9 +181,8 @@ public static void main(String[] args) throws Exception {
 
   public FlightSqlExample(final Location location, final String dbName) {
     // TODO Constructor should not be doing work.
-    checkState(
-        removeDerbyDatabaseIfExists(dbName) && populateDerbyDatabase(dbName),
-        "Failed to reset Derby database!");
+    checkState(removeDerbyDatabaseIfExists(dbName), "Failed to clear Derby database!");
+    checkState(populateDerbyDatabase(dbName), "Failed to populate Derby database!");
     databaseUri = "jdbc:derby:target/" + dbName;
     final ConnectionFactory connectionFactory =
         new DriverManagerConnectionFactory(databaseUri, new Properties());
@@ -253,36 +252,35 @@ public FlightSqlExample(final Location location, final String dbName) {
   }
 
   public static boolean removeDerbyDatabaseIfExists(final String dbName) {
-    boolean wasSuccess;
     final Path path = Paths.get("target" + File.separator + dbName);
 
     try (final Stream<Path> walk = Files.walk(path)) {
       /*
        * Iterate over all paths to delete, mapping each path to the outcome of its own
-       * deletion as a boolean representing whether or not each individual operation was
-       * successful; then reduce all booleans into a single answer, and store that into
-       * `wasSuccess`, which will later be returned by this method.
+       * deletion as a boolean representing whether each individual operation was
+       * successful; then reduce all booleans into a single answer.
        * If for whatever reason the resulting `Stream<Boolean>` is empty, throw an `IOException`;
        * this not expected.
        */
-      wasSuccess =
+      boolean unused =
           walk.sorted(Comparator.reverseOrder())
               .map(Path::toFile)
               .map(File::delete)
               .reduce(Boolean::logicalAnd)
               .orElseThrow(IOException::new);
-    } catch (IOException e) {
+    } catch (NoSuchFileException e) {
       /*
        * The only acceptable scenario for an `IOException` to be thrown here is if
        * an attempt to delete an non-existing file takes place -- which should be
        * alright, since they would be deleted anyway.
        */
-      if (!(wasSuccess = e instanceof NoSuchFileException)) {
-        LOGGER.error(format("Failed attempt to clear DerbyDB: <%s>", e.getMessage()), e);
-      }
+      LOGGER.error(format("No existing Derby database to delete.: <%s>", e.getMessage()), e);
+      return true;
+    } catch (Exception e) {
+      LOGGER.error(format("Failed attempt to clear DerbyDB.: <%s>", e.getMessage()), e);
+      return false;
     }
-
-    return wasSuccess;
+    return true;
   }
 
   private static boolean populateDerbyDatabase(final String dbName) {

From 63b34c97c5d3ca6d20dacb9e92b404986f1d7d62 Mon Sep 17 00:00:00 2001
From: Joel Lubinitsky <33523178+joellubi@users.noreply.github.com>
Date: Fri, 30 Aug 2024 13:00:50 -0400
Subject: [PATCH 052/186] GH-43837: [Go][IPC] Consolidate StreamWriter and
 FileWriter, ensuring that EOS indicator is written in file (#43890)

### Rationale for this change

Fixes: #43837

Much of the logic between the ipc stream writer and the file writer was split. This PR changes the file writer so that it uses a stream writer internally, ensuring that a valid stream is embedded within the file.

**TODO**
- [x] Remove @ bkietz's commits

### What changes are included in this PR?

- Refactor `fileWriter` to embed `streamWriter` and defer relevant methods
- Add test

### Are these changes tested?

Yes

### Are there any user-facing changes?

Go-generated IPC files will contain the EOS indicator

* GitHub Issue: #43837

Authored-by: Joel Lubinitsky <joellubi@gmail.com>
Signed-off-by: Joel Lubinitsky <joellubi@gmail.com>
---
 go/arrow/ipc/file_test.go   | 40 ++++++++++++++++++
 go/arrow/ipc/file_writer.go | 82 +++++++++----------------------------
 go/arrow/ipc/writer.go      | 12 +++---
 3 files changed, 65 insertions(+), 69 deletions(-)

diff --git a/go/arrow/ipc/file_test.go b/go/arrow/ipc/file_test.go
index dea63579cfea6..b9a4547a5126a 100644
--- a/go/arrow/ipc/file_test.go
+++ b/go/arrow/ipc/file_test.go
@@ -17,13 +17,17 @@
 package ipc_test
 
 import (
+	"bytes"
 	"fmt"
 	"os"
 	"testing"
 
+	"github.com/apache/arrow/go/v18/arrow/array"
 	"github.com/apache/arrow/go/v18/arrow/internal/arrdata"
 	"github.com/apache/arrow/go/v18/arrow/internal/flatbuf"
+	"github.com/apache/arrow/go/v18/arrow/ipc"
 	"github.com/apache/arrow/go/v18/arrow/memory"
+	"github.com/stretchr/testify/require"
 )
 
 func TestFile(t *testing.T) {
@@ -75,3 +79,39 @@ func TestFileCompressed(t *testing.T) {
 		}
 	}
 }
+
+func TestFileEmbedsStream(t *testing.T) {
+	mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+	defer mem.AssertSize(t, 0)
+
+	recs := arrdata.Records["primitives"]
+	schema := recs[0].Schema()
+
+	var buf bytes.Buffer
+	w, err := ipc.NewFileWriter(&buf, ipc.WithSchema(schema), ipc.WithAllocator(mem))
+	require.NoError(t, err)
+	defer w.Close()
+
+	for _, rec := range recs {
+		require.NoError(t, w.Write(rec))
+	}
+
+	require.NoError(t, w.Close())
+
+	// we should be able to read a valid ipc stream within the ipc file
+
+	// create an ipc stream reader, skipping the file magic+padding bytes
+	rdr, err := ipc.NewReader(bytes.NewReader(buf.Bytes()[8:]), ipc.WithSchema(schema), ipc.WithAllocator(mem))
+	require.NoError(t, err)
+	defer rdr.Release()
+
+	// the stream reader should know to stop before the footer if the EOS indicator is properly written
+	var i int
+	for rdr.Next() {
+		rec := rdr.Record()
+		require.Truef(t, array.RecordEqual(rec, recs[i]), "records[%d] differ", i)
+		i++
+	}
+
+	require.NoError(t, rdr.Err())
+}
diff --git a/go/arrow/ipc/file_writer.go b/go/arrow/ipc/file_writer.go
index 8582c81baf2fe..9a3d7d3dbeb02 100644
--- a/go/arrow/ipc/file_writer.go
+++ b/go/arrow/ipc/file_writer.go
@@ -37,23 +37,17 @@ type PayloadWriter interface {
 	Close() error
 }
 
-type pwriter struct {
-	w   io.WriteSeeker
-	pos int64
+type fileWriter struct {
+	streamWriter
 
 	schema *arrow.Schema
 	dicts  []fileBlock
 	recs   []fileBlock
 }
 
-func (w *pwriter) Start() error {
+func (w *fileWriter) Start() error {
 	var err error
 
-	err = w.updatePos()
-	if err != nil {
-		return fmt.Errorf("arrow/ipc: could not update position while in start: %w", err)
-	}
-
 	// only necessary to align to 8-byte boundary at the start of the file
 	_, err = w.Write(Magic)
 	if err != nil {
@@ -65,10 +59,10 @@ func (w *pwriter) Start() error {
 		return fmt.Errorf("arrow/ipc: could not align start block: %w", err)
 	}
 
-	return err
+	return w.streamWriter.Start()
 }
 
-func (w *pwriter) WritePayload(p Payload) error {
+func (w *fileWriter) WritePayload(p Payload) error {
 	blk := fileBlock{Offset: w.pos, Meta: 0, Body: p.size}
 	n, err := writeIPCPayload(w, p)
 	if err != nil {
@@ -77,11 +71,6 @@ func (w *pwriter) WritePayload(p Payload) error {
 
 	blk.Meta = int32(n)
 
-	err = w.updatePos()
-	if err != nil {
-		return fmt.Errorf("arrow/ipc: could not update position while in write-payload: %w", err)
-	}
-
 	switch flatbuf.MessageHeader(p.msg) {
 	case flatbuf.MessageHeaderDictionaryBatch:
 		w.dicts = append(w.dicts, blk)
@@ -92,27 +81,18 @@ func (w *pwriter) WritePayload(p Payload) error {
 	return nil
 }
 
-func (w *pwriter) Close() error {
+func (w *fileWriter) Close() error {
 	var err error
 
-	// write file footer
-	err = w.updatePos()
-	if err != nil {
-		return fmt.Errorf("arrow/ipc: could not update position while in close: %w", err)
+	if err = w.streamWriter.Close(); err != nil {
+		return err
 	}
 
 	pos := w.pos
-	err = writeFileFooter(w.schema, w.dicts, w.recs, w)
-	if err != nil {
+	if err = writeFileFooter(w.schema, w.dicts, w.recs, w); err != nil {
 		return fmt.Errorf("arrow/ipc: could not write file footer: %w", err)
 	}
 
-	// write file footer length
-	err = w.updatePos() // not strictly needed as we passed w to writeFileFooter...
-	if err != nil {
-		return fmt.Errorf("arrow/ipc: could not compute file footer length: %w", err)
-	}
-
 	size := w.pos - pos
 	if size <= 0 {
 		return fmt.Errorf("arrow/ipc: invalid file footer size (size=%d)", size)
@@ -133,13 +113,7 @@ func (w *pwriter) Close() error {
 	return nil
 }
 
-func (w *pwriter) updatePos() error {
-	var err error
-	w.pos, err = w.w.Seek(0, io.SeekCurrent)
-	return err
-}
-
-func (w *pwriter) align(align int32) error {
+func (w *fileWriter) align(align int32) error {
 	remainder := paddedLength(w.pos, align) - w.pos
 	if remainder == 0 {
 		return nil
@@ -149,12 +123,6 @@ func (w *pwriter) align(align int32) error {
 	return err
 }
 
-func (w *pwriter) Write(p []byte) (int, error) {
-	n, err := w.w.Write(p)
-	w.pos += int64(n)
-	return n, err
-}
-
 func writeIPCPayload(w io.Writer, p Payload) (int, error) {
 	n, err := writeMessage(p.meta, kArrowIPCAlignment, w)
 	if err != nil {
@@ -259,18 +227,12 @@ func (ps payloads) Release() {
 
 // FileWriter is an Arrow file writer.
 type FileWriter struct {
-	w io.WriteSeeker
+	w io.Writer
 
 	mem memory.Allocator
 
-	header struct {
-		started bool
-		offset  int64
-	}
-
-	footer struct {
-		written bool
-	}
+	headerStarted bool
+	footerWritten bool
 
 	pw PayloadWriter
 
@@ -289,7 +251,7 @@ type FileWriter struct {
 }
 
 // NewFileWriter opens an Arrow file using the provided writer w.
-func NewFileWriter(w io.WriteSeeker, opts ...Option) (*FileWriter, error) {
+func NewFileWriter(w io.Writer, opts ...Option) (*FileWriter, error) {
 	var (
 		cfg = newConfig(opts...)
 		err error
@@ -297,7 +259,7 @@ func NewFileWriter(w io.WriteSeeker, opts ...Option) (*FileWriter, error) {
 
 	f := FileWriter{
 		w:               w,
-		pw:              &pwriter{w: w, schema: cfg.schema, pos: -1},
+		pw:              &fileWriter{streamWriter: streamWriter{w: w}, schema: cfg.schema},
 		mem:             cfg.alloc,
 		schema:          cfg.schema,
 		codec:           cfg.codec,
@@ -306,12 +268,6 @@ func NewFileWriter(w io.WriteSeeker, opts ...Option) (*FileWriter, error) {
 		compressors:     make([]compressor, cfg.compressNP),
 	}
 
-	pos, err := f.w.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return nil, fmt.Errorf("arrow/ipc: could not seek current position: %w", err)
-	}
-	f.header.offset = pos
-
 	return &f, err
 }
 
@@ -321,7 +277,7 @@ func (f *FileWriter) Close() error {
 		return fmt.Errorf("arrow/ipc: could not write empty file: %w", err)
 	}
 
-	if f.footer.written {
+	if f.footerWritten {
 		return nil
 	}
 
@@ -329,7 +285,7 @@ func (f *FileWriter) Close() error {
 	if err != nil {
 		return fmt.Errorf("arrow/ipc: could not close payload writer: %w", err)
 	}
-	f.footer.written = true
+	f.footerWritten = true
 
 	return nil
 }
@@ -367,14 +323,14 @@ func (f *FileWriter) Write(rec arrow.Record) error {
 }
 
 func (f *FileWriter) checkStarted() error {
-	if !f.header.started {
+	if !f.headerStarted {
 		return f.start()
 	}
 	return nil
 }
 
 func (f *FileWriter) start() error {
-	f.header.started = true
+	f.headerStarted = true
 	err := f.pw.Start()
 	if err != nil {
 		return err
diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go
index 02c67635bb2fd..5a280fbf84a1f 100644
--- a/go/arrow/ipc/writer.go
+++ b/go/arrow/ipc/writer.go
@@ -37,18 +37,18 @@ import (
 	"github.com/apache/arrow/go/v18/internal/utils"
 )
 
-type swriter struct {
+type streamWriter struct {
 	w   io.Writer
 	pos int64
 }
 
-func (w *swriter) Start() error { return nil }
-func (w *swriter) Close() error {
+func (w *streamWriter) Start() error { return nil }
+func (w *streamWriter) Close() error {
 	_, err := w.Write(kEOS[:])
 	return err
 }
 
-func (w *swriter) WritePayload(p Payload) error {
+func (w *streamWriter) WritePayload(p Payload) error {
 	_, err := writeIPCPayload(w, p)
 	if err != nil {
 		return err
@@ -56,7 +56,7 @@ func (w *swriter) WritePayload(p Payload) error {
 	return nil
 }
 
-func (w *swriter) Write(p []byte) (int, error) {
+func (w *streamWriter) Write(p []byte) (int, error) {
 	n, err := w.w.Write(p)
 	w.pos += int64(n)
 	return n, err
@@ -118,7 +118,7 @@ func NewWriter(w io.Writer, opts ...Option) *Writer {
 	return &Writer{
 		w:              w,
 		mem:            cfg.alloc,
-		pw:             &swriter{w: w},
+		pw:             &streamWriter{w: w},
 		schema:         cfg.schema,
 		codec:          cfg.codec,
 		emitDictDeltas: cfg.emitDictDeltas,

From 3b310bbf5cc6fb55052dd28107235ca4c734cacf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 05:31:18 +0900
Subject: [PATCH 053/186] MINOR: [JS] Bump @swc/helpers from 0.5.11 to 0.5.12
 in /js (#43901)

Bumps [@ swc/helpers](https://github.com/swc-project/swc) from 0.5.11 to 0.5.12.
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/swc-project/swc/commits">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ swc/helpers&package-manager=npm_and_yarn&previous-version=0.5.11&new-version=0.5.12)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 js/yarn.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/yarn.lock b/js/yarn.lock
index dc1fc99a0ecf4..b4e208b4a61a3 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -1201,9 +1201,9 @@
   integrity sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==
 
 "@swc/helpers@^0.5.11":
-  version "0.5.11"
-  resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.11.tgz#5bab8c660a6e23c13b2d23fcd1ee44a2db1b0cb7"
-  integrity sha512-YNlnKRWF2sVojTpIyzwou9XoTNbzbzONwRhOoniEioF1AtaitTvVZblaQRrAzChWQ1bLYyYSWzM18y4WwgzJ+A==
+  version "0.5.12"
+  resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.12.tgz#37aaca95284019eb5d2207101249435659709f4b"
+  integrity sha512-KMZNXiGibsW9kvZAO1Pam2JPTDBm+KSHMMHWdsyI/1DbIZjT2A6Gy3hblVXUMEDvUAKq+e0vL0X0o54owWji7g==
   dependencies:
     tslib "^2.4.0"
 

From 1ecfb31b7dcebc486f404bc0ed74c1cf644bb51b Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Sun, 1 Sep 2024 19:17:42 -0400
Subject: [PATCH 054/186] GH-43665: [R] Remove references to bindings vignette
 (#43889)

### Rationale for this change

The writing-bindings vignette was removed in
https://github.com/apache/arrow/pull/41576#issuecomment-2134327019. It
turns out there were more references to it throughout the docs that I
failed to remove

### What changes are included in this PR?

Deleting x-refs that don't exist anymore.

### Are these changes tested?

Not really

### Are there any user-facing changes?

The docs won't point you at links that 404.
* GitHub Issue: #43665
---
 docs/source/developers/guide/resources.rst    |  1 -
 .../guide/step_by_step/arrow_codebase.rst     |  3 --
 .../developers/guide/tutorials/r_tutorial.rst | 28 -------------------
 r/vignettes/developing.Rmd                    |  3 --
 4 files changed, 35 deletions(-)

diff --git a/docs/source/developers/guide/resources.rst b/docs/source/developers/guide/resources.rst
index b5905af65499b..5b598ab1296ac 100644
--- a/docs/source/developers/guide/resources.rst
+++ b/docs/source/developers/guide/resources.rst
@@ -71,7 +71,6 @@ Contributing
 
 - :ref:`contributing`
 - `Arrow R Developer Guide <https://arrow.apache.org/docs/r/articles/developing.html>`_
-- `Writing Bindings article for R package <https://arrow.apache.org/docs/r/articles/developers/bindings.html>`_.
 
 Reproducible examples:
 
diff --git a/docs/source/developers/guide/step_by_step/arrow_codebase.rst b/docs/source/developers/guide/step_by_step/arrow_codebase.rst
index 0c194ab3a3f70..c4ea61d89ff80 100644
--- a/docs/source/developers/guide/step_by_step/arrow_codebase.rst
+++ b/docs/source/developers/guide/step_by_step/arrow_codebase.rst
@@ -150,6 +150,3 @@ C++ we must create the binding manually to use it in that implementation.
       When writing bindings between C++ compute functions and R functions,
       the aim is to expose the C++ functionality via the same interface as
       existing R functions.
-
-      To read the full content on the topic of R bindings read through the
-      `Writing Bindings article <https://arrow.apache.org/docs/r/articles/developers/bindings.html>`_.
diff --git a/docs/source/developers/guide/tutorials/r_tutorial.rst b/docs/source/developers/guide/tutorials/r_tutorial.rst
index 62d5cfcbc76c2..3fba873bff0a9 100644
--- a/docs/source/developers/guide/tutorials/r_tutorial.rst
+++ b/docs/source/developers/guide/tutorials/r_tutorial.rst
@@ -27,22 +27,6 @@ R tutorials
 ***********
 
 
-Writing Bindings Walkthrough
-============================
-
-The first R package tutorial to be included in the New Contributor's
-guide is a **Walkthrough** added in the **Writing Bindings**
-vignette. With time we will try to include additional tutorials
-directly into this guide.
-
-This tutorial will show how to do a binding of a C++ function
-`starts_with() <https://arrow.apache.org/docs/cpp/compute.html#containment-tests>`_
-to the (base) R function ``startsWith()``.
-
-To view the tutorial follow the
-`Walkthrough section of the Writing Bindings article <https://arrow.apache.org/docs/r/articles/developers/bindings.html#walkthrough>`_.
-
-
 R tutorial on adding a lubridate binding
 ========================================
 
@@ -56,11 +40,6 @@ The binding will be added to the ``expression.R`` file in the
 R package. But you can also follow these steps in case you are
 adding a binding that will live somewhere else.
 
-.. seealso::
-
-   To read more about the philosophy behind R bindings, refer to the
-   `Writing Bindings article <https://arrow.apache.org/docs/r/articles/developers/bindings.html>`_.
-
 This tutorial is different from the :ref:`step_by_step` as we
 will be working on a specific case. This tutorial is not meant
 as a step-by-step guide.
@@ -170,13 +149,6 @@ equivalent data types. lubridate's ``mday()`` function has no additional
 arguments and there are also no option classes associated with Arrow C++
 function ``day()``.
 
-.. note::
-
-   To see what to do if there is an option class associated with the
-   function you are binding, refer to
-   `Examining the C++ function <https://arrow.apache.org/docs/r/articles/developers/bindings.html#examining-the-c-function>`_ from the Writing Bindings
-   article.
-
 Looking at the code in ``expressions.R`` we can see the day function
 is already specified/mapped on the R package side:
 `<https://github.com/apache/arrow/blob/658bec37aa5cbdd53b5e4cdc81b8ba3962e67f11/r/R/expression.R#L63-L64>`_
diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd
index 248a80292a029..147f9cc028d78 100644
--- a/r/vignettes/developing.Rmd
+++ b/r/vignettes/developing.Rmd
@@ -52,6 +52,3 @@ There are a number of ways in which we do this:
 * [Running R with the C++ debugger attached](https://arrow.apache.org/docs/r/articles/developers/debugging.html)
 * [In-depth guide to how the package installation works](https://arrow.apache.org/docs/r/articles/developers/install_details.html)
 * [Using Docker to diagnose a bug or test a feature on a specific OS](https://arrow.apache.org/docs/r/articles/developers/docker.html)
-* [Writing bindings between R functions and Arrow Acero functions](https://arrow.apache.org/docs/r/articles/developers/bindings.html)
-
-

From f919da13ec0250ecf9ddf8f57dbd17b22830fa21 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 09:21:49 +0900
Subject: [PATCH 055/186] MINOR: [JS] Bump ix from 6.0.0 to 7.0.0 in /js
 (#43898)

Bumps [ix](https://github.com/ReactiveX/IxJS) from 6.0.0 to 7.0.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/ReactiveX/IxJS/releases">ix's releases</a>.</em></p>
<blockquote>
<h2>v7.0.0</h2>
<h3>Bug Fixes</h3>
<ul>
<li><strong>actions:</strong> fix docs workflow (<a href="https://github.com/ReactiveX/IxJS/commit/512e370f428a970f1f55dbf3a0dcd528e3bbc0ba">512e370</a>)</li>
<li><strong>changelog:</strong> include latest changelog in npm packages (<a href="https://github.com/ReactiveX/IxJS/commit/f25687b46a547b510184ff6558884628e37063b5">f25687b</a>)</li>
</ul>
<h3>chore</h3>
<ul>
<li><strong>build:</strong> fix gulp async task completion (<a href="https://github.com/ReactiveX/IxJS/commit/c68e97c91d565baf33de989d3f6190a8b5046adc">c68e97c</a>)</li>
<li><strong>release:</strong> 7.0.0 (<a href="https://github.com/ReactiveX/IxJS/commit/b8890f1010347ac2242398801d1405e51a4d3396">b8890f1</a>)</li>
</ul>
<h3>Documentation</h3>
<ul>
<li>
<p><strong>CHANGELOG:</strong> 7.0.0 (<a href="https://github.com/ReactiveX/IxJS/commit/28a24f945dc2f5ba05669399713b6e2299ebb28c">28a24f9</a>)</p>
</li>
<li>
<p>Fix exports (<a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a>) (<a href="https://github.com/ReactiveX/IxJS/commit/d461eae02cc63bafc4e3256d5d59541d1ff7e43f">d461eae</a>), closes <a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a></p>
</li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/ReactiveX/IxJS/blob/master/CHANGELOG.md">ix's changelog</a>.</em></p>
<blockquote>
<h1><a href="https://github.com/ReactiveX/IxJS/compare/v6.0.0...v7.0.0">7.0.0</a> (2024-07-10)</h1>
<h3>Bug Fixes</h3>
<ul>
<li><strong>actions:</strong> fix docs workflow (<a href="https://github.com/ReactiveX/IxJS/commit/512e370f428a970f1f55dbf3a0dcd528e3bbc0ba">512e370</a>)</li>
<li><strong>changelog:</strong> include latest changelog in npm packages (<a href="https://github.com/ReactiveX/IxJS/commit/f25687b46a547b510184ff6558884628e37063b5">f25687b</a>)</li>
</ul>
<h3>chore</h3>
<ul>
<li>
<p><strong>build:</strong> fix gulp async task completion (<a href="https://github.com/ReactiveX/IxJS/commit/c68e97c91d565baf33de989d3f6190a8b5046adc">c68e97c</a>)</p>
</li>
<li>
<p>Fix exports (<a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a>) (<a href="https://github.com/ReactiveX/IxJS/commit/d461eae02cc63bafc4e3256d5d59541d1ff7e43f">d461eae</a>), closes <a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a></p>
</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/ReactiveX/IxJS/commit/b8890f1010347ac2242398801d1405e51a4d3396"><code>b8890f1</code></a> chore(release): 7.0.0</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/28a24f945dc2f5ba05669399713b6e2299ebb28c"><code>28a24f9</code></a> docs(CHANGELOG): 7.0.0</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/f25687b46a547b510184ff6558884628e37063b5"><code>f25687b</code></a> fix(changelog): include latest changelog in npm packages</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/c68e97c91d565baf33de989d3f6190a8b5046adc"><code>c68e97c</code></a> chore(build): fix gulp async task completion</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/512e370f428a970f1f55dbf3a0dcd528e3bbc0ba"><code>512e370</code></a> fix(actions): fix docs workflow</li>
<li><a href="https://github.com/ReactiveX/IxJS/commit/d461eae02cc63bafc4e3256d5d59541d1ff7e43f"><code>d461eae</code></a> Fix exports (<a href="https://redirect.github.com/ReactiveX/IxJS/issues/371">#371</a>)</li>
<li>See full diff in <a href="https://github.com/ReactiveX/IxJS/compare/v6.0.0...v7.0.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ix&package-manager=npm_and_yarn&previous-version=6.0.0&new-version=7.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 js/package.json | 2 +-
 js/yarn.lock    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/js/package.json b/js/package.json
index cbf0670e018b6..1688747d24290 100644
--- a/js/package.json
+++ b/js/package.json
@@ -95,7 +95,7 @@
     "gulp-terser": "2.1.0",
     "gulp-typescript": "5.0.1",
     "gulp-vinyl-size": "1.1.4",
-    "ix": "6.0.0",
+    "ix": "7.0.0",
     "jest": "29.7.0",
     "jest-silent-reporter": "0.6.0",
     "memfs": "4.9.2",
diff --git a/js/yarn.lock b/js/yarn.lock
index b4e208b4a61a3..d1a089501a388 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -4381,10 +4381,10 @@ istextorbinary@^3.0.0:
     binaryextensions "^2.2.0"
     textextensions "^3.2.0"
 
-ix@6.0.0:
-  version "6.0.0"
-  resolved "https://registry.yarnpkg.com/ix/-/ix-6.0.0.tgz#c1875523f8090c7146dc3ac3412a763663887f27"
-  integrity sha512-B/KeYkHtOWbr3ttckqWT9uha2ixw9fGVDxX+DwVXhO+P5eOhyCadt+aC30hRBvG+do+tbI3xbYDMYN6dp1C4Vw==
+ix@7.0.0:
+  version "7.0.0"
+  resolved "https://registry.yarnpkg.com/ix/-/ix-7.0.0.tgz#df4c9a242614178f0836aa3cd1965441fae301d1"
+  integrity sha512-hgVnphYh+ytIEsmjeym5wP2GPaM3+RZf7zCrZXE7gjwwmpIBEg0t6GRX7BbdXzTosXCstEAzdPxpyplGBYnIbw==
   dependencies:
     "@types/node" ">=13.7.4"
     tslib "^2.6.2"

From 2ffb186cf0b1f226188d5ddc88f038e0504b97ea Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 09:22:19 +0900
Subject: [PATCH 056/186] MINOR: [JS] Bump @typescript-eslint/eslint-plugin
 from 7.12.0 to 7.18.0 in /js (#43900)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [@ typescript-eslint/eslint-plugin](https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin) from 7.12.0 to 7.18.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/typescript-eslint/typescript-eslint/releases"><code>@​typescript-eslint/eslint-plugin</code>'s releases</a>.</em></p>
<blockquote>
<h2>v7.18.0</h2>
<h2>7.18.0 (2024-07-29)</h2>
<h3>🚀 Features</h3>
<ul>
<li><strong>types:</strong> update ECMA versions (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9634">#9634</a>)</li>
</ul>
<h3>🩹 Fixes</h3>
<ul>
<li><strong>eslint-plugin:</strong> [no-unnecessary-type-assertion] prevent runtime error when asserting a variable declared in default TS lib (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9660">#9660</a>)</li>
<li><strong>eslint-plugin:</strong> [unbound-method] report on destructuring in function parameters (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/8952">#8952</a>)</li>
<li><strong>eslint-plugin:</strong> [no-duplicate-type-constituents] shouldn't report on error types (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9600">#9600</a>)</li>
<li><strong>eslint-plugin:</strong> [strict-boolean-expressions] support branded booleans (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9297">#9297</a>)</li>
</ul>
<h3>❤️  Thank You</h3>
<ul>
<li>auvred <a href="https://github.com/auvred"><code>@​auvred</code></a></li>
<li>Oliver Salzburg</li>
<li>Vinccool96</li>
<li>Yukihiro Hasegawa <a href="https://github.com/y-hsgw"><code>@​y-hsgw</code></a></li>
</ul>
<p>You can read about our <a href="https://main--typescript-eslint.netlify.app/users/versioning">versioning strategy</a> and <a href="https://main--typescript-eslint.netlify.app/users/releases">releases</a> on our website.</p>
<h2>v7.17.0</h2>
<h2>7.17.0 (2024-07-22)</h2>
<h3>🚀 Features</h3>
<ul>
<li><strong>eslint-plugin:</strong> backport no-unsafe-function type, no-wrapper-object-types from v8 to v7 (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9507">#9507</a>)</li>
<li><strong>eslint-plugin:</strong> [return-await] add option to report in error-handling scenarios only, and deprecate &quot;never&quot; (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9364">#9364</a>)</li>
</ul>
<h3>🩹 Fixes</h3>
<ul>
<li><strong>eslint-plugin:</strong> [no-floating-promises] check top-level type assertions (and more) (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9043">#9043</a>)</li>
<li><strong>eslint-plugin:</strong> [strict-boolean-expressions] consider assertion function argument a boolean context (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9074">#9074</a>)</li>
<li><strong>eslint-plugin:</strong> [no-unnecessary-condition] false positive on optional private field (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9602">#9602</a>)</li>
<li><strong>typescript-estree:</strong> don't infer single-run when --fix is in proces.argv (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9577">#9577</a>)</li>
<li><strong>typescript-estree:</strong> disable single-run inference with extraFileExtensions (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9580">#9580</a>)</li>
<li><strong>website:</strong> expose ATA types to eslint instance (<a href="https://redirect.github.com/typescript-eslint/typescript-eslint/pull/9598">#9598</a>)</li>
</ul>
<h3>❤️  Thank You</h3>
<ul>
<li>Armano <a href="https://github.com/armano2"><code>@​armano2</code></a></li>
<li>Josh Goldberg ✨</li>
<li>Kirk Waiblinger <a href="https://github.com/kirkwaiblinger"><code>@​kirkwaiblinger</code></a></li>
<li>StyleShit <a href="https://github.com/StyleShit"><code>@​StyleShit</code></a></li>
</ul>
<p>You can read about our <a href="https://main--typescript-eslint.netlify.app/users/versioning">versioning strategy</a> and <a href="https://main--typescript-eslint.netlify.app/users/releases">releases</a> on our website.</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/typescript-eslint/typescript-eslint/blob/main/packages/eslint-plugin/CHANGELOG.md"><code>@​typescript-eslint/eslint-plugin</code>'s changelog</a>.</em></p>
<blockquote>
<h2>7.18.0 (2024-07-29)</h2>
<h3>🩹 Fixes</h3>
<ul>
<li>
<p><strong>eslint-plugin:</strong> [no-unnecessary-type-assertion] prevent runtime error when asserting a variable declared in default TS lib</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [unbound-method] report on destructuring in function parameters</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [no-duplicate-type-constituents] shouldn't report on error types</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [strict-boolean-expressions] support branded booleans</p>
</li>
</ul>
<h3>❤️  Thank You</h3>
<ul>
<li>auvred</li>
<li>Oliver Salzburg</li>
<li>Vinccool96</li>
<li>Yukihiro Hasegawa</li>
</ul>
<p>You can read about our <a href="https://main--typescript-eslint.netlify.app/users/versioning">versioning strategy</a> and <a href="https://main--typescript-eslint.netlify.app/users/releases">releases</a> on our website.</p>
<h2>7.17.0 (2024-07-22)</h2>
<h3>🚀 Features</h3>
<ul>
<li>
<p><strong>eslint-plugin:</strong> backport no-unsafe-function type, no-wrapper-object-types from v8 to v7</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [return-await] add option to report in error-handling scenarios only, and deprecate &quot;never&quot;</p>
</li>
</ul>
<h3>🩹 Fixes</h3>
<ul>
<li>
<p><strong>eslint-plugin:</strong> [no-floating-promises] check top-level type assertions (and more)</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [strict-boolean-expressions] consider assertion function argument a boolean context</p>
</li>
<li>
<p><strong>eslint-plugin:</strong> [no-unnecessary-condition] false positive on optional private field</p>
</li>
</ul>
<h3>❤️  Thank You</h3>
<ul>
<li>Armano</li>
<li>Josh Goldberg ✨</li>
<li>Kirk Waiblinger</li>
<li>StyleShit</li>
</ul>
<p>You can read about our <a href="https://main--typescript-eslint.netlify.app/users/versioning">versioning strategy</a> and <a href="https://main--typescript-eslint.netlify.app/users/releases">releases</a> on our website.</p>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/35cf3d2b2b9611c3812b120c461d863c7881ac04"><code>35cf3d2</code></a> chore(release): publish 7.18.0</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/46a5709e434a0a252a4ffd5bfe32bf883adbb418"><code>46a5709</code></a> docs: link no-duplicate-type-constituents and no-redundant-type-constituents ...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/9eec7903698a98f61ddb933b7209d126e3400bb1"><code>9eec790</code></a> fix(eslint-plugin): [strict-boolean-expressions] support branded booleans (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9">#9</a>...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/9927a29eb83ce43bb6ecedbd0943207543eadc80"><code>9927a29</code></a> docs: add ast-spec, type-utils docs with docusaurus-plugin-typedoc (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9293">#9293</a>)</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/fb0ca4cbe79cd4b27300a42b31d6a7f5ea13e8e8"><code>fb0ca4c</code></a> docs: remove unnecessary v8 links (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9611">#9611</a>)</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/3591b78dc70592684a263755430477e74c7a5133"><code>3591b78</code></a> fix(eslint-plugin): [no-duplicate-type-constituents] shouldn't report on erro...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/64b4e43112eb52de79c6ad6454d0b243cfc1fc21"><code>64b4e43</code></a> fix(eslint-plugin): [unbound-method] report on destructuring in function para...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/bf4abdf3ce9454c8a291e78f32994c721fb5fe82"><code>bf4abdf</code></a> fix(eslint-plugin): [no-unnecessary-type-assertion] prevent runtime error whe...</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/6b92aa5ce61d86869493b764f77d882bb4d14ce7"><code>6b92aa5</code></a> chore: reorg repo level utils, lint and typecheck repo files (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9618">#9618</a>)</li>
<li><a href="https://github.com/typescript-eslint/typescript-eslint/commit/1e32db13dbf3c73423254f425662ed874f0b62b6"><code>1e32db1</code></a> chore: enable radix (<a href="https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin/issues/9563">#9563</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/typescript-eslint/typescript-eslint/commits/v7.18.0/packages/eslint-plugin">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ typescript-eslint/eslint-plugin&package-manager=npm_and_yarn&previous-version=7.12.0&new-version=7.18.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 js/package.json |   2 +-
 js/yarn.lock    | 108 ++++++++++++++++++++++++------------------------
 2 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/js/package.json b/js/package.json
index 1688747d24290..d8a784b784d3c 100644
--- a/js/package.json
+++ b/js/package.json
@@ -72,7 +72,7 @@
     "@types/glob": "8.1.0",
     "@types/jest": "29.5.12",
     "@types/multistream": "4.1.3",
-    "@typescript-eslint/eslint-plugin": "7.12.0",
+    "@typescript-eslint/eslint-plugin": "7.18.0",
     "@typescript-eslint/parser": "7.14.1",
     "async-done": "2.0.0",
     "benny": "3.7.1",
diff --git a/js/yarn.lock b/js/yarn.lock
index d1a089501a388..e8223fba9aad2 100644
--- a/js/yarn.lock
+++ b/js/yarn.lock
@@ -1421,16 +1421,16 @@
   dependencies:
     "@types/yargs-parser" "*"
 
-"@typescript-eslint/eslint-plugin@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.12.0.tgz#f87a32e8972b8a60024f2f8f12205e7c8108bc41"
-  integrity sha512-7F91fcbuDf/d3S8o21+r3ZncGIke/+eWk0EpO21LXhDfLahriZF9CGj4fbAetEjlaBdjdSm9a6VeXbpbT6Z40Q==
+"@typescript-eslint/eslint-plugin@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.18.0.tgz#b16d3cf3ee76bf572fdf511e79c248bdec619ea3"
+  integrity sha512-94EQTWZ40mzBc42ATNIBimBEDltSJ9RQHCC8vc/PDbxi4k8dVwUAv4o98dk50M1zB+JGFxp43FP7f8+FP8R6Sw==
   dependencies:
     "@eslint-community/regexpp" "^4.10.0"
-    "@typescript-eslint/scope-manager" "7.12.0"
-    "@typescript-eslint/type-utils" "7.12.0"
-    "@typescript-eslint/utils" "7.12.0"
-    "@typescript-eslint/visitor-keys" "7.12.0"
+    "@typescript-eslint/scope-manager" "7.18.0"
+    "@typescript-eslint/type-utils" "7.18.0"
+    "@typescript-eslint/utils" "7.18.0"
+    "@typescript-eslint/visitor-keys" "7.18.0"
     graphemer "^1.4.0"
     ignore "^5.3.1"
     natural-compare "^1.4.0"
@@ -1447,14 +1447,6 @@
     "@typescript-eslint/visitor-keys" "7.14.1"
     debug "^4.3.4"
 
-"@typescript-eslint/scope-manager@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.12.0.tgz#259c014362de72dd34f995efe6bd8dda486adf58"
-  integrity sha512-itF1pTnN6F3unPak+kutH9raIkL3lhH1YRPGgt7QQOh43DQKVJXmWkpb+vpc/TiDHs6RSd9CTbDsc/Y+Ygq7kg==
-  dependencies:
-    "@typescript-eslint/types" "7.12.0"
-    "@typescript-eslint/visitor-keys" "7.12.0"
-
 "@typescript-eslint/scope-manager@7.14.1":
   version "7.14.1"
   resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.14.1.tgz#63de7a577bc6fe8ee6e412a5b85499f654b93ee5"
@@ -1463,39 +1455,33 @@
     "@typescript-eslint/types" "7.14.1"
     "@typescript-eslint/visitor-keys" "7.14.1"
 
-"@typescript-eslint/type-utils@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.12.0.tgz#9dfaaa1972952f395ec5be4f5bbfc4d3cdc63908"
-  integrity sha512-lib96tyRtMhLxwauDWUp/uW3FMhLA6D0rJ8T7HmH7x23Gk1Gwwu8UZ94NMXBvOELn6flSPiBrCKlehkiXyaqwA==
+"@typescript-eslint/scope-manager@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.18.0.tgz#c928e7a9fc2c0b3ed92ab3112c614d6bd9951c83"
+  integrity sha512-jjhdIE/FPF2B7Z1uzc6i3oWKbGcHb87Qw7AWj6jmEqNOfDFbJWtjt/XfwCpvNkpGWlcJaog5vTR+VV8+w9JflA==
+  dependencies:
+    "@typescript-eslint/types" "7.18.0"
+    "@typescript-eslint/visitor-keys" "7.18.0"
+
+"@typescript-eslint/type-utils@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.18.0.tgz#2165ffaee00b1fbbdd2d40aa85232dab6998f53b"
+  integrity sha512-XL0FJXuCLaDuX2sYqZUUSOJ2sG5/i1AAze+axqmLnSkNEVMVYLF+cbwlB2w8D1tinFuSikHmFta+P+HOofrLeA==
   dependencies:
-    "@typescript-eslint/typescript-estree" "7.12.0"
-    "@typescript-eslint/utils" "7.12.0"
+    "@typescript-eslint/typescript-estree" "7.18.0"
+    "@typescript-eslint/utils" "7.18.0"
     debug "^4.3.4"
     ts-api-utils "^1.3.0"
 
-"@typescript-eslint/types@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.12.0.tgz#bf208f971a8da1e7524a5d9ae2b5f15192a37981"
-  integrity sha512-o+0Te6eWp2ppKY3mLCU+YA9pVJxhUJE15FV7kxuD9jgwIAa+w/ycGJBMrYDTpVGUM/tgpa9SeMOugSabWFq7bg==
-
 "@typescript-eslint/types@7.14.1":
   version "7.14.1"
   resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.14.1.tgz#a43a540dbe5df7f2a11269683d777fc50b4350aa"
   integrity sha512-mL7zNEOQybo5R3AavY+Am7KLv8BorIv7HCYS5rKoNZKQD9tsfGUpO4KdAn3sSUvTiS4PQkr2+K0KJbxj8H9NDg==
 
-"@typescript-eslint/typescript-estree@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.12.0.tgz#e6c1074f248b3db6573ab6a7c47a39c4cd498ff9"
-  integrity sha512-5bwqLsWBULv1h6pn7cMW5dXX/Y2amRqLaKqsASVwbBHMZSnHqE/HN4vT4fE0aFsiwxYvr98kqOWh1a8ZKXalCQ==
-  dependencies:
-    "@typescript-eslint/types" "7.12.0"
-    "@typescript-eslint/visitor-keys" "7.12.0"
-    debug "^4.3.4"
-    globby "^11.1.0"
-    is-glob "^4.0.3"
-    minimatch "^9.0.4"
-    semver "^7.6.0"
-    ts-api-utils "^1.3.0"
+"@typescript-eslint/types@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.18.0.tgz#b90a57ccdea71797ffffa0321e744f379ec838c9"
+  integrity sha512-iZqi+Ds1y4EDYUtlOOC+aUmxnE9xS/yCigkjA7XpTKV6nCBd3Hp/PRGGmdwnfkV2ThMyYldP1wRpm/id99spTQ==
 
 "@typescript-eslint/typescript-estree@7.14.1":
   version "7.14.1"
@@ -1511,23 +1497,29 @@
     semver "^7.6.0"
     ts-api-utils "^1.3.0"
 
-"@typescript-eslint/utils@7.12.0", "@typescript-eslint/utils@^6.0.0 || ^7.0.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.12.0.tgz#c6e58fd7f724cdccc848f71e388ad80cbdb95dd0"
-  integrity sha512-Y6hhwxwDx41HNpjuYswYp6gDbkiZ8Hin9Bf5aJQn1bpTs3afYY4GX+MPYxma8jtoIV2GRwTM/UJm/2uGCVv+DQ==
+"@typescript-eslint/typescript-estree@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.18.0.tgz#b5868d486c51ce8f312309ba79bdb9f331b37931"
+  integrity sha512-aP1v/BSPnnyhMHts8cf1qQ6Q1IFwwRvAQGRvBFkWlo3/lH29OXA3Pts+c10nxRxIBrDnoMqzhgdwVe5f2D6OzA==
   dependencies:
-    "@eslint-community/eslint-utils" "^4.4.0"
-    "@typescript-eslint/scope-manager" "7.12.0"
-    "@typescript-eslint/types" "7.12.0"
-    "@typescript-eslint/typescript-estree" "7.12.0"
+    "@typescript-eslint/types" "7.18.0"
+    "@typescript-eslint/visitor-keys" "7.18.0"
+    debug "^4.3.4"
+    globby "^11.1.0"
+    is-glob "^4.0.3"
+    minimatch "^9.0.4"
+    semver "^7.6.0"
+    ts-api-utils "^1.3.0"
 
-"@typescript-eslint/visitor-keys@7.12.0":
-  version "7.12.0"
-  resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.12.0.tgz#c053b55a996679528beeedd8e565710ce1ae1ad3"
-  integrity sha512-uZk7DevrQLL3vSnfFl5bj4sL75qC9D6EdjemIdbtkuUmIheWpuiiylSY01JxJE7+zGrOWDZrp1WxOuDntvKrHQ==
+"@typescript-eslint/utils@7.18.0", "@typescript-eslint/utils@^6.0.0 || ^7.0.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.18.0.tgz#bca01cde77f95fc6a8d5b0dbcbfb3d6ca4be451f"
+  integrity sha512-kK0/rNa2j74XuHVcoCZxdFBMF+aq/vH83CXAOHieC+2Gis4mF8jJXT5eAfyD3K0sAxtPuwxaIOIOvhwzVDt/kw==
   dependencies:
-    "@typescript-eslint/types" "7.12.0"
-    eslint-visitor-keys "^3.4.3"
+    "@eslint-community/eslint-utils" "^4.4.0"
+    "@typescript-eslint/scope-manager" "7.18.0"
+    "@typescript-eslint/types" "7.18.0"
+    "@typescript-eslint/typescript-estree" "7.18.0"
 
 "@typescript-eslint/visitor-keys@7.14.1":
   version "7.14.1"
@@ -1537,6 +1529,14 @@
     "@typescript-eslint/types" "7.14.1"
     eslint-visitor-keys "^3.4.3"
 
+"@typescript-eslint/visitor-keys@7.18.0":
+  version "7.18.0"
+  resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.18.0.tgz#0564629b6124d67607378d0f0332a0495b25e7d7"
+  integrity sha512-cDF0/Gf81QpY3xYyJKDV14Zwdmid5+uuENhjH2EqFaF0ni+yAyq/LzMaIJdhNJXZI7uLzwIlA+V7oWoyn6Curg==
+  dependencies:
+    "@typescript-eslint/types" "7.18.0"
+    eslint-visitor-keys "^3.4.3"
+
 "@ungap/structured-clone@^1.2.0":
   version "1.2.0"
   resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"

From 7f88ae7d5e7f18911c36630ac0ebd17ae78ab686 Mon Sep 17 00:00:00 2001
From: Max Feinleib <82004873+feinleib@users.noreply.github.com>
Date: Mon, 2 Sep 2024 10:03:09 -0400
Subject: [PATCH 057/186] MINOR: [R] Fix monospace formatting in
 dplyr-funcs-doc (#43461)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added a closing backtick for the `.groups` argument in `summarise()`.

### Rationale for this change

Improves the formatting and appearance of the Acero documentation page at <https://arrow.apache.org/docs/r/reference/acero.html>.

### Are these changes tested?

Yes, I ran `devtools::check()` on this change. I got one warning (related to my environment) and one note that isn't going to be fixable. I would consider this result "passing."

```
❯ checking top-level files ... WARNING
  A complete check needs the 'checkbashisms' script.
  See section ‘Configure and cleanup’ in the ‘Writing R Extensions’
  manual.

❯ checking installed package size ... NOTE
    installed size is 54.7Mb
    sub-directories of 1Mb or more:
      R      5.1Mb
      libs  49.0Mb

0 errors ✔ | 1 warning ✖ | 1 note ✖
```

### Are there any user-facing changes?

This is a documentation change.

Authored-by: Max Feinleib <82004873+feinleib@users.noreply.github.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/arrow-package.R   | 2 +-
 r/R/dplyr-funcs-doc.R | 2 +-
 r/man/acero.Rd        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 44dfbbcd5c7e7..4c3b78e085c6e 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -41,7 +41,7 @@ supported_dplyr_methods <- list(
   collect = NULL,
   summarise = c(
     "window functions not currently supported;",
-    'arguments `.drop = FALSE` and `.groups = "rowwise" not supported'
+    'arguments `.drop = FALSE` and `.groups = "rowwise"` not supported'
   ),
   group_by = NULL,
   groups = NULL,
diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R
index 7f0627c33d010..4f90dd16b266f 100644
--- a/r/R/dplyr-funcs-doc.R
+++ b/r/R/dplyr-funcs-doc.R
@@ -67,7 +67,7 @@
 #' * [`slice_min()`][dplyr::slice_min()]: slicing within groups not supported; `with_ties = TRUE` (dplyr default) is not supported; `prop` only supported on queries where `nrow()` is knowable without evaluating
 #' * [`slice_sample()`][dplyr::slice_sample()]: slicing within groups not supported; `replace = TRUE` and the `weight_by` argument not supported; `n` only supported on queries where `nrow()` is knowable without evaluating
 #' * [`slice_tail()`][dplyr::slice_tail()]: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; `prop` only supported on queries where `nrow()` is knowable without evaluating
-#' * [`summarise()`][dplyr::summarise()]: window functions not currently supported; arguments `.drop = FALSE` and `.groups = "rowwise" not supported
+#' * [`summarise()`][dplyr::summarise()]: window functions not currently supported; arguments `.drop = FALSE` and `.groups = "rowwise"` not supported
 #' * [`tally()`][dplyr::tally()]
 #' * [`transmute()`][dplyr::transmute()]
 #' * [`ungroup()`][dplyr::ungroup()]
diff --git a/r/man/acero.Rd b/r/man/acero.Rd
index 9ef9cd7dda6fb..aceb533a151f6 100644
--- a/r/man/acero.Rd
+++ b/r/man/acero.Rd
@@ -54,7 +54,7 @@ Table into an R \code{tibble}.
 \item \code{\link[dplyr:slice]{slice_min()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating
 \item \code{\link[dplyr:slice]{slice_sample()}}: slicing within groups not supported; \code{replace = TRUE} and the \code{weight_by} argument not supported; \code{n} only supported on queries where \code{nrow()} is knowable without evaluating
 \item \code{\link[dplyr:slice]{slice_tail()}}: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating
-\item \code{\link[dplyr:summarise]{summarise()}}: window functions not currently supported; arguments \code{.drop = FALSE} and `.groups = "rowwise" not supported
+\item \code{\link[dplyr:summarise]{summarise()}}: window functions not currently supported; arguments \code{.drop = FALSE} and \code{.groups = "rowwise"} not supported
 \item \code{\link[dplyr:count]{tally()}}
 \item \code{\link[dplyr:transmute]{transmute()}}
 \item \code{\link[dplyr:group_by]{ungroup()}}

From a8df190a43b0ddbb2009cd55b54f4cbb4d9c3377 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Mon, 2 Sep 2024 10:03:49 -0400
Subject: [PATCH 058/186] GH-43894: [R] format_aggregation() should print
 options too (#43896)

### Rationale for this change

If you printed the inner query after summarize, it would show what function was being called but not the function options.

### What changes are included in this PR?

One-line code change plus a test

### Are these changes tested?

Yes. Interestingly, it did not seem that `format_aggregations()` was tested before.

### Are there any user-facing changes?

Technically yes, but few users would likely see this.
* GitHub Issue: #43894

Authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/dplyr-summarize.R                   |  2 +-
 r/tests/testthat/test-dplyr-summarize.R | 38 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index a9ad750de7c42..42fd245e5ab9d 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -241,7 +241,7 @@ group_types <- function(.data, schema = NULL) {
 }
 
 format_aggregation <- function(x) {
-  paste0(x$fun, "(", paste(map(x$data, ~ .$ToString()), collapse = ","), ")")
+  Expression$create(x$fun, args = x$data, options = x$options)$ToString()
 }
 
 # This function evaluates an expression and returns the post-summarize
diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 95212407acf9d..8d2a209df547f 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -955,6 +955,44 @@ test_that("Summarize with 0 arguments", {
   )
 })
 
+test_that("Printing aggregation expressions", {
+  q <- tbl |>
+    arrow_table() |>
+    summarize(
+      total = sum(int, na.rm = TRUE),
+      prod = prod(int, na.rm = TRUE),
+      any = any(lgl, na.rm = TRUE),
+      all = all(lgl, na.rm = TRUE),
+      mean = mean(int, na.rm = TRUE),
+      sd = sd(int, na.rm = TRUE),
+      var = var(int, na.rm = TRUE),
+      n_distinct = n_distinct(chr),
+      min = min(int, na.rm = TRUE),
+      max = max(int, na.rm = TRUE)
+    )
+  expect_output(
+    print(q$.data),
+    "Table (query)
+int: int32
+lgl: bool
+chr: string
+
+* Aggregations:
+total: sum(int, {skip_nulls=true, min_count=0})
+prod: product(int, {skip_nulls=true, min_count=0})
+any: any(lgl, {skip_nulls=true, min_count=0})
+all: all(lgl, {skip_nulls=true, min_count=0})
+mean: mean(int, {skip_nulls=true, min_count=0})
+sd: stddev(int, {ddof=1, skip_nulls=true, min_count=0})
+var: variance(int, {ddof=1, skip_nulls=true, min_count=0})
+n_distinct: count_distinct(chr, {mode=ALL})
+min: min(int, {skip_nulls=true, min_count=0})
+max: max(int, {skip_nulls=true, min_count=0})
+See $.data for the source Arrow object",
+    fixed = TRUE
+  )
+})
+
 test_that("Not supported: window functions", {
   compare_dplyr_binding(
     .input %>%

From 9ab9532a208d5632b0f8b5035a207235b5e6b828 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Mon, 2 Sep 2024 16:35:26 +0200
Subject: [PATCH 059/186] GH-25118: [Python] Make NumPy an optional runtime
 dependency   (#41904)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Being able to run pyarrow without requiring numpy.

### What changes are included in this PR?

If numpy is not present we are able to import pyarrow and run functionality.
A new CI job has been created to run some basic tests without numpy.

### Are these changes tested?

Yes via CI.

### Are there any user-facing changes?

Yes, NumPy can be removed from the user installation and pyarrow functionality still works

* GitHub Issue: #25118

Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/python.yml                  |   6 +
 docker-compose.yml                            |  32 ++++
 python/CMakeLists.txt                         |   4 +-
 python/pyarrow/_compute.pyx                   |  16 +-
 python/pyarrow/array.pxi                      |   5 +
 python/pyarrow/builder.pxi                    |  14 +-
 python/pyarrow/conftest.py                    |  13 +-
 python/pyarrow/includes/libarrow_python.pxd   |   2 +-
 python/pyarrow/lib.pyx                        |  12 +-
 python/pyarrow/pandas_compat.py               |  79 +++++----
 python/pyarrow/src/arrow/python/inference.cc  |   4 +-
 python/pyarrow/src/arrow/python/iterators.h   |   6 +-
 .../arrow/python/{init.cc => numpy_init.cc}   |  13 +-
 .../src/arrow/python/{init.h => numpy_init.h} |   5 +-
 .../pyarrow/src/arrow/python/numpy_internal.h |  19 ++-
 .../pyarrow/src/arrow/python/python_test.cc   |   2 +-
 .../src/arrow/python/python_to_arrow.cc       |  11 +-
 python/pyarrow/table.pxi                      |   3 +
 python/pyarrow/tensor.pxi                     |  15 ++
 python/pyarrow/tests/conftest.py              |   1 +
 .../tests/interchange/test_conversion.py      |  35 ++--
 .../interchange/test_interchange_spec.py      |  33 ++--
 python/pyarrow/tests/parquet/common.py        |   5 +-
 python/pyarrow/tests/parquet/test_basic.py    |   5 +-
 .../pyarrow/tests/parquet/test_data_types.py  |  13 +-
 python/pyarrow/tests/parquet/test_dataset.py  |   5 +-
 python/pyarrow/tests/parquet/test_datetime.py |   5 +-
 python/pyarrow/tests/parquet/test_metadata.py |   7 +-
 python/pyarrow/tests/parquet/test_pandas.py   |   5 +-
 python/pyarrow/tests/strategies.py            |  10 +-
 .../pyarrow/tests/test_adhoc_memory_leak.py   |   5 +-
 python/pyarrow/tests/test_array.py            | 100 +++++++++--
 python/pyarrow/tests/test_builder.py          |  11 +-
 python/pyarrow/tests/test_compute.py          |  85 ++++++----
 python/pyarrow/tests/test_convert_builtin.py  | 155 +++++++++++-------
 python/pyarrow/tests/test_cpp_internals.py    |   8 +
 python/pyarrow/tests/test_csv.py              |  44 ++++-
 python/pyarrow/tests/test_cuda.py             |   5 +-
 .../pyarrow/tests/test_cuda_numba_interop.py  |   5 +-
 python/pyarrow/tests/test_cython.py           |   4 +
 python/pyarrow/tests/test_dataset.py          |  55 ++++---
 .../pyarrow/tests/test_dataset_encryption.py  |   7 +-
 python/pyarrow/tests/test_dlpack.py           |  46 +++---
 python/pyarrow/tests/test_extension_type.py   |  77 ++++++---
 python/pyarrow/tests/test_feather.py          |  10 +-
 python/pyarrow/tests/test_flight.py           |   6 +-
 python/pyarrow/tests/test_io.py               |  38 +++--
 python/pyarrow/tests/test_ipc.py              |  10 +-
 python/pyarrow/tests/test_json.py             |   8 +-
 python/pyarrow/tests/test_pandas.py           |  62 +++----
 python/pyarrow/tests/test_scalars.py          |  59 +++++--
 python/pyarrow/tests/test_schema.py           |   6 +-
 python/pyarrow/tests/test_sparse_tensor.py    |   5 +-
 python/pyarrow/tests/test_strategies.py       |   5 +
 python/pyarrow/tests/test_substrait.py        |   2 +
 python/pyarrow/tests/test_table.py            |  29 +++-
 python/pyarrow/tests/test_tensor.py           |   5 +-
 python/pyarrow/tests/test_types.py            |  16 +-
 python/pyarrow/tests/test_udf.py              |  13 +-
 python/pyarrow/tests/test_without_numpy.py    |  58 +++++++
 python/pyarrow/tests/util.py                  |  19 +--
 python/pyarrow/types.pxi                      |  85 +++++-----
 62 files changed, 1008 insertions(+), 420 deletions(-)
 rename python/pyarrow/src/arrow/python/{init.cc => numpy_init.cc} (78%)
 rename python/pyarrow/src/arrow/python/{init.h => numpy_init.h} (93%)
 create mode 100644 python/pyarrow/tests/test_without_numpy.py

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 854d792f3100d..90d3a50af3705 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -59,6 +59,7 @@ jobs:
           - conda-python-3.9-nopandas
           - conda-python-3.8-pandas-1.0
           - conda-python-3.10-pandas-latest
+          - conda-python-3.10-no-numpy
         include:
           - name: conda-python-docs
             cache: conda-python-3.9
@@ -83,6 +84,11 @@ jobs:
             title: AMD64 Conda Python 3.10 Pandas latest
             python: "3.10"
             pandas: latest
+          - name: conda-python-3.10-no-numpy
+            cache: conda-python-3.10
+            image: conda-python-no-numpy
+            title: AMD64 Conda Python 3.10 without NumPy
+            python: "3.10"
     env:
       PYTHON: ${{ matrix.python || 3.8 }}
       UBUNTU: ${{ matrix.ubuntu || 20.04 }}
diff --git a/docker-compose.yml b/docker-compose.yml
index 3045cf015bc26..97d6e1158ea03 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -126,6 +126,7 @@ x-hierarchy:
         - conda-python-hdfs
         - conda-python-java-integration
         - conda-python-jpype
+        - conda-python-no-numpy
         - conda-python-spark
         - conda-python-substrait
   - conda-verify-rc
@@ -1258,6 +1259,37 @@ services:
     volumes: *conda-volumes
     command: *python-conda-command
 
+  conda-python-no-numpy:
+    # Usage:
+    #   docker-compose build conda
+    #   docker-compose build conda-cpp
+    #   docker-compose build conda-python
+    #   docker-compose build conda-python-no-numpy
+    #   docker-compose run --rm conda-python-no-numpy
+    image: ${REPO}:${ARCH}-conda-python-${PYTHON}-no-numpy
+    build:
+      context: .
+      dockerfile: ci/docker/conda-python.dockerfile
+      cache_from:
+        - ${REPO}:${ARCH}-conda-python-${PYTHON}
+      args:
+        repo: ${REPO}
+        arch: ${ARCH}
+        python: ${PYTHON}
+    shm_size: *shm-size
+    environment:
+      <<: [*common, *ccache, *sccache]
+      PARQUET_REQUIRE_ENCRYPTION:  # inherit
+      HYPOTHESIS_PROFILE:  # inherit
+      PYARROW_TEST_HYPOTHESIS:  # inherit
+    volumes: *conda-volumes
+    command:
+      ["
+        /arrow/ci/scripts/cpp_build.sh /arrow /build &&
+        /arrow/ci/scripts/python_build.sh /arrow /build &&
+        mamba uninstall -y numpy &&
+        /arrow/ci/scripts/python_test.sh /arrow"]
+
   conda-python-docs:
     # Usage:
     #   archery docker run conda-python-docs
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1a18b2b173acb..eda4ff4ca5f07 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -339,17 +339,17 @@ set(PYARROW_CPP_SRCS
     ${PYARROW_CPP_SOURCE_DIR}/gdb.cc
     ${PYARROW_CPP_SOURCE_DIR}/helpers.cc
     ${PYARROW_CPP_SOURCE_DIR}/inference.cc
-    ${PYARROW_CPP_SOURCE_DIR}/init.cc
     ${PYARROW_CPP_SOURCE_DIR}/io.cc
     ${PYARROW_CPP_SOURCE_DIR}/ipc.cc
     ${PYARROW_CPP_SOURCE_DIR}/numpy_convert.cc
+    ${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
     ${PYARROW_CPP_SOURCE_DIR}/numpy_to_arrow.cc
     ${PYARROW_CPP_SOURCE_DIR}/python_test.cc
     ${PYARROW_CPP_SOURCE_DIR}/python_to_arrow.cc
     ${PYARROW_CPP_SOURCE_DIR}/pyarrow.cc
     ${PYARROW_CPP_SOURCE_DIR}/serialize.cc
     ${PYARROW_CPP_SOURCE_DIR}/udf.cc)
-set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/init.cc
+set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
                             PROPERTIES SKIP_PRECOMPILE_HEADERS ON
                                        SKIP_UNITY_BUILD_INCLUSION ON)
 
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 0e860eaf4c6b8..d39120934d5fd 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -33,7 +33,10 @@ from pyarrow.util import _DEPR_MSG
 from libcpp cimport bool as c_bool
 
 import inspect
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import warnings
 
 
@@ -43,6 +46,11 @@ _substrait_msg = (
 )
 
 
+SUPPORTED_INPUT_ARR_TYPES = (list, tuple)
+if np is not None:
+    SUPPORTED_INPUT_ARR_TYPES += (np.ndarray, )
+
+
 def _pas():
     global __pas
     if __pas is None:
@@ -473,7 +481,7 @@ cdef class MetaFunction(Function):
 
 cdef _pack_compute_args(object values, vector[CDatum]* out):
     for val in values:
-        if isinstance(val, (list, np.ndarray)):
+        if isinstance(val, SUPPORTED_INPUT_ARR_TYPES):
             val = lib.asarray(val)
 
         if isinstance(val, Array):
@@ -2189,7 +2197,7 @@ class QuantileOptions(_QuantileOptions):
 
     def __init__(self, q=0.5, *, interpolation="linear", skip_nulls=True,
                  min_count=0):
-        if not isinstance(q, (list, tuple, np.ndarray)):
+        if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
             q = [q]
         self._set_options(q, interpolation, skip_nulls, min_count)
 
@@ -2222,7 +2230,7 @@ class TDigestOptions(_TDigestOptions):
 
     def __init__(self, q=0.5, *, delta=100, buffer_size=500, skip_nulls=True,
                  min_count=0):
-        if not isinstance(q, (list, tuple, np.ndarray)):
+        if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
             q = [q]
         self._set_options(q, delta, buffer_size, skip_nulls, min_count)
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 1587de0e6b744..93c44297590e8 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -50,6 +50,8 @@ cdef _sequence_to_array(object sequence, object mask, object size,
 
 
 cdef inline _is_array_like(obj):
+    if np is None:
+        return False
     if isinstance(obj, np.ndarray):
         return True
     return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj)
@@ -1608,6 +1610,9 @@ cdef class Array(_PandasConvertible):
         """
         self._assert_cpu()
 
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef:
             PyObject* out
             PandasOptions c_options
diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi
index 2af39e2c589e6..fbab5bbdb5a01 100644
--- a/python/pyarrow/builder.pxi
+++ b/python/pyarrow/builder.pxi
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import math
+
 
 cdef class StringBuilder(_Weakrefable):
     """
@@ -42,10 +44,10 @@ cdef class StringBuilder(_Weakrefable):
         value : string/bytes or np.nan/None
             The value to append to the string array builder.
         """
-        if value is None or value is np.nan:
-            self.builder.get().AppendNull()
-        elif isinstance(value, (bytes, str)):
+        if isinstance(value, (bytes, str)):
             self.builder.get().Append(tobytes(value))
+        elif value is None or math.isnan(value):
+            self.builder.get().AppendNull()
         else:
             raise TypeError('StringBuilder only accepts string objects')
 
@@ -108,10 +110,10 @@ cdef class StringViewBuilder(_Weakrefable):
         value : string/bytes or np.nan/None
             The value to append to the string array builder.
         """
-        if value is None or value is np.nan:
-            self.builder.get().AppendNull()
-        elif isinstance(value, (bytes, str)):
+        if isinstance(value, (bytes, str)):
             self.builder.get().Append(tobytes(value))
+        elif value is None or math.isnan(value):
+            self.builder.get().AppendNull()
         else:
             raise TypeError('StringViewBuilder only accepts string objects')
 
diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py
index 29c850c142da1..10a2e72f923cb 100644
--- a/python/pyarrow/conftest.py
+++ b/python/pyarrow/conftest.py
@@ -25,7 +25,6 @@
 from pyarrow.tests.util import windows_has_tzdata
 import sys
 
-import numpy as np
 
 groups = [
     'acero',
@@ -46,6 +45,8 @@
     'lz4',
     'memory_leak',
     'nopandas',
+    'nonumpy',
+    'numpy',
     'orc',
     'pandas',
     'parquet',
@@ -81,6 +82,8 @@
     'lz4': Codec.is_available('lz4'),
     'memory_leak': False,
     'nopandas': False,
+    'nonumpy': False,
+    'numpy': False,
     'orc': False,
     'pandas': False,
     'parquet': False,
@@ -158,6 +161,12 @@
 except ImportError:
     defaults['nopandas'] = True
 
+try:
+    import numpy  # noqa
+    defaults['numpy'] = True
+except ImportError:
+    defaults['nonumpy'] = True
+
 try:
     import pyarrow.parquet  # noqa
     defaults['parquet'] = True
@@ -327,6 +336,7 @@ def unary_agg_func_fixture():
     Register a unary aggregate function (mean)
     """
     from pyarrow import compute as pc
+    import numpy as np
 
     def func(ctx, x):
         return pa.scalar(np.nanmean(x))
@@ -352,6 +362,7 @@ def varargs_agg_func_fixture():
     Register a unary aggregate function
     """
     from pyarrow import compute as pc
+    import numpy as np
 
     def func(ctx, *args):
         sum = 0.0
diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd
index 9fcc97aaf0a9c..96725c9c3862b 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -248,7 +248,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:
     CResult[PyObject*] StringToTzinfo(c_string)
 
 
-cdef extern from "arrow/python/init.h":
+cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py":
     int arrow_init_numpy() except -1
 
 
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index c72841c299566..6b82eb6566896 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -21,7 +21,10 @@
 
 import datetime
 import decimal as _pydecimal
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import os
 import sys
 
@@ -32,8 +35,11 @@ from pyarrow.includes.common cimport PyObject_to_object
 cimport pyarrow.includes.libarrow_python as libarrow_python
 cimport cpython as cp
 
-# Initialize NumPy C API
-arrow_init_numpy()
+
+# Initialize NumPy C API only if numpy was able to be imported
+if np is not None:
+    arrow_init_numpy()
+
 # Initialize PyArrow C++ API
 # (used from some of our C++ code, see e.g. ARROW-5260)
 import_pyarrow()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index fcccf564fc619..7fbde36bc23e9 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -30,13 +30,17 @@
 import re
 import warnings
 
-import numpy as np
-
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled  # noqa
 
 
 _logical_type_map = {}
+_numpy_logical_type_map = {}
+_pandas_logical_type_map = {}
 
 
 def get_logical_type_map():
@@ -85,27 +89,32 @@ def get_logical_type(arrow_type):
         return 'object'
 
 
-_numpy_logical_type_map = {
-    np.bool_: 'bool',
-    np.int8: 'int8',
-    np.int16: 'int16',
-    np.int32: 'int32',
-    np.int64: 'int64',
-    np.uint8: 'uint8',
-    np.uint16: 'uint16',
-    np.uint32: 'uint32',
-    np.uint64: 'uint64',
-    np.float32: 'float32',
-    np.float64: 'float64',
-    'datetime64[D]': 'date',
-    np.str_: 'string',
-    np.bytes_: 'bytes',
-}
+def get_numpy_logical_type_map():
+    global _numpy_logical_type_map
+    if not _numpy_logical_type_map:
+        _numpy_logical_type_map.update({
+            np.bool_: 'bool',
+            np.int8: 'int8',
+            np.int16: 'int16',
+            np.int32: 'int32',
+            np.int64: 'int64',
+            np.uint8: 'uint8',
+            np.uint16: 'uint16',
+            np.uint32: 'uint32',
+            np.uint64: 'uint64',
+            np.float32: 'float32',
+            np.float64: 'float64',
+            'datetime64[D]': 'date',
+            np.str_: 'string',
+            np.bytes_: 'bytes',
+        })
+    return _numpy_logical_type_map
 
 
 def get_logical_type_from_numpy(pandas_collection):
+    numpy_logical_type_map = get_numpy_logical_type_map()
     try:
-        return _numpy_logical_type_map[pandas_collection.dtype.type]
+        return numpy_logical_type_map[pandas_collection.dtype.type]
     except KeyError:
         if hasattr(pandas_collection.dtype, 'tz'):
             return 'datetimetz'
@@ -1023,18 +1032,23 @@ def _is_generated_index_name(name):
     return re.match(pattern, name) is not None
 
 
-_pandas_logical_type_map = {
-    'date': 'datetime64[D]',
-    'datetime': 'datetime64[ns]',
-    'datetimetz': 'datetime64[ns]',
-    'unicode': np.str_,
-    'bytes': np.bytes_,
-    'string': np.str_,
-    'integer': np.int64,
-    'floating': np.float64,
-    'decimal': np.object_,
-    'empty': np.object_,
-}
+def get_pandas_logical_type_map():
+    global _pandas_logical_type_map
+
+    if not _pandas_logical_type_map:
+        _pandas_logical_type_map.update({
+            'date': 'datetime64[D]',
+            'datetime': 'datetime64[ns]',
+            'datetimetz': 'datetime64[ns]',
+            'unicode': np.str_,
+            'bytes': np.bytes_,
+            'string': np.str_,
+            'integer': np.int64,
+            'floating': np.float64,
+            'decimal': np.object_,
+            'empty': np.object_,
+        })
+    return _pandas_logical_type_map
 
 
 def _pandas_type_to_numpy_type(pandas_type):
@@ -1050,8 +1064,9 @@ def _pandas_type_to_numpy_type(pandas_type):
     dtype : np.dtype
         The dtype that corresponds to `pandas_type`.
     """
+    pandas_logical_type_map = get_pandas_logical_type_map()
     try:
-        return _pandas_logical_type_map[pandas_type]
+        return pandas_logical_type_map[pandas_type]
     except KeyError:
         if 'mixed' in pandas_type:
             # catching 'mixed', 'mixed-integer' and 'mixed-integer-float'
diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc
index 10116f9afad69..1aa7915ba1e19 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -395,11 +395,11 @@ class TypeInferrer {
       *keep_going = make_unions_;
     } else if (arrow::py::is_scalar(obj)) {
       RETURN_NOT_OK(VisitArrowScalar(obj, keep_going));
-    } else if (PyArray_CheckAnyScalarExact(obj)) {
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
       RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going));
     } else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) {
       RETURN_NOT_OK(VisitSet(obj, keep_going));
-    } else if (PyArray_Check(obj)) {
+    } else if (has_numpy() && PyArray_Check(obj)) {
       RETURN_NOT_OK(VisitNdarray(obj, keep_going));
     } else if (PyDict_Check(obj)) {
       RETURN_NOT_OK(VisitDict(obj));
diff --git a/python/pyarrow/src/arrow/python/iterators.h b/python/pyarrow/src/arrow/python/iterators.h
index 7b31962dac5b8..8512276848272 100644
--- a/python/pyarrow/src/arrow/python/iterators.h
+++ b/python/pyarrow/src/arrow/python/iterators.h
@@ -22,6 +22,7 @@
 #include "arrow/array/array_primitive.h"
 
 #include "arrow/python/common.h"
+#include "arrow/python/numpy_init.h"
 #include "arrow/python/numpy_internal.h"
 
 namespace arrow {
@@ -44,7 +45,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&&
   // VisitorFunc may set to false to terminate iteration
   bool keep_going = true;
 
-  if (PyArray_Check(obj)) {
+  if (has_numpy() && PyArray_Check(obj)) {
     PyArrayObject* arr_obj = reinterpret_cast<PyArrayObject*>(obj);
     if (PyArray_NDIM(arr_obj) != 1) {
       return Status::Invalid("Only 1D arrays accepted");
@@ -64,6 +65,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&&
     // This code path is inefficient: callers should implement dedicated
     // logic for non-object arrays.
   }
+
   if (PySequence_Check(obj)) {
     if (PyList_Check(obj) || PyTuple_Check(obj)) {
       // Use fast item access
@@ -101,7 +103,7 @@ inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) {
 template <class VisitorFunc>
 inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
                                   VisitorFunc&& func) {
-  if (PyArray_Check(mo)) {
+  if (has_numpy() && PyArray_Check(mo)) {
     PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
     if (PyArray_NDIM(mask) != 1) {
       return Status::Invalid("Mask must be 1D array");
diff --git a/python/pyarrow/src/arrow/python/init.cc b/python/pyarrow/src/arrow/python/numpy_init.cc
similarity index 78%
rename from python/pyarrow/src/arrow/python/init.cc
rename to python/pyarrow/src/arrow/python/numpy_init.cc
index dba293bbe2366..96e2c7b7ccb5c 100644
--- a/python/pyarrow/src/arrow/python/init.cc
+++ b/python/pyarrow/src/arrow/python/numpy_init.cc
@@ -18,7 +18,16 @@
 // Trigger the array import (inversion of NO_IMPORT_ARRAY)
 #define NUMPY_IMPORT_ARRAY
 
-#include "arrow/python/init.h"
+#include "arrow/python/numpy_init.h"
 #include "arrow/python/numpy_interop.h"
 
-int arrow_init_numpy() { return arrow::py::import_numpy(); }
+namespace arrow::py {
+bool numpy_imported = false;
+
+int arrow_init_numpy() {
+  numpy_imported = true;
+  return arrow::py::import_numpy();
+}
+
+bool has_numpy() { return numpy_imported; }
+}  // namespace arrow::py
diff --git a/python/pyarrow/src/arrow/python/init.h b/python/pyarrow/src/arrow/python/numpy_init.h
similarity index 93%
rename from python/pyarrow/src/arrow/python/init.h
rename to python/pyarrow/src/arrow/python/numpy_init.h
index 2e6c954862bd9..36c544c1b51fd 100644
--- a/python/pyarrow/src/arrow/python/init.h
+++ b/python/pyarrow/src/arrow/python/numpy_init.h
@@ -20,7 +20,8 @@
 #include "arrow/python/platform.h"
 #include "arrow/python/visibility.h"
 
-extern "C" {
+namespace arrow::py {
 ARROW_PYTHON_EXPORT
 int arrow_init_numpy();
-}
+bool has_numpy();
+}  // namespace arrow::py
diff --git a/python/pyarrow/src/arrow/python/numpy_internal.h b/python/pyarrow/src/arrow/python/numpy_internal.h
index b9b632f9f9a12..0b4d0be00e42b 100644
--- a/python/pyarrow/src/arrow/python/numpy_internal.h
+++ b/python/pyarrow/src/arrow/python/numpy_internal.h
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include "arrow/python/numpy_init.h"
 #include "arrow/python/numpy_interop.h"
 
 #include "arrow/status.h"
@@ -155,15 +156,27 @@ inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
 namespace internal {
 
 inline bool PyFloatScalar_Check(PyObject* obj) {
-  return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
+  if (has_numpy()) {
+    return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
+  } else {
+    return PyFloat_Check(obj);
+  }
 }
 
 inline bool PyIntScalar_Check(PyObject* obj) {
-  return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
+  if (has_numpy()) {
+    return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
+  } else {
+    return PyLong_Check(obj);
+  }
 }
 
 inline bool PyBoolScalar_Check(PyObject* obj) {
-  return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
+  if (has_numpy()) {
+    return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
+  } else {
+    return PyBool_Check(obj);
+  }
 }
 
 static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
diff --git a/python/pyarrow/src/arrow/python/python_test.cc b/python/pyarrow/src/arrow/python/python_test.cc
index 746bf410911f9..eea6bf9459d1f 100644
--- a/python/pyarrow/src/arrow/python/python_test.cc
+++ b/python/pyarrow/src/arrow/python/python_test.cc
@@ -870,7 +870,7 @@ std::vector<TestCase> GetCppTestCases() {
        TestInferAllLeadingZerosExponentialNotationPositive},
       {"test_infer_all_leading_zeros_exponential_notation_negative",
        TestInferAllLeadingZerosExponentialNotationNegative},
-      {"test_object_block_write_fails", TestObjectBlockWriteFails},
+      {"test_object_block_write_fails_pandas_convert", TestObjectBlockWriteFails},
       {"test_mixed_type_fails", TestMixedTypeFails},
       {"test_from_python_decimal_rescale_not_truncateable",
        TestFromPythonDecimalRescaleNotTruncateable},
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index ce9e15c894ce3..e7195e99072b0 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -202,7 +202,7 @@ class PyValue {
       return true;
     } else if (obj == Py_False) {
       return false;
-    } else if (PyArray_IsScalar(obj, Bool)) {
+    } else if (has_numpy() && PyArray_IsScalar(obj, Bool)) {
       return reinterpret_cast<PyBoolScalarObject*>(obj)->obval == NPY_TRUE;
     } else {
       return internal::InvalidValue(obj, "tried to convert to boolean");
@@ -385,7 +385,7 @@ class PyValue {
         default:
           return Status::UnknownError("Invalid time unit");
       }
-    } else if (PyArray_CheckAnyScalarExact(obj)) {
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
       // validate that the numpy scalar has np.datetime64 dtype
       ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
       if (!numpy_type->Equals(*type)) {
@@ -464,7 +464,7 @@ class PyValue {
         default:
           return Status::UnknownError("Invalid time unit");
       }
-    } else if (PyArray_CheckAnyScalarExact(obj)) {
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
       // validate that the numpy scalar has np.datetime64 dtype
       ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
       if (!numpy_type->Equals(*type)) {
@@ -664,7 +664,7 @@ class PyPrimitiveConverter<
       ARROW_ASSIGN_OR_RAISE(
           auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
       // Numpy NaT sentinels can be checked after the conversion
-      if (PyArray_CheckAnyScalarExact(value) &&
+      if (has_numpy() && PyArray_CheckAnyScalarExact(value) &&
           PyValue::IsNaT(this->primitive_type_, converted)) {
         this->primitive_builder_->UnsafeAppendNull();
       } else {
@@ -804,8 +804,7 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
     if (PyValue::IsNull(this->options_, value)) {
       return this->list_builder_->AppendNull();
     }
-
-    if (PyArray_Check(value)) {
+    if (has_numpy() && PyArray_Check(value)) {
       RETURN_NOT_OK(AppendNdarray(value));
     } else if (PySequence_Check(value)) {
       RETURN_NOT_OK(AppendSequence(value));
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 6d34c71c9df40..fff47373cb991 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -495,6 +495,9 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.to_numpy()
         array([  2,   2,   4,   4,   5, 100])
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         if zero_copy_only:
             raise ValueError(
                 "zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy"
diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi
index 6fb4fc99d7cbc..3e0c63c18fc98 100644
--- a/python/pyarrow/tensor.pxi
+++ b/python/pyarrow/tensor.pxi
@@ -107,6 +107,9 @@ strides: {0.strides}""".format(self)
         array([[  2,   2,   4],
                [  4,   5, 100]], dtype=int32)
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out
 
         check_status(TensorToNdarray(self.sp_tensor, self, &out))
@@ -478,6 +481,9 @@ shape: {0.shape}""".format(self)
         """
         Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy.
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out_data
         cdef PyObject* out_coords
 
@@ -743,6 +749,9 @@ shape: {0.shape}""".format(self)
         """
         Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy.
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out_data
         cdef PyObject* out_indptr
         cdef PyObject* out_indices
@@ -981,6 +990,9 @@ shape: {0.shape}""".format(self)
         """
         Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out_data
         cdef PyObject* out_indptr
         cdef PyObject* out_indices
@@ -1216,6 +1228,9 @@ shape: {0.shape}""".format(self)
         """
         Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy
         """
+        if np is None:
+            raise ImportError(
+                "Cannot return a numpy.ndarray if NumPy is not present")
         cdef PyObject* out_data
         cdef PyObject* out_indptr
         cdef PyObject* out_indices
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 7a222cec8a7c4..0b82696d0a73f 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -25,6 +25,7 @@
 
 import pytest
 import hypothesis as h
+
 from ..conftest import groups, defaults
 
 from pyarrow import set_timezone_db_path
diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py
index 6d91bad57cef4..50da6693afff1 100644
--- a/python/pyarrow/tests/interchange/test_conversion.py
+++ b/python/pyarrow/tests/interchange/test_conversion.py
@@ -16,11 +16,15 @@
 # under the License.
 
 from datetime import datetime as dt
-import numpy as np
 import pyarrow as pa
 from pyarrow.vendored.version import Version
 import pytest
 
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
 import pyarrow.interchange as pi
 from pyarrow.interchange.column import (
     _PyArrowColumn,
@@ -107,13 +111,13 @@ def test_offset_of_sliced_array():
     "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
 )
 @pytest.mark.parametrize(
-    "float, np_float", [
+    "float, np_float_str", [
         # (pa.float16(), np.float16),   #not supported by pandas
-        (pa.float32(), np.float32),
-        (pa.float64(), np.float64)
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
     ]
 )
-def test_pandas_roundtrip(uint, int, float, np_float):
+def test_pandas_roundtrip(uint, int, float, np_float_str):
     if Version(pd.__version__) < Version("1.5.0"):
         pytest.skip("__dataframe__ added to pandas in 1.5.0")
 
@@ -122,7 +126,7 @@ def test_pandas_roundtrip(uint, int, float, np_float):
         {
             "a": pa.array(arr, type=uint),
             "b": pa.array(arr, type=int),
-            "c": pa.array(np.array(arr, dtype=np_float), type=float),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float),
             "d": [True, False, True],
         }
     )
@@ -326,13 +330,13 @@ def test_pandas_roundtrip_datetime(unit):
 
 @pytest.mark.pandas
 @pytest.mark.parametrize(
-    "np_float", [np.float32, np.float64]
+    "np_float_str", ["float32", "float64"]
 )
-def test_pandas_to_pyarrow_with_missing(np_float):
+def test_pandas_to_pyarrow_with_missing(np_float_str):
     if Version(pd.__version__) < Version("1.5.0"):
         pytest.skip("__dataframe__ added to pandas in 1.5.0")
 
-    np_array = np.array([0, np.nan, 2], dtype=np_float)
+    np_array = np.array([0, np.nan, 2], dtype=np.dtype(np_float_str))
     datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)]
     df = pd.DataFrame({
         # float, ColumnNullType.USE_NAN
@@ -364,6 +368,7 @@ def test_pandas_to_pyarrow_float16_with_missing():
         pi.from_dataframe(df)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(
     "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
 )
@@ -371,16 +376,16 @@ def test_pandas_to_pyarrow_float16_with_missing():
     "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
 )
 @pytest.mark.parametrize(
-    "float, np_float", [
-        (pa.float16(), np.float16),
-        (pa.float32(), np.float32),
-        (pa.float64(), np.float64)
+    "float, np_float_str", [
+        (pa.float16(), "float16"),
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
     ]
 )
 @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
 @pytest.mark.parametrize("tz", ['America/New_York', '+07:30', '-04:30'])
 @pytest.mark.parametrize("offset, length", [(0, 3), (0, 2), (1, 2), (2, 1)])
-def test_pyarrow_roundtrip(uint, int, float, np_float,
+def test_pyarrow_roundtrip(uint, int, float, np_float_str,
                            unit, tz, offset, length):
 
     from datetime import datetime as dt
@@ -391,7 +396,7 @@ def test_pyarrow_roundtrip(uint, int, float, np_float,
         {
             "a": pa.array(arr, type=uint),
             "b": pa.array(arr, type=int),
-            "c": pa.array(np.array(arr, dtype=np_float),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)),
                           type=float, from_pandas=True),
             "d": [True, False, True],
             "e": [True, False, None],
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
index 826089652bca6..d060f7842c2fe 100644
--- a/python/pyarrow/tests/interchange/test_interchange_spec.py
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -19,10 +19,13 @@
 import hypothesis as h
 import hypothesis.strategies as st
 
-import numpy as np
+import pytest
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 import pyarrow.tests.strategies as past
-import pytest
 
 
 all_types = st.deferred(
@@ -39,6 +42,7 @@
 
 # datetime is tested in test_extra.py
 # dictionary is tested in test_categorical()
+@pytest.mark.numpy
 @h.given(past.arrays(all_types, size=3))
 def test_dtypes(arr):
     table = pa.table([arr], names=["a"])
@@ -51,6 +55,7 @@ def test_dtypes(arr):
     assert df.get_column(0).offset == 0
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(
     "uint, uint_bw",
     [
@@ -68,17 +73,17 @@ def test_dtypes(arr):
     ]
 )
 @pytest.mark.parametrize(
-    "float, float_bw, np_float", [
-        (pa.float16(), 16, np.float16),
-        (pa.float32(), 32, np.float32),
-        (pa.float64(), 64, np.float64)
+    "float, float_bw, np_float_str", [
+        (pa.float16(), 16, "float16"),
+        (pa.float32(), 32, "float32"),
+        (pa.float64(), 64, "float64")
     ]
 )
 @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
 @pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
 @pytest.mark.parametrize("use_batch", [False, True])
 def test_mixed_dtypes(uint, uint_bw, int, int_bw,
-                      float, float_bw, np_float, unit, tz,
+                      float, float_bw, np_float_str, unit, tz,
                       use_batch):
     from datetime import datetime as dt
     arr = [1, 2, 3]
@@ -87,7 +92,7 @@ def test_mixed_dtypes(uint, uint_bw, int, int_bw,
         {
             "a": pa.array(arr, type=uint),
             "b": pa.array(arr, type=int),
-            "c": pa.array(np.array(arr, dtype=np_float), type=float),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float),
             "d": [True, False, True],
             "e": ["a", "", "c"],
             "f": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
@@ -200,16 +205,16 @@ def test_column_get_chunks(use_batch, size, n_chunks):
     "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
 )
 @pytest.mark.parametrize(
-    "float, np_float", [
-        (pa.float16(), np.float16),
-        (pa.float32(), np.float32),
-        (pa.float64(), np.float64)
+    "float, np_float_str", [
+        (pa.float16(), "float16"),
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
     ]
 )
 @pytest.mark.parametrize("use_batch", [False, True])
-def test_get_columns(uint, int, float, np_float, use_batch):
+def test_get_columns(uint, int, float, np_float_str, use_batch):
     arr = [[1, 2, 3], [4, 5]]
-    arr_float = np.array([1, 2, 3, 4, 5], dtype=np_float)
+    arr_float = np.array([1, 2, 3, 4, 5], dtype=np.dtype(np_float_str))
     table = pa.table(
         {
             "a": pa.chunked_array(arr, type=uint),
diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py
index b4a57ba0b1556..fd6ad94fbd6d3 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -17,7 +17,10 @@
 
 import io
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 from pyarrow.tests import util
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index 194af7415e863..6496aa99092b8 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -22,7 +22,6 @@
 from shutil import copytree
 from decimal import Decimal
 
-import numpy as np
 import pytest
 
 import pyarrow as pa
@@ -47,6 +46,10 @@
 except ImportError:
     pd = tm = None
 
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 # Marks all of the tests in this module
 # Ignore these with pytest ... -m 'not parquet'
diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py
index e6b66b00428fb..79dd96948261c 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -17,8 +17,12 @@
 
 import decimal
 import io
+import random
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
@@ -173,6 +177,7 @@ def test_direct_read_dictionary_subfield():
     assert result[0].num_chunks == 1
 
 
+@pytest.mark.numpy
 def test_dictionary_array_automatically_read():
     # ARROW-3246
 
@@ -334,10 +339,10 @@ def test_column_of_lists(tempdir):
 def test_large_list_records():
     # This was fixed in PARQUET-1100
 
-    list_lengths = np.random.randint(0, 500, size=50)
-    list_lengths[::10] = 0
+    list_lengths = [random.randint(0, 500) for _ in range(50)]
+    list_lengths[::10] = [0, 0, 0, 0, 0]
 
-    list_values = [list(map(int, np.random.randint(0, 100, size=x)))
+    list_values = [list(map(int, [random.randint(0, 100) for _ in range(x)]))
                    if i % 8 else None
                    for i, x in enumerate(list_lengths)]
 
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
index 47e608a1404ff..f68f1aa9cdb46 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -20,7 +20,10 @@
 import os
 import pathlib
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 import unittest.mock as mock
 
diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py
index 08fb1098322be..b89fd97cb91e6 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -19,7 +19,10 @@
 import io
 import warnings
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
index c29213ebc3d42..14ce9bbfcdd58 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -20,7 +20,10 @@
 from collections import OrderedDict
 import io
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
@@ -584,7 +587,7 @@ def test_table_large_metadata():
     my_schema = pa.schema([pa.field('f0', 'double')],
                           metadata={'large': 'x' * 10000000})
 
-    table = pa.table([np.arange(10)], schema=my_schema)
+    table = pa.table([range(10)], schema=my_schema)
     _check_roundtrip(table)
 
 
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
index b5913bf5c6b6e..2ea2f46873aef 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -18,7 +18,10 @@
 import io
 import json
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index db0aa1397123d..7a1b31a4d9d77 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -21,7 +21,10 @@
 import pytest
 import hypothesis as h
 import hypothesis.strategies as st
-import hypothesis.extra.numpy as npst
+try:
+    import hypothesis.extra.numpy as npst
+except ImportError:
+    npst = None
 try:
     import hypothesis.extra.pytz as tzst
 except ImportError:
@@ -35,7 +38,10 @@
         import tzdata  # noqa:F401
     except ImportError:
         zoneinfo = None
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 
diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py
index cd381cf427dc3..76a766984dab6 100644
--- a/python/pyarrow/tests/test_adhoc_memory_leak.py
+++ b/python/pyarrow/tests/test_adhoc_memory_leak.py
@@ -17,7 +17,10 @@
 
 import pytest
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 
 import pyarrow.tests.util as test_util
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index c44ec3f8e1afe..4160d64829483 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -27,7 +27,10 @@
 import sys
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 import pyarrow.tests.strategies as past
@@ -157,6 +160,7 @@ def test_binary_total_values_length():
     assert large_arr.slice(1, 3).total_values_length == 11
 
 
+@pytest.mark.numpy
 def test_to_numpy_zero_copy():
     arr = pa.array(range(10))
 
@@ -176,6 +180,7 @@ def test_to_numpy_zero_copy():
     np.testing.assert_array_equal(np_arr, expected)
 
 
+@pytest.mark.numpy
 def test_chunked_array_to_numpy_zero_copy():
     elements = [[2, 2, 4], [4, 5, 100]]
 
@@ -191,6 +196,7 @@ def test_chunked_array_to_numpy_zero_copy():
     np.testing.assert_array_equal(np_arr, expected)
 
 
+@pytest.mark.numpy
 def test_to_numpy_unsupported_types():
     # ARROW-2871: Some primitive types are not yet supported in to_numpy
     bool_arr = pa.array([True, False, True])
@@ -217,6 +223,7 @@ def test_to_numpy_unsupported_types():
         arr.to_numpy()
 
 
+@pytest.mark.numpy
 def test_to_numpy_writable():
     arr = pa.array(range(10))
     np_arr = arr.to_numpy()
@@ -234,6 +241,7 @@ def test_to_numpy_writable():
         arr.to_numpy(zero_copy_only=True, writable=True)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
 @pytest.mark.parametrize('tz', [None, "UTC"])
 def test_to_numpy_datetime64(unit, tz):
@@ -243,6 +251,7 @@ def test_to_numpy_datetime64(unit, tz):
     np.testing.assert_array_equal(np_arr, expected)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
 def test_to_numpy_timedelta64(unit):
     arr = pa.array([1, 2, 3], pa.duration(unit))
@@ -251,6 +260,7 @@ def test_to_numpy_timedelta64(unit):
     np.testing.assert_array_equal(np_arr, expected)
 
 
+@pytest.mark.numpy
 def test_to_numpy_dictionary():
     # ARROW-7591
     arr = pa.array(["a", "b", "a"]).dictionary_encode()
@@ -427,6 +437,11 @@ def test_array_getitem():
         with pytest.raises(IndexError):
             arr[idx]
 
+
+@pytest.mark.numpy
+def test_array_getitem_numpy_scalars():
+    arr = pa.array(range(10, 15))
+    lst = arr.to_pylist()
     # check that numpy scalars are supported
     for idx in range(-len(arr), len(arr)):
         assert arr[np.int32(idx)].as_py() == lst[idx]
@@ -469,9 +484,11 @@ def test_array_slice():
             res.validate()
             expected = arr.to_pylist()[start:stop]
             assert res.to_pylist() == expected
-            assert res.to_numpy().tolist() == expected
+            if np is not None:
+                assert res.to_numpy().tolist() == expected
 
 
+@pytest.mark.numpy
 def test_array_slice_negative_step():
     # ARROW-2714
     np_arr = np.arange(20)
@@ -542,6 +559,7 @@ def test_struct_array_slice():
                                    {'a': 5, 'b': 6.5}]
 
 
+@pytest.mark.numpy
 def test_array_factory_invalid_type():
 
     class MyObject:
@@ -552,6 +570,7 @@ class MyObject:
         pa.array(arr)
 
 
+@pytest.mark.numpy
 def test_array_ref_to_ndarray_base():
     arr = np.array([1, 2, 3])
 
@@ -576,6 +595,7 @@ def test_array_eq():
     assert (arr1 == None) is False  # noqa: E711
 
 
+@pytest.mark.numpy
 def test_array_from_buffers():
     values_buf = pa.py_buffer(np.int16([4, 5, 6, 7]))
     nulls_buf = pa.py_buffer(np.uint8([0b00001101]))
@@ -773,6 +793,7 @@ def test_dictionary_from_buffers(offset):
     assert a[offset:] == b
 
 
+@pytest.mark.numpy
 def test_dictionary_from_numpy():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
@@ -795,6 +816,7 @@ def test_dictionary_from_numpy():
             assert d2[i].as_py() == dictionary[indices[i]]
 
 
+@pytest.mark.numpy
 def test_dictionary_to_numpy():
     expected = pa.array(
         ["foo", "bar", None, "foo"]
@@ -865,6 +887,7 @@ def test_dictionary_to_numpy():
     )
 
 
+@pytest.mark.numpy
 def test_dictionary_from_boxed_arrays():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
@@ -910,6 +933,7 @@ def test_dictionary_indices():
     arr.indices.validate(full=True)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
                          [(pa.ListArray, pa.list_),
                           (pa.LargeListArray, pa.large_list)])
@@ -1052,6 +1076,7 @@ def test_map_from_dict():
     assert tup_arr.equals(dict_arr)
 
 
+@pytest.mark.numpy
 def test_map_from_arrays():
     offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
     offsets = pa.array(offsets_arr, type='int32')
@@ -1472,6 +1497,7 @@ def _check_cast_case(case, *, safe=True, check_array_construction=True):
         assert in_arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_integers_safe():
     safe_cases = [
         (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
@@ -1558,6 +1584,7 @@ def test_chunked_array_data_warns():
     assert isinstance(res, pa.ChunkedArray)
 
 
+@pytest.mark.numpy
 def test_cast_integers_unsafe():
     # We let NumPy do the unsafe casting.
     # Note that NEP50 in the NumPy spec no longer allows
@@ -1578,6 +1605,7 @@ def test_cast_integers_unsafe():
         _check_cast_case(case, safe=False)
 
 
+@pytest.mark.numpy
 def test_floating_point_truncate_safe():
     safe_cases = [
         (np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32',
@@ -1591,6 +1619,7 @@ def test_floating_point_truncate_safe():
         _check_cast_case(case, safe=True)
 
 
+@pytest.mark.numpy
 def test_floating_point_truncate_unsafe():
     unsafe_cases = [
         (np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32',
@@ -1635,6 +1664,7 @@ def test_decimal_to_int_safe():
         _check_cast_case(case, safe=True)
 
 
+@pytest.mark.numpy
 def test_decimal_to_int_value_out_of_bounds():
     out_of_bounds_cases = [
         (
@@ -1735,6 +1765,7 @@ def test_decimal_to_decimal():
         result = arr.cast(pa.decimal128(5, 2))
 
 
+@pytest.mark.numpy
 def test_safe_cast_nan_to_int_raises():
     arr = pa.array([np.nan, 1.])
 
@@ -1742,6 +1773,7 @@ def test_safe_cast_nan_to_int_raises():
         arr.cast(pa.int64(), safe=True)
 
 
+@pytest.mark.numpy
 def test_cast_signed_to_unsigned():
     safe_cases = [
         (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(),
@@ -1992,6 +2024,7 @@ def test_dictionary_decode():
         assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_time32_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int32'),
                    type=pa.time32('s'))
@@ -2001,6 +2034,7 @@ def test_cast_time32_to_int():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_time64_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                    type=pa.time64('us'))
@@ -2010,6 +2044,7 @@ def test_cast_time64_to_int():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_timestamp_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                    type=pa.timestamp('us'))
@@ -2035,6 +2070,7 @@ def test_cast_date32_to_int():
     assert result2.equals(arr)
 
 
+@pytest.mark.numpy
 def test_cast_duration_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                    type=pa.duration('us'))
@@ -2044,6 +2080,7 @@ def test_cast_duration_to_int():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_cast_binary_to_utf8():
     binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary())
     utf8_arr = binary_arr.cast(pa.utf8())
@@ -2064,6 +2101,7 @@ def test_cast_binary_to_utf8():
     assert casted.null_count == 1
 
 
+@pytest.mark.numpy
 def test_cast_date64_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                    type=pa.date64())
@@ -2146,6 +2184,7 @@ def test_array_pickle_dictionary(pickle_module):
         assert array.equals(result)
 
 
+@pytest.mark.numpy
 @h.settings(suppress_health_check=(h.HealthCheck.too_slow,))
 @h.given(
     past.arrays(
@@ -2177,9 +2216,9 @@ def test_array_pickle_protocol5(data, typ, pickle_module):
         assert result_addresses == addresses
 
 
-@pytest.mark.parametrize(
-    'narr',
-    [
+@pytest.mark.numpy
+def test_to_numpy_roundtrip():
+    for narr in [
         np.arange(10, dtype=np.int64),
         np.arange(10, dtype=np.int32),
         np.arange(10, dtype=np.int16),
@@ -2191,23 +2230,23 @@ def test_array_pickle_protocol5(data, typ, pickle_module):
         np.arange(10, dtype=np.float64),
         np.arange(10, dtype=np.float32),
         np.arange(10, dtype=np.float16),
-    ]
-)
-def test_to_numpy_roundtrip(narr):
-    arr = pa.array(narr)
-    assert narr.dtype == arr.to_numpy().dtype
-    np.testing.assert_array_equal(narr, arr.to_numpy())
-    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
-    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
-    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())
+    ]:
+        arr = pa.array(narr)
+        assert narr.dtype == arr.to_numpy().dtype
+        np.testing.assert_array_equal(narr, arr.to_numpy())
+        np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
+        np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
+        np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())
 
 
+@pytest.mark.numpy
 def test_array_uint64_from_py_over_range():
     arr = pa.array([2 ** 63], type=pa.uint64())
     expected = pa.array(np.array([2 ** 63], dtype='u8'))
     assert arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_conversions_no_sentinel_values():
     arr = np.array([1, 2, 3, 4], dtype='int8')
     refcount = sys.getrefcount(arr)
@@ -2249,6 +2288,7 @@ def test_time32_time64_from_integer():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_binary_string_pandas_null_sentinels():
     # ARROW-6227
     def _check_case(ty):
@@ -2259,6 +2299,7 @@ def _check_case(ty):
     _check_case('utf8')
 
 
+@pytest.mark.numpy
 def test_pandas_null_sentinels_raise_error():
     # ARROW-6227
     cases = [
@@ -2299,6 +2340,7 @@ def test_pandas_null_sentinels_index():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_roundtrip_from_numpy_datetimeD():
     arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')
 
@@ -2319,6 +2361,7 @@ def test_array_from_naive_datetimes():
     assert arr.type == pa.timestamp('us', tz=None)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('dtype', 'type'), [
     ('datetime64[s]', pa.timestamp('s')),
     ('datetime64[ms]', pa.timestamp('ms')),
@@ -2342,6 +2385,7 @@ def test_array_from_numpy_datetime(dtype, type):
     assert arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_from_different_numpy_datetime_units_raises():
     data = [
         None,
@@ -2356,6 +2400,7 @@ def test_array_from_different_numpy_datetime_units_raises():
         pa.array(data)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's'])
 def test_array_from_list_of_timestamps(unit):
     n = np.datetime64('NaT', unit)
@@ -2370,6 +2415,7 @@ def test_array_from_list_of_timestamps(unit):
     assert a1[0] == a2[0]
 
 
+@pytest.mark.numpy
 def test_array_from_timestamp_with_generic_unit():
     n = np.datetime64('NaT')
     x = np.datetime64('2017-01-01 01:01:01.111111111')
@@ -2380,6 +2426,7 @@ def test_array_from_timestamp_with_generic_unit():
         pa.array([n, x, y])
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('dtype', 'type'), [
     ('timedelta64[s]', pa.duration('s')),
     ('timedelta64[ms]', pa.duration('ms')),
@@ -2408,6 +2455,7 @@ def test_array_from_numpy_timedelta(dtype, type):
     assert arr.to_pylist() == data
 
 
+@pytest.mark.numpy
 def test_array_from_numpy_timedelta_incorrect_unit():
     # generic (no unit)
     td = np.timedelta64(1)
@@ -2423,6 +2471,7 @@ def test_array_from_numpy_timedelta_incorrect_unit():
             pa.array(data)
 
 
+@pytest.mark.numpy
 def test_array_from_numpy_ascii():
     arr = np.array(['abcde', 'abc', ''], dtype='|S5')
 
@@ -2567,6 +2616,7 @@ def test_interval_array_from_dateoffset():
     assert list(actual_list[0]) == expected_from_pandas
 
 
+@pytest.mark.numpy
 def test_array_from_numpy_unicode():
     dtypes = ['<U5', '>U5']
 
@@ -2599,12 +2649,14 @@ def test_array_from_numpy_unicode():
     assert arrow_arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_string_from_non_string():
     # ARROW-5682 - when converting to string raise on non string-like dtype
     with pytest.raises(TypeError):
         pa.array(np.array([1, 2, 3]), type=pa.string())
 
 
+@pytest.mark.numpy
 def test_array_string_from_all_null():
     # ARROW-5682
     vals = np.array([None, None], dtype=object)
@@ -2619,6 +2671,7 @@ def test_array_string_from_all_null():
     assert arr.null_count == 2
 
 
+@pytest.mark.numpy
 def test_array_from_masked():
     ma = np.ma.array([1, 2, 3, 4], dtype='int64',
                      mask=[False, False, True, False])
@@ -2630,6 +2683,7 @@ def test_array_from_masked():
         pa.array(ma, mask=np.array([True, False, False, False]))
 
 
+@pytest.mark.numpy
 def test_array_from_shrunken_masked():
     ma = np.ma.array([0], dtype='int64')
     result = pa.array(ma)
@@ -2637,6 +2691,7 @@ def test_array_from_shrunken_masked():
     assert expected.equals(result)
 
 
+@pytest.mark.numpy
 def test_array_from_invalid_dim_raises():
     msg = "only handle 1-dimensional arrays"
     arr2d = np.array([[1, 2, 3], [4, 5, 6]])
@@ -2648,6 +2703,7 @@ def test_array_from_invalid_dim_raises():
         pa.array(arr0d)
 
 
+@pytest.mark.numpy
 def test_array_from_strided_bool():
     # ARROW-6325
     arr = np.ones((3, 2), dtype=bool)
@@ -2659,6 +2715,7 @@ def test_array_from_strided_bool():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_array_from_strided():
     pydata = [
         ([b"ab", b"cd", b"ef"], (pa.binary(), pa.binary(2))),
@@ -2683,6 +2740,7 @@ def test_boolean_true_count_false_count():
     assert arr.false_count == 1000
 
 
+@pytest.mark.numpy
 def test_buffers_primitive():
     a = pa.array([1, 2, None, 4], type=pa.int16())
     buffers = a.buffers()
@@ -2755,6 +2813,7 @@ def test_buffers_nested():
     assert struct.unpack('4xh', values) == (43,)
 
 
+@pytest.mark.numpy
 def test_total_buffer_size():
     a = pa.array(np.array([4, 5, 6], dtype='int64'))
     assert a.nbytes == 8 * 3
@@ -3153,6 +3212,7 @@ def test_nested_dictionary_array():
     assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
 
 
+@pytest.mark.numpy
 def test_array_from_numpy_str_utf8():
     # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python
     # 2 they are NPY_STRING (binary), so we must do UTF-8 validation
@@ -3179,6 +3239,7 @@ def test_array_from_numpy_str_utf8():
         pa.array(vec, pa.string(), mask=np.array([False]))
 
 
+@pytest.mark.numpy
 @pytest.mark.slow
 @pytest.mark.large_memory
 def test_numpy_binary_overflow_to_chunked():
@@ -3237,6 +3298,7 @@ def test_list_child_overflow_to_chunked():
     assert len(arr.chunk(1)) == 1
 
 
+@pytest.mark.numpy
 def test_infer_type_masked():
     # ARROW-5208
     ty = pa.infer_type(['foo', 'bar', None, 2],
@@ -3252,6 +3314,7 @@ def test_infer_type_masked():
     assert pa.infer_type([], mask=[]) == pa.null()
 
 
+@pytest.mark.numpy
 def test_array_masked():
     # ARROW-5208
     arr = pa.array([4, None, 4, 3.],
@@ -3264,6 +3327,7 @@ def test_array_masked():
     assert arr.type == pa.int64()
 
 
+@pytest.mark.numpy
 def test_array_supported_masks():
     # ARROW-13883
     arr = pa.array([4, None, 4, 3.],
@@ -3322,6 +3386,7 @@ def test_array_supported_pandas_masks():
     assert arr.to_pylist() == [None, 1]
 
 
+@pytest.mark.numpy
 def test_binary_array_masked():
     # ARROW-12431
     masked_basic = pa.array([b'\x05'], type=pa.binary(1),
@@ -3354,6 +3419,7 @@ def test_binary_array_masked():
     assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist()
 
 
+@pytest.mark.numpy
 def test_binary_array_strided():
     # Masked
     nparray = np.array([b"ab", b"cd", b"ef"])
@@ -3367,6 +3433,7 @@ def test_binary_array_strided():
     assert [b"ab", b"ef"] == arrow_array.to_pylist()
 
 
+@pytest.mark.numpy
 def test_array_invalid_mask_raises():
     # ARROW-10742
     cases = [
@@ -3400,6 +3467,7 @@ def test_array_from_large_pyints():
         pa.array([int(2 ** 63)])
 
 
+@pytest.mark.numpy
 def test_numpy_array_protocol():
     # test the __array__ method on pyarrow.Array
     arr = pa.array([1, 2, 3])
@@ -3446,6 +3514,7 @@ def test_numpy_array_protocol():
     assert result.dtype == "float64"
 
 
+@pytest.mark.numpy
 def test_array_protocol():
 
     class MyArray:
@@ -3769,6 +3838,7 @@ def test_run_end_encoded_from_buffers():
                                            1, offset, children)
 
 
+@pytest.mark.numpy
 def test_run_end_encoded_from_array_with_type():
     run_ends = [1, 3, 6]
     values = [1, 2, 3]
@@ -3808,6 +3878,7 @@ def test_run_end_encoded_from_array_with_type():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 def test_run_end_encoded_to_numpy():
     arr = [1, 2, 2, 3, 3, 3]
     ree_array = pa.array(arr, pa.run_end_encoded(pa.int32(), pa.int64()))
@@ -4023,6 +4094,7 @@ def test_list_view_slice(list_view_type):
     assert sliced_array[0].as_py() == sliced_array.values[i:j].to_pylist() == [4]
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('numpy_native_dtype', ['u2', 'i4', 'f8'])
 def test_swapped_byte_order_fails(numpy_native_dtype):
     # ARROW-39129
diff --git a/python/pyarrow/tests/test_builder.py b/python/pyarrow/tests/test_builder.py
index abc8a0013df37..9187a19b5fc24 100644
--- a/python/pyarrow/tests/test_builder.py
+++ b/python/pyarrow/tests/test_builder.py
@@ -15,10 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import math
 import weakref
 
-import numpy as np
-
 import pyarrow as pa
 from pyarrow.lib import StringBuilder, StringViewBuilder
 
@@ -35,7 +34,7 @@ def test_string_builder_append():
     sbuilder = StringBuilder()
     sbuilder.append(b"a byte string")
     sbuilder.append("a string")
-    sbuilder.append(np.nan)
+    sbuilder.append(math.nan)
     sbuilder.append(None)
     assert len(sbuilder) == 4
     assert sbuilder.null_count == 2
@@ -50,7 +49,7 @@ def test_string_builder_append():
 
 def test_string_builder_append_values():
     sbuilder = StringBuilder()
-    sbuilder.append_values([np.nan, None, "text", None, "other text"])
+    sbuilder.append_values([math.nan, None, "text", None, "other text"])
     assert sbuilder.null_count == 3
     arr = sbuilder.finish()
     assert arr.null_count == 3
@@ -60,7 +59,7 @@ def test_string_builder_append_values():
 
 def test_string_builder_append_after_finish():
     sbuilder = StringBuilder()
-    sbuilder.append_values([np.nan, None, "text", None, "other text"])
+    sbuilder.append_values([math.nan, None, "text", None, "other text"])
     arr = sbuilder.finish()
     sbuilder.append("No effect")
     expected = [None, None, "text", None, "other text"]
@@ -72,7 +71,7 @@ def test_string_view_builder():
     builder.append(b"a byte string")
     builder.append("a string")
     builder.append("a longer not-inlined string")
-    builder.append(np.nan)
+    builder.append(math.nan)
     builder.append_values([None, "text"])
     assert len(builder) == 6
     assert builder.null_count == 2
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 64fe7f1deb510..d4307cd24f8fc 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -28,7 +28,10 @@
 import sys
 import textwrap
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 try:
     import pandas as pd
@@ -44,27 +47,6 @@
 except ImportError:
     pas = None
 
-all_array_types = [
-    ('bool', [True, False, False, True, True]),
-    ('uint8', np.arange(5)),
-    ('int8', np.arange(5)),
-    ('uint16', np.arange(5)),
-    ('int16', np.arange(5)),
-    ('uint32', np.arange(5)),
-    ('int32', np.arange(5)),
-    ('uint64', np.arange(5, 10)),
-    ('int64', np.arange(5, 10)),
-    ('float', np.arange(0, 0.5, 0.1)),
-    ('double', np.arange(0, 0.5, 0.1)),
-    ('string', ['a', 'b', None, 'ddd', 'ee']),
-    ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
-    (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
-    (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
-    (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
-    (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
-        {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
-]
-
 exported_functions = [
     func for (name, func) in sorted(pc.__dict__.items())
     if hasattr(func, '__arrow_compute_function__')]
@@ -87,6 +69,28 @@
 ]
 
 
+all_array_types = [
+    ('bool', [True, False, False, True, True]),
+    ('uint8', range(5)),
+    ('int8', range(5)),
+    ('uint16', range(5)),
+    ('int16', range(5)),
+    ('uint32', range(5)),
+    ('int32', range(5)),
+    ('uint64', range(5, 10)),
+    ('int64', range(5, 10)),
+    ('float', [0, 0.1, 0.2, 0.3, 0.4]),
+    ('double', [0, 0.1, 0.2, 0.3, 0.4]),
+    ('string', ['a', 'b', None, 'ddd', 'ee']),
+    ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
+    (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
+    (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
+    (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
+    (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
+        {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
+]
+
+
 def test_exported_functions():
     # Check that all exported concrete functions can be called with
     # the right number of arguments.
@@ -263,6 +267,7 @@ def test_get_function_hash_aggregate():
                         pc.HashAggregateKernel, 1)
 
 
+@pytest.mark.numpy
 def test_call_function_with_memory_pool():
     arr = pa.array(["foo", "bar", "baz"])
     indices = np.array([2, 2, 1])
@@ -1172,7 +1177,7 @@ def test_take_on_chunked_array():
         ]
     ])
 
-    indices = np.array([0, 5, 1, 6, 9, 2])
+    indices = pa.array([0, 5, 1, 6, 9, 2])
     result = arr.take(indices)
     expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]])
     assert result.equals(expected)
@@ -1304,12 +1309,6 @@ def test_filter(ty, values):
     result.validate()
     assert result.equals(pa.array([values[0], values[3], None], type=ty))
 
-    # same test with different array type
-    mask = np.array([True, False, False, True, None])
-    result = arr.filter(mask, null_selection_behavior='drop')
-    result.validate()
-    assert result.equals(pa.array([values[0], values[3]], type=ty))
-
     # non-boolean dtype
     mask = pa.array([0, 1, 0, 1, 0])
     with pytest.raises(NotImplementedError):
@@ -1321,6 +1320,17 @@ def test_filter(ty, values):
         arr.filter(mask)
 
 
+@pytest.mark.numpy
+@pytest.mark.parametrize(('ty', 'values'), all_array_types)
+def test_filter_numpy_array_mask(ty, values):
+    arr = pa.array(values, type=ty)
+    # same test as test_filter with different array type
+    mask = np.array([True, False, False, True, None])
+    result = arr.filter(mask, null_selection_behavior='drop')
+    result.validate()
+    assert result.equals(pa.array([values[0], values[3]], type=ty))
+
+
 def test_filter_chunked_array():
     arr = pa.chunked_array([["a", None], ["c", "d", "e"]])
     expected_drop = pa.chunked_array([["a"], ["e"]])
@@ -1586,9 +1596,11 @@ def test_round_to_integer(ty):
     for round_mode, expected in rmode_and_expected.items():
         options = RoundOptions(round_mode=round_mode)
         result = round(values, options=options)
-        np.testing.assert_array_equal(result, pa.array(expected))
+        expected_array = pa.array(expected, type=pa.float64())
+        assert expected_array.equals(result)
 
 
+@pytest.mark.numpy
 def test_round():
     values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
     ndigits_and_expected = {
@@ -1607,6 +1619,7 @@ def test_round():
         assert pc.round(values, ndigits, "half_towards_infinity") == result
 
 
+@pytest.mark.numpy
 def test_round_to_multiple():
     values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None]
     multiple_and_expected = {
@@ -1670,7 +1683,7 @@ def test_is_null():
     expected = pa.chunked_array([[True, True], [True, False]])
     assert result.equals(expected)
 
-    arr = pa.array([1, 2, 3, None, np.nan])
+    arr = pa.array([1, 2, 3, None, float("nan")])
     result = arr.is_null()
     expected = pa.array([False, False, False, True, False])
     assert result.equals(expected)
@@ -1681,7 +1694,7 @@ def test_is_null():
 
 
 def test_is_nan():
-    arr = pa.array([1, 2, 3, None, np.nan])
+    arr = pa.array([1, 2, 3, None, float("nan")])
     result = arr.is_nan()
     expected = pa.array([False, False, False, None, True])
     assert result.equals(expected)
@@ -1986,6 +1999,7 @@ def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx,
 
 
 # Cannot test float32 as case generators above assume float64
+@pytest.mark.numpy
 @pytest.mark.parametrize('float_ty', [pa.float64()], ids=str)
 @pytest.mark.parametrize('decimal_ty', decimal_type_traits,
                          ids=lambda v: v.name)
@@ -2003,6 +2017,7 @@ def test_cast_float_to_decimal(float_ty, decimal_ty, case_generator):
                 ctx, decimal_ty.max_precision)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('float_ty', [pa.float32(), pa.float64()], ids=str)
 @pytest.mark.parametrize('decimal_traits', decimal_type_traits,
                          ids=lambda v: v.name)
@@ -2908,6 +2923,7 @@ def test_min_max_element_wise():
     assert result == pa.array([1, 2, None])
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('start', (1.25, 10.5, -10.5))
 @pytest.mark.parametrize('skip_nulls', (True, False))
 def test_cumulative_sum(start, skip_nulls):
@@ -2962,6 +2978,7 @@ def test_cumulative_sum(start, skip_nulls):
             pc.cumulative_sum([1, 2, 3], start=strt)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('start', (1.25, 10.5, -10.5))
 @pytest.mark.parametrize('skip_nulls', (True, False))
 def test_cumulative_prod(start, skip_nulls):
@@ -3016,6 +3033,7 @@ def test_cumulative_prod(start, skip_nulls):
             pc.cumulative_prod([1, 2, 3], start=strt)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('start', (0.5, 3.5, 6.5))
 @pytest.mark.parametrize('skip_nulls', (True, False))
 def test_cumulative_max(start, skip_nulls):
@@ -3073,6 +3091,7 @@ def test_cumulative_max(start, skip_nulls):
             pc.cumulative_max([1, 2, 3], start=strt)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('start', (0.5, 3.5, 6.5))
 @pytest.mark.parametrize('skip_nulls', (True, False))
 def test_cumulative_min(start, skip_nulls):
@@ -3407,6 +3426,7 @@ def create_sample_expressions():
 # Tests the Arrow-specific serialization mechanism
 
 
+@pytest.mark.numpy
 def test_expression_serialization_arrow(pickle_module):
     for expr in create_sample_expressions()["all"]:
         assert isinstance(expr, pc.Expression)
@@ -3414,6 +3434,7 @@ def test_expression_serialization_arrow(pickle_module):
         assert expr.equals(restored)
 
 
+@pytest.mark.numpy
 @pytest.mark.substrait
 def test_expression_serialization_substrait():
 
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 6140163a8ee8c..c3589877e6423 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -23,8 +23,11 @@
 import re
 
 import hypothesis as h
-import numpy as np
 import pytest
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 from pyarrow.pandas_compat import _pandas_api  # noqa
 import pyarrow as pa
@@ -32,17 +35,17 @@
 
 
 int_type_pairs = [
-    (np.int8, pa.int8()),
-    (np.int16, pa.int16()),
-    (np.int32, pa.int32()),
-    (np.int64, pa.int64()),
-    (np.uint8, pa.uint8()),
-    (np.uint16, pa.uint16()),
-    (np.uint32, pa.uint32()),
-    (np.uint64, pa.uint64())]
+    ("int8", pa.int8()),
+    ("int16", pa.int16()),
+    ("int32", pa.int32()),
+    ("int64", pa.int64()),
+    ("uint8", pa.uint8()),
+    ("uint16", pa.uint16()),
+    ("uint32", pa.uint32()),
+    ("uint64", pa.uint64())]
 
 
-np_int_types, pa_int_types = zip(*int_type_pairs)
+np_str_int_types, pa_int_types = zip(*int_type_pairs)
 
 
 class StrangeIterable:
@@ -174,7 +177,9 @@ def _as_set(xs):
     return set(xs)
 
 
-SEQUENCE_TYPES = [_as_list, _as_tuple, _as_numpy_array]
+SEQUENCE_TYPES = [_as_list, _as_tuple]
+if np is not None:
+    SEQUENCE_TYPES.append(_as_numpy_array)
 ITERABLE_TYPES = [_as_set, _as_dict_values] + SEQUENCE_TYPES
 COLLECTIONS_TYPES = [_as_deque] + ITERABLE_TYPES
 
@@ -217,6 +222,7 @@ def test_sequence_boolean(seq):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 def test_sequence_numpy_boolean(seq):
     expected = [np.bool_(True), None, np.bool_(False), None]
@@ -225,6 +231,7 @@ def test_sequence_numpy_boolean(seq):
     assert arr.to_pylist() == [True, None, False, None]
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 def test_sequence_mixed_numpy_python_bools(seq):
     values = np.array([True, False])
@@ -278,11 +285,14 @@ def test_list_with_non_list(seq):
 
 
 @parametrize_with_sequence_types
+@pytest.mark.parametrize(
+    "inner_seq", SEQUENCE_TYPES
+)
 @pytest.mark.parametrize("factory", [
     pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
-def test_nested_arrays(seq, factory):
-    arr = pa.array(seq([np.array([], dtype=np.int64),
-                        np.array([1, 2], dtype=np.int64), None]),
+def test_nested_arrays(seq, inner_seq, factory):
+    arr = pa.array(seq([inner_seq([]),
+                        inner_seq([1, 2]), None]),
                    type=factory(pa.int64()))
     assert len(arr) == 3
     assert arr.null_count == 1
@@ -290,6 +300,7 @@ def test_nested_arrays(seq, factory):
     assert arr.to_pylist() == [[], [1, 2], None]
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 def test_nested_fixed_size_list(seq):
     # sequence of lists
@@ -334,10 +345,12 @@ def test_sequence_all_none(seq):
     assert arr.to_pylist() == [None, None]
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 def test_sequence_integer(seq, np_scalar_pa_type):
-    np_scalar, pa_type = np_scalar_pa_type
+    np_str_scalar, pa_type = np_scalar_pa_type
+    np_scalar = getattr(np, np_str_scalar)
     expected = [1, None, 3, None,
                 np.iinfo(np_scalar).min, np.iinfo(np_scalar).max]
     arr = pa.array(seq(expected), type=pa_type)
@@ -347,12 +360,12 @@ def test_sequence_integer(seq, np_scalar_pa_type):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_collections_types
-@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
-def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
+@pytest.mark.parametrize("pa_type", pa_int_types)
+def test_sequence_integer_np_nan(seq, pa_type):
     # ARROW-2806: numpy.nan is a double value and thus should produce
     # a double array.
-    _, pa_type = np_scalar_pa_type
     with pytest.raises(ValueError):
         pa.array(seq([np.nan]), type=pa_type, from_pandas=False)
 
@@ -364,12 +377,12 @@ def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
-@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
-def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
+@pytest.mark.parametrize("pa_type", pa_int_types)
+def test_sequence_integer_nested_np_nan(seq, pa_type):
     # ARROW-2806: numpy.nan is a double value and thus should produce
     # a double array.
-    _, pa_type = np_scalar_pa_type
     with pytest.raises(ValueError):
         pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)
 
@@ -391,10 +404,12 @@ def test_sequence_integer_inferred(seq):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 def test_sequence_numpy_integer(seq, np_scalar_pa_type):
-    np_scalar, pa_type = np_scalar_pa_type
+    np_str_scalar, pa_type = np_scalar_pa_type
+    np_scalar = getattr(np, np_str_scalar)
     expected = [np_scalar(1), None, np_scalar(3), None,
                 np_scalar(np.iinfo(np_scalar).min),
                 np_scalar(np.iinfo(np_scalar).max)]
@@ -405,10 +420,12 @@ def test_sequence_numpy_integer(seq, np_scalar_pa_type):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
 @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
 def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
-    np_scalar, pa_type = np_scalar_pa_type
+    np_str_scalar, pa_type = np_scalar_pa_type
+    np_scalar = getattr(np, np_str_scalar)
     expected = [np_scalar(1), None, np_scalar(3), None]
     expected += [np_scalar(np.iinfo(np_scalar).min),
                  np_scalar(np.iinfo(np_scalar).max)]
@@ -434,6 +451,7 @@ def test_broken_integers(seq):
         pa.array(seq(data), type=pa.int64())
 
 
+@pytest.mark.numpy
 def test_numpy_scalars_mixed_type():
     # ARROW-4324
     data = [np.int32(10), np.float32(0.5)]
@@ -448,6 +466,7 @@ def test_numpy_scalars_mixed_type():
     assert arr.equals(expected)
 
 
+@pytest.mark.numpy
 @pytest.mark.xfail(reason="Type inference for uint64 not implemented",
                    raises=OverflowError)
 def test_uint64_max_convert():
@@ -491,7 +510,7 @@ def test_integer_from_string_error(seq, typ):
 
 def test_convert_with_mask():
     data = [1, 2, 3, 4, 5]
-    mask = np.array([False, True, False, False, True])
+    mask = [False, True, False, False, True]
 
     result = pa.array(data, mask=mask)
     expected = pa.array([1, None, 3, 4, None])
@@ -559,6 +578,7 @@ def test_double_integer_coerce_representable_range():
         pa.array(invalid_values2)
 
 
+@pytest.mark.numpy
 def test_float32_integer_coerce_representable_range():
     f32 = np.float32
     valid_values = [f32(1.5), 1 << 24, -(1 << 24)]
@@ -587,14 +607,16 @@ def test_mixed_sequence_errors():
         pa.array([1.5, 'foo'])
 
 
+@pytest.mark.numpy
 @parametrize_with_sequence_types
-@pytest.mark.parametrize("np_scalar,pa_type", [
-    (np.float16, pa.float16()),
-    (np.float32, pa.float32()),
-    (np.float64, pa.float64())
+@pytest.mark.parametrize("np_str_scalar,pa_type", [
+    ("float16", pa.float16()),
+    ("float32", pa.float32()),
+    ("float64", pa.float64())
 ])
 @pytest.mark.parametrize("from_pandas", [True, False])
-def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
+def test_sequence_numpy_double(seq, np_str_scalar, pa_type, from_pandas):
+    np_scalar = getattr(np, np_str_scalar)
     data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
     arr = pa.array(seq(data), from_pandas=from_pandas)
     assert len(arr) == 6
@@ -616,27 +638,29 @@ def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
         assert np.isnan(arr.to_pylist()[5])
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("from_pandas", [True, False])
-@pytest.mark.parametrize("inner_seq", [np.array, list])
-def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
+def test_ndarray_nested_numpy_double(from_pandas):
     # ARROW-2806
-    data = np.array([
-        inner_seq([1., 2.]),
-        inner_seq([1., 2., 3.]),
-        inner_seq([np.nan]),
-        None
-    ], dtype=object)
-    arr = pa.array(data, from_pandas=from_pandas)
-    assert len(arr) == 4
-    assert arr.null_count == 1
-    assert arr.type == pa.list_(pa.float64())
-    if from_pandas:
-        assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
-    else:
-        np.testing.assert_equal(arr.to_pylist(),
-                                [[1., 2.], [1., 2., 3.], [np.nan], None])
+    for inner_seq in (np.array, list):
+        data = np.array([
+            inner_seq([1., 2.]),
+            inner_seq([1., 2., 3.]),
+            inner_seq([np.nan]),
+            None
+        ], dtype=object)
+        arr = pa.array(data, from_pandas=from_pandas)
+        assert len(arr) == 4
+        assert arr.null_count == 1
+        assert arr.type == pa.list_(pa.float64())
+        if from_pandas:
+            assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
+        else:
+            np.testing.assert_equal(arr.to_pylist(),
+                                    [[1., 2.], [1., 2., 3.], [np.nan], None])
 
 
+@pytest.mark.numpy
 def test_nested_ndarray_in_object_array():
     # ARROW-4350
     arr = np.empty(2, dtype=object)
@@ -664,6 +688,7 @@ def test_nested_ndarray_in_object_array():
     assert result.to_pylist() == [[[1], [2]], [[1], [2]]]
 
 
+@pytest.mark.numpy
 @pytest.mark.xfail(reason=("Type inference for multidimensional ndarray "
                            "not yet implemented"),
                    raises=AssertionError)
@@ -682,6 +707,7 @@ def test_multidimensional_ndarray_as_nested_list():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('data', 'value_type'), [
     ([True, False], pa.bool_()),
     ([None, None], pa.null()),
@@ -711,6 +737,7 @@ def test_list_array_from_object_ndarray(data, value_type):
     assert arr.to_pylist() == [data]
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize(('data', 'value_type'), [
     ([[1, 2], [3]], pa.list_(pa.int64())),
     ([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)),
@@ -730,13 +757,14 @@ def test_array_ignore_nan_from_pandas():
     # See ARROW-4324, this reverts logic that was introduced in
     # ARROW-2240
     with pytest.raises(ValueError):
-        pa.array([np.nan, 'str'])
+        pa.array([float("nan"), 'str'])
 
-    arr = pa.array([np.nan, 'str'], from_pandas=True)
+    arr = pa.array([float("nan"), 'str'], from_pandas=True)
     expected = pa.array([None, 'str'])
     assert arr.equals(expected)
 
 
+@pytest.mark.numpy
 def test_nested_ndarray_different_dtypes():
     data = [
         np.array([1, 2, 3], dtype='int64'),
@@ -1238,6 +1266,7 @@ def test_sequence_timestamp_out_of_bounds_nanosecond():
     assert arr.to_pylist()[0] == datetime.datetime(2262, 4, 12)
 
 
+@pytest.mark.numpy
 def test_sequence_numpy_timestamp():
     data = [
         np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
@@ -1407,14 +1436,25 @@ class CustomClass():
             pa.array([1, CustomClass()], type=ty)
 
 
-@pytest.mark.parametrize('np_scalar', [True, False])
-def test_sequence_duration(np_scalar):
+def test_sequence_duration():
     td1 = datetime.timedelta(2, 3601, 1)
     td2 = datetime.timedelta(1, 100, 1000)
-    if np_scalar:
-        data = [np.timedelta64(td1), None, np.timedelta64(td2)]
-    else:
-        data = [td1, None, td2]
+    data = [td1, None, td2]
+
+    arr = pa.array(data)
+    assert len(arr) == 3
+    assert arr.type == pa.duration('us')
+    assert arr.null_count == 1
+    assert arr[0].as_py() == td1
+    assert arr[1].as_py() is None
+    assert arr[2].as_py() == td2
+
+
+@pytest.mark.numpy
+def test_sequence_duration_np_scalar():
+    td1 = datetime.timedelta(2, 3601, 1)
+    td2 = datetime.timedelta(1, 100, 1000)
+    data = [np.timedelta64(td1), None, np.timedelta64(td2)]
 
     arr = pa.array(data)
     assert len(arr) == 3
@@ -1480,6 +1520,7 @@ def test_sequence_duration_nested_lists_with_explicit_type(factory):
     assert arr.to_pylist() == data
 
 
+@pytest.mark.numpy
 def test_sequence_duration_nested_lists_numpy():
     td1 = datetime.timedelta(1, 1, 1000)
     td2 = datetime.timedelta(1, 100)
@@ -1769,6 +1810,7 @@ def test_struct_from_dicts_bytes_keys():
     ]
 
 
+@pytest.mark.numpy
 def test_struct_from_tuples():
     ty = pa.struct([pa.field('a', pa.int32()),
                     pa.field('b', pa.string()),
@@ -1915,6 +1957,7 @@ def test_struct_from_mixed_sequence():
         pa.array(data, type=ty)
 
 
+@pytest.mark.numpy
 def test_struct_from_dicts_inference():
     expected_type = pa.struct([pa.field('a', pa.int64()),
                                pa.field('b', pa.string()),
@@ -1992,7 +2035,7 @@ def test_structarray_from_arrays_coerce():
 
 
 def test_decimal_array_with_none_and_nan():
-    values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
+    values = [decimal.Decimal('1.234'), None, float("nan"), decimal.Decimal('nan')]
 
     with pytest.raises(TypeError):
         # ARROW-6227: Without from_pandas=True, NaN is considered a float
@@ -2215,6 +2258,7 @@ def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
     ]
 
 
+@pytest.mark.numpy
 @h.given(past.all_arrays)
 def test_array_to_pylist_roundtrip(arr):
     seq = arr.to_pylist()
@@ -2498,6 +2542,7 @@ def test_array_accepts_pyarrow_scalar(seq, data, scalar_data, value_type):
     assert expect.equals(result)
 
 
+@pytest.mark.numpy
 @parametrize_with_collections_types
 def test_array_accepts_pyarrow_scalar_errors(seq):
     sequence = seq([pa.scalar(1), pa.scalar("a"), pa.scalar(3.0)])
diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py
index 83800b77f894b..7508d8f0b9816 100644
--- a/python/pyarrow/tests/test_cpp_internals.py
+++ b/python/pyarrow/tests/test_cpp_internals.py
@@ -18,6 +18,8 @@
 import os.path
 from os.path import join as pjoin
 
+import pytest
+
 from pyarrow._pyarrow_cpp_tests import get_cpp_tests
 
 
@@ -26,10 +28,16 @@ def inject_cpp_tests(ns):
     Inject C++ tests as Python functions into namespace `ns` (a dict).
     """
     for case in get_cpp_tests():
+
         def wrapper(case=case):
             case()
         wrapper.__name__ = wrapper.__qualname__ = case.name
         wrapper.__module__ = ns['__name__']
+        # Add numpy or pandas marks if the test requires it
+        if 'numpy' in case.name:
+            wrapper = pytest.mark.numpy(wrapper)
+        elif 'pandas' in case.name:
+            wrapper = pytest.mark.pandas(wrapper)
         ns[case.name] = wrapper
 
 
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 112129d9602ed..dcf96f68c4da7 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -24,6 +24,7 @@
 import io
 import itertools
 import os
+import random
 import select
 import shutil
 import signal
@@ -36,8 +37,6 @@
 
 import pytest
 
-import numpy as np
-
 import pyarrow as pa
 from pyarrow.csv import (
     open_csv, read_csv, ReadOptions, ParseOptions, ConvertOptions, ISO8601,
@@ -54,18 +53,32 @@ def generate_col_names():
             yield first + second
 
 
+def split_rows(arr, num_cols, num_rows):
+    # Split a num_cols x num_rows array into rows
+    for i in range(0, num_rows * num_cols, num_cols):
+        yield arr[i:i + num_cols]
+
+
+def split_columns(arr, num_cols, num_rows):
+    # Split a num_cols x num_rows array into columns
+    for i in range(0, num_cols):
+        yield arr[i::num_cols]
+
+
 def make_random_csv(num_cols=2, num_rows=10, linesep='\r\n', write_names=True):
-    arr = np.random.RandomState(42).randint(0, 1000, size=(num_cols, num_rows))
+    rnd = random.Random(42)
+    arr = [rnd.randint(0, 1000) for _ in range(num_cols * num_rows)]
     csv = io.StringIO()
     col_names = list(itertools.islice(generate_col_names(), num_cols))
     if write_names:
         csv.write(",".join(col_names))
         csv.write(linesep)
-    for row in arr.T:
+    for row in split_rows(arr, num_cols, num_rows):
         csv.write(",".join(map(str, row)))
         csv.write(linesep)
     csv = csv.getvalue().encode()
-    columns = [pa.array(a, type=pa.int64()) for a in arr]
+    columns = [pa.array(row, type=pa.int64())
+               for row in split_columns(arr, num_cols, num_rows)]
     expected = pa.Table.from_arrays(columns, col_names)
     return csv, expected
 
@@ -127,6 +140,25 @@ def __ne__(self, other):
                 other.result != self.result)
 
 
+def test_split_rows_and_columns_utility():
+    num_cols = 5
+    num_rows = 2
+    arr = [x for x in range(1, 11)]
+    rows = list(split_rows(arr, num_cols, num_rows))
+    assert rows == [
+        [1, 2, 3, 4, 5],
+        [6, 7, 8, 9, 10]
+    ]
+    columns = list(split_columns(arr, num_cols, num_rows))
+    assert columns == [
+        [1, 6],
+        [2, 7],
+        [3, 8],
+        [4, 9],
+        [5, 10]
+    ]
+
+
 def test_read_options(pickle_module):
     cls = ReadOptions
     opts = cls()
@@ -520,6 +552,7 @@ def test_skip_rows_after_names(self):
             assert (values[opts.skip_rows + opts.skip_rows_after_names:] ==
                     table_dict[name])
 
+    @pytest.mark.numpy
     def test_row_number_offset_in_errors(self):
         # Row numbers are only correctly counted in serial reads
         def format_msg(msg_format, row, *args):
@@ -1802,6 +1835,7 @@ def test_header_skip_rows(self):
         with pytest.raises(StopIteration):
             assert reader.read_next_batch()
 
+    @pytest.mark.numpy
     def test_skip_rows_after_names(self):
         super().test_skip_rows_after_names()
 
diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py
index d55be651b1571..a71fa036503d7 100644
--- a/python/pyarrow/tests/test_cuda.py
+++ b/python/pyarrow/tests/test_cuda.py
@@ -26,7 +26,10 @@
 import pytest
 
 import pyarrow as pa
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    pytestmark = pytest.mark.numpy
 
 
 cuda = pytest.importorskip("pyarrow.cuda")
diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py
index ff1722d278d5e..876f3c7f761cf 100644
--- a/python/pyarrow/tests/test_cuda_numba_interop.py
+++ b/python/pyarrow/tests/test_cuda_numba_interop.py
@@ -17,7 +17,10 @@
 
 import pytest
 import pyarrow as pa
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    pytestmark = pytest.mark.numpy
 
 dtypes = ['uint8', 'int16', 'float32']
 cuda = pytest.importorskip("pyarrow.cuda")
diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py
index 0eeae5d65f7d5..937d927f831b0 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -80,6 +80,9 @@ def check_cython_example_module(mod):
         mod.cast_scalar(scal, pa.list_(pa.int64()))
 
 
+# NumPy is still a required build dependency. It is present in our
+# headers and is required to build for the cython tests.
+@pytest.mark.numpy
 @pytest.mark.cython
 def test_cython_api(tmpdir):
     """
@@ -162,6 +165,7 @@ def test_cython_api(tmpdir):
                               env=subprocess_env)
 
 
+@pytest.mark.numpy
 @pytest.mark.cython
 def test_visit_strings(tmpdir):
     with tmpdir.as_cwd():
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 3b0284bcb74a6..276cd2e78db37 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -20,6 +20,7 @@
 import os
 import pathlib
 import posixpath
+import random
 import sys
 import tempfile
 import textwrap
@@ -28,7 +29,10 @@
 from shutil import copytree
 from urllib.parse import quote
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
@@ -684,8 +688,8 @@ def test_partitioning():
 
     # test partitioning roundtrip
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))],
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)],
         names=["f1", "f2", "part"]
     )
     partitioning_schema = pa.schema([("part", pa.string())])
@@ -2494,7 +2498,7 @@ def _create_partitioned_dataset(basedir):
         pq.write_table(table.slice(3*i, 3), part / "test.parquet")
 
     full_table = table.append_column(
-        "part", pa.array(np.repeat([0, 1, 2], 3), type=pa.int32()))
+        "part", pa.array([0] * 3 + [1] * 3 + [2] * 3, type=pa.int32()))
 
     return full_table, path
 
@@ -2532,7 +2536,7 @@ def test_open_dataset_partitioned_directory(tempdir, dataset_reader, pickle_modu
 
     result = dataset.to_table()
     expected = table.append_column(
-        "part", pa.array(np.repeat([0, 1, 2], 3), type=pa.int8()))
+        "part", pa.array([0] * 3 + [1] * 3 + [2] * 3, type=pa.int8()))
     assert result.equals(expected)
 
 
@@ -3567,7 +3571,7 @@ def _create_parquet_dataset_simple(root_path):
     metadata_collector = []
 
     for i in range(4):
-        table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)})
+        table = pa.table({'f1': [i] * 10, 'f2': [random.random() for _ in range(10)]})
         pq.write_to_dataset(
             table, str(root_path), metadata_collector=metadata_collector
         )
@@ -4255,7 +4259,7 @@ def compare_tables_ignoring_order(t1, t2):
 
 
 def _generate_random_int_array(size=4, min=1, max=10):
-    return np.random.randint(min, max, size)
+    return [random.randint(min, max) for _ in range(size)]
 
 
 def _generate_data_and_columns(num_of_columns, num_of_records):
@@ -4513,8 +4517,8 @@ def file_visitor(written_file):
 
 def test_write_table(tempdir):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "part"])
 
     base_dir = tempdir / 'single'
@@ -4560,8 +4564,8 @@ def file_visitor(written_file):
 
 def test_write_table_multiple_fragments(tempdir):
     table = pa.table([
-        pa.array(range(10)), pa.array(np.random.randn(10)),
-        pa.array(np.repeat(['a', 'b'], 5))
+        pa.array(range(10)), pa.array(random.random() for _ in range(10)),
+        pa.array(['a'] * 5 + ['b'] * 5)
     ], names=["f1", "f2", "part"])
     table = pa.concat_tables([table]*2)
 
@@ -4596,8 +4600,8 @@ def test_write_table_multiple_fragments(tempdir):
 
 def test_write_iterable(tempdir):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "part"])
 
     base_dir = tempdir / 'inmemory_iterable'
@@ -4618,8 +4622,8 @@ def test_write_iterable(tempdir):
 
 def test_write_scanner(tempdir, dataset_reader):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "part"])
     dataset = ds.dataset(table)
 
@@ -4647,7 +4651,7 @@ def test_write_table_partitioned_dict(tempdir):
     # specifying the dictionary values explicitly
     table = pa.table([
         pa.array(range(20)),
-        pa.array(np.repeat(['a', 'b'], 10)).dictionary_encode(),
+        pa.array(['a'] * 10 + ['b'] * 10).dictionary_encode(),
     ], names=['col', 'part'])
 
     partitioning = ds.partitioning(table.select(["part"]).schema)
@@ -4666,6 +4670,7 @@ def test_write_table_partitioned_dict(tempdir):
     assert result.equals(table)
 
 
+@pytest.mark.numpy
 @pytest.mark.parquet
 def test_write_dataset_parquet(tempdir):
     table = pa.table([
@@ -4712,8 +4717,8 @@ def test_write_dataset_parquet(tempdir):
 
 def test_write_dataset_csv(tempdir):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "chr1"])
 
     base_dir = tempdir / 'csv_dataset'
@@ -4739,8 +4744,8 @@ def test_write_dataset_csv(tempdir):
 @pytest.mark.parquet
 def test_write_dataset_parquet_file_visitor(tempdir):
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)
     ], names=["f1", "f2", "part"])
 
     visitor_called = False
@@ -4763,7 +4768,7 @@ def test_partition_dataset_parquet_file_visitor(tempdir):
     f1_vals = [item for chunk in range(4) for item in [chunk] * 10]
     f2_vals = [item*10 for chunk in range(4) for item in [chunk] * 10]
     table = pa.table({'f1': f1_vals, 'f2': f2_vals,
-                      'part': np.repeat(['a', 'b'], 20)})
+                      'part': ['a'] * 20 + ['b'] * 20})
 
     root_path = tempdir / 'partitioned'
     partitioning = ds.partitioning(
@@ -4841,8 +4846,8 @@ def test_write_dataset_s3(s3_example_simple):
     )
 
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))],
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a'] * 10 + ['b'] * 10)],
         names=["f1", "f2", "part"]
     )
     part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
@@ -4918,8 +4923,8 @@ def test_write_dataset_s3_put_only(s3_server):
     _configure_s3_limited_user(s3_server, _minio_put_only_policy)
 
     table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))],
+        pa.array(range(20)), pa.array(random.random() for _ in range(20)),
+        pa.array(['a']*10 + ['b'] * 10)],
         names=["f1", "f2", "part"]
     )
     part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py
index 0d8b4a152ab9f..eb79121b1cdbe 100644
--- a/python/pyarrow/tests/test_dataset_encryption.py
+++ b/python/pyarrow/tests/test_dataset_encryption.py
@@ -17,7 +17,7 @@
 
 import base64
 from datetime import timedelta
-import numpy as np
+import random
 import pyarrow.fs as fs
 import pyarrow as pa
 
@@ -187,7 +187,10 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes:
 
     row_count = 2**15 + 1
     table = pa.Table.from_arrays(
-        [pa.array(np.random.rand(row_count), type=pa.float32())], names=["foo"]
+        [pa.array(
+            [random.random() for _ in range(row_count)],
+            type=pa.float32()
+        )], names=["foo"]
     )
 
     kms_config = pe.KmsConnectionConfig()
diff --git a/python/pyarrow/tests/test_dlpack.py b/python/pyarrow/tests/test_dlpack.py
index 7cf3f4acdbd40..a18accb1e21df 100644
--- a/python/pyarrow/tests/test_dlpack.py
+++ b/python/pyarrow/tests/test_dlpack.py
@@ -19,12 +19,20 @@
 from functools import wraps
 import pytest
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 from pyarrow.vendored.version import Version
 
 
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not numpy'
+pytestmark = pytest.mark.numpy
+
+
 def PyCapsule_IsValid(capsule, name):
     return ctypes.pythonapi.PyCapsule_IsValid(ctypes.py_object(capsule), name) == 1
 
@@ -52,45 +60,45 @@ def wrapper(*args, **kwargs):
 
 @check_bytes_allocated
 @pytest.mark.parametrize(
-    ('value_type', 'np_type'),
+    ('value_type', 'np_type_str'),
     [
-        (pa.uint8(), np.uint8),
-        (pa.uint16(), np.uint16),
-        (pa.uint32(), np.uint32),
-        (pa.uint64(), np.uint64),
-        (pa.int8(), np.int8),
-        (pa.int16(), np.int16),
-        (pa.int32(), np.int32),
-        (pa.int64(), np.int64),
-        (pa.float16(), np.float16),
-        (pa.float32(), np.float32),
-        (pa.float64(), np.float64),
+        (pa.uint8(), "uint8"),
+        (pa.uint16(), "uint16"),
+        (pa.uint32(), "uint32"),
+        (pa.uint64(), "uint64"),
+        (pa.int8(), "int8"),
+        (pa.int16(), "int16"),
+        (pa.int32(), "int32"),
+        (pa.int64(), "int64"),
+        (pa.float16(), "float16"),
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64"),
     ]
 )
-def test_dlpack(value_type, np_type):
+def test_dlpack(value_type, np_type_str):
     if Version(np.__version__) < Version("1.24.0"):
         pytest.skip("No dlpack support in numpy versions older than 1.22.0, "
                     "strict keyword in assert_array_equal added in numpy version "
                     "1.24.0")
 
-    expected = np.array([1, 2, 3], dtype=np_type)
+    expected = np.array([1, 2, 3], dtype=np.dtype(np_type_str))
     arr = pa.array(expected, type=value_type)
     check_dlpack_export(arr, expected)
 
     arr_sliced = arr.slice(1, 1)
-    expected = np.array([2], dtype=np_type)
+    expected = np.array([2], dtype=np.dtype(np_type_str))
     check_dlpack_export(arr_sliced, expected)
 
     arr_sliced = arr.slice(0, 1)
-    expected = np.array([1], dtype=np_type)
+    expected = np.array([1], dtype=np.dtype(np_type_str))
     check_dlpack_export(arr_sliced, expected)
 
     arr_sliced = arr.slice(1)
-    expected = np.array([2, 3], dtype=np_type)
+    expected = np.array([2, 3], dtype=np.dtype(np_type_str))
     check_dlpack_export(arr_sliced, expected)
 
     arr_zero = pa.array([], type=value_type)
-    expected = np.array([], dtype=np_type)
+    expected = np.array([], dtype=np.dtype(np_type_str))
     check_dlpack_export(arr_zero, expected)
 
 
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index aacbd2cb6e756..b74eca75bdca9 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -23,12 +23,15 @@
 from uuid import uuid4, UUID
 import sys
 
-import numpy as np
+import pytest
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
 import pyarrow as pa
 from pyarrow.vendored.version import Version
 
-import pytest
-
 
 @contextlib.contextmanager
 def registered_extension_type(ext_type):
@@ -562,6 +565,7 @@ def test_ext_array_pickling(pickle_module):
         assert arr.storage.to_pylist() == [b"foo", b"bar"]
 
 
+@pytest.mark.numpy
 def test_ext_array_conversion_to_numpy():
     storage1 = pa.array([1, 2, 3], type=pa.int64())
     storage2 = pa.array([b"123", b"456", b"789"], type=pa.binary(3))
@@ -619,6 +623,7 @@ def struct_w_ext_data():
     return [sarr1, sarr2]
 
 
+@pytest.mark.numpy
 def test_struct_w_ext_array_to_numpy(struct_w_ext_data):
     # ARROW-15291
     # Check that we don't segfault when trying to build
@@ -1233,6 +1238,7 @@ def test_parquet_extension_nested_in_extension(tmpdir):
             assert table == orig_table
 
 
+@pytest.mark.numpy
 def test_to_numpy():
     period_type = PeriodType('D')
     storage = pa.array([1, 2, 3, 4], pa.int64())
@@ -1285,7 +1291,11 @@ def test_empty_take():
     (["cat", "dog", "horse"], LabelType)
 ))
 @pytest.mark.parametrize(
-    "into", ["to_numpy", pytest.param("to_pandas", marks=pytest.mark.pandas)])
+    "into", [
+        pytest.param("to_numpy", marks=pytest.mark.numpy),
+        pytest.param("to_pandas", marks=pytest.mark.pandas)
+    ]
+)
 def test_extension_array_to_numpy_pandas(data, ty, into):
     storage = pa.array(data)
     ext_arr = pa.ExtensionArray.from_storage(ty(), storage)
@@ -1301,6 +1311,7 @@ def test_extension_array_to_numpy_pandas(data, ty, into):
         assert np.array_equal(result, expected)
 
 
+@pytest.mark.numpy
 def test_array_constructor():
     ext_type = IntegerType()
     storage = pa.array([1, 2, 3], type=pa.int64())
@@ -1333,6 +1344,7 @@ def test_array_constructor_from_pandas():
     assert result.equals(expected)
 
 
+@pytest.mark.numpy
 @pytest.mark.cython
 def test_cpp_extension_in_python(tmpdir):
     from .test_cython import (
@@ -1430,38 +1442,45 @@ def test_tensor_type():
     assert tensor_type.permutation is None
 
 
-@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32()))
-def test_tensor_class_methods(value_type):
+@pytest.mark.numpy
+@pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32"))
+def test_tensor_class_methods(np_type_str):
     from numpy.lib.stride_tricks import as_strided
-    arrow_type = pa.from_numpy_dtype(value_type)
+    arrow_type = pa.from_numpy_dtype(np.dtype(np_type_str))
 
     tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3])
     storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
                        pa.list_(arrow_type, 6))
     arr = pa.ExtensionArray.from_storage(tensor_type, storage)
     expected = np.array(
-        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+        [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+        dtype=np.dtype(np_type_str)
+    )
     np.testing.assert_array_equal(arr.to_tensor(), expected)
     np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected)
 
-    expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type)
+    expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str))
     result = arr[1:].to_numpy_ndarray()
     np.testing.assert_array_equal(result, expected)
 
     values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]
-    flat_arr = np.array(values[0], dtype=value_type)
-    bw = value_type.itemsize
+    flat_arr = np.array(values[0], dtype=np.dtype(np_type_str))
+    bw = np.dtype(np_type_str).itemsize
     storage = pa.array(values, pa.list_(arrow_type, 12))
 
     tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2])
     result = pa.ExtensionArray.from_storage(tensor_type, storage)
     expected = np.array(
-        [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=value_type)
+        [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
+        dtype=np.dtype(np_type_str)
+    )
     np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
 
     result = flat_arr.reshape(1, 2, 3, 2)
     expected = np.array(
-        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type)
+        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
+        dtype=np.dtype(np_type_str)
+    )
     np.testing.assert_array_equal(result, expected)
 
     tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1])
@@ -1482,25 +1501,27 @@ def test_tensor_class_methods(value_type):
     assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw)
 
 
-@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32()))
-def test_tensor_array_from_numpy(value_type):
+@pytest.mark.numpy
+@pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32"))
+def test_tensor_array_from_numpy(np_type_str):
     from numpy.lib.stride_tricks import as_strided
-    arrow_type = pa.from_numpy_dtype(value_type)
+    arrow_type = pa.from_numpy_dtype(np.dtype(np_type_str))
 
     arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
-                   dtype=value_type, order="C")
+                   dtype=np.dtype(np_type_str), order="C")
     tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
     assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
     assert tensor_array_from_numpy.type.value_type == arrow_type
     assert tensor_array_from_numpy.type.shape == [2, 3]
 
     arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
-                   dtype=value_type, order="F")
+                   dtype=np.dtype(np_type_str), order="F")
     with pytest.raises(ValueError, match="First stride needs to be largest"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
 
-    flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type)
-    bw = value_type.itemsize
+    flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+                        dtype=np.dtype(np_type_str))
+    bw = np.dtype(np_type_str).itemsize
 
     arr = flat_arr.reshape(1, 3, 4)
     tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
@@ -1518,23 +1539,26 @@ def test_tensor_array_from_numpy(value_type):
     arr = flat_arr.reshape(1, 2, 3, 2)
     result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
     expected = np.array(
-        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type)
+        [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]],
+        dtype=np.dtype(np_type_str)
+    )
     np.testing.assert_array_equal(result.to_numpy_ndarray(), expected)
 
-    arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=value_type)
+    arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
+                   dtype=np.dtype(np_type_str))
     expected = arr[1:]
     result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray()
     np.testing.assert_array_equal(result, expected)
 
-    arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type)
+    arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.dtype(np_type_str))
     with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
 
-    arr = np.array(1, dtype=value_type)
+    arr = np.array(1, dtype=np.dtype(np_type_str))
     with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr)
 
-    arr = np.array([], dtype=value_type)
+    arr = np.array([], dtype=np.dtype(np_type_str))
 
     with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0)))
@@ -1546,6 +1570,7 @@ def test_tensor_array_from_numpy(value_type):
         pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((3, 0, 2)))
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("tensor_type", (
     pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),
     pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]),
@@ -1801,6 +1826,7 @@ def test_bool8_to_bool_conversion():
     assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr
 
 
+@pytest.mark.numpy
 def test_bool8_to_numpy_conversion():
     arr = pa.ExtensionArray.from_storage(
         pa.bool8(),
@@ -1841,6 +1867,7 @@ def test_bool8_to_numpy_conversion():
     assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address
 
 
+@pytest.mark.numpy
 def test_bool8_from_numpy_conversion():
     np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_)
     canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage(
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 0064006489088..18c8cd5b654e6 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -23,7 +23,10 @@
 import hypothesis as h
 import hypothesis.strategies as st
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 import pyarrow.tests.strategies as past
@@ -135,6 +138,7 @@ def f():
     pytest.raises(exc, f)
 
 
+@pytest.mark.numpy
 def test_dataset(version):
     num_values = (100, 100)
     num_files = 5
@@ -354,6 +358,7 @@ def test_buffer_bounds_error(version):
         _check_arrow_roundtrip(table)
 
 
+@pytest.mark.numpy
 def test_boolean_object_nulls(version):
     repeats = 100
     table = pa.Table.from_arrays(
@@ -540,6 +545,7 @@ def test_read_columns(version):
                             columns=['boo', 'woo'])
 
 
+@pytest.mark.numpy
 def test_overwritten_file(version):
     path = random_path()
     TEST_FILES.append(path)
@@ -675,6 +681,7 @@ def test_v2_compression_options():
         write_feather(df, buf, compression='snappy')
 
 
+@pytest.mark.numpy
 def test_v2_lz4_default_compression():
     # ARROW-8750: Make sure that the compression=None option selects lz4 if
     # it's available
@@ -807,6 +814,7 @@ def test_nested_types(compression):
     _check_arrow_roundtrip(table, compression=compression)
 
 
+@pytest.mark.numpy
 @h.given(past.all_tables, st.sampled_from(["uncompressed", "lz4", "zstd"]))
 def test_roundtrip(table, compression):
     _check_arrow_roundtrip(table, compression=compression)
diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py
index 832c6a2dbdf9f..029a2695b9fd8 100644
--- a/python/pyarrow/tests/test_flight.py
+++ b/python/pyarrow/tests/test_flight.py
@@ -28,7 +28,10 @@
 import traceback
 import json
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 import pyarrow as pa
 
@@ -1588,6 +1591,7 @@ def test_flight_do_put_metadata():
                 assert idx == server_idx
 
 
+@pytest.mark.numpy
 def test_flight_do_put_limit():
     """Try a simple do_put call with a size limit."""
     large_batch = pa.RecordBatch.from_arrays([
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index ef499a3a8d76c..e2df1b1c46835 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -25,11 +25,15 @@
 import os
 import pathlib
 import pytest
+import random
 import sys
 import tempfile
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 from pyarrow.util import guid
 from pyarrow import Codec
@@ -464,6 +468,7 @@ def test_buffer_hex(val, expected_hex_buffer):
     assert buf.hex() == expected_hex_buffer
 
 
+@pytest.mark.numpy
 def test_buffer_to_numpy():
     # Make sure creating a numpy array from an arrow buffer works
     byte_array = bytearray(20)
@@ -476,6 +481,7 @@ def test_buffer_to_numpy():
     assert array.base == buf
 
 
+@pytest.mark.numpy
 def test_buffer_from_numpy():
     # C-contiguous
     arr = np.arange(12, dtype=np.int8).reshape((3, 4))
@@ -493,6 +499,7 @@ def test_buffer_from_numpy():
         buf = pa.py_buffer(arr.T[::2])
 
 
+@pytest.mark.numpy
 def test_buffer_address():
     b1 = b'some data!'
     b2 = bytearray(b1)
@@ -513,6 +520,7 @@ def test_buffer_address():
     assert buf.address == arr.ctypes.data
 
 
+@pytest.mark.numpy
 def test_buffer_equals():
     # Buffer.equals() returns true iff the buffers have the same contents
     def eq(a, b):
@@ -624,6 +632,7 @@ def test_buffer_hashing():
         hash(pa.py_buffer(b'123'))
 
 
+@pytest.mark.numpy
 def test_buffer_protocol_respects_immutability():
     # ARROW-3228; NumPy's frombuffer ctor determines whether a buffer-like
     # object is mutable by first attempting to get a mutable buffer using
@@ -635,6 +644,7 @@ def test_buffer_protocol_respects_immutability():
     assert not numpy_ref.flags.writeable
 
 
+@pytest.mark.numpy
 def test_foreign_buffer():
     obj = np.array([1, 2], dtype=np.int32)
     addr = obj.__array_interface__["data"][0]
@@ -669,6 +679,7 @@ def test_allocate_buffer_resizable():
     assert buf.size == 200
 
 
+@pytest.mark.numpy
 def test_non_cpu_buffer(pickle_module):
     cuda = pytest.importorskip("pyarrow.cuda")
     ctx = cuda.Context(0)
@@ -798,6 +809,7 @@ def test_cache_options_pickling(pickle_module):
         assert pickle_module.loads(pickle_module.dumps(option)) == option
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("compression", [
     pytest.param(
         "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
@@ -838,6 +850,7 @@ def test_compress_decompress(compression):
         pa.decompress(compressed_bytes, codec=compression)
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("compression", [
     pytest.param(
         "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
@@ -996,6 +1009,7 @@ def make_buffer(bytes_obj):
     assert refcount_before == sys.getrefcount(val)
 
 
+@pytest.mark.numpy
 def test_nativefile_write_memoryview():
     f = pa.BufferOutputStream()
     data = b'ok'
@@ -1058,8 +1072,8 @@ def test_mock_output_stream():
 @pytest.fixture
 def sample_disk_data(request, tmpdir):
     SIZE = 4096
-    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
-    data = arr.tobytes()[:SIZE]
+    arr = [random.randint(0, 255) for _ in range(SIZE)]
+    data = bytes(arr[:SIZE])
 
     path = os.path.join(str(tmpdir), guid())
 
@@ -1146,8 +1160,8 @@ def test_memory_map_writer(tmpdir):
     if sys.platform == "emscripten":
         pytest.xfail("Multiple memory maps to same file don't work on emscripten")
     SIZE = 4096
-    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
-    data = arr.tobytes()[:SIZE]
+    arr = [random.randint(0, 255) for _ in range(SIZE)]
+    data = bytes(arr[:SIZE])
 
     path = os.path.join(str(tmpdir), guid())
     with open(path, 'wb') as f:
@@ -1187,9 +1201,9 @@ def test_memory_map_writer(tmpdir):
 
 def test_memory_map_resize(tmpdir):
     SIZE = 4096
-    arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8)
-    data1 = arr.tobytes()[:(SIZE // 2)]
-    data2 = arr.tobytes()[(SIZE // 2):]
+    arr = [random.randint(0, 255) for _ in range(SIZE)]
+    data1 = bytes(arr[:(SIZE // 2)])
+    data2 = bytes(arr[(SIZE // 2):])
 
     path = os.path.join(str(tmpdir), guid())
 
@@ -1202,7 +1216,7 @@ def test_memory_map_resize(tmpdir):
     mmap.close()
 
     with open(path, 'rb') as f:
-        assert f.read() == arr.tobytes()
+        assert f.read() == bytes(arr[:SIZE])
 
 
 def test_memory_zero_length(tmpdir):
@@ -1241,8 +1255,8 @@ def test_memory_map_deref_remove(tmpdir):
 
 def test_os_file_writer(tmpdir):
     SIZE = 4096
-    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
-    data = arr.tobytes()[:SIZE]
+    arr = [random.randint(0, 255) for _ in range(SIZE)]
+    data = bytes(arr[:SIZE])
 
     path = os.path.join(str(tmpdir), guid())
     with open(path, 'wb') as f:
@@ -1523,6 +1537,7 @@ def test_buffered_input_stream_detach_non_seekable():
         raw.seek(2)
 
 
+@pytest.mark.numpy
 def test_buffered_output_stream():
     np_buf = np.zeros(100, dtype=np.int8)  # zero-initialized buffer
     buf = pa.py_buffer(np_buf)
@@ -1540,6 +1555,7 @@ def test_buffered_output_stream():
     assert np_buf[:10].tobytes() == b'123456789\0'
 
 
+@pytest.mark.numpy
 def test_buffered_output_stream_detach():
     np_buf = np.zeros(100, dtype=np.int8)  # zero-initialized buffer
     buf = pa.py_buffer(np_buf)
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 1e5242efe40f0..4be5792a92f6d 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -20,11 +20,15 @@
 import io
 import pathlib
 import pytest
+import random
 import socket
 import threading
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 from pyarrow.tests.util import changed_environ, invoke_script
@@ -59,7 +63,7 @@ def write_batches(self, num_batches=5, as_table=False):
         batches = []
         for i in range(num_batches):
             batch = pa.record_batch(
-                [np.random.randn(nrows),
+                [[random.random() for _ in range(nrows)],
                  ['foo', None, 'bar', 'bazbaz', 'qux']],
                 schema=schema)
             batches.append(batch)
@@ -422,7 +426,7 @@ def test_stream_simple_roundtrip(stream_fixture, use_legacy_ipc_format):
 @pytest.mark.zstd
 def test_compression_roundtrip():
     sink = io.BytesIO()
-    values = np.random.randint(0, 3, 10000)
+    values = [random.randint(0, 3) for _ in range(10000)]
     table = pa.Table.from_arrays([values], names=["values"])
 
     options = pa.ipc.IpcWriteOptions(compression='zstd')
diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py
index a0a6174266310..3bb4440e89750 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -23,7 +23,10 @@
 import string
 import unittest
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 
 import pyarrow as pa
@@ -297,6 +300,7 @@ def test_explicit_schema_with_unexpected_behaviour(self):
                            match="JSON parse error: unexpected field"):
             self.read_bytes(rows, parse_options=opts)
 
+    @pytest.mark.numpy
     def test_small_random_json(self):
         data, expected = make_random_json(num_cols=2, num_rows=10)
         table = self.read_bytes(data)
@@ -304,6 +308,7 @@ def test_small_random_json(self):
         assert table.equals(expected)
         assert table.to_pydict() == expected.to_pydict()
 
+    @pytest.mark.numpy
     def test_load_large_json(self):
         data, expected = make_random_json(num_cols=2, num_rows=100100)
         # set block size is 10MB
@@ -312,6 +317,7 @@ def test_load_large_json(self):
         assert table.num_rows == 100100
         assert expected.num_rows == 100100
 
+    @pytest.mark.numpy
     def test_stress_block_sizes(self):
         # Test a number of small block sizes to stress block stitching
         data_base, expected = make_random_json(num_cols=2, num_rows=100)
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 208812c3ac458..178a073ed59dc 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -27,9 +27,18 @@
 
 import hypothesis as h
 import hypothesis.strategies as st
-import numpy as np
-import numpy.testing as npt
 import pytest
+try:
+    import numpy as np
+    import numpy.testing as npt
+    try:
+        _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning
+    except AttributeError:
+        from numpy.exceptions import (
+            VisibleDeprecationWarning as _np_VisibleDeprecationWarning
+        )
+except ImportError:
+    np = None
 
 from pyarrow.pandas_compat import get_logical_type, _pandas_api
 from pyarrow.tests.util import invoke_script, random_ascii, rands
@@ -51,14 +60,6 @@
     pass
 
 
-try:
-    _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning
-except AttributeError:
-    from numpy.exceptions import (
-        VisibleDeprecationWarning as _np_VisibleDeprecationWarning
-    )
-
-
 # Marks all of the tests in this module
 pytestmark = pytest.mark.pandas
 
@@ -1202,9 +1203,11 @@ def test_datetime64_to_date32(self):
 
     @pytest.mark.parametrize('mask', [
         None,
-        np.array([True, False, False, True, False, False]),
+        [True, False, False, True, False, False],
     ])
     def test_pandas_datetime_to_date64(self, mask):
+        if mask:
+            mask = np.array(mask)
         s = pd.to_datetime([
             '2018-05-10T00:00:00',
             '2018-05-11T00:00:00',
@@ -1608,7 +1611,8 @@ def test_array_from_pandas_date_with_mask(self):
         assert pa.Array.from_pandas(expected).equals(result)
 
     @pytest.mark.skipif(
-        Version('1.16.0') <= Version(np.__version__) < Version('1.16.1'),
+        np is not None and Version('1.16.0') <= Version(
+            np.__version__) < Version('1.16.1'),
         reason='Until numpy/numpy#12745 is resolved')
     def test_fixed_offset_timezone(self):
         df = pd.DataFrame({
@@ -2921,23 +2925,23 @@ class TestConvertMisc:
     """
 
     type_pairs = [
-        (np.int8, pa.int8()),
-        (np.int16, pa.int16()),
-        (np.int32, pa.int32()),
-        (np.int64, pa.int64()),
-        (np.uint8, pa.uint8()),
-        (np.uint16, pa.uint16()),
-        (np.uint32, pa.uint32()),
-        (np.uint64, pa.uint64()),
-        (np.float16, pa.float16()),
-        (np.float32, pa.float32()),
-        (np.float64, pa.float64()),
+        ("int8", pa.int8()),
+        ("int16", pa.int16()),
+        ("int32", pa.int32()),
+        ("int64", pa.int64()),
+        ("uint8", pa.uint8()),
+        ("uint16", pa.uint16()),
+        ("uint32", pa.uint32()),
+        ("uint64", pa.uint64()),
+        ("float16", pa.float16()),
+        ("float32", pa.float32()),
+        ("float64", pa.float64()),
         # XXX unsupported
         # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
-        (np.object_, pa.string()),
-        (np.object_, pa.binary()),
-        (np.object_, pa.binary(10)),
-        (np.object_, pa.list_(pa.int64())),
+        ("object", pa.string()),
+        ("object", pa.binary()),
+        ("object", pa.binary(10)),
+        ("object", pa.list_(pa.int64())),
     ]
 
     def test_all_none_objects(self):
@@ -2950,8 +2954,8 @@ def test_all_none_category(self):
         _check_pandas_roundtrip(df)
 
     def test_empty_arrays(self):
-        for dtype, pa_type in self.type_pairs:
-            arr = np.array([], dtype=dtype)
+        for dtype_str, pa_type in self.type_pairs:
+            arr = np.array([], dtype=np.dtype(dtype_str))
             _check_array_roundtrip(arr, type=pa_type)
 
     def test_non_threaded_conversion(self):
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index bc50697e1be17..3f4a53c473e7e 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -20,7 +20,10 @@
 import pytest
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -40,7 +43,6 @@
     (1, pa.int64(), pa.Int64Scalar),
     (1, pa.uint64(), pa.UInt64Scalar),
     (1.0, None, pa.DoubleScalar),
-    (np.float16(1.0), pa.float16(), pa.HalfFloatScalar),
     (1.0, pa.float32(), pa.FloatScalar),
     (decimal.Decimal("1.123"), None, pa.Decimal128Scalar),
     (decimal.Decimal("1.1234567890123456789012345678901234567890"),
@@ -98,6 +100,40 @@ def test_basics(value, ty, klass, pickle_module):
     assert wr() is None
 
 
+# This test is a copy of test_basics but only for float16 (HalfFloatScalar)
+# which currently requires a numpy scalar to create it. The test collection
+# fails if numpy is used on the parametrization when not present.
+@pytest.mark.numpy
+def test_basics_np_required(pickle_module):
+    value, ty, klass = np.float16(1.0), pa.float16(), pa.HalfFloatScalar
+    s = pa.scalar(value, type=ty)
+    s.validate()
+    s.validate(full=True)
+    assert isinstance(s, klass)
+    assert s.as_py() == value
+    assert s == pa.scalar(value, type=ty)
+    assert s != value
+    assert s != "else"
+    assert hash(s) == hash(s)
+    assert s.is_valid is True
+    assert s != None  # noqa: E711
+
+    s = pa.scalar(None, type=s.type)
+    assert s.is_valid is False
+    assert s.as_py() is None
+    assert s != pa.scalar(value, type=ty)
+
+    # test pickle roundtrip
+    restored = pickle_module.loads(pickle_module.dumps(s))
+    assert s.equals(restored)
+
+    # test that scalars are weak-referenceable
+    wr = weakref.ref(s)
+    assert wr() is not None
+    del s
+    assert wr() is None
+
+
 def test_invalid_scalar():
     s = pc.cast(pa.scalar(b"\xff"), pa.string(), safe=False)
     s.validate()
@@ -202,14 +238,15 @@ def test_numerics():
     assert str(s) == "1.5"
     assert s.as_py() == 1.5
 
-    # float16
-    s = pa.scalar(np.float16(0.5), type='float16')
-    assert isinstance(s, pa.HalfFloatScalar)
-    # on numpy2 repr(np.float16(0.5)) == "np.float16(0.5)"
-    # on numpy1 repr(np.float16(0.5)) == "0.5"
-    assert repr(s) == f"<pyarrow.HalfFloatScalar: {np.float16(0.5)!r}>"
-    assert str(s) == "0.5"
-    assert s.as_py() == 0.5
+    if np is not None:
+        # float16
+        s = pa.scalar(np.float16(0.5), type='float16')
+        assert isinstance(s, pa.HalfFloatScalar)
+        # on numpy2 repr(np.float16(0.5)) == "np.float16(0.5)"
+        # on numpy1 repr(np.float16(0.5)) == "0.5"
+        assert repr(s) == f"<pyarrow.HalfFloatScalar: {np.float16(0.5)!r}>"
+        assert str(s) == "0.5"
+        assert s.as_py() == 0.5
 
 
 def test_decimal128():
@@ -434,6 +471,7 @@ def test_timestamp_fixed_offset_print():
     assert str(arr[0]) == "1970-01-01 02:00:00+02:00"
 
 
+@pytest.mark.numpy
 def test_duration():
     arr = np.array([0, 3600000000000], dtype='timedelta64[ns]')
 
@@ -559,6 +597,7 @@ def test_list(ty, klass):
         s[2]
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize('ty', [
     pa.list_(pa.int64()),
     pa.large_list(pa.int64()),
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index 1b05c58384cf0..bdcb6c2b42d78 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -20,7 +20,10 @@
 import weakref
 
 import pytest
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 
 import pyarrow.tests.util as test_util
@@ -185,6 +188,7 @@ def test_time_types():
         pa.time64('s')
 
 
+@pytest.mark.numpy
 def test_from_numpy_dtype():
     cases = [
         (np.dtype('bool'), pa.bool_()),
diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py
index aa7da0a742086..7ba9e2b3e13db 100644
--- a/python/pyarrow/tests/test_sparse_tensor.py
+++ b/python/pyarrow/tests/test_sparse_tensor.py
@@ -19,7 +19,10 @@
 import sys
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    pytestmark = pytest.mark.numpy
 import pyarrow as pa
 
 try:
diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py
index 14fc949928c33..da50bcda52f2b 100644
--- a/python/pyarrow/tests/test_strategies.py
+++ b/python/pyarrow/tests/test_strategies.py
@@ -17,6 +17,8 @@
 
 import hypothesis as h
 
+import pytest
+
 import pyarrow as pa
 import pyarrow.tests.strategies as past
 
@@ -36,11 +38,13 @@ def test_schemas(schema):
     assert isinstance(schema, pa.lib.Schema)
 
 
+@pytest.mark.numpy
 @h.given(past.all_arrays)
 def test_arrays(array):
     assert isinstance(array, pa.lib.Array)
 
 
+@pytest.mark.numpy
 @h.given(past.arrays(past.primitive_types, nullable=False))
 def test_array_nullability(array):
     assert array.null_count == 0
@@ -56,6 +60,7 @@ def test_record_batches(record_bath):
     assert isinstance(record_bath, pa.lib.RecordBatch)
 
 
+@pytest.mark.numpy
 @h.given(past.all_tables)
 def test_tables(table):
     assert isinstance(table, pa.lib.Table)
diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py
index 40700e4741321..01d468cd9e9cc 100644
--- a/python/pyarrow/tests/test_substrait.py
+++ b/python/pyarrow/tests/test_substrait.py
@@ -608,6 +608,7 @@ def table_provider(names, schema):
     assert res_tb == expected
 
 
+@pytest.mark.numpy
 def test_scalar_aggregate_udf_basic(varargs_agg_func_fixture):
 
     test_table = pa.Table.from_pydict(
@@ -756,6 +757,7 @@ def table_provider(names, _):
     assert res_tb == expected_tb
 
 
+@pytest.mark.numpy
 def test_hash_aggregate_udf_basic(varargs_agg_func_fixture):
 
     test_table = pa.Table.from_pydict(
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index cd38909edf357..3b60cff2d8cf2 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -20,7 +20,10 @@
 import sys
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pytest
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -125,6 +128,7 @@ def test_chunked_array_can_combine_chunks_with_no_chunks():
     ).combine_chunks() == pa.array([], type=pa.bool_())
 
 
+@pytest.mark.numpy
 def test_chunked_array_to_numpy():
     data = pa.chunked_array([
         [1, 2, 3],
@@ -173,6 +177,7 @@ def test_chunked_array_str():
 ]"""
 
 
+@pytest.mark.numpy
 def test_chunked_array_getitem():
     data = [
         pa.array([1, 2, 3]),
@@ -972,12 +977,14 @@ def check_tensors(tensor, expected_tensor, type, size):
     assert tensor.strides == expected_tensor.strides
 
 
-@pytest.mark.parametrize('typ', [
-    np.uint8, np.uint16, np.uint32, np.uint64,
-    np.int8, np.int16, np.int32, np.int64,
-    np.float32, np.float64,
+@pytest.mark.numpy
+@pytest.mark.parametrize('typ_str', [
+    "uint8", "uint16", "uint32", "uint64",
+    "int8", "int16", "int32", "int64",
+    "float32", "float64",
 ])
-def test_recordbatch_to_tensor_uniform_type(typ):
+def test_recordbatch_to_tensor_uniform_type(typ_str):
+    typ = np.dtype(typ_str)
     arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
     arr3 = [100, 100, 100, 100, 100, 100, 100, 100, 100]
@@ -1031,6 +1038,7 @@ def test_recordbatch_to_tensor_uniform_type(typ):
     check_tensors(result, expected, pa.from_numpy_dtype(typ), 15)
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_uniform_float_16():
     arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
@@ -1054,6 +1062,7 @@ def test_recordbatch_to_tensor_uniform_float_16():
     check_tensors(result, expected, pa.float16(), 27)
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_mixed_type():
     # uint16 + int16 = int32
     arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
@@ -1105,6 +1114,7 @@ def test_recordbatch_to_tensor_mixed_type():
     assert result.strides == expected.strides
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_unsupported_mixed_type_with_float16():
     arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90]
@@ -1124,6 +1134,7 @@ def test_recordbatch_to_tensor_unsupported_mixed_type_with_float16():
         batch.to_tensor()
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_nan():
     arr1 = [1, 2, 3, 4, np.nan, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, np.nan, 90]
@@ -1144,6 +1155,7 @@ def test_recordbatch_to_tensor_nan():
     assert result.strides == expected.strides
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_null():
     arr1 = [1, 2, 3, 4, None, 6, 7, 8, 9]
     arr2 = [10, 20, 30, 40, 50, 60, 70, None, 90]
@@ -1204,6 +1216,7 @@ def test_recordbatch_to_tensor_null():
     assert result.strides == expected.strides
 
 
+@pytest.mark.numpy
 def test_recordbatch_to_tensor_empty():
     batch = pa.RecordBatch.from_arrays(
         [
@@ -1295,6 +1308,7 @@ def test_slice_zero_length_table():
     table.to_pandas()
 
 
+@pytest.mark.numpy
 def test_recordbatchlist_schema_equals():
     a1 = np.array([1], dtype='uint32')
     a2 = np.array([4.0, 5.0], dtype='float64')
@@ -2130,6 +2144,7 @@ def test_table_unsafe_casting(cls):
     assert casted_table.equals(expected_table)
 
 
+@pytest.mark.numpy
 def test_invalid_table_construct():
     array = np.array([0, 1], dtype=np.uint8)
     u8 = pa.uint8()
@@ -3287,6 +3302,7 @@ def test_table_sort_by(cls):
     assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"]
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("constructor", [pa.table, pa.record_batch])
 def test_numpy_asarray(constructor):
     table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"])
@@ -3319,6 +3335,7 @@ def test_numpy_asarray(constructor):
     assert result.dtype == "int32"
 
 
+@pytest.mark.numpy
 @pytest.mark.parametrize("constructor", [pa.table, pa.record_batch])
 def test_numpy_array_protocol(constructor):
     table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"])
diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py
index 29c6de65b1607..debb1066280c1 100644
--- a/python/pyarrow/tests/test_tensor.py
+++ b/python/pyarrow/tests/test_tensor.py
@@ -21,7 +21,10 @@
 import warnings
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    pytestmark = pytest.mark.numpy
 import pyarrow as pa
 
 
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index d673f956527aa..cc680939ac46a 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -30,7 +30,10 @@
     tzst = None
 import weakref
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 import pyarrow as pa
 import pyarrow.types as types
 import pyarrow.tests.strategies as past
@@ -1265,14 +1268,16 @@ def test_field_modified_copies():
 
 def test_is_integer_value():
     assert pa.types.is_integer_value(1)
-    assert pa.types.is_integer_value(np.int64(1))
+    if np is not None:
+        assert pa.types.is_integer_value(np.int64(1))
     assert not pa.types.is_integer_value('1')
 
 
 def test_is_float_value():
     assert not pa.types.is_float_value(1)
     assert pa.types.is_float_value(1.)
-    assert pa.types.is_float_value(np.float64(1))
+    if np is not None:
+        assert pa.types.is_float_value(np.float64(1))
     assert not pa.types.is_float_value('1.0')
 
 
@@ -1280,8 +1285,9 @@ def test_is_boolean_value():
     assert not pa.types.is_boolean_value(1)
     assert pa.types.is_boolean_value(True)
     assert pa.types.is_boolean_value(False)
-    assert pa.types.is_boolean_value(np.bool_(True))
-    assert pa.types.is_boolean_value(np.bool_(False))
+    if np is not None:
+        assert pa.types.is_boolean_value(np.bool_(True))
+        assert pa.types.is_boolean_value(np.bool_(False))
 
 
 @h.settings(suppress_health_check=(h.HealthCheck.too_slow,))
diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py
index 22fefbbb58ba9..93004a30618a7 100644
--- a/python/pyarrow/tests/test_udf.py
+++ b/python/pyarrow/tests/test_udf.py
@@ -18,7 +18,10 @@
 
 import pytest
 
-import numpy as np
+try:
+    import numpy as np
+except ImportError:
+    np = None
 
 import pyarrow as pa
 from pyarrow import compute as pc
@@ -749,6 +752,7 @@ def test_udt_datasource1_exception():
         _test_datasource1_udt(datasource1_exception)
 
 
+@pytest.mark.numpy
 def test_scalar_agg_basic(unary_agg_func_fixture):
     arr = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64())
     result = pc.call_function("mean_udf", [arr])
@@ -756,6 +760,7 @@ def test_scalar_agg_basic(unary_agg_func_fixture):
     assert result == expected
 
 
+@pytest.mark.numpy
 def test_scalar_agg_empty(unary_agg_func_fixture):
     empty = pa.array([], pa.float64())
 
@@ -775,6 +780,7 @@ def test_scalar_agg_wrong_output_type(wrong_output_type_agg_func_fixture):
         pc.call_function("y=wrong_output_type(x)", [arr])
 
 
+@pytest.mark.numpy
 def test_scalar_agg_varargs(varargs_agg_func_fixture):
     arr1 = pa.array([10, 20, 30, 40, 50], pa.int64())
     arr2 = pa.array([1.0, 2.0, 3.0, 4.0, 5.0], pa.float64())
@@ -786,6 +792,7 @@ def test_scalar_agg_varargs(varargs_agg_func_fixture):
     assert result == expected
 
 
+@pytest.mark.numpy
 def test_scalar_agg_exception(exception_agg_func_fixture):
     arr = pa.array([10, 20, 30, 40, 50, 60], pa.int64())
 
@@ -793,6 +800,7 @@ def test_scalar_agg_exception(exception_agg_func_fixture):
         pc.call_function("y=exception_len(x)", [arr])
 
 
+@pytest.mark.numpy
 def test_hash_agg_basic(unary_agg_func_fixture):
     arr1 = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64())
     arr2 = pa.array([4, 2, 1, 2, 1], pa.int32())
@@ -811,6 +819,7 @@ def test_hash_agg_basic(unary_agg_func_fixture):
     assert result.sort_by('id') == expected.sort_by('id')
 
 
+@pytest.mark.numpy
 def test_hash_agg_empty(unary_agg_func_fixture):
     arr1 = pa.array([], pa.float64())
     arr2 = pa.array([], pa.int32())
@@ -841,6 +850,7 @@ def test_hash_agg_wrong_output_type(wrong_output_type_agg_func_fixture):
         table.group_by("id").aggregate([("value", "y=wrong_output_type(x)")])
 
 
+@pytest.mark.numpy
 def test_hash_agg_exception(exception_agg_func_fixture):
     arr1 = pa.array([10, 20, 30, 40, 50], pa.int64())
     arr2 = pa.array([4, 2, 1, 2, 1], pa.int32())
@@ -850,6 +860,7 @@ def test_hash_agg_exception(exception_agg_func_fixture):
         table.group_by("id").aggregate([("value", "y=exception_len(x)")])
 
 
+@pytest.mark.numpy
 def test_hash_agg_random(sum_agg_func_fixture):
     """Test hash aggregate udf with randomly sampled data"""
 
diff --git a/python/pyarrow/tests/test_without_numpy.py b/python/pyarrow/tests/test_without_numpy.py
new file mode 100644
index 0000000000000..55c12602ce89a
--- /dev/null
+++ b/python/pyarrow/tests/test_without_numpy.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not nonumpy'
+pytestmark = pytest.mark.nonumpy
+
+
+def test_array_to_np():
+    arr = pa.array(range(10))
+
+    msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+    with pytest.raises(ImportError, match=msg):
+        arr.to_numpy()
+
+
+def test_chunked_array_to_np():
+    data = pa.chunked_array([
+        [1, 2, 3],
+        [4, 5, 6],
+        []
+    ])
+    msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+    with pytest.raises(ImportError, match=msg):
+        data.to_numpy()
+
+
+def test_tensor_to_np():
+    tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2])
+    arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+    storage = pa.array(arr, pa.list_(pa.int32(), 4))
+    tensor_array = pa.ExtensionArray.from_storage(tensor_type, storage)
+
+    tensor = tensor_array.to_tensor()
+    msg = "Cannot return a numpy.ndarray if NumPy is not present"
+
+    with pytest.raises(ImportError, match=msg):
+        tensor.to_numpy()
diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
index 638eee9807335..aa6dd21f800c5 100644
--- a/python/pyarrow/tests/util.py
+++ b/python/pyarrow/tests/util.py
@@ -22,7 +22,6 @@
 import contextlib
 import decimal
 import gc
-import numpy as np
 import os
 import random
 import re
@@ -110,27 +109,15 @@ def randdecimal(precision, scale):
 
 
 def random_ascii(length):
-    return bytes(np.random.randint(65, 123, size=length, dtype='i1'))
+    return bytes([random.randint(65, 122) for i in range(length)])
 
 
 def rands(nchars):
     """
     Generate one random string.
     """
-    RANDS_CHARS = np.array(
-        list(string.ascii_letters + string.digits), dtype=(np.str_, 1))
-    return "".join(np.random.choice(RANDS_CHARS, nchars))
-
-
-def make_dataframe():
-    import pandas as pd
-
-    N = 30
-    df = pd.DataFrame(
-        {col: np.random.randn(N) for col in string.ascii_uppercase[:4]},
-        index=pd.Index([rands(10) for _ in range(N)])
-    )
-    return df
+    RANDS_CHARS = list(string.ascii_letters + string.digits)
+    return "".join(random.choice(RANDS_CHARS) for i in range(nchars))
 
 
 def memory_leak_check(f, metric='rss', threshold=1 << 17, iterations=10,
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index f83ecc3aa4326..a46caff1f21a4 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -33,42 +33,50 @@ from cython import sizeof
 
 # These are imprecise because the type (in pandas 0.x) depends on the presence
 # of nulls
-cdef dict _pandas_type_map = {
-    _Type_NA: np.object_,  # NaNs
-    _Type_BOOL: np.bool_,
-    _Type_INT8: np.int8,
-    _Type_INT16: np.int16,
-    _Type_INT32: np.int32,
-    _Type_INT64: np.int64,
-    _Type_UINT8: np.uint8,
-    _Type_UINT16: np.uint16,
-    _Type_UINT32: np.uint32,
-    _Type_UINT64: np.uint64,
-    _Type_HALF_FLOAT: np.float16,
-    _Type_FLOAT: np.float32,
-    _Type_DOUBLE: np.float64,
-    # Pandas does not support [D]ay, so default to [ms] for date32
-    _Type_DATE32: np.dtype('datetime64[ms]'),
-    _Type_DATE64: np.dtype('datetime64[ms]'),
-    _Type_TIMESTAMP: {
-        's': np.dtype('datetime64[s]'),
-        'ms': np.dtype('datetime64[ms]'),
-        'us': np.dtype('datetime64[us]'),
-        'ns': np.dtype('datetime64[ns]'),
-    },
-    _Type_DURATION: {
-        's': np.dtype('timedelta64[s]'),
-        'ms': np.dtype('timedelta64[ms]'),
-        'us': np.dtype('timedelta64[us]'),
-        'ns': np.dtype('timedelta64[ns]'),
-    },
-    _Type_BINARY: np.object_,
-    _Type_FIXED_SIZE_BINARY: np.object_,
-    _Type_STRING: np.object_,
-    _Type_LIST: np.object_,
-    _Type_MAP: np.object_,
-    _Type_DECIMAL128: np.object_,
-}
+cdef dict _pandas_type_map = {}
+
+
+def _get_pandas_type_map():
+    global _pandas_type_map
+    if not _pandas_type_map:
+        _pandas_type_map.update({
+            _Type_NA: np.object_,  # NaNs
+            _Type_BOOL: np.bool_,
+            _Type_INT8: np.int8,
+            _Type_INT16: np.int16,
+            _Type_INT32: np.int32,
+            _Type_INT64: np.int64,
+            _Type_UINT8: np.uint8,
+            _Type_UINT16: np.uint16,
+            _Type_UINT32: np.uint32,
+            _Type_UINT64: np.uint64,
+            _Type_HALF_FLOAT: np.float16,
+            _Type_FLOAT: np.float32,
+            _Type_DOUBLE: np.float64,
+            # Pandas does not support [D]ay, so default to [ms] for date32
+            _Type_DATE32: np.dtype('datetime64[ms]'),
+            _Type_DATE64: np.dtype('datetime64[ms]'),
+            _Type_TIMESTAMP: {
+                's': np.dtype('datetime64[s]'),
+                'ms': np.dtype('datetime64[ms]'),
+                'us': np.dtype('datetime64[us]'),
+                'ns': np.dtype('datetime64[ns]'),
+            },
+            _Type_DURATION: {
+                's': np.dtype('timedelta64[s]'),
+                'ms': np.dtype('timedelta64[ms]'),
+                'us': np.dtype('timedelta64[us]'),
+                'ns': np.dtype('timedelta64[ns]'),
+            },
+            _Type_BINARY: np.object_,
+            _Type_FIXED_SIZE_BINARY: np.object_,
+            _Type_STRING: np.object_,
+            _Type_LIST: np.object_,
+            _Type_MAP: np.object_,
+            _Type_DECIMAL128: np.object_,
+        })
+    return _pandas_type_map
+
 
 cdef dict _pep3118_type_map = {
     _Type_INT8: b'b',
@@ -149,14 +157,15 @@ def _is_primitive(Type type):
 
 def _get_pandas_type(arrow_type, coerce_to_ns=False):
     cdef Type type_id = arrow_type.id
-    if type_id not in _pandas_type_map:
+    cdef dict pandas_type_map = _get_pandas_type_map()
+    if type_id not in pandas_type_map:
         return None
     if coerce_to_ns:
         # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
         if type_id == _Type_DURATION:
             return np.dtype('timedelta64[ns]')
         return np.dtype('datetime64[ns]')
-    pandas_type = _pandas_type_map[type_id]
+    pandas_type = pandas_type_map[type_id]
     if isinstance(pandas_type, dict):
         unit = getattr(arrow_type, 'unit', None)
         pandas_type = pandas_type.get(unit, None)

From 44d3f763c083280d2480a735a9ab45243af48232 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Mon, 2 Sep 2024 23:06:10 +0800
Subject: [PATCH 060/186] GH-43758: [C++] Compute: More comment in RowEncoder
 (#43763)

### Rationale for this change

Some comments for RowEncoder

### What changes are included in this PR?

Some comments for RowEncoder

### Are these changes tested?

Covered by existing

### Are there any user-facing changes?

no

* GitHub Issue: #43758

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: mwish <anmmscs_maple@qq.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Co-authored-by: Rossi Sun <zanmato1984@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/arrow/compute/light_array_internal.h  |   6 +-
 .../arrow/compute/row/row_encoder_internal.cc |  56 +++----
 .../arrow/compute/row/row_encoder_internal.h  | 154 ++++++++++++++----
 cpp/src/arrow/compute/row/row_internal.h      |   2 +-
 4 files changed, 155 insertions(+), 63 deletions(-)

diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h
index b8e48f096baeb..5adb06e540009 100644
--- a/cpp/src/arrow/compute/light_array_internal.h
+++ b/cpp/src/arrow/compute/light_array_internal.h
@@ -65,12 +65,12 @@ struct ARROW_EXPORT KeyColumnMetadata {
   /// If this is true the column will have a validity buffer and
   /// a data buffer and the third buffer will be unused.
   bool is_fixed_length;
-  /// \brief True if this column is the null type
+  /// \brief True if this column is the null type(NA).
   bool is_null_type;
   /// \brief The number of bytes for each item
   ///
   /// Zero has a special meaning, indicating a bit vector with one bit per value if it
-  /// isn't a null type column.
+  /// isn't a null type column. Generally, this means that the column is a boolean type.
   ///
   /// For a varying-length binary column this represents the number of bytes per offset.
   uint32_t fixed_length;
@@ -405,7 +405,7 @@ class ARROW_EXPORT ExecBatchBuilder {
 
   int num_rows() const { return values_.empty() ? 0 : values_[0].num_rows(); }
 
-  static int num_rows_max() { return 1 << kLogNumRows; }
+  static constexpr int num_rows_max() { return 1 << kLogNumRows; }
 
  private:
   static constexpr int kLogNumRows = 15;
diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.cc b/cpp/src/arrow/compute/row/row_encoder_internal.cc
index 414cc6793a5a3..0965e4e8f9571 100644
--- a/cpp/src/arrow/compute/row/row_encoder_internal.cc
+++ b/cpp/src/arrow/compute/row/row_encoder_internal.cc
@@ -145,41 +145,37 @@ void FixedWidthKeyEncoder::AddLengthNull(int32_t* length) {
 
 Status FixedWidthKeyEncoder::Encode(const ExecValue& data, int64_t batch_length,
                                     uint8_t** encoded_bytes) {
+  auto handle_next_valid_value = [&](std::string_view bytes) {
+    auto& encoded_ptr = *encoded_bytes++;
+    *encoded_ptr++ = kValidByte;
+    memcpy(encoded_ptr, bytes.data(), byte_width_);
+    encoded_ptr += byte_width_;
+  };
+  auto handle_next_null_value = [&] {
+    auto& encoded_ptr = *encoded_bytes++;
+    *encoded_ptr++ = kNullByte;
+    memset(encoded_ptr, 0, byte_width_);
+    encoded_ptr += byte_width_;
+  };
   if (data.is_array()) {
     ArraySpan viewed = data.array;
+    // The original type might not be FixedSizeBinaryType, but it would
+    // treat the input as binary data.
     auto view_ty = fixed_size_binary(byte_width_);
     viewed.type = view_ty.get();
-    VisitArraySpanInline<FixedSizeBinaryType>(
-        viewed,
-        [&](std::string_view bytes) {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kValidByte;
-          memcpy(encoded_ptr, bytes.data(), byte_width_);
-          encoded_ptr += byte_width_;
-        },
-        [&] {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kNullByte;
-          memset(encoded_ptr, 0, byte_width_);
-          encoded_ptr += byte_width_;
-        });
+    VisitArraySpanInline<FixedSizeBinaryType>(viewed, handle_next_valid_value,
+                                              handle_next_null_value);
   } else {
     const auto& scalar = data.scalar_as<arrow::internal::PrimitiveScalarBase>();
     if (scalar.is_valid) {
-      const std::string_view data = scalar.view();
-      DCHECK_EQ(data.size(), static_cast<size_t>(byte_width_));
+      const std::string_view scalar_data = scalar.view();
+      DCHECK_EQ(scalar_data.size(), static_cast<size_t>(byte_width_));
       for (int64_t i = 0; i < batch_length; i++) {
-        auto& encoded_ptr = *encoded_bytes++;
-        *encoded_ptr++ = kValidByte;
-        memcpy(encoded_ptr, data.data(), data.size());
-        encoded_ptr += byte_width_;
+        handle_next_valid_value(scalar_data);
       }
     } else {
       for (int64_t i = 0; i < batch_length; i++) {
-        auto& encoded_ptr = *encoded_bytes++;
-        *encoded_ptr++ = kNullByte;
-        memset(encoded_ptr, 0, byte_width_);
-        encoded_ptr += byte_width_;
+        handle_next_null_value();
       }
     }
   }
@@ -267,11 +263,11 @@ void RowEncoder::Init(const std::vector<TypeHolder>& column_types, ExecContext*
 
   for (size_t i = 0; i < column_types.size(); ++i) {
     const bool is_extension = column_types[i].id() == Type::EXTENSION;
-    const TypeHolder& type = is_extension
-                                 ? arrow::internal::checked_pointer_cast<ExtensionType>(
-                                       column_types[i].GetSharedPtr())
-                                       ->storage_type()
-                                 : column_types[i];
+    const TypeHolder& type =
+        is_extension
+            ? arrow::internal::checked_cast<const ExtensionType*>(column_types[i].type)
+                  ->storage_type()
+            : column_types[i];
 
     if (is_extension) {
       extension_types_[i] = arrow::internal::checked_pointer_cast<ExtensionType>(
@@ -379,7 +375,7 @@ Result<ExecBatch> RowEncoder::Decode(int64_t num_rows, const int32_t* row_ids) {
       ARROW_ASSIGN_OR_RAISE(out.values[i], ::arrow::internal::GetArrayView(
                                                column_array_data, extension_types_[i]))
     } else {
-      out.values[i] = column_array_data;
+      out.values[i] = std::move(column_array_data);
     }
   }
 
diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.h b/cpp/src/arrow/compute/row/row_encoder_internal.h
index 60eb14af504f7..4d6cc34af2342 100644
--- a/cpp/src/arrow/compute/row/row_encoder_internal.h
+++ b/cpp/src/arrow/compute/row/row_encoder_internal.h
@@ -38,16 +38,41 @@ struct ARROW_EXPORT KeyEncoder {
 
   virtual ~KeyEncoder() = default;
 
+  // Increment the values in the lengths array by the length of the encoded key for the
+  // corresponding value in the given column.
+  //
+  // Generally if Encoder is for a fixed-width type, the length of the encoded key
+  // would add ExtraByteForNull + byte_width.
+  // If Encoder is for a variable-width type, the length would add ExtraByteForNull +
+  // sizeof(Offset) + buffer_size.
+  // If Encoder is for null type, the length would add 0.
   virtual void AddLength(const ExecValue& value, int64_t batch_length,
                          int32_t* lengths) = 0;
 
+  // Increment the length by the length of an encoded null value.
+  // It's a special case for AddLength like `AddLength(Null-Scalar, 1, lengths)`.
   virtual void AddLengthNull(int32_t* length) = 0;
 
+  // Encode the column into the encoded_bytes, which is an array of pointers to each row
+  // buffer.
+  //
+  // If value is an array, the array-size should be batch_length.
+  // If value is a scalar, the value would repeat batch_length times.
+  // NB: The pointers in the encoded_bytes will be advanced as values being encoded into.
   virtual Status Encode(const ExecValue&, int64_t batch_length,
                         uint8_t** encoded_bytes) = 0;
 
+  // Encode a null value into the encoded_bytes, which is an array of pointers to each row
+  // buffer.
+  //
+  // It's a special case for Encode like `Encode(Null-Scalar, 1, encoded_bytes)`.
+  // NB: The pointers in the encoded_bytes will be advanced as values being encoded into.
   virtual void EncodeNull(uint8_t** encoded_bytes) = 0;
 
+  // Decode the encoded key from the encoded_bytes, which is an array of pointers to each
+  // row buffer, into an ArrayData.
+  //
+  // NB: The pointers in the encoded_bytes will be advanced as values being decoded from.
   virtual Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes,
                                                     int32_t length, MemoryPool*) = 0;
 
@@ -94,7 +119,7 @@ struct ARROW_EXPORT FixedWidthKeyEncoder : KeyEncoder {
                                             MemoryPool* pool) override;
 
   std::shared_ptr<DataType> type_;
-  int byte_width_;
+  const int byte_width_;
 };
 
 struct ARROW_EXPORT DictionaryKeyEncoder : FixedWidthKeyEncoder {
@@ -118,6 +143,7 @@ struct ARROW_EXPORT VarLengthKeyEncoder : KeyEncoder {
   void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override {
     if (data.is_array()) {
       int64_t i = 0;
+      ARROW_DCHECK_EQ(data.array.length, batch_length);
       VisitArraySpanInline<T>(
           data.array,
           [&](std::string_view bytes) {
@@ -142,41 +168,34 @@ struct ARROW_EXPORT VarLengthKeyEncoder : KeyEncoder {
 
   Status Encode(const ExecValue& data, int64_t batch_length,
                 uint8_t** encoded_bytes) override {
+    auto handle_next_valid_value = [&encoded_bytes](std::string_view bytes) {
+      auto& encoded_ptr = *encoded_bytes++;
+      *encoded_ptr++ = kValidByte;
+      util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
+      encoded_ptr += sizeof(Offset);
+      memcpy(encoded_ptr, bytes.data(), bytes.size());
+      encoded_ptr += bytes.size();
+    };
+    auto handle_next_null_value = [&encoded_bytes]() {
+      auto& encoded_ptr = *encoded_bytes++;
+      *encoded_ptr++ = kNullByte;
+      util::SafeStore(encoded_ptr, static_cast<Offset>(0));
+      encoded_ptr += sizeof(Offset);
+    };
     if (data.is_array()) {
-      VisitArraySpanInline<T>(
-          data.array,
-          [&](std::string_view bytes) {
-            auto& encoded_ptr = *encoded_bytes++;
-            *encoded_ptr++ = kValidByte;
-            util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
-            encoded_ptr += sizeof(Offset);
-            memcpy(encoded_ptr, bytes.data(), bytes.size());
-            encoded_ptr += bytes.size();
-          },
-          [&] {
-            auto& encoded_ptr = *encoded_bytes++;
-            *encoded_ptr++ = kNullByte;
-            util::SafeStore(encoded_ptr, static_cast<Offset>(0));
-            encoded_ptr += sizeof(Offset);
-          });
+      DCHECK_EQ(data.length(), batch_length);
+      VisitArraySpanInline<T>(data.array, handle_next_valid_value,
+                              handle_next_null_value);
     } else {
       const auto& scalar = data.scalar_as<BaseBinaryScalar>();
       if (scalar.is_valid) {
-        const auto& bytes = *scalar.value;
+        const auto bytes = std::string_view{*scalar.value};
         for (int64_t i = 0; i < batch_length; i++) {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kValidByte;
-          util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
-          encoded_ptr += sizeof(Offset);
-          memcpy(encoded_ptr, bytes.data(), bytes.size());
-          encoded_ptr += bytes.size();
+          handle_next_valid_value(bytes);
         }
       } else {
         for (int64_t i = 0; i < batch_length; i++) {
-          auto& encoded_ptr = *encoded_bytes++;
-          *encoded_ptr++ = kNullByte;
-          util::SafeStore(encoded_ptr, static_cast<Offset>(0));
-          encoded_ptr += sizeof(Offset);
+          handle_next_null_value();
         }
       }
     }
@@ -250,6 +269,68 @@ struct ARROW_EXPORT NullKeyEncoder : KeyEncoder {
   }
 };
 
+/// RowEncoder encodes ExecSpan to a variable length byte sequence
+/// created by concatenating the encoded form of each column. The encoding
+/// for each column depends on its data type.
+///
+/// This is used to encode columns into row-major format, which will be
+/// beneficial for grouping and joining operations.
+///
+/// Unlike DuckDB and arrow-rs, currently this row format can not help
+/// sortings because the row-format is uncomparable.
+///
+/// # Key Column Encoding
+///
+/// The row format is composed of the the KeyColumn encodings for each,
+/// and the column is encoded as follows:
+/// 1. A null byte for each column, indicating whether the column is null.
+///    "1" for null, "0" for non-null.
+/// 2. The "fixed width" encoding for the column, it would exist whether
+///    the column is null or not.
+/// 3. The "variable payload" encoding for the column, it would exists only
+///    for non-null string/binary columns.
+///    For string/binary columns, the length of the payload is in
+///    "fixed width" part, and the binary contents are in the
+///    "variable payload" part.
+/// 4. Specially, if all columns in a row are null, the caller may decide
+///    to refer to kRowIdForNulls instead of actually encoding/decoding
+///    it using any KeyEncoder. See the comment for encoded_nulls_.
+///
+/// The endianness of the encoded bytes is platform-dependent.
+///
+/// ## Null Type
+///
+/// Null Type is a special case, it doesn't occupy any space in the
+/// encoded row.
+///
+/// ## Fixed Width Type
+///
+/// Fixed Width Type is encoded as a fixed-width byte sequence. For example:
+/// ```
+/// Int8: 5, null, 6
+/// ```
+/// Would be encoded as [0 5], [1 0], [0 6].
+///
+/// ### Dictionary Type
+///
+/// Dictionary Type is encoded as a fixed-width byte sequence using
+/// dictionary  indices, the dictionary should be identical for all
+/// rows.
+///
+/// ## Variable Width Type
+///
+/// Variable Width Type is encoded as:
+/// [null byte, variable-byte length, variable bytes]. For example:
+///
+/// String "abc" Would be encoded as:
+/// 0 ( 1 byte for not null) + 3 ( 4 bytes for length ) + "abc" (payload)
+///
+/// Null string Would be encoded as:
+/// 1 ( 1 byte for null) + 0 ( 4 bytes for length )
+///
+/// # Row Encoding
+///
+/// The row format is the concatenation of the encodings of each column.
 class ARROW_EXPORT RowEncoder {
  public:
   static constexpr int kRowIdForNulls() { return -1; }
@@ -259,6 +340,9 @@ class ARROW_EXPORT RowEncoder {
   Status EncodeAndAppend(const ExecSpan& batch);
   Result<ExecBatch> Decode(int64_t num_rows, const int32_t* row_ids);
 
+  // Returns the encoded representation of the row at index i.
+  // If i is kRowIdForNulls, it returns the pre-encoded all-nulls
+  // row.
   inline std::string encoded_row(int32_t i) const {
     if (i == kRowIdForNulls()) {
       return std::string(reinterpret_cast<const char*>(encoded_nulls_.data()),
@@ -270,14 +354,26 @@ class ARROW_EXPORT RowEncoder {
   }
 
   int32_t num_rows() const {
-    return offsets_.size() == 0 ? 0 : static_cast<int32_t>(offsets_.size() - 1);
+    return offsets_.empty() ? 0 : static_cast<int32_t>(offsets_.size() - 1);
   }
 
  private:
   ExecContext* ctx_{nullptr};
   std::vector<std::shared_ptr<KeyEncoder>> encoders_;
+  // offsets_ vector stores the starting position (offset) of each encoded row
+  // within the bytes_ vector. This allows for quick access to individual rows.
+  //
+  // The size would be num_rows + 1 if not empty, the last element is the total
+  // length of the bytes_ vector.
   std::vector<int32_t> offsets_;
+  // The encoded bytes of all non "kRowIdForNulls" rows.
   std::vector<uint8_t> bytes_;
+  // A pre-encoded constant row with all its columns being null. Useful when
+  // the caller is certain that an entire row is null and then uses kRowIdForNulls
+  // to refer to it.
+  //
+  // EncodeAndAppend would never append this row, but encoded_row and Decode would
+  // return this row when kRowIdForNulls is passed.
   std::vector<uint8_t> encoded_nulls_;
   std::vector<std::shared_ptr<ExtensionType>> extension_types_;
 };
diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h
index 094a9c31efe0a..3ab86fd1fc6ed 100644
--- a/cpp/src/arrow/compute/row/row_internal.h
+++ b/cpp/src/arrow/compute/row/row_internal.h
@@ -38,7 +38,7 @@ struct ARROW_EXPORT RowTableMetadata {
   /// For a fixed-length binary row, common size of rows in bytes,
   /// rounded up to the multiple of alignment.
   ///
-  /// For a varying-length binary, size of all encoded fixed-length key columns,
+  /// For a varying-length binary row, size of all encoded fixed-length key columns,
   /// including lengths of varying-length columns, rounded up to the multiple of string
   /// alignment.
   uint32_t fixed_length;

From 9cafbb26681a1488c16edaec231ba55c21543e3a Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Mon, 2 Sep 2024 23:38:24 +0800
Subject: [PATCH 061/186] GH-43768: [C++] Fix the case when boolean_{any|all}
 meets constant input with length in Acero (#43799)

### Rationale for this change

See https://github.com/apache/arrow/issues/43768

### What changes are included in this PR?

Fix the case when boolean_{any|all} meets constant input with length in Acero

### Are these changes tested?

Yes

### Are there any user-facing changes?

no

* GitHub Issue: #43768

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: Rossi Sun <zanmato1984@gmail.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/arrow/acero/aggregate_node_test.cc    | 52 +++++++++++++++++++
 .../arrow/compute/kernels/aggregate_basic.cc  | 16 +++---
 2 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/acero/aggregate_node_test.cc b/cpp/src/arrow/acero/aggregate_node_test.cc
index d398fb24b73d5..c623271db9fb4 100644
--- a/cpp/src/arrow/acero/aggregate_node_test.cc
+++ b/cpp/src/arrow/acero/aggregate_node_test.cc
@@ -210,5 +210,57 @@ TEST(GroupByNode, NoSkipNulls) {
   AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch}, out_batches.batches);
 }
 
+TEST(ScalarAggregateNode, AnyAll) {
+  // GH-43768: boolean_any and boolean_all with constant input should work well
+  // when min_count != 0.
+  std::shared_ptr<Schema> in_schema = schema({field("not_used", int32())});
+  std::shared_ptr<Schema> out_schema = schema({field("agg_out", boolean())});
+  struct AnyAllCase {
+    std::string batches_json;
+    Expression literal;
+    std::string expected_json;
+    bool skip_nulls = false;
+    uint32_t min_count = 2;
+  };
+  std::vector<AnyAllCase> cases{
+      {"[[42], [42], [42], [42]]", literal(true), "[[true]]"},
+      {"[[42], [42], [42], [42]]", literal(false), "[[false]]"},
+      {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]"},
+      {"[[42]]", literal(true), "[[null]]"},
+      {"[[42], [42], [42]]", literal(true), "[[true]]"},
+      {"[[42], [42], [42]]", literal(true), "[[null]]", /*skip_nulls=*/false,
+       /*min_count=*/4},
+      {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]",
+       /*skip_nulls=*/true},
+  };
+  for (const AnyAllCase& any_all_case : cases) {
+    for (auto func_name : {"any", "all"}) {
+      std::vector<ExecBatch> batches{
+          ExecBatchFromJSON({int32()}, any_all_case.batches_json)};
+      std::vector<Aggregate> aggregates = {
+          Aggregate(func_name,
+                    std::make_shared<compute::ScalarAggregateOptions>(
+                        /*skip_nulls=*/any_all_case.skip_nulls,
+                        /*min_count=*/any_all_case.min_count),
+                    FieldRef("literal"))};
+
+      // And a projection to make the input including a Scalar Boolean
+      Declaration plan = Declaration::Sequence(
+          {{"exec_batch_source", ExecBatchSourceNodeOptions(in_schema, batches)},
+           {"project", ProjectNodeOptions({any_all_case.literal}, {"literal"})},
+           {"aggregate", AggregateNodeOptions(aggregates)}});
+
+      ASSERT_OK_AND_ASSIGN(BatchesWithCommonSchema out_batches,
+                           DeclarationToExecBatches(plan));
+
+      ExecBatch expected_batch =
+          ExecBatchFromJSON({boolean()}, any_all_case.expected_json);
+
+      AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch},
+                                          out_batches.batches);
+    }
+  }
+}
+
 }  // namespace acero
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 1fbcd6a249093..c5e0e6fd6e977 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -532,13 +532,13 @@ struct BooleanAnyImpl : public ScalarAggregator {
     }
     if (batch[0].is_scalar()) {
       const Scalar& scalar = *batch[0].scalar;
-      this->has_nulls = !scalar.is_valid;
-      this->any = scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
-      this->count += scalar.is_valid;
+      this->has_nulls |= !scalar.is_valid;
+      this->any |= scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
+      this->count += scalar.is_valid * batch.length;
       return Status::OK();
     }
     const ArraySpan& data = batch[0].array;
-    this->has_nulls = data.GetNullCount() > 0;
+    this->has_nulls |= data.GetNullCount() > 0;
     this->count += data.length - data.GetNullCount();
     arrow::internal::OptionalBinaryBitBlockCounter counter(
         data.buffers[0].data, data.offset, data.buffers[1].data, data.offset,
@@ -603,13 +603,13 @@ struct BooleanAllImpl : public ScalarAggregator {
     }
     if (batch[0].is_scalar()) {
       const Scalar& scalar = *batch[0].scalar;
-      this->has_nulls = !scalar.is_valid;
-      this->count += scalar.is_valid;
-      this->all = !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
+      this->has_nulls |= !scalar.is_valid;
+      this->count += scalar.is_valid * batch.length;
+      this->all &= !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
       return Status::OK();
     }
     const ArraySpan& data = batch[0].array;
-    this->has_nulls = data.GetNullCount() > 0;
+    this->has_nulls |= data.GetNullCount() > 0;
     this->count += data.length - data.GetNullCount();
     arrow::internal::OptionalBinaryBitBlockCounter counter(
         data.buffers[1].data, data.offset, data.buffers[0].data, data.offset,

From 7d4cf37ce656581b123b04214f64ff44826dd83a Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 2 Sep 2024 18:54:51 +0200
Subject: [PATCH 062/186] GH-43883: [CI] Remove Python version guard when
 installing GCS testbench (#43884)

We can now use the GCS testbench even if we are testing a Python version that does not support it.

* GitHub Issue: #43883

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/cpp.yml                     |  2 +-
 ci/docker/conda-cpp.dockerfile                |  2 +-
 .../python-wheel-manylinux-test.dockerfile    | 14 ++++++++---
 ci/scripts/install_gcs_testbench.sh           | 25 +++++++++++--------
 dev/tasks/python-wheels/github.osx.yml        | 16 +++++++++++-
 5 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index fd23e0cf217e6..c3ca66719a5cf 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -472,7 +472,7 @@ jobs:
         shell: msys2 {0}
         env:
           PIPX_BIN_DIR: /usr/local/bin
-          PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }}
+          PIPX_BASE_PYTHON: ${{ steps.python-install.outputs.python-path }}
         run: |
           ci/scripts/install_gcs_testbench.sh default
       - name: Test
diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile
index eb035d887a158..f0084894e19dc 100644
--- a/ci/docker/conda-cpp.dockerfile
+++ b/ci/docker/conda-cpp.dockerfile
@@ -44,7 +44,7 @@ RUN mamba install -q -y \
 
 # We want to install the GCS testbench using the Conda base environment's Python,
 # because the test environment's Python may later change.
-ENV PIPX_PYTHON=/opt/conda/bin/python3
+ENV PIPX_BASE_PYTHON=/opt/conda/bin/python3
 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts
 RUN /arrow/ci/scripts/install_gcs_testbench.sh default
 
diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile
index 443ff9c53cbcb..09883f9780a36 100644
--- a/ci/docker/python-wheel-manylinux-test.dockerfile
+++ b/ci/docker/python-wheel-manylinux-test.dockerfile
@@ -19,13 +19,19 @@ ARG arch
 ARG python_image_tag
 FROM ${arch}/python:${python_image_tag}
 
-# RUN pip install --upgrade pip
-
 # pandas doesn't provide wheel for aarch64 yet, so cache the compiled
 # test dependencies in a docker image
 COPY python/requirements-wheel-test.txt /arrow/python/
 RUN pip install -r /arrow/python/requirements-wheel-test.txt
 
+# Install the GCS testbench with the system Python
+RUN apt-get update -y -q && \
+    apt-get install -y -q \
+        build-essential \
+        python3-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
-ARG python
-RUN PYTHON_VERSION=${python} /arrow/ci/scripts/install_gcs_testbench.sh default
+ENV PIPX_PYTHON=/usr/bin/python3 PIPX_PIP_ARGS=--prefer-binary
+RUN /arrow/ci/scripts/install_gcs_testbench.sh default
diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh
index 78826e94d3294..48a5858a358c9 100755
--- a/ci/scripts/install_gcs_testbench.sh
+++ b/ci/scripts/install_gcs_testbench.sh
@@ -39,18 +39,21 @@ if [[ "${version}" -eq "default" ]]; then
   version="v0.39.0"
 fi
 
-: ${PIPX_PYTHON:=$(which python3)}
+# The Python to install pipx with
+: ${PIPX_BASE_PYTHON:=$(which python3)}
+# The Python to install the GCS testbench with
+: ${PIPX_PYTHON:=${PIPX_BASE_PYTHON:-$(which python3)}}
 
 export PIP_BREAK_SYSTEM_PACKAGES=1
-${PIPX_PYTHON} -m pip install -U pipx
+${PIPX_BASE_PYTHON} -m pip install -U pipx
 
-# This script is run with PYTHON undefined in some places,
-# but those only use older pythons.
-if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then
-  pipx_flags=--verbose
-  if [[ $(id -un) == "root" ]]; then
-    # Install globally as /root/.local/bin is typically not in $PATH
-    pipx_flags="${pipx_flags} --global"
-  fi
-  ${PIPX_PYTHON} -m pipx install ${pipx_flags} "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
+pipx_flags=(--verbose --python ${PIPX_PYTHON})
+if [[ $(id -un) == "root" ]]; then
+  # Install globally as /root/.local/bin is typically not in $PATH
+  pipx_flags+=(--global)
 fi
+if [[ -n "${PIPX_PIP_ARGS}" ]]; then
+  pipx_flags+=(--pip-args "'${PIPX_PIP_ARGS}'")
+fi
+${PIPX_BASE_PYTHON} -m pipx install ${pipx_flags[@]} \
+  "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz"
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index b26aeba32b79b..a65bf9b56addf 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -113,6 +113,21 @@ jobs:
           name: wheel
           path: arrow/python/repaired_wheels/*.whl
 
+      # Use a well-known Python version for the GCS testbench, and avoid
+      # putting it in PATH.
+      - name: Set up Python for GCS testbench
+        uses: actions/setup-python@v5.1.1
+        id: gcs-python-install
+        with:
+          python-version: 3.12
+          update-environment: false
+
+      - name: Install GCS testbench
+        env:
+          PIPX_BIN_DIR: /usr/local/bin
+          PIPX_BASE_PYTHON: {{ '${{ steps.gcs-python-install.outputs.python-path }}' }}
+        run: arrow/ci/scripts/install_gcs_testbench.sh default
+
       - name: Test Wheel
         env:
           PYTEST_ADDOPTS: "-k 'not test_cancellation'"
@@ -121,7 +136,6 @@ jobs:
           source test-env/bin/activate
           pip install --upgrade pip wheel
           arch -{{ arch }} pip install -r arrow/python/requirements-wheel-test.txt
-          PYTHON_VERSION={{ python_version }} arch -{{ arch }} arrow/ci/scripts/install_gcs_testbench.sh default
           arch -{{ arch }} arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow
 
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}

From 698e0416f1aa4cecc024ed6eb4ab7014375f4966 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 09:55:13 -0700
Subject: [PATCH 063/186] MINOR: [C#] Bump Grpc.Tools from 2.65.0 to 2.66.0 in
 /csharp (#43913)

Bumps [Grpc.Tools](https://github.com/grpc/grpc) from 2.65.0 to 2.66.0.
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/grpc/grpc/commits">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Tools&package-manager=nuget&previous-version=2.65.0&new-version=2.66.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj  | 2 +-
 csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj
index 1870888184906..ec438fde843f4 100644
--- a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj
+++ b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj
@@ -5,7 +5,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Grpc.Tools" Version="2.65.0" PrivateAssets="All" />
+    <PackageReference Include="Grpc.Tools" Version="2.66.0" PrivateAssets="All" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
index 9e1866f84160b..afe7d39194211 100644
--- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
+++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
@@ -7,7 +7,7 @@
   <ItemGroup>
     <PackageReference Include="Google.Protobuf" Version="3.27.3" />
     <PackageReference Include="Grpc.Net.Client" Version="2.65.0" />
-    <PackageReference Include="Grpc.Tools" Version="2.65.0" PrivateAssets="All" />
+    <PackageReference Include="Grpc.Tools" Version="2.66.0" PrivateAssets="All" />
     <PackageReference Include="System.Memory" Version="4.5.5" />
   </ItemGroup>
 

From fa2edd468c986f0deca0e0411b26a7d2058aa5d1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Sep 2024 09:55:48 -0700
Subject: [PATCH 064/186] MINOR: [C#] Bump Google.Protobuf from 3.27.3 to
 3.28.0 in /csharp (#43914)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[//]: # (dependabot-start)
⚠️  **Dependabot is rebasing this PR** ⚠️

Rebasing might not happen immediately, so don't worry if this takes some time.

Note: if you make any changes to this PR yourself, they will take precedence over the rebase.

---

[//]: # (dependabot-end)

Bumps [Google.Protobuf](https://github.com/protocolbuffers/protobuf) from 3.27.3 to 3.28.0.
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/439c42c735ae1efed57ab7771986f2a3c0b99319"><code>439c42c</code></a> Updating version.json and repo version numbers to: 28.0</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/c9454f40e91bef6187e618f4856ebea240985c81"><code>c9454f4</code></a> Remove <code>--copt=&quot;-Werror&quot;</code> from <code>.bazelrc</code> (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/18005">#18005</a>)</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/f5a1b178ad52c3e64da40caceaa4ca9e51045cb4"><code>f5a1b17</code></a> Move -Werror to our test/dev bazelrc files. (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/17938">#17938</a>)</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/0c9e14a9eb880747c94dc5eef31be73db7cf2526"><code>0c9e14a</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/17917">#17917</a> from thomasvl/patch_objc_to_28</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/6a6ebe4b1c50c5ab1b32f8e55c0a98797a565ecc"><code>6a6ebe4</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/17919">#17919</a> from protocolbuffers/28.x-202408221734</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/09ba2bb826c9fafa0f0f49af9cc52d6ce1a5fcdb"><code>09ba2bb</code></a> Updating version.json and repo version numbers to: 28.0-dev</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/e340f52e461bf2726acb9fd1e0c88a88762aaf87"><code>e340f52</code></a> Updating version.json and repo version numbers to: 28.0-rc3</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/b2764205e943d9bc912c4504d95117179e9b38e1"><code>b276420</code></a> [ObjC] Issue stderr warnings for deprecated generation options.</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/13f850d92a522b330ef9a665d38bc5b6647ea8f3"><code>13f850d</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/17913">#17913</a> from protocolbuffers/cp-compat-upgrade</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/6bf01c51a0b92278958f0169d330d64a08dbb4ec"><code>6bf01c5</code></a> Binary compatibility shims for GeneratedMessageV3, SingleFieldBuilderV3, Repe...</li>
<li>Additional commits viewable in <a href="https://github.com/protocolbuffers/protobuf/compare/v3.27.3...v3.28.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.27.3&new-version=3.28.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj       | 2 +-
 .../Apache.Arrow.Flight.TestWeb.csproj                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
index afe7d39194211..bcfb813c11435 100644
--- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
+++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
@@ -5,7 +5,7 @@
   </PropertyGroup>
   
   <ItemGroup>
-    <PackageReference Include="Google.Protobuf" Version="3.27.3" />
+    <PackageReference Include="Google.Protobuf" Version="3.28.0" />
     <PackageReference Include="Grpc.Net.Client" Version="2.65.0" />
     <PackageReference Include="Grpc.Tools" Version="2.66.0" PrivateAssets="All" />
     <PackageReference Include="System.Memory" Version="4.5.5" />
diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
index 14227e2c4eb6b..5ed7cc47d6ac2 100644
--- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
+++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
@@ -5,7 +5,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Google.Protobuf" Version="3.27.3" />
+    <PackageReference Include="Google.Protobuf" Version="3.28.0" />
     <PackageReference Include="Grpc.AspNetCore" Version="2.65.0" />
   </ItemGroup>
 

From 3a6135b66d511296281c1389063bff060a8b83e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Tue, 3 Sep 2024 00:37:03 +0200
Subject: [PATCH 065/186] GH-40216: [CI][Packaging][Python] Upload pyarrow
 nightly wheels to scientific python channel on Anaconda (#43862)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

As discussed on the main issue is interesting for discoverability to have the wheels uploaded to the nightly channel.

### What changes are included in this PR?

Added macro to upload wheel to scientific python channel

### Are these changes tested?

Via archery

### Are there any user-facing changes?

No but nightly wheels will be available on scientific python channel
* GitHub Issue: #40216

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
---
 dev/tasks/macros.jinja                     | 12 ++++++++++++
 dev/tasks/python-sdist/github.yml          |  1 +
 dev/tasks/python-wheels/github.linux.yml   |  1 +
 dev/tasks/python-wheels/github.osx.yml     |  1 +
 dev/tasks/python-wheels/github.windows.yml |  1 +
 dev/tasks/tasks.yml                        |  1 +
 6 files changed, 17 insertions(+)

diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index df55f32222e91..63cb2fc6dd101 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -183,6 +183,18 @@ env:
   {% endif %}
 {% endmacro %}
 
+{%- macro github_upload_wheel_scientific_python(pattern) -%}
+  {%- if arrow.is_default_branch() -%}
+  - name: Upload wheel to Anaconda scientific-python
+    shell: bash
+    run: |
+      python3 -m pip install git+https://github.com/Anaconda-Platform/anaconda-client.git@1.12.3
+      anaconda -t ${CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN} upload --force -u scientific-python-nightly-wheels --label dev {{ pattern }}
+    env:
+      CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN: {{ '${{ secrets.CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN }}' }}
+  {% endif %}
+{% endmacro %}
+
 {%- macro azure_checkout_arrow() -%}
   - script: |
       git clone --no-checkout --branch {{ arrow.branch }} {{ arrow.remote }} arrow
diff --git a/dev/tasks/python-sdist/github.yml b/dev/tasks/python-sdist/github.yml
index ef36e358aa926..ce41f437946a7 100644
--- a/dev/tasks/python-sdist/github.yml
+++ b/dev/tasks/python-sdist/github.yml
@@ -43,3 +43,4 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/dist/*.tar.gz")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/dist/*.tar.gz")|indent }}
+      {{ macros.github_upload_wheel_scientific_python("arrow/python/dist/*.tar.gz")|indent }}
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index 97746ba3f9b8b..f9df27ba3175b 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -110,6 +110,7 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/repaired_wheels/*.whl")|indent }}
+      {{ macros.github_upload_wheel_scientific_python("arrow/python/repaired_wheels/*.whl")|indent }}
 
       {% if arrow.is_default_branch() %}
       - name: Push Docker Image
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index a65bf9b56addf..98e06a14ff222 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -140,3 +140,4 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/repaired_wheels/*.whl")|indent }}
+      {{ macros.github_upload_wheel_scientific_python("arrow/python/repaired_wheels/*.whl")|indent }}
diff --git a/dev/tasks/python-wheels/github.windows.yml b/dev/tasks/python-wheels/github.windows.yml
index a40b9c0d65103..3a943b6ae515c 100644
--- a/dev/tasks/python-wheels/github.windows.yml
+++ b/dev/tasks/python-wheels/github.windows.yml
@@ -71,6 +71,7 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/dist/*.whl")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/dist/*.whl")|indent }}
+      {{ macros.github_upload_wheel_scientific_python("arrow/python/dist/*.whl")|indent }}
 
       {% if arrow.is_default_branch() %}
       - name: Push Docker Image
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index c6d2f2175d44c..b7e0c1601e336 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -29,6 +29,7 @@ groups:
 
   wheel:
     - wheel-*
+    - python-sdist
 
   linux:
     - almalinux-*

From 00d357674002b4e2e08b9d76b5d52530e723c4eb Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 3 Sep 2024 10:08:33 +0900
Subject: [PATCH 066/186] GH-43746: [C++] Add support for Boost 1.86 (#43766)

### Rationale for this change

`boost/process/*.hpp` are deprecated since Boost 1.86. And it seems that it also adds backward incompatible change. We need to use `boost/process/v2/*.hpp` instead.

### What changes are included in this PR?

This introduces `arrow::util::Process` for testing. It wraps boost/process/ API. So we don't need to use boost/process/ API directly in our tests.

We still use the v1 API on Windows because the v2 API doesn't process group and we don't have a workaround for it on Windows. If GCS's testbench doesn't use multiple processes, we can use the v2 API on Windows because we don't need to use process group in our use case.

See also:
* The v2 API and process group: https://github.com/boostorg/process/issues/259
* GCS's testbench and multiple processes: https://github.com/googleapis/storage-testbench/issues/669

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #43746

Lead-authored-by: Sutou Kouhei <kou@clear-code.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/cpp.yml                   |   4 +-
 .github/workflows/ruby.yml                  |   5 +-
 cpp/cmake_modules/ThirdpartyToolchain.cmake |  48 +++-
 cpp/src/arrow/CMakeLists.txt                |  13 +-
 cpp/src/arrow/filesystem/CMakeLists.txt     |  18 +-
 cpp/src/arrow/filesystem/azurefs_test.cc    |  52 +---
 cpp/src/arrow/filesystem/gcsfs_test.cc      | 106 +++----
 cpp/src/arrow/filesystem/s3_test_util.cc    |  69 +----
 cpp/src/arrow/flight/CMakeLists.txt         |   5 -
 cpp/src/arrow/flight/flight_benchmark.cc    |   2 +-
 cpp/src/arrow/flight/flight_test.cc         |   4 +-
 cpp/src/arrow/flight/test_util.cc           | 118 ++------
 cpp/src/arrow/flight/test_util.h            |  17 +-
 cpp/src/arrow/testing/process.cc            | 298 ++++++++++++++++++++
 cpp/src/arrow/testing/process.h             |  46 +++
 cpp/src/gandiva/precompiled/CMakeLists.txt  |  26 +-
 cpp/vcpkg.json                              |   2 +-
 17 files changed, 500 insertions(+), 333 deletions(-)
 create mode 100644 cpp/src/arrow/testing/process.cc
 create mode 100644 cpp/src/arrow/testing/process.h

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index c3ca66719a5cf..d51438c5f193a 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -409,12 +409,10 @@ jobs:
       ARROW_WITH_SNAPPY: ON
       ARROW_WITH_ZLIB: ON
       ARROW_WITH_ZSTD: ON
-      # Don't use preinstalled Boost by empty BOOST_ROOT and
-      # -DBoost_NO_BOOST_CMAKE=ON
+      # Don't use preinstalled Boost by empty BOOST_ROOT
       BOOST_ROOT: ""
       ARROW_CMAKE_ARGS: >-
         -DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}}
-        -DBoost_NO_BOOST_CMAKE=ON
         -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON
       # We can't use unity build because we don't have enough memory on
       # GitHub Actions.
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index e4d650e74a8ad..4b74b8d7fc84d 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -406,7 +406,10 @@ jobs:
             -source "https://nuget.pkg.github.com/$GITHUB_REPOSITORY_OWNER/index.json"
       - name: Build C++ vcpkg dependencies
         run: |
-          vcpkg\vcpkg.exe install --triplet $env:VCPKG_TRIPLET --x-manifest-root cpp --x-install-root build\cpp\vcpkg_installed
+          vcpkg\vcpkg.exe install `
+            --triplet $env:VCPKG_TRIPLET `
+            --x-manifest-root cpp `
+            --x-install-root build\cpp\vcpkg_installed
       - name: Build C++
         shell: cmd
         run: |
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 63e2c036c9a6f..b31037a973279 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -259,7 +259,7 @@ macro(resolve_dependency DEPENDENCY_NAME)
       IS_RUNTIME_DEPENDENCY
       REQUIRED_VERSION
       USE_CONFIG)
-  set(multi_value_args COMPONENTS PC_PACKAGE_NAMES)
+  set(multi_value_args COMPONENTS OPTIONAL_COMPONENTS PC_PACKAGE_NAMES)
   cmake_parse_arguments(ARG
                         "${options}"
                         "${one_value_args}"
@@ -287,6 +287,9 @@ macro(resolve_dependency DEPENDENCY_NAME)
   if(ARG_COMPONENTS)
     list(APPEND FIND_PACKAGE_ARGUMENTS COMPONENTS ${ARG_COMPONENTS})
   endif()
+  if(ARG_OPTIONAL_COMPONENTS)
+    list(APPEND FIND_PACKAGE_ARGUMENTS OPTIONAL_COMPONENTS ${ARG_OPTIONAL_COMPONENTS})
+  endif()
   if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO")
     find_package(${FIND_PACKAGE_ARGUMENTS})
     set(COMPATIBLE ${${PACKAGE_NAME}_FOUND})
@@ -1289,15 +1292,19 @@ if(ARROW_USE_BOOST)
     set(Boost_USE_STATIC_LIBS ON)
   endif()
   if(ARROW_BOOST_REQUIRE_LIBRARY)
-    set(ARROW_BOOST_COMPONENTS system filesystem)
+    set(ARROW_BOOST_COMPONENTS filesystem system)
+    set(ARROW_BOOST_OPTIONAL_COMPONENTS process)
   else()
     set(ARROW_BOOST_COMPONENTS)
+    set(ARROW_BOOST_OPTIONAL_COMPONENTS)
   endif()
   resolve_dependency(Boost
                      REQUIRED_VERSION
                      ${ARROW_BOOST_REQUIRED_VERSION}
                      COMPONENTS
                      ${ARROW_BOOST_COMPONENTS}
+                     OPTIONAL_COMPONENTS
+                     ${ARROW_BOOST_OPTIONAL_COMPONENTS}
                      IS_RUNTIME_DEPENDENCY
                      # libarrow.so doesn't depend on libboost*.
                      FALSE)
@@ -1316,14 +1323,35 @@ if(ARROW_USE_BOOST)
     endif()
   endforeach()
 
-  if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    # boost/process/detail/windows/handle_workaround.hpp doesn't work
-    # without BOOST_USE_WINDOWS_H with MinGW because MinGW doesn't
-    # provide __kernel_entry without winternl.h.
-    #
-    # See also:
-    # https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp
-    target_compile_definitions(Boost::headers INTERFACE "BOOST_USE_WINDOWS_H=1")
+  if(TARGET Boost::process)
+    # Boost >= 1.86
+    target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V1")
+    target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2")
+  else()
+    # Boost < 1.86
+    add_library(Boost::process INTERFACE IMPORTED)
+    if(TARGET Boost::filesystem)
+      target_link_libraries(Boost::process INTERFACE Boost::filesystem)
+    endif()
+    if(TARGET Boost::system)
+      target_link_libraries(Boost::process INTERFACE Boost::system)
+    endif()
+    if(TARGET Boost::headers)
+      target_link_libraries(Boost::process INTERFACE Boost::headers)
+    endif()
+    if(Boost_VERSION VERSION_GREATER_EQUAL 1.80)
+      target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2")
+      # Boost < 1.86 has a bug that
+      # boost::process::v2::process_environment::on_setup() isn't
+      # defined. We need to build Boost Process source to define it.
+      #
+      # See also:
+      # https://github.com/boostorg/process/issues/312
+      target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_NEED_SOURCE")
+      if(WIN32)
+        target_link_libraries(Boost::process INTERFACE bcrypt ntdll)
+      endif()
+    endif()
   endif()
 
   message(STATUS "Boost include dir: ${Boost_INCLUDE_DIRS}")
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 65343df1291ba..01ac813f4713b 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -644,9 +644,13 @@ else()
 endif()
 
 set(ARROW_TESTING_SHARED_LINK_LIBS arrow_shared ${ARROW_GTEST_GTEST})
-set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON)
-set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static
-                                   ${ARROW_GTEST_GTEST})
+set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON Boost::process)
+set(ARROW_TESTING_STATIC_LINK_LIBS
+    arrow::flatbuffers
+    RapidJSON
+    Boost::process
+    arrow_static
+    ${ARROW_GTEST_GTEST})
 set(ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared)
 set(ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static)
 # that depend on gtest
@@ -667,9 +671,10 @@ set(ARROW_TESTING_SRCS
     io/test_common.cc
     ipc/test_common.cc
     testing/fixed_width_test_util.cc
+    testing/generator.cc
     testing/gtest_util.cc
+    testing/process.cc
     testing/random.cc
-    testing/generator.cc
     testing/util.cc)
 
 #
diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt
index dec4bb6e3d465..7afdf566f2fb5 100644
--- a/cpp/src/arrow/filesystem/CMakeLists.txt
+++ b/cpp/src/arrow/filesystem/CMakeLists.txt
@@ -47,9 +47,7 @@ if(ARROW_GCS)
                  EXTRA_LABELS
                  filesystem
                  EXTRA_LINK_LIBS
-                 google-cloud-cpp::storage
-                 Boost::filesystem
-                 Boost::system)
+                 google-cloud-cpp::storage)
 endif()
 
 if(ARROW_AZURE)
@@ -57,9 +55,7 @@ if(ARROW_AZURE)
                  EXTRA_LABELS
                  filesystem
                  EXTRA_LINK_LIBS
-                 ${AZURE_SDK_LINK_LIBRARIES}
-                 Boost::filesystem
-                 Boost::system)
+                 ${AZURE_SDK_LINK_LIBRARIES})
 endif()
 
 if(ARROW_S3)
@@ -75,11 +71,7 @@ if(ARROW_S3)
   else()
     list(APPEND ARROW_S3_TEST_EXTRA_LINK_LIBS arrow_static)
   endif()
-  list(APPEND
-       ARROW_S3_TEST_EXTRA_LINK_LIBS
-       ${AWSSDK_LINK_LIBRARIES}
-       Boost::filesystem
-       Boost::system)
+  list(APPEND ARROW_S3_TEST_EXTRA_LINK_LIBS ${AWSSDK_LINK_LIBRARIES})
   add_arrow_test(s3fs_test
                  SOURCES
                  s3fs_test.cc
@@ -122,9 +114,7 @@ if(ARROW_S3)
                         s3_test_util.cc
                         STATIC_LINK_LIBS
                         ${AWSSDK_LINK_LIBRARIES}
-                        ${ARROW_BENCHMARK_LINK_LIBS}
-                        Boost::filesystem
-                        Boost::system)
+                        ${ARROW_BENCHMARK_LINK_LIBS})
     if(ARROW_TEST_LINKAGE STREQUAL "static")
       target_link_libraries(arrow-filesystem-s3fs-benchmark PRIVATE parquet_static)
     else()
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc
index 9d437d1f83aac..a8dc923476752 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -15,24 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>  // Missing include in boost/process
-
-// This boost/asio/io_context.hpp include is needless for no MinGW
-// build.
-//
-// This is for including boost/asio/detail/socket_types.hpp before any
-// "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
-// work if windows.h is already included. boost/process.h ->
-// boost/process/args.hpp -> boost/process/detail/basic_cmd.hpp
-// includes windows.h. boost/process/args.hpp is included before
-// boost/process/async.h that includes
-// boost/asio/detail/socket_types.hpp implicitly is included.
-#include <boost/asio/io_context.hpp>
-// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
-// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in
-// cpp/cmake_modules/ThirdpartyToolchain.cmake for details.
-#include <boost/process.hpp>
-
 #include "arrow/filesystem/azurefs.h"
 #include "arrow/filesystem/azurefs_internal.h"
 
@@ -53,6 +35,7 @@
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/future.h"
 #include "arrow/util/io_util.h"
@@ -67,7 +50,6 @@ namespace arrow {
 using internal::TemporaryDir;
 namespace fs {
 using internal::ConcatAbstractPath;
-namespace bp = boost::process;
 
 using ::testing::IsEmpty;
 using ::testing::Not;
@@ -174,42 +156,32 @@ class AzuriteEnv : public AzureEnvImpl<AzuriteEnv> {
  private:
   std::unique_ptr<TemporaryDir> temp_dir_;
   arrow::internal::PlatformFilename debug_log_path_;
-  bp::child server_process_;
+  std::unique_ptr<util::Process> server_process_;
 
   using AzureEnvImpl::AzureEnvImpl;
 
  public:
   static const AzureBackend kBackend = AzureBackend::kAzurite;
 
-  ~AzuriteEnv() override {
-    server_process_.terminate();
-    server_process_.wait();
-  }
+  ~AzuriteEnv() = default;
 
   static Result<std::unique_ptr<AzureEnvImpl>> Make() {
     auto self = std::unique_ptr<AzuriteEnv>(
         new AzuriteEnv("devstoreaccount1",
                        "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/"
                        "K1SZFPTOtr/KBHBeksoGMGw=="));
-    auto exe_path = bp::search_path("azurite");
-    if (exe_path.empty()) {
-      return Status::Invalid("Could not find Azurite emulator.");
-    }
+    self->server_process_ = std::make_unique<util::Process>();
+    ARROW_RETURN_NOT_OK(self->server_process_->SetExecutable("azurite"));
     ARROW_ASSIGN_OR_RAISE(self->temp_dir_, TemporaryDir::Make("azurefs-test-"));
     ARROW_ASSIGN_OR_RAISE(self->debug_log_path_,
                           self->temp_dir_->path().Join("debug.log"));
-    auto server_process = bp::child(
-        boost::this_process::environment(), exe_path, "--silent", "--location",
-        self->temp_dir_->path().ToString(), "--debug", self->debug_log_path_.ToString(),
-        // For old Azurite. We can't install the latest Azurite with
-        // old Node.js on old Ubuntu.
-        "--skipApiVersionCheck");
-    if (!server_process.valid() || !server_process.running()) {
-      server_process.terminate();
-      server_process.wait();
-      return Status::Invalid("Could not start Azurite emulator.");
-    }
-    self->server_process_ = std::move(server_process);
+    self->server_process_->SetArgs({"--silent", "--location",
+                                    self->temp_dir_->path().ToString(), "--debug",
+                                    self->debug_log_path_.ToString(),
+                                    // For old Azurite. We can't install the latest
+                                    // Azurite with old Node.js on old Ubuntu.
+                                    "--skipApiVersionCheck"});
+    ARROW_RETURN_NOT_OK(self->server_process_->Execute());
     return self;
   }
 
diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc
index 2098cf4d7f319..d4d5edf4b8993 100644
--- a/cpp/src/arrow/filesystem/gcsfs_test.cc
+++ b/cpp/src/arrow/filesystem/gcsfs_test.cc
@@ -15,26 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>  // Missing include in boost/process
-
-#define BOOST_NO_CXX98_FUNCTION_BASE  // ARROW-17805
-// This boost/asio/io_context.hpp include is needless for no MinGW
-// build.
-//
-// This is for including boost/asio/detail/socket_types.hpp before any
-// "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
-// work if windows.h is already included. boost/process.h ->
-// boost/process/args.hpp -> boost/process/detail/basic_cmd.hpp
-// includes windows.h. boost/process/args.hpp is included before
-// boost/process/async.h that includes
-// boost/asio/detail/socket_types.hpp implicitly is included.
-#include <boost/asio/io_context.hpp>
-// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
-// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in
-// cpp/cmake_modules/ThirdpartyToolchain.cmake for details.
-#include <boost/process.hpp>
-#include <boost/thread.hpp>
-
 #include "arrow/filesystem/gcsfs.h"
 
 #include <absl/time/time.h>
@@ -45,16 +25,15 @@
 #include <google/cloud/storage/options.h>
 #include <gtest/gtest.h>
 
-#include <array>
 #include <random>
 #include <string>
-#include <thread>
 
 #include "arrow/filesystem/gcsfs_internal.h"
 #include "arrow/filesystem/path_util.h"
 #include "arrow/filesystem/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/matchers.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/future.h"
 #include "arrow/util/key_value_metadata.h"
@@ -64,7 +43,6 @@ namespace fs {
 
 namespace {
 
-namespace bp = boost::process;
 namespace gc = google::cloud;
 namespace gcs = google::cloud::storage;
 
@@ -89,70 +67,62 @@ class GcsTestbench : public ::testing::Environment {
  public:
   GcsTestbench() {
     port_ = std::to_string(GetListenPort());
-    std::vector<std::string> names{"python3", "python"};
-    // If the build script or application developer provides a value in the PYTHON
-    // environment variable, then just use that.
-    if (const auto* env = std::getenv("PYTHON")) {
-      names = {env};
-    }
     auto error = std::string("Could not start GCS emulator 'storage-testbench'");
+    auto server_process = std::make_unique<util::Process>();
+    auto status = server_process->SetExecutable("storage-testbench");
+    if (!status.ok()) {
+      error += ": " + status.ToString();
+      error_ = std::move(error);
+      return;
+    }
 
-    auto testbench_is_running = [](bp::child& process, bp::ipstream& output) {
-      // Wait for message: "* Restarting with"
-      std::string line;
+    server_process->SetArgs({"--port", port_});
+    server_process->IgnoreStderr();
+    status = server_process->Execute();
+    if (!status.ok()) {
+      error += ": " + status.ToString();
+      error_ = std::move(error);
+      return;
+    }
+
+    auto testbench_is_running = [&server_process, this]() {
+      auto ready_timeout = std::chrono::seconds(10);
       std::chrono::time_point<std::chrono::steady_clock> end =
-          std::chrono::steady_clock::now() + std::chrono::seconds(10);
-      while (process.valid() && process.running() &&
-             std::chrono::steady_clock::now() < end) {
-        if (output.peek() && std::getline(output, line)) {
-          std::cerr << line << std::endl;
-          if (line.find("* Restarting with") != std::string::npos) return true;
-        } else {
-          std::this_thread::sleep_for(std::chrono::milliseconds(20));
+          std::chrono::steady_clock::now() + ready_timeout;
+      while (server_process->IsRunning() && std::chrono::steady_clock::now() < end) {
+        auto client = gcs::Client(
+            google::cloud::Options{}
+                .set<gcs::RestEndpointOption>("http://127.0.0.1:" + port_)
+                .set<gc::UnifiedCredentialsOption>(gc::MakeInsecureCredentials())
+                .set<gcs::RetryPolicyOption>(
+                    gcs::LimitedTimeRetryPolicy(ready_timeout).clone()));
+        auto metadata = client.GetBucketMetadata("nonexistent");
+        if (metadata.status().code() == google::cloud::StatusCode::kNotFound) {
+          return true;
         }
       }
       return false;
     };
 
-    auto exe_path = bp::search_path("storage-testbench");
-    if (!exe_path.empty()) {
-      bp::ipstream output;
-      server_process_ =
-          bp::child(exe_path, "--port", port_, group_, bp::std_err > output);
-      if (!testbench_is_running(server_process_, output)) {
-        error += " (failed to start)";
-        server_process_.terminate();
-        server_process_.wait();
-      }
-    } else {
-      error += " (exe not found)";
-    }
-    if (!server_process_.valid()) {
+    if (!testbench_is_running()) {
+      error += " (failed to listen)";
       error_ = std::move(error);
+      return;
     }
+
+    server_process_ = std::move(server_process);
   }
 
-  bool running() { return server_process_.running(); }
+  bool running() { return server_process_ && server_process_->IsRunning(); }
 
-  ~GcsTestbench() override {
-    // Brutal shutdown, kill the full process group because the GCS testbench may launch
-    // additional children.
-    try {
-      group_.terminate();
-    } catch (bp::process_error&) {
-    }
-    if (server_process_.valid()) {
-      server_process_.wait();
-    }
-  }
+  ~GcsTestbench() = default;
 
   const std::string& port() const { return port_; }
   const std::string& error() const { return error_; }
 
  private:
   std::string port_;
-  bp::child server_process_;
-  bp::group group_;
+  std::unique_ptr<util::Process> server_process_;
   std::string error_;
 };
 
diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc
index eb29a677dae9e..003afa68f1e35 100644
--- a/cpp/src/arrow/filesystem/s3_test_util.cc
+++ b/cpp/src/arrow/filesystem/s3_test_util.cc
@@ -15,33 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>  // Missing include in boost/process
-
 #ifndef _WIN32
 #include <sys/wait.h>
 #endif
 
-// This boost/asio/io_context.hpp include is needless for no MinGW
-// build.
-//
-// This is for including boost/asio/detail/socket_types.hpp before any
-// "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
-// work if windows.h is already included. boost/process.h ->
-// boost/process/args.hpp -> boost/process/detail/basic_cmd.hpp
-// includes windows.h. boost/process/args.hpp is included before
-// boost/process/async.h that includes
-// boost/asio/detail/socket_types.hpp implicitly is included.
-#ifdef __MINGW32__
-#include <boost/asio/io_context.hpp>
-#endif
-#define BOOST_NO_CXX98_FUNCTION_BASE  // ARROW-17805
-// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
-// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in
-// cpp/cmake_modules/ThirdpartyToolchain.cmake for details.
-#include <boost/process.hpp>
-
 #include "arrow/filesystem/s3_test_util.h"
 #include "arrow/filesystem/s3fs.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/async_generator.h"
 #include "arrow/util/future.h"
@@ -53,8 +33,6 @@ namespace fs {
 
 using ::arrow::internal::TemporaryDir;
 
-namespace bp = boost::process;
-
 namespace {
 
 const char* kMinioExecutableName = "minio";
@@ -75,7 +53,7 @@ struct MinioTestServer::Impl {
   std::string connect_string_;
   std::string access_key_ = kMinioAccessKey;
   std::string secret_key_ = kMinioSecretKey;
-  std::shared_ptr<::boost::process::child> server_process_;
+  std::unique_ptr<util::Process> server_process_;
 };
 
 MinioTestServer::MinioTestServer() : impl_(new Impl) {}
@@ -105,44 +83,23 @@ Status MinioTestServer::Start() {
 
   ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_, TemporaryDir::Make("s3fs-test-"));
 
-  // Get a copy of the current environment.
-  // (NOTE: using "auto" would return a native_environment that mutates
-  //  the current environment)
-  bp::environment env = boost::this_process::environment();
-  env["MINIO_ACCESS_KEY"] = kMinioAccessKey;
-  env["MINIO_SECRET_KEY"] = kMinioSecretKey;
+  impl_->server_process_ = std::make_unique<util::Process>();
+  impl_->server_process_->SetEnv("MINIO_ACCESS_KEY", kMinioAccessKey);
+  impl_->server_process_->SetEnv("MINIO_SECRET_KEY", kMinioSecretKey);
   // Disable the embedded console (one less listening address to care about)
-  env["MINIO_BROWSER"] = "off";
-
+  impl_->server_process_->SetEnv("MINIO_BROWSER", "off");
   impl_->connect_string_ = GenerateConnectString();
-  auto exe_path = bp::search_path(kMinioExecutableName);
-  if (exe_path.empty()) {
-    return Status::IOError("Failed to find minio executable ('", kMinioExecutableName,
-                           "') in PATH");
-  }
-
-  try {
-    // NOTE: --quiet makes startup faster by suppressing remote version check
-    impl_->server_process_ = std::make_shared<bp::child>(
-        env, exe_path, "server", "--quiet", "--compat", "--address",
-        impl_->connect_string_, impl_->temp_dir_->path().ToString());
-  } catch (const std::exception& e) {
-    return Status::IOError("Failed to launch Minio server: ", e.what());
-  }
+  ARROW_RETURN_NOT_OK(impl_->server_process_->SetExecutable(kMinioExecutableName));
+  // NOTE: --quiet makes startup faster by suppressing remote version check
+  impl_->server_process_->SetArgs({"server", "--quiet", "--compat", "--address",
+                                   impl_->connect_string_,
+                                   impl_->temp_dir_->path().ToString()});
+  ARROW_RETURN_NOT_OK(impl_->server_process_->Execute());
   return Status::OK();
 }
 
 Status MinioTestServer::Stop() {
-  if (impl_->server_process_ && impl_->server_process_->valid()) {
-    // Brutal shutdown
-    impl_->server_process_->terminate();
-    impl_->server_process_->wait();
-#ifndef _WIN32
-    // Despite calling wait() above, boost::process fails to clear zombies
-    // so do it ourselves.
-    waitpid(impl_->server_process_->id(), nullptr, 0);
-#endif
-  }
+  impl_->server_process_ = nullptr;
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt
index 835c4fc83bf18..b12476ac3893a 100644
--- a/cpp/src/arrow/flight/CMakeLists.txt
+++ b/cpp/src/arrow/flight/CMakeLists.txt
@@ -70,11 +70,6 @@ if(ARROW_BUILD_BENCHMARKS
     endif()
   endif()
 endif()
-list(APPEND
-     ARROW_FLIGHT_TEST_INTERFACE_LIBS
-     Boost::headers
-     Boost::filesystem
-     Boost::system)
 list(APPEND ARROW_FLIGHT_TEST_LINK_LIBS gRPC::grpc++)
 
 # TODO(wesm): Protobuf shared vs static linking
diff --git a/cpp/src/arrow/flight/flight_benchmark.cc b/cpp/src/arrow/flight/flight_benchmark.cc
index 057ef15c3c7ae..661c47737f024 100644
--- a/cpp/src/arrow/flight/flight_benchmark.cc
+++ b/cpp/src/arrow/flight/flight_benchmark.cc
@@ -491,7 +491,7 @@ int main(int argc, char** argv) {
         if (FLAGS_cuda && FLAGS_test_put) {
           server_args.push_back("-cuda");
         }
-        server->Start(server_args);
+        ABORT_NOT_OK(server->Start(server_args));
       }
       std::cout << "Server host: " << FLAGS_server_host << std::endl
                 << "Server port: " << FLAGS_server_port << std::endl;
diff --git a/cpp/src/arrow/flight/flight_test.cc b/cpp/src/arrow/flight/flight_test.cc
index 3d52bc3f5ae06..6425233dadec4 100644
--- a/cpp/src/arrow/flight/flight_test.cc
+++ b/cpp/src/arrow/flight/flight_test.cc
@@ -204,7 +204,7 @@ ARROW_FLIGHT_TEST_ASYNC_CLIENT(GrpcAsyncClientTest);
 
 TEST(TestFlight, ConnectUri) {
   TestServer server("flight-test-server");
-  server.Start();
+  ASSERT_OK(server.Start());
   ASSERT_TRUE(server.IsRunning());
 
   std::stringstream ss;
@@ -230,7 +230,7 @@ TEST(TestFlight, InvalidUriScheme) {
 #ifndef _WIN32
 TEST(TestFlight, ConnectUriUnix) {
   TestServer server("flight-test-server", "/tmp/flight-test.sock");
-  server.Start();
+  ASSERT_OK(server.Start());
   ASSERT_TRUE(server.IsRunning());
 
   std::stringstream ss;
diff --git a/cpp/src/arrow/flight/test_util.cc b/cpp/src/arrow/flight/test_util.cc
index 127827ff38cdd..aa10d9a7da822 100644
--- a/cpp/src/arrow/flight/test_util.cc
+++ b/cpp/src/arrow/flight/test_util.cc
@@ -17,11 +17,6 @@
 
 #include "arrow/flight/test_util.h"
 
-#ifdef __APPLE__
-#include <limits.h>
-#include <mach-o/dyld.h>
-#endif
-
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
@@ -31,18 +26,13 @@
 #include "arrow/util/windows_compatibility.h"
 
 #include <gtest/gtest.h>
-#include <boost/filesystem.hpp>
-#define BOOST_NO_CXX98_FUNCTION_BASE  // ARROW-17805
-// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
-// boost/process.hpp. See BOOST_USE_WINDOWS_H=1 in
-// cpp/cmake_modules/ThirdpartyToolchain.cmake for details.
-#include <boost/process.hpp>
 
 #include "arrow/array.h"
 #include "arrow/array/builder_primitive.h"
 #include "arrow/ipc/test_common.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/logging.h"
 
@@ -51,101 +41,27 @@
 
 namespace arrow::flight {
 
-namespace bp = boost::process;
-namespace fs = boost::filesystem;
-
-namespace {
-
-Status ResolveCurrentExecutable(fs::path* out) {
-  // See https://stackoverflow.com/a/1024937/10194 for various
-  // platform-specific recipes.
-
-  boost::system::error_code ec;
-
-#if defined(__linux__)
-  *out = fs::canonical("/proc/self/exe", ec);
-#elif defined(__APPLE__)
-  char buf[PATH_MAX + 1];
-  uint32_t bufsize = sizeof(buf);
-  if (_NSGetExecutablePath(buf, &bufsize) < 0) {
-    return Status::Invalid("Can't resolve current exe: path too large");
-  }
-  *out = fs::canonical(buf, ec);
-#elif defined(_WIN32)
-  char buf[MAX_PATH + 1];
-  if (!GetModuleFileNameA(NULL, buf, sizeof(buf))) {
-    return Status::Invalid("Can't get executable file path");
-  }
-  *out = fs::canonical(buf, ec);
-#else
-  ARROW_UNUSED(ec);
-  return Status::NotImplemented("Not available on this system");
-#endif
-  if (ec) {
-    // XXX fold this into the Status class?
-    return Status::IOError("Can't resolve current exe: ", ec.message());
+Status TestServer::Start(const std::vector<std::string>& extra_args) {
+  server_process_ = std::make_unique<util::Process>();
+  ARROW_RETURN_NOT_OK(server_process_->SetExecutable(executable_name_));
+  std::vector<std::string> args = {};
+  if (unix_sock_.empty()) {
+    args.push_back("-port");
+    args.push_back(std::to_string(port_));
   } else {
-    return Status::OK();
-  }
-}
-
-}  // namespace
-
-void TestServer::Start(const std::vector<std::string>& extra_args) {
-  namespace fs = boost::filesystem;
-
-  std::string str_port = std::to_string(port_);
-  std::vector<fs::path> search_path = ::boost::this_process::path();
-  // If possible, prepend current executable directory to search path,
-  // since it's likely that the test server executable is located in
-  // the same directory as the running unit test.
-  fs::path current_exe;
-  Status st = ResolveCurrentExecutable(&current_exe);
-  if (st.ok()) {
-    search_path.insert(search_path.begin(), current_exe.parent_path());
-  } else if (st.IsNotImplemented()) {
-    ARROW_CHECK(st.IsNotImplemented()) << st.ToString();
-  }
-
-  try {
-    if (unix_sock_.empty()) {
-      server_process_ =
-          std::make_shared<bp::child>(bp::search_path(executable_name_, search_path),
-                                      "-port", str_port, bp::args(extra_args));
-    } else {
-      server_process_ =
-          std::make_shared<bp::child>(bp::search_path(executable_name_, search_path),
-                                      "-server_unix", unix_sock_, bp::args(extra_args));
-    }
-  } catch (...) {
-    std::stringstream ss;
-    ss << "Failed to launch test server '" << executable_name_ << "', looked in ";
-    for (const auto& path : search_path) {
-      ss << path << " : ";
-    }
-    ARROW_LOG(FATAL) << ss.str();
-    throw;
+    args.push_back("-server_unix");
+    args.push_back(unix_sock_);
   }
-  std::cout << "Server running with pid " << server_process_->id() << std::endl;
+  args.insert(args.end(), extra_args.begin(), extra_args.end());
+  server_process_->SetArgs(args);
+  ARROW_RETURN_NOT_OK(server_process_->Execute());
+  std::cout << "Server running with pid " << server_process_->pid() << std::endl;
+  return Status::OK();
 }
 
-int TestServer::Stop() {
-  if (server_process_ && server_process_->valid()) {
-#ifndef _WIN32
-    kill(server_process_->id(), SIGTERM);
-#else
-    // This would use SIGKILL on POSIX, which is more brutal than SIGTERM
-    server_process_->terminate();
-#endif
-    server_process_->wait();
-    return server_process_->exit_code();
-  } else {
-    // Presumably the server wasn't able to start
-    return -1;
-  }
-}
+void TestServer::Stop() { server_process_ = nullptr; }
 
-bool TestServer::IsRunning() { return server_process_->running(); }
+bool TestServer::IsRunning() { return server_process_->IsRunning(); }
 
 int TestServer::port() const { return port_; }
 
diff --git a/cpp/src/arrow/flight/test_util.h b/cpp/src/arrow/flight/test_util.h
index 15ba6145ecd2b..946caebcc2b5a 100644
--- a/cpp/src/arrow/flight/test_util.h
+++ b/cpp/src/arrow/flight/test_util.h
@@ -29,6 +29,7 @@
 
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/process.h"
 #include "arrow/testing/util.h"
 
 #include "arrow/flight/client.h"
@@ -36,14 +37,6 @@
 #include "arrow/flight/types.h"
 #include "arrow/flight/visibility.h"
 
-namespace boost {
-namespace process {
-
-class child;
-
-}  // namespace process
-}  // namespace boost
-
 namespace arrow {
 namespace flight {
 
@@ -76,10 +69,10 @@ class ARROW_FLIGHT_EXPORT TestServer {
   TestServer(const std::string& executable_name, const std::string& unix_sock)
       : executable_name_(executable_name), unix_sock_(unix_sock) {}
 
-  void Start(const std::vector<std::string>& extra_args);
-  void Start() { Start({}); }
+  Status Start(const std::vector<std::string>& extra_args);
+  Status Start() { return Start({}); }
 
-  int Stop();
+  void Stop();
 
   bool IsRunning();
 
@@ -90,7 +83,7 @@ class ARROW_FLIGHT_EXPORT TestServer {
   std::string executable_name_;
   int port_;
   std::string unix_sock_;
-  std::shared_ptr<::boost::process::child> server_process_;
+  std::unique_ptr<util::Process> server_process_;
 };
 
 // Helper to initialize a server and matching client with callbacks to
diff --git a/cpp/src/arrow/testing/process.cc b/cpp/src/arrow/testing/process.cc
new file mode 100644
index 0000000000000..32da81f14630e
--- /dev/null
+++ b/cpp/src/arrow/testing/process.cc
@@ -0,0 +1,298 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/testing/process.h"
+#include "arrow/result.h"
+
+// This boost/asio/io_context.hpp include is needless for no MinGW
+// build.
+//
+// This is for including boost/asio/detail/socket_types.hpp before any
+// "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
+// work if windows.h is already included.
+#include <boost/asio/io_context.hpp>
+
+#ifdef BOOST_PROCESS_HAVE_V2
+// We can't use v2 API on Windows because v2 API doesn't support
+// process group [1] and GCS testbench uses multiple processes [2].
+//
+// [1] https://github.com/boostorg/process/issues/259
+// [2] https://github.com/googleapis/storage-testbench/issues/669
+#ifndef _WIN32
+#define BOOST_PROCESS_USE_V2
+#endif
+#endif
+
+#ifdef BOOST_PROCESS_USE_V2
+#ifdef BOOST_PROCESS_NEED_SOURCE
+// Workaround for https://github.com/boostorg/process/issues/312
+#define BOOST_PROCESS_V2_SEPARATE_COMPILATION
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+#include <boost/process/v2.hpp>
+#include <boost/process/v2/src.hpp>
+#else
+#include <boost/process/v2.hpp>
+#endif
+#include <unordered_map>
+#else
+// We need BOOST_USE_WINDOWS_H definition with MinGW when we use
+// boost/process.hpp. boost/process/detail/windows/handle_workaround.hpp
+// doesn't work without BOOST_USE_WINDOWS_H with MinGW because MinGW
+// doesn't provide __kernel_entry without winternl.h.
+//
+// See also:
+// https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp
+#ifdef __MINGW32__
+#define BOOST_USE_WINDOWS_H = 1
+#endif
+#ifdef BOOST_PROCESS_HAVE_V1
+#include <boost/process/v1.hpp>
+#else
+#include <boost/process.hpp>
+#endif
+#endif
+
+#ifdef __APPLE__
+#include <limits.h>
+#include <mach-o/dyld.h>
+#endif
+
+#include <chrono>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+#ifdef BOOST_PROCESS_USE_V2
+namespace asio = BOOST_PROCESS_V2_ASIO_NAMESPACE;
+namespace process = BOOST_PROCESS_V2_NAMESPACE;
+namespace filesystem = process::filesystem;
+#elif defined(BOOST_PROCESS_HAVE_V1)
+namespace process = boost::process::v1;
+namespace filesystem = boost::process::v1::filesystem;
+#else
+namespace process = boost::process;
+namespace filesystem = boost::filesystem;
+#endif
+
+namespace arrow::util {
+
+class Process::Impl {
+ public:
+  Impl() {
+    // Get a copy of the current environment.
+#ifdef BOOST_PROCESS_USE_V2
+    for (const auto& kv : process::environment::current()) {
+      env_[kv.key()] = process::environment::value(kv.value());
+    }
+#else
+    env_ = process::environment(boost::this_process::environment());
+#endif
+  }
+
+  ~Impl() {
+#ifdef BOOST_PROCESS_USE_V2
+    // V2 doesn't provide process group support yet:
+    // https://github.com/boostorg/process/issues/259
+    //
+    // So we try graceful shutdown (SIGTERM + waitpid()) before
+    // immediate shutdown (SIGKILL). This assumes that the target
+    // executable such as "python3 -m testbench" terminates all related
+    // processes by graceful shutdown.
+    boost::system::error_code error_code;
+    if (process_ && process_->running(error_code)) {
+      process_->request_exit(error_code);
+      if (!error_code) {
+        auto timeout = std::chrono::seconds(3);
+        std::chrono::time_point<std::chrono::steady_clock> end =
+            std::chrono::steady_clock::now() + timeout;
+        while (process_->running(error_code) && std::chrono::steady_clock::now() < end) {
+          std::this_thread::sleep_for(std::chrono::milliseconds(20));
+        }
+      }
+    }
+#else
+    process_group_ = nullptr;
+#endif
+    process_ = nullptr;
+  }
+
+  Status SetExecutable(const std::string& name) {
+#ifdef BOOST_PROCESS_USE_V2
+    executable_ = process::environment::find_executable(name);
+#else
+    executable_ = process::search_path(name);
+#endif
+    if (executable_.empty()) {
+      // Search the current executable directory as fallback.
+      ARROW_ASSIGN_OR_RAISE(auto current_exe, ResolveCurrentExecutable());
+#ifdef BOOST_PROCESS_USE_V2
+      std::unordered_map<process::environment::key, process::environment::value> env;
+      for (const auto& kv : process::environment::current()) {
+        env[kv.key()] = process::environment::value(kv.value());
+      }
+      env["PATH"] = process::environment::value(current_exe.parent_path());
+      executable_ = process::environment::find_executable(name, env);
+#else
+      executable_ = process::search_path(name, {current_exe.parent_path()});
+#endif
+    }
+    if (executable_.empty()) {
+      return Status::IOError("Failed to find '", name, "' in PATH");
+    }
+    return Status::OK();
+  }
+
+  void SetArgs(const std::vector<std::string>& args) { args_ = args; }
+
+  void SetEnv(const std::string& name, const std::string& value) {
+#ifdef BOOST_PROCESS_USE_V2
+    env_[name] = process::environment::value(value);
+#else
+    env_[name] = value;
+#endif
+  }
+
+  void IgnoreStderr() { keep_stderr_ = false; }
+
+  Status Execute() {
+    try {
+#ifdef BOOST_PROCESS_USE_V2
+      return ExecuteV2();
+#else
+      return ExecuteV1();
+#endif
+    } catch (const std::exception& e) {
+      return Status::IOError("Failed to launch '", executable_, "': ", e.what());
+    }
+  }
+
+  bool IsRunning() {
+#ifdef BOOST_PROCESS_USE_V2
+    boost::system::error_code error_code;
+    return process_ && process_->running(error_code);
+#else
+    return process_ && process_->running();
+#endif
+  }
+
+  uint64_t pid() {
+    if (!process_) {
+      return 0;
+    }
+    return process_->id();
+  }
+
+ private:
+  filesystem::path executable_;
+  std::vector<std::string> args_;
+  bool keep_stderr_ = true;
+#ifdef BOOST_PROCESS_USE_V2
+  std::unordered_map<process::environment::key, process::environment::value> env_;
+  std::unique_ptr<process::process> process_;
+  asio::io_context ctx_;
+  // boost/process/v2/ doesn't support process group yet:
+  // https://github.com/boostorg/process/issues/259
+#else
+  process::environment env_;
+  std::unique_ptr<process::child> process_;
+  std::unique_ptr<process::group> process_group_;
+#endif
+
+  Result<filesystem::path> ResolveCurrentExecutable() {
+    // See https://stackoverflow.com/a/1024937/10194 for various
+    // platform-specific recipes.
+
+    filesystem::path path;
+    boost::system::error_code error_code;
+
+#if defined(__linux__)
+    path = filesystem::canonical("/proc/self/exe", error_code);
+#elif defined(__APPLE__)
+    char buf[PATH_MAX + 1];
+    uint32_t bufsize = sizeof(buf);
+    if (_NSGetExecutablePath(buf, &bufsize) < 0) {
+      return Status::Invalid("Can't resolve current exe: path too large");
+    }
+    path = filesystem::canonical(buf, error_code);
+#elif defined(_WIN32)
+    char buf[MAX_PATH + 1];
+    if (!GetModuleFileNameA(NULL, buf, sizeof(buf))) {
+      return Status::Invalid("Can't get executable file path");
+    }
+    path = filesystem::canonical(buf, error_code);
+#else
+    ARROW_UNUSED(error_code);
+    return Status::NotImplemented("Not available on this system");
+#endif
+    if (error_code) {
+      // XXX fold this into the Status class?
+      return Status::IOError("Can't resolve current exe: ", error_code.message());
+    } else {
+      return path;
+    }
+  }
+
+#ifdef BOOST_PROCESS_USE_V2
+  Status ExecuteV2() {
+    process::process_environment env(env_);
+    // We can't use std::make_unique<process::process>.
+    process_ = std::unique_ptr<process::process>(
+        new process::process(ctx_, executable_, args_, env,
+                             keep_stderr_ ? process::process_stdio{{}, {}, {}}
+                                          : process::process_stdio{{}, {}, nullptr}));
+    return Status::OK();
+  }
+#else
+  Status ExecuteV1() {
+    process_group_ = std::make_unique<process::group>();
+    if (keep_stderr_) {
+      process_ = std::make_unique<process::child>(executable_, process::args(args_), env_,
+                                                  *process_group_);
+    } else {
+      process_ = std::make_unique<process::child>(executable_, process::args(args_), env_,
+                                                  *process_group_,
+                                                  process::std_err > process::null);
+    }
+    return Status::OK();
+  }
+#endif
+};
+
+Process::Process() : impl_(new Impl()) {}
+
+Process::~Process() {}
+
+Status Process::SetExecutable(const std::string& path) {
+  return impl_->SetExecutable(path);
+}
+
+void Process::SetArgs(const std::vector<std::string>& args) { impl_->SetArgs(args); }
+
+void Process::SetEnv(const std::string& key, const std::string& value) {
+  impl_->SetEnv(key, value);
+}
+
+void Process::IgnoreStderr() { impl_->IgnoreStderr(); }
+
+Status Process::Execute() { return impl_->Execute(); }
+
+bool Process::IsRunning() { return impl_->IsRunning(); }
+
+uint64_t Process::pid() { return impl_->pid(); }
+}  // namespace arrow::util
diff --git a/cpp/src/arrow/testing/process.h b/cpp/src/arrow/testing/process.h
new file mode 100644
index 0000000000000..d4d2ae124f427
--- /dev/null
+++ b/cpp/src/arrow/testing/process.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/testing/visibility.h"
+
+namespace arrow::util {
+
+class ARROW_TESTING_EXPORT Process {
+ public:
+  Process();
+  ~Process();
+
+  Status SetExecutable(const std::string& path);
+  void SetArgs(const std::vector<std::string>& args);
+  void SetEnv(const std::string& name, const std::string& value);
+  void IgnoreStderr();
+  Status Execute();
+  bool IsRunning();
+  uint64_t pid();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+}  // namespace arrow::util
diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt
index c092ff4fd011f..c2bc7fc02797e 100644
--- a/cpp/src/gandiva/precompiled/CMakeLists.txt
+++ b/cpp/src/gandiva/precompiled/CMakeLists.txt
@@ -53,8 +53,8 @@ add_custom_target(precompiled ALL DEPENDS ${GANDIVA_PRECOMPILED_BC_PATH}
                                           ${GANDIVA_PRECOMPILED_CC_PATH})
 
 # testing
-if(ARROW_BUILD_TESTS)
-  add_executable(gandiva-precompiled-test
+add_gandiva_test(precompiled-test
+                 SOURCES
                  ../context_helper.cc
                  bitmap_test.cc
                  bitmap.cc
@@ -75,16 +75,12 @@ if(ARROW_BUILD_TESTS)
                  decimal_ops_test.cc
                  decimal_ops.cc
                  ../decimal_type_util.cc
-                 ../decimal_xlarge.cc)
-  target_include_directories(gandiva-precompiled-test PRIVATE ${CMAKE_SOURCE_DIR}/src)
-  target_link_libraries(gandiva-precompiled-test PRIVATE ${ARROW_TEST_LINK_LIBS}
-                                                         Boost::headers)
-  target_compile_definitions(gandiva-precompiled-test PRIVATE GANDIVA_UNIT_TEST=1
-                                                              ARROW_STATIC GANDIVA_STATIC)
-  set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/gandiva-precompiled-test")
-  add_test(gandiva-precompiled-test ${TEST_PATH})
-  set_property(TEST gandiva-precompiled-test
-               APPEND
-               PROPERTY LABELS "unittest;gandiva-tests")
-  add_dependencies(gandiva-tests gandiva-precompiled-test)
-endif()
+                 ../decimal_xlarge.cc
+                 EXTRA_INCLUDES
+                 ${CMAKE_SOURCE_DIR}/src
+                 EXTRA_LINK_LIBS
+                 Boost::headers
+                 DEFINITIONS
+                 GANDIVA_UNIT_TEST=1
+                 ARROW_STATIC
+                 GANDIVA_STATIC)
diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json
index 6f825b55cfd94..103e678ebb4ac 100644
--- a/cpp/vcpkg.json
+++ b/cpp/vcpkg.json
@@ -15,11 +15,11 @@
       ]
     },
     "benchmark",
+    "boost-crc",
     "boost-filesystem",
     "boost-multiprecision",
     "boost-process",
     "boost-system",
-    "boost-crc",
     "brotli",
     "bzip2",
     "c-ares",

From 589ab7aca8179a749eeef091884bebc12700f168 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 10:13:17 +0900
Subject: [PATCH 067/186] MINOR: [CI] Bump actions/setup-python from 5.1.1 to
 5.2.0 (#43917)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.1.1 to 5.2.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/actions/setup-python/releases">actions/setup-python's releases</a>.</em></p>
<blockquote>
<h2>v5.2.0</h2>
<h2>What's Changed</h2>
<h3>Bug fixes:</h3>
<ul>
<li>Add <code>.zip</code> extension to Windows package downloads for <code>Expand-Archive</code> Compatibility by <a href="https://github.com/priyagupta108"><code>@​priyagupta108</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/916">actions/setup-python#916</a>
This addresses compatibility issues on Windows self-hosted runners by ensuring that the filenames for Python and PyPy package downloads explicitly include the .zip extension, allowing the Expand-Archive command to function correctly.</li>
<li>Add arch to cache key by <a href="https://github.com/Zxilly"><code>@​Zxilly</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/896">actions/setup-python#896</a>
This addresses issues with caching by adding the architecture (arch) to the cache key, ensuring that cache keys are accurate to prevent conflicts</li>
</ul>
<h3>Documentation changes:</h3>
<ul>
<li>Fix display of emojis in contributors doc by <a href="https://github.com/sciencewhiz"><code>@​sciencewhiz</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/899">actions/setup-python#899</a></li>
<li>Documentation update for caching poetry dependencies by <a href="https://github.com/gowridurgad"><code>@​gowridurgad</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/908">actions/setup-python#908</a></li>
</ul>
<h3>Dependency updates:</h3>
<ul>
<li>Bump <code>@​iarna/toml</code> version from 2.2.5 to 3.0.0 by <a href="https://github.com/priya-kinthali"><code>@​priya-kinthali</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/912">actions/setup-python#912</a></li>
<li>Bump pyinstaller from 3.6 to 5.13.1 by <a href="https://github.com/aparnajyothi-y"><code>@​aparnajyothi-y</code></a> in <a href="https://redirect.github.com/actions/setup-python/pull/923">actions/setup-python#923</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/sciencewhiz"><code>@​sciencewhiz</code></a> made their first contribution in <a href="https://redirect.github.com/actions/setup-python/pull/899">actions/setup-python#899</a></li>
<li><a href="https://github.com/priyagupta108"><code>@​priyagupta108</code></a> made their first contribution in <a href="https://redirect.github.com/actions/setup-python/pull/916">actions/setup-python#916</a></li>
<li><a href="https://github.com/Zxilly"><code>@​Zxilly</code></a> made their first contribution in <a href="https://redirect.github.com/actions/setup-python/pull/896">actions/setup-python#896</a></li>
<li><a href="https://github.com/aparnajyothi-y"><code>@​aparnajyothi-y</code></a> made their first contribution in <a href="https://redirect.github.com/actions/setup-python/pull/923">actions/setup-python#923</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/setup-python/compare/v5...v5.2.0">https://github.com/actions/setup-python/compare/v5...v5.2.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/actions/setup-python/commit/f677139bbe7f9c59b41e40162b753c062f5d49a3"><code>f677139</code></a> Bump pyinstaller from 3.6 to 5.13.1 in /<strong>tests</strong>/data (<a href="https://redirect.github.com/actions/setup-python/issues/923">#923</a>)</li>
<li><a href="https://github.com/actions/setup-python/commit/2bd53f9a4d1dd1cd21eaffcc01a7b91a8e73ea4c"><code>2bd53f9</code></a> Documentation update for caching poetry dependencies (<a href="https://redirect.github.com/actions/setup-python/issues/908">#908</a>)</li>
<li><a href="https://github.com/actions/setup-python/commit/80b49d3ed89312896dbdcbefc2ddb159c7f8ca43"><code>80b49d3</code></a> fix: add arch to cache key (<a href="https://redirect.github.com/actions/setup-python/issues/896">#896</a>)</li>
<li><a href="https://github.com/actions/setup-python/commit/036a5236741fd24c89eea80d1b76179e8e5f9214"><code>036a523</code></a> Fix: Add <code>.zip</code> extension to Windows package downloads for <code>Expand-Archive</code> C...</li>
<li><a href="https://github.com/actions/setup-python/commit/04c1311429f7be71707d8ab66c7af8a14e54b938"><code>04c1311</code></a> Fix display of emojis in contributors doc (<a href="https://redirect.github.com/actions/setup-python/issues/899">#899</a>)</li>
<li><a href="https://github.com/actions/setup-python/commit/cb6845644151e35f879e10f2f0896c3c8bee372c"><code>cb68456</code></a> Updated <code>@​iarna/toml</code> version to 3.0.0 (<a href="https://redirect.github.com/actions/setup-python/issues/912">#912</a>)</li>
<li>See full diff in <a href="https://github.com/actions/setup-python/compare/v5.1.1...v5.2.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/setup-python&package-manager=github_actions&previous-version=5.1.1&new-version=5.2.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/archery.yml      | 2 +-
 .github/workflows/comment_bot.yml  | 2 +-
 .github/workflows/cpp.yml          | 4 ++--
 .github/workflows/csharp.yml       | 2 +-
 .github/workflows/dev.yml          | 4 ++--
 .github/workflows/docs.yml         | 2 +-
 .github/workflows/docs_light.yml   | 2 +-
 .github/workflows/go.yml           | 6 +++---
 .github/workflows/integration.yml  | 2 +-
 .github/workflows/java.yml         | 2 +-
 .github/workflows/java_jni.yml     | 4 ++--
 .github/workflows/java_nightly.yml | 2 +-
 .github/workflows/js.yml           | 2 +-
 .github/workflows/pr_bot.yml       | 2 +-
 .github/workflows/python.yml       | 4 ++--
 .github/workflows/r.yml            | 4 ++--
 .github/workflows/r_nightly.yml    | 2 +-
 .github/workflows/ruby.yml         | 2 +-
 18 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml
index b016f7d11b9fa..2c46071010962 100644
--- a/.github/workflows/archery.yml
+++ b/.github/workflows/archery.yml
@@ -58,7 +58,7 @@ jobs:
         shell: bash
         run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true
       - name: Setup Python
-        uses: actions/setup-python@v5.1.1
+        uses: actions/setup-python@v5.2.0
         with:
           python-version: '3.9'
       - name: Install pygit2 binary wheel
diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml
index 1138c0a02f812..b7af4c5800835 100644
--- a/.github/workflows/comment_bot.yml
+++ b/.github/workflows/comment_bot.yml
@@ -41,7 +41,7 @@ jobs:
           # fetch the tags for version number generation
           fetch-depth: 0
       - name: Set up Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Install Archery and Crossbow dependencies
diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index d51438c5f193a..20bcfcb38da69 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -243,7 +243,7 @@ jobs:
           $(brew --prefix bash)/bin/bash \
             ci/scripts/install_minio.sh latest ${ARROW_HOME}
       - name: Set up Python
-        uses: actions/setup-python@v5.1.1
+        uses: actions/setup-python@v5.2.0
         with:
           python-version: 3.12
       - name: Install Google Cloud Storage Testbench
@@ -462,7 +462,7 @@ jobs:
             https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z
           chmod +x /usr/local/bin/minio.exe
       - name: Set up Python
-        uses: actions/setup-python@v5.1.1
+        uses: actions/setup-python@v5.2.0
         id: python-install
         with:
           python-version: 3.9
diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml
index 6e8548dc960f4..c618350affbeb 100644
--- a/.github/workflows/csharp.yml
+++ b/.github/workflows/csharp.yml
@@ -108,7 +108,7 @@ jobs:
         with:
           dotnet-version: ${{ matrix.dotnet }}
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Checkout Arrow
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index cc3ff6330746d..1cc8d993498b6 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -45,7 +45,7 @@ jobs:
         with:
           fetch-depth: 0
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Install pre-commit
@@ -104,7 +104,7 @@ jobs:
         with:
           fetch-depth: 0
       - name: Install Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: '3.12'
       - name: Install Ruby
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 25db1c39ad89e..1219f7526f9f2 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -52,7 +52,7 @@ jobs:
           key: debian-docs-${{ hashFiles('cpp/**') }}
           restore-keys: debian-docs-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Setup Archery
diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml
index ea7fe5d02d7b8..454affd7fa7f9 100644
--- a/.github/workflows/docs_light.yml
+++ b/.github/workflows/docs_light.yml
@@ -58,7 +58,7 @@ jobs:
           key: conda-docs-${{ hashFiles('cpp/**') }}
           restore-keys: conda-docs-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Setup Archery
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index b9a19d182d5c4..9b18b010a0cb9 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -207,7 +207,7 @@ jobs:
           fetch-depth: 0
           submodules: recursive
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -247,7 +247,7 @@ jobs:
         with:
           fetch-depth: 0
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -339,7 +339,7 @@ jobs:
           github.event_name == 'push' &&
           github.repository == 'apache/arrow' &&
           github.ref_name == 'main'
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: '3.10'
       - name: Run Benchmarks
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 43f8af0a600d8..3a6b568c5207f 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -89,7 +89,7 @@ jobs:
           key: conda-${{ hashFiles('cpp/**') }}
           restore-keys: conda-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml
index 0317879b580ba..8560f0dd1cbe9 100644
--- a/.github/workflows/java.yml
+++ b/.github/workflows/java.yml
@@ -76,7 +76,7 @@ jobs:
           key: maven-${{ hashFiles('java/**') }}
           restore-keys: maven-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml
index c2bc679e681a2..f204d6459ae01 100644
--- a/.github/workflows/java_jni.yml
+++ b/.github/workflows/java_jni.yml
@@ -70,7 +70,7 @@ jobs:
           key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }}
           restore-keys: java-jni-manylinux-2014-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -110,7 +110,7 @@ jobs:
           key: maven-${{ hashFiles('java/**') }}
           restore-keys: maven-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml
index 72afb6dbf1c1d..0bf0c27288faf 100644
--- a/.github/workflows/java_nightly.yml
+++ b/.github/workflows/java_nightly.yml
@@ -58,7 +58,7 @@ jobs:
           repository: ursacomputing/crossbow
           ref: main
       - name: Set up Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           cache: 'pip'
           python-version: 3.12
diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml
index 630bef61105f6..4ab9831924fb1 100644
--- a/.github/workflows/js.yml
+++ b/.github/workflows/js.yml
@@ -54,7 +54,7 @@ jobs:
         with:
           fetch-depth: 0
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml
index 7dd06b6aeec09..bbb1a2d7228d0 100644
--- a/.github/workflows/pr_bot.yml
+++ b/.github/workflows/pr_bot.yml
@@ -82,7 +82,7 @@ jobs:
           # fetch the tags for version number generation
           fetch-depth: 0
       - name: Set up Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.12
       - name: Install Archery and Crossbow dependencies
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 90d3a50af3705..b88ea7ce4f1ee 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -107,7 +107,7 @@ jobs:
           key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }}
           restore-keys: ${{ matrix.cache }}-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -177,7 +177,7 @@ jobs:
           fetch-depth: 0
           submodules: recursive
       - name: Setup Python
-        uses: actions/setup-python@v5.1.1
+        uses: actions/setup-python@v5.2.0
         with:
           python-version: '3.11'
       - name: Install Dependencies
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 2820d42470bca..21afa4586b5a4 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -146,7 +146,7 @@ jobs:
             ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-
             ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
@@ -206,7 +206,7 @@ jobs:
           fetch-depth: 0
           submodules: recursive
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery
diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml
index 1ec071b6bbb5e..9817e41d3b61d 100644
--- a/.github/workflows/r_nightly.yml
+++ b/.github/workflows/r_nightly.yml
@@ -60,7 +60,7 @@ jobs:
           repository: ursacomputing/crossbow
           ref: main
       - name: Set up Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           cache: 'pip'
           python-version: 3.12
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index 4b74b8d7fc84d..228bacb77e58a 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -83,7 +83,7 @@ jobs:
           key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }}
           restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby-
       - name: Setup Python
-        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
           python-version: 3.8
       - name: Setup Archery

From 4ed5a149695644fe364466eabcae38d8dabfc090 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 3 Sep 2024 10:13:48 +0900
Subject: [PATCH 068/186] GH-43797: [C++] Attach `arrow::ArrayStatistics` to
 `arrow::ArrayData` (#43801)

### Rationale for this change

If we can attach associated statistics to an array via `ArrayData`, we can use it in later processes such as query planning.

If `ArrayData` not `Array` has statistics, we can use statistics in computing kernels.

There was a concern that associated `arrow::ArrayStatistics` may be outdated if `arrow::ArrayData` is mutated after attaching `arrow::ArrayStatistics`. But `arrow::ArrayData` isn't mutable after the first population. So `arrow::ArrayStatistics` will not be outdated. We can require mutators to take responsibility for statistics.

### What changes are included in this PR?

* Add `arrow::ArrayData::statistics`
* Add `arrow::Array::statistics()` to get statistics attached in `arrow::ArrayData`

This doesn't provide a new `arrow::ArrayData` constructor (`arrow::ArrayData::Make()`) that accepts `arrow::ArrayStatistics`. We can change `arrow::ArrayData::statistics` after we create `arrow::ArrayData`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.

`arrow::Array::statistics()` is a new public API.
* GitHub Issue: #43797

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/arrow/array/array_base.h  |   8 ++
 cpp/src/arrow/array/array_test.cc | 126 ++++++++++++++++++++++++++++++
 cpp/src/arrow/array/data.cc       |   3 +
 cpp/src/arrow/array/data.h        |  24 +++++-
 4 files changed, 159 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h
index 716ae0722069e..e4af67d7e5f0b 100644
--- a/cpp/src/arrow/array/array_base.h
+++ b/cpp/src/arrow/array/array_base.h
@@ -232,6 +232,14 @@ class ARROW_EXPORT Array {
   /// \return DeviceAllocationType
   DeviceAllocationType device_type() const { return data_->device_type(); }
 
+  /// \brief Return the statistics of this Array
+  ///
+  /// This just delegates to calling statistics on the underlying ArrayData
+  /// object which backs this Array.
+  ///
+  /// \return const ArrayStatistics&
+  std::shared_ptr<ArrayStatistics> statistics() const { return data_->statistics; }
+
  protected:
   Array() = default;
   ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 32806d9d2edb3..73e0c692432b6 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) {
   }
 }
 
+class TestArrayDataStatistics : public ::testing::Test {
+ public:
+  void SetUp() {
+    valids_ = {1, 0, 1, 1};
+    null_count_ = std::count(valids_.begin(), valids_.end(), 0);
+    null_buffer_ = *internal::BytesToBits(valids_);
+    values_ = {1, 0, 3, -4};
+    min_ = *std::min_element(values_.begin(), values_.end());
+    max_ = *std::max_element(values_.begin(), values_.end());
+    values_buffer_ = Buffer::FromVector(values_);
+    data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_},
+                            null_count_);
+    data_->statistics = std::make_shared<ArrayStatistics>();
+    data_->statistics->null_count = null_count_;
+    data_->statistics->min = min_;
+    data_->statistics->is_min_exact = true;
+    data_->statistics->max = max_;
+    data_->statistics->is_max_exact = true;
+  }
+
+ protected:
+  std::vector<uint8_t> valids_;
+  size_t null_count_;
+  std::shared_ptr<Buffer> null_buffer_;
+  std::vector<int32_t> values_;
+  int64_t min_;
+  int64_t max_;
+  std::shared_ptr<Buffer> values_buffer_;
+  std::shared_ptr<ArrayData> data_;
+};
+
+TEST_F(TestArrayDataStatistics, MoveConstructor) {
+  ArrayData copied_data(*data_);
+  ArrayData moved_data(std::move(copied_data));
+
+  ASSERT_TRUE(moved_data.statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
+
+  ASSERT_TRUE(moved_data.statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
+  ASSERT_TRUE(moved_data.statistics->is_min_exact);
+
+  ASSERT_TRUE(moved_data.statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
+  ASSERT_TRUE(moved_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyConstructor) {
+  ArrayData copied_data(*data_);
+
+  ASSERT_TRUE(copied_data.statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
+
+  ASSERT_TRUE(copied_data.statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
+  ASSERT_TRUE(copied_data.statistics->is_min_exact);
+
+  ASSERT_TRUE(copied_data.statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
+  ASSERT_TRUE(copied_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, MoveAssignment) {
+  ArrayData copied_data(*data_);
+  ArrayData moved_data;
+  moved_data = std::move(copied_data);
+
+  ASSERT_TRUE(moved_data.statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
+
+  ASSERT_TRUE(moved_data.statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
+  ASSERT_TRUE(moved_data.statistics->is_min_exact);
+
+  ASSERT_TRUE(moved_data.statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
+  ASSERT_TRUE(moved_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyAssignment) {
+  ArrayData copied_data;
+  copied_data = *data_;
+
+  ASSERT_TRUE(copied_data.statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
+
+  ASSERT_TRUE(copied_data.statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
+  ASSERT_TRUE(copied_data.statistics->is_min_exact);
+
+  ASSERT_TRUE(copied_data.statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
+  ASSERT_TRUE(copied_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyTo) {
+  ASSERT_OK_AND_ASSIGN(auto copied_data,
+                       data_->CopyTo(arrow::default_cpu_memory_manager()));
+
+  ASSERT_TRUE(copied_data->statistics->null_count.has_value());
+  ASSERT_EQ(null_count_, copied_data->statistics->null_count.value());
+
+  ASSERT_TRUE(copied_data->statistics->min.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->min.value()));
+  ASSERT_EQ(min_, std::get<int64_t>(copied_data->statistics->min.value()));
+  ASSERT_TRUE(copied_data->statistics->is_min_exact);
+
+  ASSERT_TRUE(copied_data->statistics->max.has_value());
+  ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->max.value()));
+  ASSERT_EQ(max_, std::get<int64_t>(copied_data->statistics->max.value()));
+  ASSERT_TRUE(copied_data->statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, Slice) {
+  auto sliced_data = data_->Slice(0, 1);
+  ASSERT_FALSE(sliced_data->statistics);
+}
+
 template <typename PType>
 class TestPrimitiveArray : public ::testing::Test {
  public:
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 83eeb56c496cf..8e29297a8c175 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -165,6 +165,8 @@ Result<std::shared_ptr<ArrayData>> CopyToImpl(const ArrayData& data,
     ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn));
   }
 
+  output->statistics = data.statistics;
+
   return output;
 }
 }  // namespace
@@ -195,6 +197,7 @@ std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
   } else {
     copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
   }
+  copy->statistics = nullptr;
   return copy;
 }
 
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index e0508fe6980a7..1e6ee9a1d32ff 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -24,6 +24,7 @@
 #include <utility>
 #include <vector>
 
+#include "arrow/array/statistics.h"
 #include "arrow/buffer.h"
 #include "arrow/result.h"
 #include "arrow/type.h"
@@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData {
         offset(other.offset),
         buffers(std::move(other.buffers)),
         child_data(std::move(other.child_data)),
-        dictionary(std::move(other.dictionary)) {
+        dictionary(std::move(other.dictionary)),
+        statistics(std::move(other.statistics)) {
     SetNullCount(other.null_count);
   }
 
@@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData {
         offset(other.offset),
         buffers(other.buffers),
         child_data(other.child_data),
-        dictionary(other.dictionary) {
+        dictionary(other.dictionary),
+        statistics(other.statistics) {
     SetNullCount(other.null_count);
   }
 
@@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData {
     buffers = std::move(other.buffers);
     child_data = std::move(other.child_data);
     dictionary = std::move(other.dictionary);
+    statistics = std::move(other.statistics);
     return *this;
   }
 
@@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData {
     buffers = other.buffers;
     child_data = other.child_data;
     dictionary = other.dictionary;
+    statistics = other.statistics;
     return *this;
   }
 
@@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData {
   }
 
   /// \brief Construct a zero-copy slice of the data with the given offset and length
+  ///
+  /// The associated `ArrayStatistics` is always discarded in a sliced
+  /// `ArrayData`. Because `ArrayStatistics` in the original
+  /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want
+  /// to reuse statistics in the original `ArrayData`, you need to do
+  /// it by yourself.
+  ///
+  /// If the specified slice range has the same range as the original
+  /// `ArrayData`, we can reuse statistics in the original
+  /// `ArrayData`. Because it has the same data as the original
+  /// `ArrayData`. But the associated `ArrayStatistics` is discarded
+  /// in this case too. Use `Copy()` instead for the case.
   std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
 
   /// \brief Input-checking variant of Slice
@@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData {
 
   // The dictionary for this Array, if any. Only used for dictionary type
   std::shared_ptr<ArrayData> dictionary;
+
+  // The statistics for this Array.
+  std::shared_ptr<ArrayStatistics> statistics;
 };
 
 /// \brief A non-owning Buffer reference

From 1475bd815bbdcd2bbcc6d6e74a7d8df5fe369ea5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:58:59 +0900
Subject: [PATCH 069/186] MINOR: [Java] Bump org.mockito:mockito-junit-jupiter
 from 5.12.0 to 5.13.0 in /java (#43919)

Bumps [org.mockito:mockito-junit-jupiter](https://github.com/mockito/mockito) from 5.12.0 to 5.13.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/mockito/mockito/releases">org.mockito:mockito-junit-jupiter's releases</a>.</em></p>
<blockquote>
<h2>v5.13.0</h2>
<p><em>Changelog generated by <a href="https://github.com/shipkit/shipkit-changelog">Shipkit Changelog Gradle Plugin</a></em></p>
<h4>5.13.0</h4>
<ul>
<li>2024-08-27 - <a href="https://github.com/mockito/mockito/compare/v5.12.0...v5.13.0">43 commit(s)</a> by Breno A, Caleb Cushing, Jinwoo, Kurt Alfred Kluever, Stefano Cordio, Thach Le, dependabot[bot]</li>
<li>Bump versions.bytebuddy from 1.14.19 to 1.15.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3429">#3429</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3429">mockito/mockito#3429</a>)</li>
<li>Bump org.jetbrains.kotlin:kotlin-stdlib from 2.0.10 to 2.0.20 [(<a href="https://redirect.github.com/mockito/mockito/issues/3427">#3427</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3427">mockito/mockito#3427</a>)</li>
<li>Bump org.junit.platform:junit-platform-launcher from 1.10.3 to 1.11.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3425">#3425</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3425">mockito/mockito#3425</a>)</li>
<li>Bump com.gradle.enterprise from 3.17.6 to 3.18 [(<a href="https://redirect.github.com/mockito/mockito/issues/3423">#3423</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3423">mockito/mockito#3423</a>)</li>
<li>Fix a typo in InjectMocks [(<a href="https://redirect.github.com/mockito/mockito/issues/3422">#3422</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3422">mockito/mockito#3422</a>)</li>
<li>Bump versions.bytebuddy from 1.14.18 to 1.14.19 [(<a href="https://redirect.github.com/mockito/mockito/issues/3417">#3417</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3417">mockito/mockito#3417</a>)</li>
<li>Bump androidx.test:runner from 1.6.1 to 1.6.2 [(<a href="https://redirect.github.com/mockito/mockito/issues/3415">#3415</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3415">mockito/mockito#3415</a>)</li>
<li>Bump versions.junitJupiter from 5.10.3 to 5.11.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3413">#3413</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3413">mockito/mockito#3413</a>)</li>
<li>Bump org.jetbrains.kotlin:kotlin-stdlib from 2.0.0 to 2.0.10 [(<a href="https://redirect.github.com/mockito/mockito/issues/3409">#3409</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3409">mockito/mockito#3409</a>)</li>
<li>Bump org.hamcrest:hamcrest-core from 2.2 to 3.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3408">#3408</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3408">mockito/mockito#3408</a>)</li>
<li>Bump com.google.googlejavaformat:google-java-format from 1.22.0 to 1.23.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3407">#3407</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3407">mockito/mockito#3407</a>)</li>
<li>Bump org.shipkit:shipkit-auto-version from 2.0.9 to 2.0.10 [(<a href="https://redirect.github.com/mockito/mockito/issues/3405">#3405</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3405">mockito/mockito#3405</a>)</li>
<li>Bump com.gradle.enterprise from 3.17.5 to 3.17.6 [(<a href="https://redirect.github.com/mockito/mockito/issues/3404">#3404</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3404">mockito/mockito#3404</a>)</li>
<li>Bump gradle/wrapper-validation-action from 3.4.2 to 3.5.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3401">#3401</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3401">mockito/mockito#3401</a>)</li>
<li>Bump org.assertj:assertj-core from 3.26.0 to 3.26.3 [(<a href="https://redirect.github.com/mockito/mockito/issues/3398">#3398</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3398">mockito/mockito#3398</a>)</li>
<li>Bump versions.bytebuddy from 1.14.17 to 1.14.18 [(<a href="https://redirect.github.com/mockito/mockito/issues/3397">#3397</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3397">mockito/mockito#3397</a>)</li>
<li>ci: add .m2 dependencies cache [(<a href="https://redirect.github.com/mockito/mockito/issues/3396">#3396</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3396">mockito/mockito#3396</a>)</li>
<li>Bump org.codehaus.groovy:groovy from 3.0.21 to 3.0.22 [(<a href="https://redirect.github.com/mockito/mockito/issues/3394">#3394</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3394">mockito/mockito#3394</a>)</li>
<li>Bump androidx.test:runner from 1.6.0 to 1.6.1 [(<a href="https://redirect.github.com/mockito/mockito/issues/3393">#3393</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3393">mockito/mockito#3393</a>)</li>
<li>Bump org.junit.platform:junit-platform-launcher from 1.10.2 to 1.10.3 [(<a href="https://redirect.github.com/mockito/mockito/issues/3392">#3392</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3392">mockito/mockito#3392</a>)</li>
<li>Gradle lazy configuration [(<a href="https://redirect.github.com/mockito/mockito/issues/3391">#3391</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3391">mockito/mockito#3391</a>)</li>
<li>Bump androidx.test.ext:junit from 1.2.0 to 1.2.1 [(<a href="https://redirect.github.com/mockito/mockito/issues/3388">#3388</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3388">mockito/mockito#3388</a>)</li>
<li>docs: cleanup javadoc for modularity [(<a href="https://redirect.github.com/mockito/mockito/issues/3386">#3386</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3386">mockito/mockito#3386</a>)</li>
<li>Bump versions.junitJupiter from 5.10.2 to 5.10.3 [(<a href="https://redirect.github.com/mockito/mockito/issues/3385">#3385</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3385">mockito/mockito#3385</a>)</li>
<li>Bump androidx.test.ext:junit from 1.1.5 to 1.2.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3383">#3383</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3383">mockito/mockito#3383</a>)</li>
<li>Bump androidx.test:runner from 1.5.2 to 1.6.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3382">#3382</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3382">mockito/mockito#3382</a>)</li>
<li>Bump net.ltgt.gradle:gradle-errorprone-plugin from 4.0.0 to 4.0.1 [(<a href="https://redirect.github.com/mockito/mockito/issues/3380">#3380</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3380">mockito/mockito#3380</a>)</li>
<li>Bump gradle/wrapper-validation-action from 3.4.1 to 3.4.2 [(<a href="https://redirect.github.com/mockito/mockito/issues/3376">#3376</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3376">mockito/mockito#3376</a>)</li>
<li>Bump gradle/wrapper-validation-action from 3.4.0 to 3.4.1 [(<a href="https://redirect.github.com/mockito/mockito/issues/3372">#3372</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3372">mockito/mockito#3372</a>)</li>
<li>Bump gradle/wrapper-validation-action from 3.3.2 to 3.4.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3365">#3365</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3365">mockito/mockito#3365</a>)</li>
<li>Bump org.shipkit:shipkit-auto-version from 2.0.7 to 2.0.9 [(<a href="https://redirect.github.com/mockito/mockito/issues/3364">#3364</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3364">mockito/mockito#3364</a>)</li>
<li>Bump com.gradle.enterprise from 3.17.4 to 3.17.5 [(<a href="https://redirect.github.com/mockito/mockito/issues/3363">#3363</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3363">mockito/mockito#3363</a>)</li>
<li>Bump org.eclipse.platform:org.eclipse.osgi from 3.19.0 to 3.20.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3362">#3362</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3362">mockito/mockito#3362</a>)</li>
<li>Bump net.ltgt.gradle:gradle-errorprone-plugin from 3.1.0 to 4.0.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3361">#3361</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3361">mockito/mockito#3361</a>)</li>
<li>Bump versions.bytebuddy from 1.14.16 to 1.14.17 [(<a href="https://redirect.github.com/mockito/mockito/issues/3357">#3357</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3357">mockito/mockito#3357</a>)</li>
<li>Bump org.assertj:assertj-core from 3.25.3 to 3.26.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3355">#3355</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3355">mockito/mockito#3355</a>)</li>
<li>EditorConfig enhancement [(<a href="https://redirect.github.com/mockito/mockito/issues/3353">#3353</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3353">mockito/mockito#3353</a>)</li>
<li>Bump versions.bytebuddy from 1.14.15 to 1.14.16 [(<a href="https://redirect.github.com/mockito/mockito/issues/3352">#3352</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3352">mockito/mockito#3352</a>)</li>
<li>Bump org.jetbrains.kotlin:kotlin-stdlib from 1.9.24 to 2.0.0 [(<a href="https://redirect.github.com/mockito/mockito/issues/3351">#3351</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3351">mockito/mockito#3351</a>)</li>
<li>Fixes <a href="https://redirect.github.com/mockito/mockito/issues/3237">#3237</a>: Fix NullPointerException in Only.verify [(<a href="https://redirect.github.com/mockito/mockito/issues/3349">#3349</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3349">mockito/mockito#3349</a>)</li>
<li>Bump com.gradle.enterprise from 3.17.3 to 3.17.4 [(<a href="https://redirect.github.com/mockito/mockito/issues/3348">#3348</a>)](<a href="https://redirect.github.com/mockito/mockito/pull/3348">mockito/mockito#3348</a>)</li>
<li>potential editorconfig enhancement [(<a href="https://redirect.github.com/mockito/mockito/issues/3347">#3347</a>)](<a href="https://redirect.github.com/mockito/mockito/issues/3347">mockito/mockito#3347</a>)</li>
<li>Method <code>Only.verify</code> throws <code>NullPointerException</code> [(<a href="https://redirect.github.com/mockito/mockito/issues/3237">#3237</a>)](<a href="https://redirect.github.com/mockito/mockito/issues/3237">mockito/mockito#3237</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/mockito/mockito/commit/9a7e7ea04294fd53e13936c18aca37640ca4dc5e"><code>9a7e7ea</code></a> Replace <code>dependencies.gradle</code> with <code>libs.versions.toml</code></li>
<li><a href="https://github.com/mockito/mockito/commit/1b7675c045b47637e300230624b29aad80bf64d4"><code>1b7675c</code></a> Allow links to JUnit Jupiter Javadoc</li>
<li><a href="https://github.com/mockito/mockito/commit/f6450a01f27d0e4e7e95a52025b000726558774d"><code>f6450a0</code></a> Bump versions.bytebuddy from 1.14.19 to 1.15.0 (<a href="https://redirect.github.com/mockito/mockito/issues/3429">#3429</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/77c31972b9bb8100765359081faed88aa35e6e08"><code>77c3197</code></a> Bump org.jetbrains.kotlin:kotlin-stdlib from 2.0.10 to 2.0.20 (<a href="https://redirect.github.com/mockito/mockito/issues/3427">#3427</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/0e5962428b64eadcd5ddcc89848fa6f1345454ec"><code>0e59624</code></a> Bump org.junit.platform:junit-platform-launcher from 1.10.3 to 1.11.0 (<a href="https://redirect.github.com/mockito/mockito/issues/3425">#3425</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/40925b6f93f81c598112636533fc469e85476edb"><code>40925b6</code></a> Bump com.gradle.enterprise from 3.17.6 to 3.18 (<a href="https://redirect.github.com/mockito/mockito/issues/3423">#3423</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/84f605d5d6da079dabd6ca7e29926c03f0dad45c"><code>84f605d</code></a> Fix a typo in InjectMocks (<a href="https://redirect.github.com/mockito/mockito/issues/3422">#3422</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/87e4a4fa85c84cbd09420c2c8e73bab3627708a7"><code>87e4a4f</code></a> Bump versions.bytebuddy from 1.14.18 to 1.14.19 (<a href="https://redirect.github.com/mockito/mockito/issues/3417">#3417</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/819cc6f6d867fe4aec06178e68b5faca16101e9c"><code>819cc6f</code></a> Bump androidx.test:runner from 1.6.1 to 1.6.2 (<a href="https://redirect.github.com/mockito/mockito/issues/3415">#3415</a>)</li>
<li><a href="https://github.com/mockito/mockito/commit/90df798c9623ef0c010c86319ddfeb5be64fe5f3"><code>90df798</code></a> Bump versions.junitJupiter from 5.10.3 to 5.11.0 (<a href="https://redirect.github.com/mockito/mockito/issues/3413">#3413</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/mockito/mockito/compare/v5.12.0...v5.13.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.mockito:mockito-junit-jupiter&package-manager=maven&previous-version=5.12.0&new-version=5.13.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 577f23e6a719c..49e5348ef5af5 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -279,7 +279,7 @@ under the License.
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-junit-jupiter</artifactId>
-      <version>5.12.0</version>
+      <version>5.13.0</version>
       <scope>test</scope>
     </dependency>
     <dependency>

From 540b2ce393c24373fd35f649eecfbb4cd336e037 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:02:30 +0900
Subject: [PATCH 070/186] MINOR: [Java] Bump com.github.luben:zstd-jni from
 1.5.6-4 to 1.5.6-5 in /java (#43921)

Bumps [com.github.luben:zstd-jni](https://github.com/luben/zstd-jni) from 1.5.6-4 to 1.5.6-5.
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/luben/zstd-jni/commit/ac14b057018be44a4b186d3682d4aae207928262"><code>ac14b05</code></a> Update the sbt syntax for the AIX target</li>
<li><a href="https://github.com/luben/zstd-jni/commit/1adcc4993395a519ed02cc87b7b255eceb63580c"><code>1adcc49</code></a> Try to bring back the AIX build</li>
<li><a href="https://github.com/luben/zstd-jni/commit/4e981883af0ac5c45bc6ca3b7479b1ac4c0a7715"><code>4e98188</code></a> v1.5.6-5</li>
<li><a href="https://github.com/luben/zstd-jni/commit/ee88b906af4f197609744bc5e98c7b35034f8bef"><code>ee88b90</code></a> Don't define <code>Automatic-Module-Name</code> in the Manifest</li>
<li>See full diff in <a href="https://github.com/luben/zstd-jni/compare/v1.5.6-4...v1.5.6-5">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.github.luben:zstd-jni&package-manager=maven&previous-version=1.5.6-4&new-version=1.5.6-5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/compression/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/compression/pom.xml b/java/compression/pom.xml
index 46ed8796423eb..f0d8e92c9a41d 100644
--- a/java/compression/pom.xml
+++ b/java/compression/pom.xml
@@ -55,7 +55,7 @@ under the License.
     <dependency>
       <groupId>com.github.luben</groupId>
       <artifactId>zstd-jni</artifactId>
-      <version>1.5.6-4</version>
+      <version>1.5.6-5</version>
     </dependency>
   </dependencies>
 </project>

From 41e1118f083f21ad2677c182ceb8629e861e8396 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:16:01 +0900
Subject: [PATCH 071/186] MINOR: [Java] Bump org.apache.orc:orc-core from 1.9.2
 to 1.9.4 in /java (#43918)

Bumps org.apache.orc:orc-core from 1.9.2 to 1.9.4.

<details>
<summary>Most Recent Ignore Conditions Applied to This Pull Request</summary>

| Dependency Name | Ignore Conditions |
| --- | --- |
| org.apache.orc:orc-core | [>= 2.a, < 3] |
</details>

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.orc:orc-core&package-manager=maven&previous-version=1.9.2&new-version=1.9.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/adapter/orc/pom.xml | 2 +-
 java/dataset/pom.xml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml
index ec8ddbbb780df..d9cd2bb21a526 100644
--- a/java/adapter/orc/pom.xml
+++ b/java/adapter/orc/pom.xml
@@ -61,7 +61,7 @@ under the License.
     <dependency>
       <groupId>org.apache.orc</groupId>
       <artifactId>orc-core</artifactId>
-      <version>1.9.2</version>
+      <version>1.9.4</version>
       <scope>test</scope>
       <exclusions>
         <exclusion>
diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
index f3384fabbed6d..7e649e3824b93 100644
--- a/java/dataset/pom.xml
+++ b/java/dataset/pom.xml
@@ -130,7 +130,7 @@ under the License.
     <dependency>
       <groupId>org.apache.orc</groupId>
       <artifactId>orc-core</artifactId>
-      <version>1.9.2</version>
+      <version>1.9.4</version>
       <scope>test</scope>
       <exclusions>
         <exclusion>

From 99bc23d901f14a5a5146defc030db995b6d46d63 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:30:43 +0900
Subject: [PATCH 072/186] MINOR: [Java] Bump parquet.version from 1.14.1 to
 1.14.2 in /java (#43920)

Bumps `parquet.version` from 1.14.1 to 1.14.2.
Updates `org.apache.parquet:parquet-avro` from 1.14.1 to 1.14.2
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/apache/parquet-mr/releases">org.apache.parquet:parquet-avro's releases</a>.</em></p>
<blockquote>
<h2>Apache Parquet Java 1.14.2</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
<h2>Apache Parquet Java 1.14.2 RC2</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
<h2>Apache Parquet Java 1.14.2 RC1</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/apache/parquet-java/blob/master/CHANGES.md">org.apache.parquet:parquet-avro's changelog</a>.</em></p>
<blockquote>

<h1>Parquet</h1>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/apache/parquet-java/commit/e7937382e7894f4780c90eb6f896c163cad4cd93"><code>e793738</code></a> [maven-release-plugin] prepare release apache-parquet-1.14.2rc2</li>
<li><a href="https://github.com/apache/parquet-java/commit/d04986ffbd2bc974d07c0db20afd6d2467235cbf"><code>d04986f</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter (<a href="https://redirect.github.com/apache/parquet-mr/issues/2993">#2993</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/7204a116bb4bc2fa5727e47253f75b67e600c7cb"><code>7204a11</code></a> [maven-release-plugin] prepare for next development iteration</li>
<li><a href="https://github.com/apache/parquet-java/commit/7a679f1fefb3c6a12602a33ba405264e4e4e3c40"><code>7a679f1</code></a> [maven-release-plugin] prepare release apache-parquet-1.14.2-rc1</li>
<li><a href="https://github.com/apache/parquet-java/commit/c88a3f8ab0dd2f4041b6249c807f43ed6e6d052a"><code>c88a3f8</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2948">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile...</li>
<li><a href="https://github.com/apache/parquet-java/commit/af4307b25349d78a4f401194f31786e1c4929b3f"><code>af4307b</code></a> PARQUET-1126: Write unencrypted Parquet files without Hadoop (<a href="https://redirect.github.com/apache/parquet-mr/issues/1376">#1376</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/0f3a615acb06dd8ab201f37e485aaac619e467ba"><code>0f3a615</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2956">GH-2956</a>: Use avro SchemaBuilder API to convert record (<a href="https://redirect.github.com/apache/parquet-mr/issues/2957">#2957</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/ca572cbad6ceb9fe303c057cae447c6fd9586f67"><code>ca572cb</code></a> Minor: <code>PARQUET-2472</code> is not on the branch (<a href="https://redirect.github.com/apache/parquet-mr/issues/2966">#2966</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/05f2e39cf3add09501c1534328db2452370a582c"><code>05f2e39</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2935">GH-2935</a>: Avoid double close of ParquetFileWriter (<a href="https://redirect.github.com/apache/parquet-mr/issues/2951">#2951</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/4241df31bfe4f4c90fd0c1907b96109ab16fa5e9"><code>4241df3</code></a> PARQUET-2472: Close in finally block in ParquetFileWriter#end (<a href="https://redirect.github.com/apache/parquet-mr/issues/1350">#1350</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/apache/parquet-mr/compare/apache-parquet-1.14.1...apache-parquet-1.14.2">compare view</a></li>
</ul>
</details>
<br />

Updates `org.apache.parquet:parquet-hadoop` from 1.14.1 to 1.14.2
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/apache/parquet-mr/releases">org.apache.parquet:parquet-hadoop's releases</a>.</em></p>
<blockquote>
<h2>Apache Parquet Java 1.14.2</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
<h2>Apache Parquet Java 1.14.2 RC2</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
<h2>Apache Parquet Java 1.14.2 RC1</h2>
<h2>What's Changed</h2>
<ul>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2949">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2957">GH-2956</a>: Use avro SchemaBuilder API to convert record</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/2951">GH-2935</a>: Avoid double close of ParquetFileWriter</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1376">PARQUET-1126</a>: Write unencrypted Parquet files without Hadoop</li>
<li><a href="https://redirect.github.com/apache/parquet-java/pull/1350">PARQUET-2472</a>: Close in finally block in <code>ParquetFileWriter#end</code></li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/apache/parquet-java/blob/master/CHANGES.md">org.apache.parquet:parquet-hadoop's changelog</a>.</em></p>
<blockquote>

<h1>Parquet</h1>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/apache/parquet-java/commit/e7937382e7894f4780c90eb6f896c163cad4cd93"><code>e793738</code></a> [maven-release-plugin] prepare release apache-parquet-1.14.2rc2</li>
<li><a href="https://github.com/apache/parquet-java/commit/d04986ffbd2bc974d07c0db20afd6d2467235cbf"><code>d04986f</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2992">GH-2992</a>: Gate LocalTimestamp references in AvroSchemaConverter (<a href="https://redirect.github.com/apache/parquet-mr/issues/2993">#2993</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/7204a116bb4bc2fa5727e47253f75b67e600c7cb"><code>7204a11</code></a> [maven-release-plugin] prepare for next development iteration</li>
<li><a href="https://github.com/apache/parquet-java/commit/7a679f1fefb3c6a12602a33ba405264e4e4e3c40"><code>7a679f1</code></a> [maven-release-plugin] prepare release apache-parquet-1.14.2-rc1</li>
<li><a href="https://github.com/apache/parquet-java/commit/c88a3f8ab0dd2f4041b6249c807f43ed6e6d052a"><code>c88a3f8</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2948">GH-2948</a>: Fix NPE when using the AvroParquetReader.Builder with LocalInputFile...</li>
<li><a href="https://github.com/apache/parquet-java/commit/af4307b25349d78a4f401194f31786e1c4929b3f"><code>af4307b</code></a> PARQUET-1126: Write unencrypted Parquet files without Hadoop (<a href="https://redirect.github.com/apache/parquet-mr/issues/1376">#1376</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/0f3a615acb06dd8ab201f37e485aaac619e467ba"><code>0f3a615</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2956">GH-2956</a>: Use avro SchemaBuilder API to convert record (<a href="https://redirect.github.com/apache/parquet-mr/issues/2957">#2957</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/ca572cbad6ceb9fe303c057cae447c6fd9586f67"><code>ca572cb</code></a> Minor: <code>PARQUET-2472</code> is not on the branch (<a href="https://redirect.github.com/apache/parquet-mr/issues/2966">#2966</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/05f2e39cf3add09501c1534328db2452370a582c"><code>05f2e39</code></a> <a href="https://redirect.github.com/apache/parquet-mr/issues/2935">GH-2935</a>: Avoid double close of ParquetFileWriter (<a href="https://redirect.github.com/apache/parquet-mr/issues/2951">#2951</a>)</li>
<li><a href="https://github.com/apache/parquet-java/commit/4241df31bfe4f4c90fd0c1907b96109ab16fa5e9"><code>4241df3</code></a> PARQUET-2472: Close in finally block in ParquetFileWriter#end (<a href="https://redirect.github.com/apache/parquet-mr/issues/1350">#1350</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/apache/parquet-mr/compare/apache-parquet-1.14.1...apache-parquet-1.14.2">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/dataset/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
index 7e649e3824b93..a19e934f0de98 100644
--- a/java/dataset/pom.xml
+++ b/java/dataset/pom.xml
@@ -32,7 +32,7 @@ under the License.
 
   <properties>
     <arrow.cpp.build.dir>../../../cpp/release-build/</arrow.cpp.build.dir>
-    <parquet.version>1.14.1</parquet.version>
+    <parquet.version>1.14.2</parquet.version>
     <avro.version>1.12.0</avro.version>
   </properties>
 

From db9435f324d816c7ed7e0a18c9806ef9f51873a3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:31:00 +0900
Subject: [PATCH 073/186] MINOR: [Java] Bump error_prone_core.version from
 2.30.0 to 2.31.0 in /java (#43923)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps `error_prone_core.version` from 2.30.0 to 2.31.0.
Updates `com.google.errorprone:error_prone_annotations` from 2.30.0 to 2.31.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/google/error-prone/releases">com.google.errorprone:error_prone_annotations's releases</a>.</em></p>
<blockquote>
<h2>Error Prone 2.31.0</h2>
<p>This is the last planned minor release of Error Prone that will support running on JDK 11, see <a href="https://redirect.github.com/google/error-prone/issues/3803">#3803</a>. Using Error Prone to compile code that is deployed to earlier versions will continue to be fully supported, but will require using JDK 17 or newer for compilation and setting <code>--release</code> or <code>-source</code>/<code>-target</code>/<code>-bootclasspath</code>.</p>
<p>Changes:</p>
<ul>
<li>Introduce <a href="https://github.com/google/error-prone/blob/2656f48902f6723f3147caa117372309dbc6c15f/type_annotations/src/main/java/com/google/errorprone/annotations/ThreadSafeTypeParameter.java"><code>@ ThreadSafeTypeParameter</code></a> with enforcement by <a href="https://errorprone.info/bugpattern/ThreadSafe">ThreadSafe</a></li>
<li>Improved support for latest JDK 24 EA builds</li>
<li>Error Prone is now distributed as a Multi-Release jar (<a href="https://redirect.github.com/google/error-prone/issues/3756">#3756</a>)</li>
</ul>
<p>New checks:</p>
<ul>
<li><a href="https://errorprone.info/bugpattern/AutoValueBoxedValues"><code>AutoValueBoxedValues</code></a>: AutoValue instances should not usually contain boxed types that are not Nullable. We recommend removing the unnecessary boxing.</li>
</ul>
<p>Full changelog: <a href="https://github.com/google/error-prone/compare/v2.30.0...v2.31.0">https://github.com/google/error-prone/compare/v2.30.0...v2.31.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/google/error-prone/commit/4294aac27cb0a5ec536fdfdbe0ec5227ac90c1a2"><code>4294aac</code></a> Release Error Prone 2.31.0</li>
<li><a href="https://github.com/google/error-prone/commit/5bf91fb051bce74517456a35e798c44c331d7da2"><code>5bf91fb</code></a> Replace <code>{@ link ThreadSafeTypeParameter}</code> with <code>{@ code ThreadSafeTypeParameter}</code></li>
<li><a href="https://github.com/google/error-prone/commit/a5a718974dd7d325025ea14c1492f113490d5cf8"><code>a5a7189</code></a> Replace <code>ComparisonChain</code> with a <code>Comparator</code> chain.</li>
<li><a href="https://github.com/google/error-prone/commit/7e9a10089b731fcff39d711aab25bc2b8b8d0c5a"><code>7e9a100</code></a> Make ThreadSafeTypeParameter useful in the open-source version of ErrorProne.</li>
<li><a href="https://github.com/google/error-prone/commit/b4cebef79651ae33277459240fc74d53e61ef3a9"><code>b4cebef</code></a> Fix typo noted by <a href="https://github.com/Stephan202"><code>@​Stephan202</code></a>.</li>
<li><a href="https://github.com/google/error-prone/commit/354104ec807269d79848d9d84b448f5e7e8e4315"><code>354104e</code></a> Remove <code>ThreadSafe.TypeParameter</code> now that it's been replaced by `ThreadSafeT...</li>
<li><a href="https://github.com/google/error-prone/commit/7542d36993acb6ac6c219c30e6bbac3ab8d0b793"><code>7542d36</code></a> Don't fire <code>CanIgnoreReturnValueSuggester</code> for simple <code>return param;</code> impleme...</li>
<li><a href="https://github.com/google/error-prone/commit/0a5a5b8bca44854904ac13b704f761a8c2a1277f"><code>0a5a5b8</code></a> Migrate <code>CollectionIncompatibleType</code> from the deprecated <code>withSignature</code> to `...</li>
<li><a href="https://github.com/google/error-prone/commit/78218f298883071c44f91fea30d8c2916f2da6df"><code>78218f2</code></a> Write more about <code>withSignature</code>.</li>
<li><a href="https://github.com/google/error-prone/commit/90d939069d5b59cc404da5ac48b25509b2ebef40"><code>90d9390</code></a> Mark some Kotlin ranges as Immutable.</li>
<li>Additional commits viewable in <a href="https://github.com/google/error-prone/compare/v2.30.0...v2.31.0">compare view</a></li>
</ul>
</details>
<br />

Updates `com.google.errorprone:error_prone_core` from 2.30.0 to 2.31.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/google/error-prone/releases">com.google.errorprone:error_prone_core's releases</a>.</em></p>
<blockquote>
<h2>Error Prone 2.31.0</h2>
<p>This is the last planned minor release of Error Prone that will support running on JDK 11, see <a href="https://redirect.github.com/google/error-prone/issues/3803">#3803</a>. Using Error Prone to compile code that is deployed to earlier versions will continue to be fully supported, but will require using JDK 17 or newer for compilation and setting <code>--release</code> or <code>-source</code>/<code>-target</code>/<code>-bootclasspath</code>.</p>
<p>Changes:</p>
<ul>
<li>Introduce <a href="https://github.com/google/error-prone/blob/2656f48902f6723f3147caa117372309dbc6c15f/type_annotations/src/main/java/com/google/errorprone/annotations/ThreadSafeTypeParameter.java"><code>@ ThreadSafeTypeParameter</code></a> with enforcement by <a href="https://errorprone.info/bugpattern/ThreadSafe">ThreadSafe</a></li>
<li>Improved support for latest JDK 24 EA builds</li>
<li>Error Prone is now distributed as a Multi-Release jar (<a href="https://redirect.github.com/google/error-prone/issues/3756">#3756</a>)</li>
</ul>
<p>New checks:</p>
<ul>
<li><a href="https://errorprone.info/bugpattern/AutoValueBoxedValues"><code>AutoValueBoxedValues</code></a>: AutoValue instances should not usually contain boxed types that are not Nullable. We recommend removing the unnecessary boxing.</li>
</ul>
<p>Full changelog: <a href="https://github.com/google/error-prone/compare/v2.30.0...v2.31.0">https://github.com/google/error-prone/compare/v2.30.0...v2.31.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/google/error-prone/commit/4294aac27cb0a5ec536fdfdbe0ec5227ac90c1a2"><code>4294aac</code></a> Release Error Prone 2.31.0</li>
<li><a href="https://github.com/google/error-prone/commit/5bf91fb051bce74517456a35e798c44c331d7da2"><code>5bf91fb</code></a> Replace <code>{@ link ThreadSafeTypeParameter}</code> with <code>{@ code ThreadSafeTypeParameter}</code></li>
<li><a href="https://github.com/google/error-prone/commit/a5a718974dd7d325025ea14c1492f113490d5cf8"><code>a5a7189</code></a> Replace <code>ComparisonChain</code> with a <code>Comparator</code> chain.</li>
<li><a href="https://github.com/google/error-prone/commit/7e9a10089b731fcff39d711aab25bc2b8b8d0c5a"><code>7e9a100</code></a> Make ThreadSafeTypeParameter useful in the open-source version of ErrorProne.</li>
<li><a href="https://github.com/google/error-prone/commit/b4cebef79651ae33277459240fc74d53e61ef3a9"><code>b4cebef</code></a> Fix typo noted by <a href="https://github.com/Stephan202"><code>@​Stephan202</code></a>.</li>
<li><a href="https://github.com/google/error-prone/commit/354104ec807269d79848d9d84b448f5e7e8e4315"><code>354104e</code></a> Remove <code>ThreadSafe.TypeParameter</code> now that it's been replaced by `ThreadSafeT...</li>
<li><a href="https://github.com/google/error-prone/commit/7542d36993acb6ac6c219c30e6bbac3ab8d0b793"><code>7542d36</code></a> Don't fire <code>CanIgnoreReturnValueSuggester</code> for simple <code>return param;</code> impleme...</li>
<li><a href="https://github.com/google/error-prone/commit/0a5a5b8bca44854904ac13b704f761a8c2a1277f"><code>0a5a5b8</code></a> Migrate <code>CollectionIncompatibleType</code> from the deprecated <code>withSignature</code> to `...</li>
<li><a href="https://github.com/google/error-prone/commit/78218f298883071c44f91fea30d8c2916f2da6df"><code>78218f2</code></a> Write more about <code>withSignature</code>.</li>
<li><a href="https://github.com/google/error-prone/commit/90d939069d5b59cc404da5ac48b25509b2ebef40"><code>90d9390</code></a> Mark some Kotlin ranges as Immutable.</li>
<li>Additional commits viewable in <a href="https://github.com/google/error-prone/compare/v2.30.0...v2.31.0">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 49e5348ef5af5..81e652f462e02 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -107,7 +107,7 @@ under the License.
     <forkCount>2</forkCount>
     <checkstyle.version>10.17.0</checkstyle.version>
     <checkstyle.failOnViolation>true</checkstyle.failOnViolation>
-    <error_prone_core.version>2.30.0</error_prone_core.version>
+    <error_prone_core.version>2.31.0</error_prone_core.version>
     <mockito.core.version>5.11.0</mockito.core.version>
     <mockito.inline.version>5.2.0</mockito.inline.version>
     <checker.framework.version>3.46.0</checker.framework.version>

From 170c599cca72971c5db07305a73fd5d4885c1e61 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 3 Sep 2024 14:36:56 +0200
Subject: [PATCH 074/186] GH-40216: [Python][CI][Packaging] Upload nightly
 wheels to main label of scientific-python-nightly-wheels channel (#43932)

### Rationale for this change

Small follow-up on https://github.com/apache/arrow/pull/43862, correcting the `label` being used to upload the wheels. See https://github.com/apache/arrow/issues/40216#issuecomment-2325937999 for context.

* GitHub Issue: #40216

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 dev/tasks/macros.jinja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index 63cb2fc6dd101..082d33b124f9f 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -189,7 +189,7 @@ env:
     shell: bash
     run: |
       python3 -m pip install git+https://github.com/Anaconda-Platform/anaconda-client.git@1.12.3
-      anaconda -t ${CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN} upload --force -u scientific-python-nightly-wheels --label dev {{ pattern }}
+      anaconda -t ${CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN} upload --force -u scientific-python-nightly-wheels --label main {{ pattern }}
     env:
       CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN: {{ '${{ secrets.CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN }}' }}
   {% endif %}

From b0786d48a58a5f95fe22db12932a3a9dffb4101f Mon Sep 17 00:00:00 2001
From: qmmk <47608571+qmmk@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:30:06 +0200
Subject: [PATCH 075/186] GH-43907: [C#][FlightRPC] Add Grpc Call Options
 support on Flight Client (#43910)

### Rationale for this change

This implementation add default grpc call options on the csharp implementation FlightClient

### What changes are included in this PR?

- FlightClient.cs with updated signature for all the methods accepting grpc call options
- FlightTest.cs update test to verify the raise of the right exception

### Are these changes tested?

Yes, tests are added in FlightTest.cs
I've tested locally with the C++ implementation.

### Are there any user-facing changes?

No is transparent for the user, following the already present documentation should be sufficient.

### References

* GitHub Issue: #43907

Authored-by: Marco Malagoli <mmalagoli@board.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../Client/FlightClient.cs                    | 69 ++++++++++---
 .../Apache.Arrow.Flight.Tests/FlightTests.cs  | 97 ++++++++++++++++++-
 2 files changed, 150 insertions(+), 16 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs
index efb22b1948a01..b89ce9da79d14 100644
--- a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs
+++ b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+using System.Threading;
 using System.Threading.Tasks;
 using Apache.Arrow.Flight.Internal;
 using Apache.Arrow.Flight.Protocol;
@@ -34,12 +35,17 @@ public FlightClient(ChannelBase grpcChannel)
 
         public AsyncServerStreamingCall<FlightInfo> ListFlights(FlightCriteria criteria = null, Metadata headers = null)
         {
-            if(criteria == null)
+            return ListFlights(criteria, headers, null, CancellationToken.None);
+        }
+
+        public AsyncServerStreamingCall<FlightInfo> ListFlights(FlightCriteria criteria, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            if (criteria == null)
             {
                 criteria = FlightCriteria.Empty;
             }
-            
-            var response = _client.ListFlights(criteria.ToProtocol(), headers);
+
+            var response = _client.ListFlights(criteria.ToProtocol(), headers, deadline, cancellationToken);
             var convertStream = new StreamReader<Protocol.FlightInfo, FlightInfo>(response.ResponseStream, inFlight => new FlightInfo(inFlight));
 
             return new AsyncServerStreamingCall<FlightInfo>(convertStream, response.ResponseHeadersAsync, response.GetStatus, response.GetTrailers, response.Dispose);
@@ -47,7 +53,12 @@ public AsyncServerStreamingCall<FlightInfo> ListFlights(FlightCriteria criteria
 
         public AsyncServerStreamingCall<FlightActionType> ListActions(Metadata headers = null)
         {
-            var response = _client.ListActions(EmptyInstance, headers);
+            return ListActions(headers, null, CancellationToken.None);
+        }
+
+        public AsyncServerStreamingCall<FlightActionType> ListActions(Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var response = _client.ListActions(EmptyInstance, headers, deadline, cancellationToken);
             var convertStream = new StreamReader<Protocol.ActionType, FlightActionType>(response.ResponseStream, actionType => new FlightActionType(actionType));
 
             return new AsyncServerStreamingCall<FlightActionType>(convertStream, response.ResponseHeadersAsync, response.GetStatus, response.GetTrailers, response.Dispose);
@@ -55,14 +66,24 @@ public AsyncServerStreamingCall<FlightActionType> ListActions(Metadata headers =
 
         public FlightRecordBatchStreamingCall GetStream(FlightTicket ticket, Metadata headers = null)
         {
-            var stream = _client.DoGet(ticket.ToProtocol(),  headers);
+            return GetStream(ticket, headers, null, CancellationToken.None);
+        }
+
+        public FlightRecordBatchStreamingCall GetStream(FlightTicket ticket, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var stream = _client.DoGet(ticket.ToProtocol(), headers, deadline, cancellationToken);
             var responseStream = new FlightClientRecordBatchStreamReader(stream.ResponseStream);
             return new FlightRecordBatchStreamingCall(responseStream, stream.ResponseHeadersAsync, stream.GetStatus, stream.GetTrailers, stream.Dispose);
         }
 
         public AsyncUnaryCall<FlightInfo> GetInfo(FlightDescriptor flightDescriptor, Metadata headers = null)
         {
-            var flightInfoResult = _client.GetFlightInfoAsync(flightDescriptor.ToProtocol(), headers);
+            return GetInfo(flightDescriptor, headers, null, CancellationToken.None);
+        }
+
+        public AsyncUnaryCall<FlightInfo> GetInfo(FlightDescriptor flightDescriptor, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var flightInfoResult = _client.GetFlightInfoAsync(flightDescriptor.ToProtocol(), headers, deadline, cancellationToken);
 
             var flightInfo = flightInfoResult
                 .ResponseAsync
@@ -79,7 +100,12 @@ public AsyncUnaryCall<FlightInfo> GetInfo(FlightDescriptor flightDescriptor, Met
 
         public FlightRecordBatchDuplexStreamingCall StartPut(FlightDescriptor flightDescriptor, Metadata headers = null)
         {
-            var channels = _client.DoPut(headers);
+            return StartPut(flightDescriptor, headers, null, CancellationToken.None);
+        }
+
+        public FlightRecordBatchDuplexStreamingCall StartPut(FlightDescriptor flightDescriptor, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var channels = _client.DoPut(headers, deadline, cancellationToken);
             var requestStream = new FlightClientRecordBatchStreamWriter(channels.RequestStream, flightDescriptor);
             var readStream = new StreamReader<Protocol.PutResult, FlightPutResult>(channels.ResponseStream, putResult => new FlightPutResult(putResult));
             return new FlightRecordBatchDuplexStreamingCall(
@@ -93,7 +119,13 @@ public FlightRecordBatchDuplexStreamingCall StartPut(FlightDescriptor flightDesc
 
         public AsyncDuplexStreamingCall<FlightHandshakeRequest, FlightHandshakeResponse> Handshake(Metadata headers = null)
         {
-            var channel = _client.Handshake(headers);
+            return Handshake(headers, null, CancellationToken.None);
+
+        }
+
+        public AsyncDuplexStreamingCall<FlightHandshakeRequest, FlightHandshakeResponse> Handshake(Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var channel = _client.Handshake(headers, deadline, cancellationToken);
             var readStream = new StreamReader<HandshakeResponse, FlightHandshakeResponse>(channel.ResponseStream, response => new FlightHandshakeResponse(response));
             var writeStream = new FlightHandshakeStreamWriterAdapter(channel.RequestStream);
             var call = new AsyncDuplexStreamingCall<FlightHandshakeRequest, FlightHandshakeResponse>(
@@ -109,7 +141,12 @@ public AsyncDuplexStreamingCall<FlightHandshakeRequest, FlightHandshakeResponse>
 
         public FlightRecordBatchExchangeCall DoExchange(FlightDescriptor flightDescriptor, Metadata headers = null)
         {
-            var channel = _client.DoExchange(headers);
+            return DoExchange(flightDescriptor, headers, null, CancellationToken.None);
+        }
+
+        public FlightRecordBatchExchangeCall DoExchange(FlightDescriptor flightDescriptor, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var channel = _client.DoExchange(headers, deadline, cancellationToken);
             var requestStream = new FlightClientRecordBatchStreamWriter(channel.RequestStream, flightDescriptor);
             var responseStream = new FlightClientRecordBatchStreamReader(channel.ResponseStream);
             var call = new FlightRecordBatchExchangeCall(
@@ -125,14 +162,24 @@ public FlightRecordBatchExchangeCall DoExchange(FlightDescriptor flightDescripto
 
         public AsyncServerStreamingCall<FlightResult> DoAction(FlightAction action, Metadata headers = null)
         {
-            var stream = _client.DoAction(action.ToProtocol(), headers);
+            return DoAction(action, headers, null, CancellationToken.None);
+        }
+
+        public AsyncServerStreamingCall<FlightResult> DoAction(FlightAction action, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var stream = _client.DoAction(action.ToProtocol(), headers, deadline, cancellationToken);
             var streamReader = new StreamReader<Protocol.Result, FlightResult>(stream.ResponseStream, result => new FlightResult(result));
             return new AsyncServerStreamingCall<FlightResult>(streamReader, stream.ResponseHeadersAsync, stream.GetStatus, stream.GetTrailers, stream.Dispose);
         }
 
         public AsyncUnaryCall<Schema> GetSchema(FlightDescriptor flightDescriptor, Metadata headers = null)
         {
-            var schemaResult = _client.GetSchemaAsync(flightDescriptor.ToProtocol(), headers);
+            return GetSchema(flightDescriptor, headers, null, CancellationToken.None);
+        }
+
+        public AsyncUnaryCall<Schema> GetSchema(FlightDescriptor flightDescriptor, Metadata headers, System.DateTime? deadline, CancellationToken cancellationToken = default)
+        {
+            var schemaResult = _client.GetSchemaAsync(flightDescriptor.ToProtocol(), headers, deadline, cancellationToken);
 
             var schema = schemaResult
                 .ResponseAsync
diff --git a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
index aac4e4209240a..8bf6e1120c6d3 100644
--- a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
+++ b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
@@ -16,12 +16,15 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using System.Threading;
 using System.Threading.Tasks;
 using Apache.Arrow.Flight.Client;
 using Apache.Arrow.Flight.TestWeb;
 using Apache.Arrow.Tests;
 using Google.Protobuf;
+using Grpc.Core;
 using Grpc.Core.Utils;
+using Python.Runtime;
 using Xunit;
 
 namespace Apache.Arrow.Flight.Tests
@@ -70,7 +73,7 @@ private FlightInfo GivenStoreBatches(FlightDescriptor flightDescriptor, params R
 
             var flightHolder = new FlightHolder(flightDescriptor, initialBatch.RecordBatch.Schema, _testWebFactory.GetAddress());
 
-            foreach(var batch in batches)
+            foreach (var batch in batches)
             {
                 flightHolder.AddBatch(batch);
             }
@@ -187,8 +190,8 @@ public async Task TestGetFlightMetadata()
 
             var getStream = _flightClient.GetStream(endpoint.Ticket);
 
-            List<ByteString> actualMetadata = new List<ByteString>(); 
-            while(await getStream.ResponseStream.MoveNext(default))
+            List<ByteString> actualMetadata = new List<ByteString>();
+            while (await getStream.ResponseStream.MoveNext(default))
             {
                 actualMetadata.AddRange(getStream.ResponseStream.ApplicationMetadata);
             }
@@ -277,7 +280,7 @@ public async Task TestListFlights()
 
             var actualFlights = await listFlightStream.ResponseStream.ToListAsync();
 
-            for(int i = 0; i < expectedFlightInfo.Count; i++)
+            for (int i = 0; i < expectedFlightInfo.Count; i++)
             {
                 FlightInfoComparer.Compare(expectedFlightInfo[i], actualFlights[i]);
             }
@@ -386,7 +389,7 @@ public async Task TestGetBatchesWithAsyncEnumerable()
 
 
             List<RecordBatch> resultList = new List<RecordBatch>();
-            await foreach(var recordBatch in getStream.ResponseStream)
+            await foreach (var recordBatch in getStream.ResponseStream)
             {
                 resultList.Add(recordBatch);
             }
@@ -415,5 +418,89 @@ public async Task EnsureTheSerializedBatchContainsTheProperTotalRecordsAndTotalB
             Assert.Equal(expectedBatch.Length, result.TotalRecords);
             Assert.Equal(expectedTotalBytes, result.TotalBytes);
         }
+
+        [Fact]
+        public async Task EnsureCallRaisesDeadlineExceeded()
+        {
+            var flightDescriptor = FlightDescriptor.CreatePathDescriptor("raise_deadline");
+            var deadline = DateTime.UtcNow;
+            var batch = CreateTestBatch(0, 100);
+
+            RpcException exception = null;
+
+            var asyncServerStreamingCallFlights = _flightClient.ListFlights(null, null, deadline);
+            Assert.Equal(StatusCode.DeadlineExceeded, asyncServerStreamingCallFlights.GetStatus().StatusCode);
+
+            var asyncServerStreamingCallActions = _flightClient.ListActions(null, deadline);
+            Assert.Equal(StatusCode.DeadlineExceeded, asyncServerStreamingCallFlights.GetStatus().StatusCode);
+
+            GivenStoreBatches(flightDescriptor, new RecordBatchWithMetadata(batch));
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await _flightClient.GetInfo(flightDescriptor, null, deadline));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+
+            var flightInfo = await _flightClient.GetInfo(flightDescriptor);
+            var endpoint = flightInfo.Endpoints.FirstOrDefault();
+            var getStream = _flightClient.GetStream(endpoint.Ticket, null, deadline);
+            Assert.Equal(StatusCode.DeadlineExceeded, getStream.GetStatus().StatusCode);
+
+            var duplexStreamingCall = _flightClient.DoExchange(flightDescriptor, null, deadline);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await duplexStreamingCall.RequestStream.WriteAsync(batch));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+
+            var putStream = _flightClient.StartPut(flightDescriptor, null, deadline);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await putStream.RequestStream.WriteAsync(batch));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await _flightClient.GetSchema(flightDescriptor, null, deadline));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+
+            var handshakeStreamingCall = _flightClient.Handshake(null, deadline);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await handshakeStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.Empty)));
+            Assert.Equal(StatusCode.DeadlineExceeded, exception.StatusCode);
+        }
+
+        [Fact]
+        public async Task EnsureCallRaisesRequestCancelled()
+        {
+            var cts = new CancellationTokenSource();
+            cts.CancelAfter(1);
+            
+            var batch = CreateTestBatch(0, 100);
+            var metadata = new Metadata();
+            var flightDescriptor = FlightDescriptor.CreatePathDescriptor("raise_cancelled");
+            await Task.Delay(5);
+            RpcException exception = null;
+
+            var asyncServerStreamingCallFlights = _flightClient.ListFlights(null, null, null, cts.Token);
+            Assert.Equal(StatusCode.Cancelled, asyncServerStreamingCallFlights.GetStatus().StatusCode);
+
+            var asyncServerStreamingCallActions = _flightClient.ListActions(null, null, cts.Token);
+            Assert.Equal(StatusCode.Cancelled, asyncServerStreamingCallFlights.GetStatus().StatusCode);
+
+            GivenStoreBatches(flightDescriptor, new RecordBatchWithMetadata(batch));
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await _flightClient.GetInfo(flightDescriptor, null, null, cts.Token));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+            var flightInfo = await _flightClient.GetInfo(flightDescriptor);
+            var endpoint = flightInfo.Endpoints.FirstOrDefault();
+            var getStream = _flightClient.GetStream(endpoint.Ticket, null, null, cts.Token);
+            Assert.Equal(StatusCode.Cancelled, getStream.GetStatus().StatusCode);
+
+            var duplexStreamingCall = _flightClient.DoExchange(flightDescriptor, null, null, cts.Token);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await duplexStreamingCall.RequestStream.WriteAsync(batch));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+            var putStream = _flightClient.StartPut(flightDescriptor, null, null, cts.Token);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await putStream.RequestStream.WriteAsync(batch));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await _flightClient.GetSchema(flightDescriptor, null, null, cts.Token));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+            var handshakeStreamingCall = _flightClient.Handshake(null, null, cts.Token);
+            exception = await Assert.ThrowsAsync<RpcException>(async () => await handshakeStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.Empty)));
+            Assert.Equal(StatusCode.Cancelled, exception.StatusCode);
+
+        }
     }
 }

From 57cc0b92e4c7fd6d80cbb06c18853e17b444e4ce Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Tue, 3 Sep 2024 10:54:15 -0300
Subject: [PATCH 076/186] GH-43927: [C++] Make ChunkResolver::ResolveMany
 output a list of ChunkLocations (#43928)

### Rationale for this change

Better cache locality and it's simpler. Easier to use with a single allocation.

### What changes are included in this PR?

Change of the `ChunkResolver::ResolveMany()` signature.

### Are these changes tested?

Yes, by the tests in `chunked_array_test.cc`

### Are there any user-facing changes?

No because `ChunkResolver` is still in the `internal` namespace.
* GitHub Issue: #43927

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
---
 cpp/src/arrow/chunk_resolver.cc     | 74 ++++++++++++++---------------
 cpp/src/arrow/chunk_resolver.h      | 61 +++++++++++++-----------
 cpp/src/arrow/chunked_array_test.cc | 29 ++++++-----
 3 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc
index 55eec53ced1c7..854127480744e 100644
--- a/cpp/src/arrow/chunk_resolver.cc
+++ b/cpp/src/arrow/chunk_resolver.cc
@@ -60,42 +60,38 @@ inline std::vector<int64_t> MakeChunksOffsets(const std::vector<T>& chunks) {
 template <typename IndexType>
 void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets,
                        int64_t n_indices, const IndexType* logical_index_vec,
-                       IndexType* out_chunk_index_vec, IndexType chunk_hint,
-                       IndexType* out_index_in_chunk_vec) {
+                       TypedChunkLocation<IndexType>* out_chunk_location_vec,
+                       IndexType chunk_hint) {
   auto* offsets = reinterpret_cast<const uint64_t*>(signed_offsets);
   const auto num_chunks = static_cast<IndexType>(num_offsets - 1);
   // chunk_hint in [0, num_offsets) per the precondition.
   for (int64_t i = 0; i < n_indices; i++) {
-    const auto index = static_cast<uint64_t>(logical_index_vec[i]);
+    auto typed_logical_index = logical_index_vec[i];
+    const auto index = static_cast<uint64_t>(typed_logical_index);
+    // use or update chunk_hint
     if (index >= offsets[chunk_hint] &&
         (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) {
-      out_chunk_index_vec[i] = chunk_hint;  // hint is correct!
-      continue;
+      // hint is correct!
+    } else {
+      // lo < hi is guaranteed by `num_offsets = chunks.size() + 1`
+      auto chunk_index =
+          ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets);
+      chunk_hint = static_cast<IndexType>(chunk_index);
     }
-    // lo < hi is guaranteed by `num_offsets = chunks.size() + 1`
-    auto chunk_index =
-        ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets);
-    chunk_hint = static_cast<IndexType>(chunk_index);
-    out_chunk_index_vec[i] = chunk_hint;
-  }
-  if (out_index_in_chunk_vec != NULLPTR) {
-    for (int64_t i = 0; i < n_indices; i++) {
-      auto logical_index = logical_index_vec[i];
-      auto chunk_index = out_chunk_index_vec[i];
-      // chunk_index is in [0, chunks.size()] no matter what the
-      // value of logical_index is, so it's always safe to dereference
-      // offset_ as it contains chunks.size()+1 values.
-      out_index_in_chunk_vec[i] =
-          logical_index - static_cast<IndexType>(offsets[chunk_index]);
+    out_chunk_location_vec[i].chunk_index = chunk_hint;
+    // chunk_index is in [0, chunks.size()] no matter what the
+    // value of logical_index is, so it's always safe to dereference
+    // offset_ as it contains chunks.size()+1 values.
+    out_chunk_location_vec[i].index_in_chunk =
+        typed_logical_index - static_cast<IndexType>(offsets[chunk_hint]);
 #if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER)
-      // Make it more likely that Valgrind/ASAN can catch an invalid memory
-      // access by poisoning out_index_in_chunk_vec[i] when the logical
-      // index is out-of-bounds.
-      if (chunk_index == num_chunks) {
-        out_index_in_chunk_vec[i] = std::numeric_limits<IndexType>::max();
-      }
-#endif
+    // Make it more likely that Valgrind/ASAN can catch an invalid memory
+    // access by poisoning the index-in-chunk value when the logical
+    // index is out-of-bounds.
+    if (chunk_hint == num_chunks) {
+      out_chunk_location_vec[i].index_in_chunk = std::numeric_limits<IndexType>::max();
     }
+#endif
   }
 }
 
@@ -130,31 +126,31 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept {
 }
 
 void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec,
-                                    uint8_t* out_chunk_index_vec, uint8_t chunk_hint,
-                                    uint8_t* out_index_in_chunk_vec) const {
+                                    TypedChunkLocation<uint8_t>* out_chunk_location_vec,
+                                    uint8_t chunk_hint) const {
   ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec,
-                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+                    out_chunk_location_vec, chunk_hint);
 }
 
 void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec,
-                                    uint32_t* out_chunk_index_vec, uint32_t chunk_hint,
-                                    uint32_t* out_index_in_chunk_vec) const {
+                                    TypedChunkLocation<uint32_t>* out_chunk_location_vec,
+                                    uint32_t chunk_hint) const {
   ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec,
-                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+                    out_chunk_location_vec, chunk_hint);
 }
 
 void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec,
-                                    uint16_t* out_chunk_index_vec, uint16_t chunk_hint,
-                                    uint16_t* out_index_in_chunk_vec) const {
+                                    TypedChunkLocation<uint16_t>* out_chunk_location_vec,
+                                    uint16_t chunk_hint) const {
   ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec,
-                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+                    out_chunk_location_vec, chunk_hint);
 }
 
 void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec,
-                                    uint64_t* out_chunk_index_vec, uint64_t chunk_hint,
-                                    uint64_t* out_index_in_chunk_vec) const {
+                                    TypedChunkLocation<uint64_t>* out_chunk_location_vec,
+                                    uint64_t chunk_hint) const {
   ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec,
-                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+                    out_chunk_location_vec, chunk_hint);
 }
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h
index a2a3d5a864243..83fda62387fe1 100644
--- a/cpp/src/arrow/chunk_resolver.h
+++ b/cpp/src/arrow/chunk_resolver.h
@@ -31,28 +31,34 @@ namespace arrow::internal {
 
 struct ChunkResolver;
 
-struct ChunkLocation {
+template <typename IndexType>
+struct TypedChunkLocation {
   /// \brief Index of the chunk in the array of chunks
   ///
   /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used
   /// to represent out-of-bounds locations.
-  int64_t chunk_index = 0;
+  IndexType chunk_index = 0;
 
   /// \brief Index of the value in the chunk
   ///
   /// The value is UNDEFINED if chunk_index >= chunks.size()
-  int64_t index_in_chunk = 0;
+  IndexType index_in_chunk = 0;
 
-  ChunkLocation() = default;
+  TypedChunkLocation() = default;
 
-  ChunkLocation(int64_t chunk_index, int64_t index_in_chunk)
-      : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {}
+  TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk)
+      : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {
+    static_assert(sizeof(TypedChunkLocation<IndexType>) == 2 * sizeof(IndexType));
+    static_assert(alignof(TypedChunkLocation<IndexType>) == alignof(IndexType));
+  }
 
-  bool operator==(ChunkLocation other) const {
+  bool operator==(TypedChunkLocation other) const {
     return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk;
   }
 };
 
+using ChunkLocation = TypedChunkLocation<int64_t>;
+
 /// \brief An utility that incrementally resolves logical indices into
 /// physical indices in a chunked array.
 struct ARROW_EXPORT ChunkResolver {
@@ -144,26 +150,25 @@ struct ARROW_EXPORT ChunkResolver {
   ///
   /// \pre 0 <= logical_index_vec[i] < logical_array_length()
   ///      (for well-defined and valid chunk index results)
-  /// \pre out_chunk_index_vec has space for `n_indices`
+  /// \pre out_chunk_location_vec has space for `n_indices` locations
   /// \pre chunk_hint in [0, chunks.size()]
-  /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+  /// \post out_chunk_location_vec[i].chunk_index in [0, chunks.size()] for i in [0, n)
   /// \post if logical_index_vec[i] >= chunked_array.length(), then
-  ///       out_chunk_index_vec[i] == chunks.size()
-  ///       and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds)
-  /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and
-  ///       out_index_in_chunk_vec[i] are UNDEFINED
+  ///       out_chunk_location_vec[i].chunk_index == chunks.size()
+  ///       and out_chunk_location_vec[i].index_in_chunk is UNDEFINED (can be
+  ///       out-of-bounds)
+  /// \post if logical_index_vec[i] < 0, then both values in out_chunk_index_vec[i]
+  ///       are UNDEFINED
   ///
   /// \param n_indices The number of logical indices to resolve
   /// \param logical_index_vec The logical indices to resolve
-  /// \param out_chunk_index_vec The output array where the chunk indices will be written
+  /// \param out_chunk_location_vec The output array where the locations will be written
   /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
-  /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
-  ///                               within-chunk indices will be written
   /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
   template <typename IndexType>
   [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec,
-                                 IndexType* out_chunk_index_vec, IndexType chunk_hint = 0,
-                                 IndexType* out_index_in_chunk_vec = NULLPTR) const {
+                                 TypedChunkLocation<IndexType>* out_chunk_location_vec,
+                                 IndexType chunk_hint = 0) const {
     if constexpr (sizeof(IndexType) < sizeof(uint64_t)) {
       // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()).
       constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits<IndexType>::max();
@@ -188,13 +193,11 @@ struct ARROW_EXPORT ChunkResolver {
       // logical index in the chunked array.
       using U = std::make_unsigned_t<IndexType>;
       ResolveManyImpl(n_indices, reinterpret_cast<const U*>(logical_index_vec),
-                      reinterpret_cast<U*>(out_chunk_index_vec),
-                      static_cast<U>(chunk_hint),
-                      reinterpret_cast<U*>(out_index_in_chunk_vec));
+                      reinterpret_cast<TypedChunkLocation<U>*>(out_chunk_location_vec),
+                      static_cast<U>(chunk_hint));
     } else {
       static_assert(std::is_unsigned_v<IndexType>);
-      ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint,
-                      out_index_in_chunk_vec);
+      ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec, chunk_hint);
     }
     return true;
   }
@@ -226,10 +229,14 @@ struct ARROW_EXPORT ChunkResolver {
 
   /// \pre all the pre-conditions of ChunkResolver::ResolveMany()
   /// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
-  void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const;
-  void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const;
-  void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const;
-  void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const;
+  void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation<uint8_t>*,
+                       uint8_t) const;
+  void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation<uint16_t>*,
+                       uint16_t) const;
+  void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation<uint32_t>*,
+                       uint32_t) const;
+  void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation<uint64_t>*,
+                       uint64_t) const;
 
  public:
   /// \brief Find the index of the chunk that contains the logical index.
diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc
index b796e9250008a..bf9d4af7c7bb0 100644
--- a/cpp/src/arrow/chunked_array_test.cc
+++ b/cpp/src/arrow/chunked_array_test.cc
@@ -37,6 +37,7 @@ namespace arrow {
 
 using internal::ChunkLocation;
 using internal::ChunkResolver;
+using internal::TypedChunkLocation;
 
 class TestChunkedArray : public ::testing::Test {
  protected:
@@ -380,24 +381,26 @@ class TestChunkResolverMany : public ::testing::Test {
   Result<std::vector<ChunkLocation>> ResolveMany(
       const ChunkResolver& resolver, const std::vector<IndexType>& logical_index_vec) {
     const size_t n = logical_index_vec.size();
-    std::vector<IndexType> chunk_index_vec;
-    chunk_index_vec.resize(n);
-    std::vector<IndexType> index_in_chunk_vec;
-    index_in_chunk_vec.resize(n);
+    std::vector<TypedChunkLocation<IndexType>> chunk_location_vec;
+    chunk_location_vec.resize(n);
     bool valid = resolver.ResolveMany<IndexType>(
-        static_cast<int64_t>(n), logical_index_vec.data(), chunk_index_vec.data(), 0,
-        index_in_chunk_vec.data());
+        static_cast<int64_t>(n), logical_index_vec.data(), chunk_location_vec.data(), 0);
     if (ARROW_PREDICT_FALSE(!valid)) {
       return Status::Invalid("index type doesn't fit possible chunk indexes");
     }
-    std::vector<ChunkLocation> locations;
-    locations.reserve(n);
-    for (size_t i = 0; i < n; i++) {
-      auto chunk_index = static_cast<int64_t>(chunk_index_vec[i]);
-      auto index_in_chunk = static_cast<int64_t>(index_in_chunk_vec[i]);
-      locations.emplace_back(chunk_index, index_in_chunk);
+    if constexpr (std::is_same<decltype(ChunkLocation::chunk_index), IndexType>::value) {
+      return chunk_location_vec;
+    } else {
+      std::vector<ChunkLocation> locations;
+      locations.reserve(n);
+      for (size_t i = 0; i < n; i++) {
+        auto loc = chunk_location_vec[i];
+        auto chunk_index = static_cast<int64_t>(loc.chunk_index);
+        auto index_in_chunk = static_cast<int64_t>(loc.index_in_chunk);
+        locations.emplace_back(chunk_index, index_in_chunk);
+      }
+      return locations;
     }
-    return locations;
   }
 
   void CheckResolveMany(const ChunkResolver& resolver,

From 6ce2af73c8e633b0485ab9d0aa7b729820deebbc Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Tue, 3 Sep 2024 11:27:45 -0300
Subject: [PATCH 077/186] GH-43719: [C++] Clarify the way SIMD-enabled agg
 kernels come from the same code in different compilation units (#43720)

### Rationale for this change

More than once I've been confused about how the `SimdLevel` template parameters on these kernel classes affect dispatching of kernels based on SIMD support detection at runtime [1] given that nothing in the code changes based on the parameters.

What matters is the compilation unit in which the templates are instantiated. Different compilation units get different compilation parameters. The SimdLevel parameters don't really affect the code that gets generated (!), they only serve as a way to avoid duplication of symbols in the compiled objects.

This PR organizes the code to make this more explicit.

[1] https://github.com/apache/arrow/pull/7871#issuecomment-2291615590

### What changes are included in this PR?

 - Introduction of aggregate_basic-inl.h
 - Moving of the impls in `aggregate_basic-inl.h` to an anonymous namespace
 - Grouping of code based on the function they implement (`Sum`, `Mean`, and `MinMax`)

### Are these changes tested?

By the compilation process, existing tests, and benchmarks.

* GitHub Issue: #43719

Lead-authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../arrow/compute/kernels/aggregate_basic.cc  |   37 +-
 .../compute/kernels/aggregate_basic.inc.cc    | 1025 +++++++++++++++++
 .../compute/kernels/aggregate_basic_avx2.cc   |   47 +-
 .../compute/kernels/aggregate_basic_avx512.cc |   51 +-
 .../kernels/aggregate_basic_internal.h        | 1001 +---------------
 5 files changed, 1125 insertions(+), 1036 deletions(-)
 create mode 100644 cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc

diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index c5e0e6fd6e977..b545d8bcc1003 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -23,7 +23,9 @@
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/hashing.h"
 
-#include <memory>
+// Include templated definitions for aggregate kernels that must compiled here
+// with the SIMD level configured for this compilation unit in the build.
+#include "arrow/compute/kernels/aggregate_basic.inc.cc"  // NOLINT(build/include)
 
 namespace arrow {
 namespace compute {
@@ -276,11 +278,6 @@ struct SumImplDefault : public SumImpl<ArrowType, SimdLevel::NONE> {
   using SumImpl<ArrowType, SimdLevel::NONE>::SumImpl;
 };
 
-template <typename ArrowType>
-struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
-  using MeanImpl<ArrowType, SimdLevel::NONE>::MeanImpl;
-};
-
 Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
                                              const KernelInitArgs& args) {
   SumLikeInit<SumImplDefault> visitor(
@@ -289,6 +286,14 @@ Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
   return visitor.Create();
 }
 
+// ----------------------------------------------------------------------
+// Mean implementation
+
+template <typename ArrowType>
+struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
+  using MeanImpl<ArrowType, SimdLevel::NONE>::MeanImpl;
+};
+
 Result<std::unique_ptr<KernelState>> MeanInit(KernelContext* ctx,
                                               const KernelInitArgs& args) {
   MeanKernelInit<MeanImplDefault> visitor(
@@ -482,8 +487,8 @@ void AddFirstOrLastAggKernel(ScalarAggregateFunction* func,
 // ----------------------------------------------------------------------
 // MinMax implementation
 
-Result<std::unique_ptr<KernelState>> MinMaxInit(KernelContext* ctx,
-                                                const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> MinMaxInitDefault(KernelContext* ctx,
+                                                       const KernelInitArgs& args) {
   ARROW_ASSIGN_OR_RAISE(TypeHolder out_type,
                         args.kernel->signature->out_type().Resolve(ctx, args.inputs));
   MinMaxInitState<SimdLevel::NONE> visitor(
@@ -1114,14 +1119,14 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
   // Add min max function
   func = std::make_shared<ScalarAggregateFunction>("min_max", Arity::Unary(), min_max_doc,
                                                    &default_scalar_aggregate_options);
-  AddMinMaxKernels(MinMaxInit, {null(), boolean()}, func.get());
-  AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get());
-  AddMinMaxKernels(MinMaxInit, TemporalTypes(), func.get());
-  AddMinMaxKernels(MinMaxInit, BaseBinaryTypes(), func.get());
-  AddMinMaxKernel(MinMaxInit, Type::FIXED_SIZE_BINARY, func.get());
-  AddMinMaxKernel(MinMaxInit, Type::INTERVAL_MONTHS, func.get());
-  AddMinMaxKernel(MinMaxInit, Type::DECIMAL128, func.get());
-  AddMinMaxKernel(MinMaxInit, Type::DECIMAL256, func.get());
+  AddMinMaxKernels(MinMaxInitDefault, {null(), boolean()}, func.get());
+  AddMinMaxKernels(MinMaxInitDefault, NumericTypes(), func.get());
+  AddMinMaxKernels(MinMaxInitDefault, TemporalTypes(), func.get());
+  AddMinMaxKernels(MinMaxInitDefault, BaseBinaryTypes(), func.get());
+  AddMinMaxKernel(MinMaxInitDefault, Type::FIXED_SIZE_BINARY, func.get());
+  AddMinMaxKernel(MinMaxInitDefault, Type::INTERVAL_MONTHS, func.get());
+  AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL128, func.get());
+  AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL256, func.get());
   // Add the SIMD variants for min max
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
   if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
new file mode 100644
index 0000000000000..f2151e0a9e029
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
@@ -0,0 +1,1025 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// .inc.cc file to be included in compilation unit where kernels are meant to be
+// compiled auto-vectorized by the compiler with different SIMD levels passed
+// as compiler flags.
+//
+// It contains no includes to avoid double inclusion in the compilation unit
+// that includes this .inc.cc file.
+
+#include <cassert>
+#include <cmath>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/align_util.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/decimal.h"
+
+namespace arrow::compute::internal {
+namespace {
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+template <typename ArrowType, SimdLevel::type SimdLevel,
+          typename ResultType = typename FindAccumulatorType<ArrowType>::Type>
+struct SumImpl : public ScalarAggregator {
+  using ThisType = SumImpl<ArrowType, SimdLevel, ResultType>;
+  using CType = typename TypeTraits<ArrowType>::CType;
+  using SumType = ResultType;
+  using SumCType = typename TypeTraits<SumType>::CType;
+  using OutputType = typename TypeTraits<SumType>::ScalarType;
+
+  SumImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options_)
+      : out_type(std::move(out_type)), options(std::move(options_)) {}
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (batch[0].is_array()) {
+      const ArraySpan& data = batch[0].array;
+      this->count += data.length - data.GetNullCount();
+      this->nulls_observed = this->nulls_observed || data.GetNullCount();
+
+      if (!options.skip_nulls && this->nulls_observed) {
+        // Short-circuit
+        return Status::OK();
+      }
+
+      if (is_boolean_type<ArrowType>::value) {
+        this->sum += GetTrueCount(data);
+      } else {
+        this->sum += SumArray<CType, SumCType, SimdLevel>(data);
+      }
+    } else {
+      const Scalar& data = *batch[0].scalar;
+      this->count += data.is_valid * batch.length;
+      this->nulls_observed = this->nulls_observed || !data.is_valid;
+      if (data.is_valid) {
+        this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
+      }
+    }
+    return Status::OK();
+  }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override {
+    const auto& other = checked_cast<const ThisType&>(src);
+    this->count += other.count;
+    this->sum += other.sum;
+    this->nulls_observed = this->nulls_observed || other.nulls_observed;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    if ((!options.skip_nulls && this->nulls_observed) ||
+        (this->count < options.min_count)) {
+      out->value = std::make_shared<OutputType>(out_type);
+    } else {
+      out->value = std::make_shared<OutputType>(this->sum, out_type);
+    }
+    return Status::OK();
+  }
+
+  size_t count = 0;
+  bool nulls_observed = false;
+  SumCType sum = 0;
+  std::shared_ptr<DataType> out_type;
+  ScalarAggregateOptions options;
+};
+
+template <typename ArrowType>
+struct NullImpl : public ScalarAggregator {
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  explicit NullImpl(const ScalarAggregateOptions& options_) : options(options_) {}
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (batch[0].is_scalar() || batch[0].array.GetNullCount() > 0) {
+      // If the batch is a scalar or an array with elements, set is_empty to false
+      is_empty = false;
+    }
+    return Status::OK();
+  }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override {
+    const auto& other = checked_cast<const NullImpl&>(src);
+    this->is_empty &= other.is_empty;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    if ((options.skip_nulls || this->is_empty) && options.min_count == 0) {
+      // Return 0 if the remaining data is empty
+      out->value = output_empty();
+    } else {
+      out->value = MakeNullScalar(TypeTraits<ArrowType>::type_singleton());
+    }
+    return Status::OK();
+  }
+
+  virtual std::shared_ptr<Scalar> output_empty() = 0;
+
+  bool is_empty = true;
+  ScalarAggregateOptions options;
+};
+
+template <typename ArrowType>
+struct NullSumImpl : public NullImpl<ArrowType> {
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  explicit NullSumImpl(const ScalarAggregateOptions& options_)
+      : NullImpl<ArrowType>(options_) {}
+
+  std::shared_ptr<Scalar> output_empty() override {
+    return std::make_shared<ScalarType>(0);
+  }
+};
+
+template <template <typename> class KernelClass>
+struct SumLikeInit {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  std::shared_ptr<DataType> type;
+  const ScalarAggregateOptions& options;
+
+  SumLikeInit(KernelContext* ctx, std::shared_ptr<DataType> type,
+              const ScalarAggregateOptions& options)
+      : ctx(ctx), type(type), options(options) {}
+
+  Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); }
+
+  Status Visit(const HalfFloatType&) {
+    return Status::NotImplemented("No sum implemented");
+  }
+
+  Status Visit(const BooleanType&) {
+    auto ty = TypeTraits<typename KernelClass<BooleanType>::SumType>::type_singleton();
+    state.reset(new KernelClass<BooleanType>(ty, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_number<Type, Status> Visit(const Type&) {
+    auto ty = TypeTraits<typename KernelClass<Type>::SumType>::type_singleton();
+    state.reset(new KernelClass<Type>(ty, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_decimal<Type, Status> Visit(const Type&) {
+    state.reset(new KernelClass<Type>(type, options));
+    return Status::OK();
+  }
+
+  virtual Status Visit(const NullType&) {
+    state.reset(new NullSumImpl<Int64Type>(options));
+    return Status::OK();
+  }
+
+  Result<std::unique_ptr<KernelState>> Create() {
+    ARROW_RETURN_NOT_OK(VisitTypeInline(*type, this));
+    return std::move(state);
+  }
+};
+
+// ----------------------------------------------------------------------
+// Mean implementation
+
+template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
+struct MeanImpl;
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MeanImpl<ArrowType, SimdLevel, enable_if_decimal<ArrowType>>
+    : public SumImpl<ArrowType, SimdLevel> {
+  using SumImpl<ArrowType, SimdLevel>::SumImpl;
+  using SumImpl<ArrowType, SimdLevel>::options;
+  using SumCType = typename SumImpl<ArrowType, SimdLevel>::SumCType;
+  using OutputType = typename SumImpl<ArrowType, SimdLevel>::OutputType;
+
+  template <typename T = ArrowType>
+  Status FinalizeImpl(Datum* out) {
+    if ((!options.skip_nulls && this->nulls_observed) ||
+        (this->count < options.min_count) || (this->count == 0)) {
+      out->value = std::make_shared<OutputType>(this->out_type);
+    } else {
+      SumCType quotient, remainder;
+      ARROW_ASSIGN_OR_RAISE(std::tie(quotient, remainder), this->sum.Divide(this->count));
+      // Round the decimal result based on the remainder
+      remainder.Abs();
+      if (remainder * 2 >= this->count) {
+        if (this->sum >= 0) {
+          quotient += 1;
+        } else {
+          quotient -= 1;
+        }
+      }
+      out->value = std::make_shared<OutputType>(quotient, this->out_type);
+    }
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); }
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MeanImpl<ArrowType, SimdLevel,
+                std::enable_if_t<!is_decimal_type<ArrowType>::value>>
+    // Override the ResultType of SumImpl because we need to use double for intermediate
+    // sum to prevent integer overflows
+    : public SumImpl<ArrowType, SimdLevel, DoubleType> {
+  using SumImpl<ArrowType, SimdLevel, DoubleType>::SumImpl;
+  using SumImpl<ArrowType, SimdLevel, DoubleType>::options;
+
+  template <typename T = ArrowType>
+  Status FinalizeImpl(Datum* out) {
+    if ((!options.skip_nulls && this->nulls_observed) ||
+        (this->count < options.min_count)) {
+      out->value = std::make_shared<DoubleScalar>();
+    } else {
+      static_assert(std::is_same_v<decltype(this->sum), double>,
+                    "SumCType must be double for numeric inputs");
+      const double mean = this->sum / this->count;
+      out->value = std::make_shared<DoubleScalar>(mean);
+    }
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); }
+};
+
+template <template <typename> class KernelClass>
+struct MeanKernelInit : public SumLikeInit<KernelClass> {
+  MeanKernelInit(KernelContext* ctx, std::shared_ptr<DataType> type,
+                 const ScalarAggregateOptions& options)
+      : SumLikeInit<KernelClass>(ctx, type, options) {}
+
+  Status Visit(const NullType&) override {
+    this->state.reset(new NullSumImpl<DoubleType>(this->options));
+    return Status::OK();
+  }
+};
+
+// ----------------------------------------------------------------------
+// FirstLast implementation
+
+template <typename ArrowType, typename Enable = void>
+struct FirstLastState {};
+
+template <typename ArrowType>
+struct FirstLastState<ArrowType, enable_if_boolean<ArrowType>> {
+  using ThisType = FirstLastState<ArrowType>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->first = this->has_values ? this->first : rhs.first;
+    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
+    this->last = rhs.has_values ? rhs.last : this->last;
+    this->last_is_null = rhs.last_is_null;
+    this->has_values |= rhs.has_values;
+    this->has_any_values |= rhs.has_any_values;
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    if (!has_values) {
+      this->first = value;
+      has_values = true;
+    }
+    this->last = value;
+  }
+
+  T first = false;
+  T last = false;
+  bool has_values = false;
+  bool first_is_null = false;
+  bool last_is_null = false;
+  bool has_any_values = false;
+};
+
+template <typename ArrowType>
+struct FirstLastState<ArrowType, enable_if_physical_integer<ArrowType>> {
+  using ThisType = FirstLastState<ArrowType>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->first = this->has_values ? this->first : rhs.first;
+    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
+    this->last = rhs.has_values ? rhs.last : this->last;
+    this->last_is_null = rhs.last_is_null;
+    this->has_values |= rhs.has_values;
+    this->has_any_values |= rhs.has_any_values;
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    if (!has_values) {
+      this->first = value;
+      has_values = true;
+    }
+    this->last = value;
+  }
+
+  T first = std::numeric_limits<T>::infinity();
+  T last = std::numeric_limits<T>::infinity();
+  bool has_values = false;
+
+  // These are updated in ConsumeScalar and ConsumeArray since null values don't
+  // invoke MergeOne
+  bool first_is_null = false;
+  bool last_is_null = false;
+  // has_any_values indicates whether there is any value (either null or non-null)
+  // (1) has_any_values = false: There is no value aggregated
+  // (2) has_any_values = true, has_values = false: There are only null values aggregated
+  // (3) has_any_values = true, has_values = true: There are both null and non-null values
+  // aggregated
+  bool has_any_values = false;
+};
+
+template <typename ArrowType>
+struct FirstLastState<ArrowType, enable_if_floating_point<ArrowType>> {
+  using ThisType = FirstLastState<ArrowType>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->first = this->has_values ? this->first : rhs.first;
+    this->last = rhs.has_values ? rhs.last : this->last;
+    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
+    this->last_is_null = rhs.last_is_null;
+    this->has_values |= rhs.has_values;
+    this->has_any_values |= rhs.has_any_values;
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    if (!has_values) {
+      this->first = value;
+      has_values = true;
+    }
+    last = value;
+  }
+
+  T first = std::numeric_limits<T>::infinity();
+  T last = std::numeric_limits<T>::infinity();
+  bool has_values = false;
+  bool first_is_null = false;
+  bool last_is_null = false;
+  bool has_any_values = false;
+};
+
+template <typename ArrowType>
+struct FirstLastState<ArrowType,
+                      enable_if_t<is_base_binary_type<ArrowType>::value ||
+                                  std::is_same<ArrowType, FixedSizeBinaryType>::value>> {
+  using ThisType = FirstLastState<ArrowType>;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->first = this->has_values ? this->first : rhs.first;
+    this->last = rhs.has_values ? rhs.last : this->last;
+    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
+    this->last_is_null = rhs.last_is_null;
+    this->has_values |= rhs.has_values;
+    this->has_any_values |= rhs.has_any_values;
+    return *this;
+  }
+
+  void MergeOne(std::string_view value) {
+    if (!has_values) {
+      first = std::string(value);
+      has_values = true;
+    }
+    last = std::string(value);
+  }
+
+  std::string first = "";
+  std::string last = "";
+  bool has_values = false;
+  bool first_is_null = false;
+  bool last_is_null = false;
+  bool has_any_values = false;
+};
+
+template <typename ArrowType>
+struct FirstLastImpl : public ScalarAggregator {
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  using ThisType = FirstLastImpl<ArrowType>;
+  using StateType = FirstLastState<ArrowType>;
+
+  FirstLastImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options)
+      : out_type(std::move(out_type)), options(std::move(options)), count(0) {
+    this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
+  }
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (batch[0].is_array()) {
+      return ConsumeArray(batch[0].array);
+    }
+    return ConsumeScalar(*batch[0].scalar);
+  }
+
+  Status ConsumeScalar(const Scalar& scalar) {
+    this->state.has_any_values = true;
+    if (scalar.is_valid) {
+      this->state.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
+    } else {
+      if (!this->state.has_values) {
+        this->state.first_is_null = true;
+      }
+    }
+    this->count += scalar.is_valid;
+    return Status::OK();
+  }
+
+  Status ConsumeArray(const ArraySpan& arr_span) {
+    this->state.has_any_values = true;
+    ArrayType arr(arr_span.ToArrayData());
+    const auto null_count = arr.null_count();
+    this->count += arr.length() - null_count;
+
+    if (null_count == 0) {
+      // If there are no null values, we can just merge
+      // the first and last element
+      this->state.MergeOne(arr.GetView(0));
+      this->state.MergeOne(arr.GetView(arr.length() - 1));
+    } else {
+      int64_t first_i = -1;
+      int64_t last_i = -1;
+
+      if (!this->state.has_values && arr.IsNull(0)) {
+        this->state.first_is_null = true;
+      }
+
+      if (arr.IsNull(arr.length() - 1)) {
+        this->state.last_is_null = true;
+      }
+
+      // Find the first and last non-null value and update state
+      for (int64_t i = 0; i < arr.length(); i++) {
+        if (!arr.IsNull(i)) {
+          first_i = i;
+          break;
+        }
+      }
+      if (first_i >= 0) {
+        for (int64_t i = arr.length() - 1; i >= 0; i--) {
+          if (!arr.IsNull(i)) {
+            last_i = i;
+            break;
+          }
+        }
+        assert(last_i >= first_i);
+        this->state.MergeOne(arr.GetView(first_i));
+        this->state.MergeOne(arr.GetView(last_i));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override {
+    const auto& other = checked_cast<const ThisType&>(src);
+    this->state += other.state;
+    this->count += other.count;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    const auto& struct_type = checked_cast<const StructType&>(*out_type);
+    const auto& child_type = struct_type.field(0)->type();
+    auto null_scalar = MakeNullScalar(child_type);
+
+    std::vector<std::shared_ptr<Scalar>> values;
+
+    if (this->count < options.min_count) {
+      values = {null_scalar, null_scalar};
+    } else {
+      if (state.has_values) {
+        if (options.skip_nulls) {
+          ARROW_ASSIGN_OR_RAISE(auto first_scalar, MakeScalar(child_type, state.first));
+          ARROW_ASSIGN_OR_RAISE(auto last_scalar, MakeScalar(child_type, state.last));
+          values = {first_scalar, last_scalar};
+        } else {
+          ARROW_ASSIGN_OR_RAISE(
+              auto first_scalar,
+              state.first_is_null ? null_scalar : MakeScalar(child_type, state.first));
+          ARROW_ASSIGN_OR_RAISE(
+              auto last_scalar,
+              state.last_is_null ? null_scalar : MakeScalar(child_type, state.last));
+
+          values = {first_scalar, last_scalar};
+        }
+      } else {
+        // If there is no non-null values, we always output null regardless of
+        // skip_null
+        values = {null_scalar, null_scalar};
+      }
+    }
+
+    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+    return Status::OK();
+  }
+
+  std::shared_ptr<DataType> out_type;
+  ScalarAggregateOptions options;
+  int64_t count;
+  FirstLastState<ArrowType> state;
+};
+
+struct FirstLastInitState {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  const DataType& in_type;
+  std::shared_ptr<DataType> out_type;
+  const ScalarAggregateOptions& options;
+
+  FirstLastInitState(KernelContext* ctx, const DataType& in_type,
+                     const std::shared_ptr<DataType>& out_type,
+                     const ScalarAggregateOptions& options)
+      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
+
+  Status Visit(const DataType& ty) {
+    return Status::NotImplemented("No first/last implemented for ", ty);
+  }
+
+  Status Visit(const HalfFloatType& ty) {
+    return Status::NotImplemented("No first/last implemented for ", ty);
+  }
+
+  Status Visit(const BooleanType&) {
+    state.reset(new FirstLastImpl<BooleanType>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_physical_integer<Type, Status> Visit(const Type&) {
+    using PhysicalType = typename Type::PhysicalType;
+    state.reset(new FirstLastImpl<PhysicalType>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_physical_floating_point<Type, Status> Visit(const Type&) {
+    using PhysicalType = typename Type::PhysicalType;
+    state.reset(new FirstLastImpl<PhysicalType>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_base_binary<Type, Status> Visit(const Type&) {
+    state.reset(new FirstLastImpl<Type>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_t<std::is_same<Type, FixedSizeBinaryType>::value, Status> Visit(const Type&) {
+    state.reset(new FirstLastImpl<Type>(out_type, options));
+    return Status::OK();
+  }
+
+  Result<std::unique_ptr<KernelState>> Create() {
+    ARROW_RETURN_NOT_OK(VisitTypeInline(in_type, this));
+    return std::move(state);
+  }
+};
+
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
+struct MinMaxState {};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_boolean<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using T = typename ArrowType::c_type;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = this->min && rhs.min;
+    this->max = this->max || rhs.max;
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    this->min = this->min && value;
+    this->max = this->max || value;
+  }
+
+  T min = true;
+  T max = false;
+  bool has_nulls = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_integer<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = std::min(this->min, rhs.min);
+    this->max = std::max(this->max, rhs.max);
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    this->min = std::min(this->min, value);
+    this->max = std::max(this->max, value);
+  }
+
+  T min = std::numeric_limits<T>::max();
+  T max = std::numeric_limits<T>::min();
+  bool has_nulls = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_floating_point<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using T = typename ArrowType::c_type;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = std::fmin(this->min, rhs.min);
+    this->max = std::fmax(this->max, rhs.max);
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    this->min = std::fmin(this->min, value);
+    this->max = std::fmax(this->max, value);
+  }
+
+  T min = std::numeric_limits<T>::infinity();
+  T max = -std::numeric_limits<T>::infinity();
+  bool has_nulls = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel, enable_if_decimal<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using T = typename TypeTraits<ArrowType>::CType;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  MinMaxState() : min(T::GetMaxSentinel()), max(T::GetMinSentinel()) {}
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = std::min(this->min, rhs.min);
+    this->max = std::max(this->max, rhs.max);
+    return *this;
+  }
+
+  void MergeOne(std::string_view value) {
+    MergeOne(T(reinterpret_cast<const uint8_t*>(value.data())));
+  }
+
+  void MergeOne(const T value) {
+    this->min = std::min(this->min, value);
+    this->max = std::max(this->max, value);
+  }
+
+  T min;
+  T max;
+  bool has_nulls = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxState<ArrowType, SimdLevel,
+                   enable_if_t<is_base_binary_type<ArrowType>::value ||
+                               std::is_same<ArrowType, FixedSizeBinaryType>::value>> {
+  using ThisType = MinMaxState<ArrowType, SimdLevel>;
+  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    if (!this->seen && rhs.seen) {
+      this->min = rhs.min;
+      this->max = rhs.max;
+    } else if (this->seen && rhs.seen) {
+      if (this->min > rhs.min) {
+        this->min = rhs.min;
+      }
+      if (this->max < rhs.max) {
+        this->max = rhs.max;
+      }
+    }
+    this->has_nulls |= rhs.has_nulls;
+    this->seen |= rhs.seen;
+    return *this;
+  }
+
+  void MergeOne(std::string_view value) {
+    if (!seen) {
+      this->min = std::string(value);
+      this->max = std::string(value);
+    } else {
+      if (value < std::string_view(this->min)) {
+        this->min = std::string(value);
+      } else if (value > std::string_view(this->max)) {
+        this->max = std::string(value);
+      }
+    }
+    this->seen = true;
+  }
+
+  std::string min;
+  std::string max;
+  bool has_nulls = false;
+  bool seen = false;
+};
+
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MinMaxImpl : public ScalarAggregator {
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  using ThisType = MinMaxImpl<ArrowType, SimdLevel>;
+  using StateType = MinMaxState<ArrowType, SimdLevel>;
+
+  MinMaxImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options)
+      : out_type(std::move(out_type)), options(std::move(options)), count(0) {
+    this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
+  }
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (batch[0].is_array()) {
+      return ConsumeArray(batch[0].array);
+    }
+    return ConsumeScalar(*batch[0].scalar);
+  }
+
+  Status ConsumeScalar(const Scalar& scalar) {
+    StateType local;
+    local.has_nulls = !scalar.is_valid;
+    this->count += scalar.is_valid;
+
+    if (!local.has_nulls || options.skip_nulls) {
+      local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
+    }
+
+    this->state += local;
+    return Status::OK();
+  }
+
+  Status ConsumeArray(const ArraySpan& arr_span) {
+    StateType local;
+
+    ArrayType arr(arr_span.ToArrayData());
+
+    const auto null_count = arr.null_count();
+    local.has_nulls = null_count > 0;
+    this->count += arr.length() - null_count;
+
+    if (!local.has_nulls) {
+      for (int64_t i = 0; i < arr.length(); i++) {
+        local.MergeOne(arr.GetView(i));
+      }
+    } else if (local.has_nulls && options.skip_nulls) {
+      local += ConsumeWithNulls(arr);
+    }
+
+    this->state += local;
+    return Status::OK();
+  }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override {
+    const auto& other = checked_cast<const ThisType&>(src);
+    this->state += other.state;
+    this->count += other.count;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    const auto& struct_type = checked_cast<const StructType&>(*out_type);
+    const auto& child_type = struct_type.field(0)->type();
+
+    std::vector<std::shared_ptr<Scalar>> values;
+    // Physical type != result type
+    if ((state.has_nulls && !options.skip_nulls) || (this->count < options.min_count)) {
+      // (null, null)
+      auto null_scalar = MakeNullScalar(child_type);
+      values = {null_scalar, null_scalar};
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto min_scalar,
+                            MakeScalar(child_type, std::move(state.min)));
+      ARROW_ASSIGN_OR_RAISE(auto max_scalar,
+                            MakeScalar(child_type, std::move(state.max)));
+      values = {std::move(min_scalar), std::move(max_scalar)};
+    }
+    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+    return Status::OK();
+  }
+
+  std::shared_ptr<DataType> out_type;
+  ScalarAggregateOptions options;
+  int64_t count;
+  MinMaxState<ArrowType, SimdLevel> state;
+
+ private:
+  StateType ConsumeWithNulls(const ArrayType& arr) const {
+    StateType local;
+    const int64_t length = arr.length();
+    int64_t offset = arr.offset();
+    const uint8_t* bitmap = arr.null_bitmap_data();
+    int64_t idx = 0;
+
+    const auto p = arrow::internal::BitmapWordAlign<1>(bitmap, offset, length);
+    // First handle the leading bits
+    const int64_t leading_bits = p.leading_bits;
+    while (idx < leading_bits) {
+      if (bit_util::GetBit(bitmap, offset)) {
+        local.MergeOne(arr.GetView(idx));
+      }
+      idx++;
+      offset++;
+    }
+
+    // The aligned parts scanned with BitBlockCounter
+    arrow::internal::BitBlockCounter data_counter(bitmap, offset, length - leading_bits);
+    auto current_block = data_counter.NextWord();
+    while (idx < length) {
+      if (current_block.AllSet()) {  // All true values
+        int run_length = 0;
+        // Scan forward until a block that has some false values (or the end)
+        while (current_block.length > 0 && current_block.AllSet()) {
+          run_length += current_block.length;
+          current_block = data_counter.NextWord();
+        }
+        for (int64_t i = 0; i < run_length; i++) {
+          local.MergeOne(arr.GetView(idx + i));
+        }
+        idx += run_length;
+        offset += run_length;
+        // The current_block already computed, advance to next loop
+        continue;
+      } else if (!current_block.NoneSet()) {  // Some values are null
+        BitmapReader reader(arr.null_bitmap_data(), offset, current_block.length);
+        for (int64_t i = 0; i < current_block.length; i++) {
+          if (reader.IsSet()) {
+            local.MergeOne(arr.GetView(idx + i));
+          }
+          reader.Next();
+        }
+
+        idx += current_block.length;
+        offset += current_block.length;
+      } else {  // All null values
+        idx += current_block.length;
+        offset += current_block.length;
+      }
+      current_block = data_counter.NextWord();
+    }
+
+    return local;
+  }
+};
+
+template <SimdLevel::type SimdLevel>
+struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
+  using StateType = MinMaxState<BooleanType, SimdLevel>;
+  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
+  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
+  using MinMaxImpl<BooleanType, SimdLevel>::options;
+
+  Status Consume(KernelContext*, const ExecSpan& batch) override {
+    if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
+      return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar));
+    }
+    StateType local;
+    ArrayType arr(batch[0].array.ToArrayData());
+
+    const auto arr_length = arr.length();
+    const auto null_count = arr.null_count();
+    const auto valid_count = arr_length - null_count;
+
+    local.has_nulls = null_count > 0;
+    this->count += valid_count;
+    if (!local.has_nulls || options.skip_nulls) {
+      const auto true_count = arr.true_count();
+      const auto false_count = valid_count - true_count;
+      local.max = true_count > 0;
+      local.min = false_count == 0;
+    }
+
+    this->state += local;
+    return Status::OK();
+  }
+
+  Status ConsumeScalar(const BooleanScalar& scalar) {
+    StateType local;
+
+    local.has_nulls = !scalar.is_valid;
+    this->count += scalar.is_valid;
+    if (!local.has_nulls || options.skip_nulls) {
+      const int true_count = scalar.is_valid && scalar.value;
+      const int false_count = scalar.is_valid && !scalar.value;
+      local.max = true_count > 0;
+      local.min = false_count == 0;
+    }
+
+    this->state += local;
+    return Status::OK();
+  }
+};
+
+struct NullMinMaxImpl : public ScalarAggregator {
+  Status Consume(KernelContext*, const ExecSpan& batch) override { return Status::OK(); }
+
+  Status MergeFrom(KernelContext*, KernelState&& src) override { return Status::OK(); }
+
+  Status Finalize(KernelContext*, Datum* out) override {
+    std::vector<std::shared_ptr<Scalar>> values{std::make_shared<NullScalar>(),
+                                                std::make_shared<NullScalar>()};
+    out->value = std::make_shared<StructScalar>(
+        std::move(values), struct_({field("min", null()), field("max", null())}));
+    return Status::OK();
+  }
+};
+
+template <SimdLevel::type SimdLevel>
+struct MinMaxInitState {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  const DataType& in_type;
+  std::shared_ptr<DataType> out_type;
+  const ScalarAggregateOptions& options;
+
+  MinMaxInitState(KernelContext* ctx, const DataType& in_type,
+                  const std::shared_ptr<DataType>& out_type,
+                  const ScalarAggregateOptions& options)
+      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
+
+  Status Visit(const DataType& ty) {
+    return Status::NotImplemented("No min/max implemented for ", ty);
+  }
+
+  Status Visit(const HalfFloatType& ty) {
+    return Status::NotImplemented("No min/max implemented for ", ty);
+  }
+
+  Status Visit(const NullType&) {
+    state.reset(new NullMinMaxImpl());
+    return Status::OK();
+  }
+
+  Status Visit(const BooleanType&) {
+    state.reset(new BooleanMinMaxImpl<SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_physical_integer<Type, Status> Visit(const Type&) {
+    using PhysicalType = typename Type::PhysicalType;
+    state.reset(new MinMaxImpl<PhysicalType, SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_floating_point<Type, Status> Visit(const Type&) {
+    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_base_binary<Type, Status> Visit(const Type&) {
+    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_fixed_size_binary<Type, Status> Visit(const Type&) {
+    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
+    return Status::OK();
+  }
+
+  Result<std::unique_ptr<KernelState>> Create() {
+    ARROW_RETURN_NOT_OK(VisitTypeInline(in_type, this));
+    return std::move(state);
+  }
+};
+
+}  // namespace
+}  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc
index 03b45107eeca1..a1a6a95c5e11c 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc
@@ -17,6 +17,10 @@
 
 #include "arrow/compute/kernels/aggregate_basic_internal.h"
 
+// Include templated definitions for aggregate kernels that must compiled here
+// with the SIMD level configured for this compilation unit in the build.
+#include "arrow/compute/kernels/aggregate_basic.inc.cc"  // NOLINT(build/include)
+
 namespace arrow {
 namespace compute {
 namespace internal {
@@ -24,16 +28,13 @@ namespace internal {
 // ----------------------------------------------------------------------
 // Sum implementation
 
+namespace {
+
 template <typename ArrowType>
 struct SumImplAvx2 : public SumImpl<ArrowType, SimdLevel::AVX2> {
   using SumImpl<ArrowType, SimdLevel::AVX2>::SumImpl;
 };
 
-template <typename ArrowType>
-struct MeanImplAvx2 : public MeanImpl<ArrowType, SimdLevel::AVX2> {
-  using MeanImpl<ArrowType, SimdLevel::AVX2>::MeanImpl;
-};
-
 Result<std::unique_ptr<KernelState>> SumInitAvx2(KernelContext* ctx,
                                                  const KernelInitArgs& args) {
   SumLikeInit<SumImplAvx2> visitor(
@@ -42,6 +43,24 @@ Result<std::unique_ptr<KernelState>> SumInitAvx2(KernelContext* ctx,
   return visitor.Create();
 }
 
+}  // namespace
+
+void AddSumAvx2AggKernels(ScalarAggregateFunction* func) {
+  AddBasicAggKernels(SumInitAvx2, SignedIntTypes(), int64(), func, SimdLevel::AVX2);
+  AddBasicAggKernels(SumInitAvx2, UnsignedIntTypes(), uint64(), func, SimdLevel::AVX2);
+  AddBasicAggKernels(SumInitAvx2, FloatingPointTypes(), float64(), func, SimdLevel::AVX2);
+}
+
+// ----------------------------------------------------------------------
+// Mean implementation
+
+namespace {
+
+template <typename ArrowType>
+struct MeanImplAvx2 : public MeanImpl<ArrowType, SimdLevel::AVX2> {
+  using MeanImpl<ArrowType, SimdLevel::AVX2>::MeanImpl;
+};
+
 Result<std::unique_ptr<KernelState>> MeanInitAvx2(KernelContext* ctx,
                                                   const KernelInitArgs& args) {
   SumLikeInit<MeanImplAvx2> visitor(
@@ -50,9 +69,17 @@ Result<std::unique_ptr<KernelState>> MeanInitAvx2(KernelContext* ctx,
   return visitor.Create();
 }
 
+}  // namespace
+
+void AddMeanAvx2AggKernels(ScalarAggregateFunction* func) {
+  AddBasicAggKernels(MeanInitAvx2, NumericTypes(), float64(), func, SimdLevel::AVX2);
+}
+
 // ----------------------------------------------------------------------
 // MinMax implementation
 
+namespace {
+
 Result<std::unique_ptr<KernelState>> MinMaxInitAvx2(KernelContext* ctx,
                                                     const KernelInitArgs& args) {
   ARROW_ASSIGN_OR_RAISE(TypeHolder out_type,
@@ -63,15 +90,7 @@ Result<std::unique_ptr<KernelState>> MinMaxInitAvx2(KernelContext* ctx,
   return visitor.Create();
 }
 
-void AddSumAvx2AggKernels(ScalarAggregateFunction* func) {
-  AddBasicAggKernels(SumInitAvx2, SignedIntTypes(), int64(), func, SimdLevel::AVX2);
-  AddBasicAggKernels(SumInitAvx2, UnsignedIntTypes(), uint64(), func, SimdLevel::AVX2);
-  AddBasicAggKernels(SumInitAvx2, FloatingPointTypes(), float64(), func, SimdLevel::AVX2);
-}
-
-void AddMeanAvx2AggKernels(ScalarAggregateFunction* func) {
-  AddBasicAggKernels(MeanInitAvx2, NumericTypes(), float64(), func, SimdLevel::AVX2);
-}
+}  // namespace
 
 void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func) {
   // Enable int types for AVX2 variants.
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
index 05356e0aa5e75..9dc490937a691 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc
@@ -17,6 +17,10 @@
 
 #include "arrow/compute/kernels/aggregate_basic_internal.h"
 
+// Include templated definitions for aggregate kernels that must compiled here
+// with the SIMD level configured for this compilation unit in the build.
+#include "arrow/compute/kernels/aggregate_basic.inc.cc"  // NOLINT(build/include)
+
 namespace arrow {
 namespace compute {
 namespace internal {
@@ -24,16 +28,13 @@ namespace internal {
 // ----------------------------------------------------------------------
 // Sum implementation
 
+namespace {
+
 template <typename ArrowType>
 struct SumImplAvx512 : public SumImpl<ArrowType, SimdLevel::AVX512> {
   using SumImpl<ArrowType, SimdLevel::AVX512>::SumImpl;
 };
 
-template <typename ArrowType>
-struct MeanImplAvx512 : public MeanImpl<ArrowType, SimdLevel::AVX512> {
-  using MeanImpl<ArrowType, SimdLevel::AVX512>::MeanImpl;
-};
-
 Result<std::unique_ptr<KernelState>> SumInitAvx512(KernelContext* ctx,
                                                    const KernelInitArgs& args) {
   SumLikeInit<SumImplAvx512> visitor(
@@ -42,6 +43,26 @@ Result<std::unique_ptr<KernelState>> SumInitAvx512(KernelContext* ctx,
   return visitor.Create();
 }
 
+}  // namespace
+
+void AddSumAvx512AggKernels(ScalarAggregateFunction* func) {
+  AddBasicAggKernels(SumInitAvx512, SignedIntTypes(), int64(), func, SimdLevel::AVX512);
+  AddBasicAggKernels(SumInitAvx512, UnsignedIntTypes(), uint64(), func,
+                     SimdLevel::AVX512);
+  AddBasicAggKernels(SumInitAvx512, FloatingPointTypes(), float64(), func,
+                     SimdLevel::AVX512);
+}
+
+// ----------------------------------------------------------------------
+// Mean implementation
+
+namespace {
+
+template <typename ArrowType>
+struct MeanImplAvx512 : public MeanImpl<ArrowType, SimdLevel::AVX512> {
+  using MeanImpl<ArrowType, SimdLevel::AVX512>::MeanImpl;
+};
+
 Result<std::unique_ptr<KernelState>> MeanInitAvx512(KernelContext* ctx,
                                                     const KernelInitArgs& args) {
   SumLikeInit<MeanImplAvx512> visitor(
@@ -50,9 +71,17 @@ Result<std::unique_ptr<KernelState>> MeanInitAvx512(KernelContext* ctx,
   return visitor.Create();
 }
 
+}  // namespace
+
+void AddMeanAvx512AggKernels(ScalarAggregateFunction* func) {
+  AddBasicAggKernels(MeanInitAvx512, NumericTypes(), float64(), func, SimdLevel::AVX512);
+}
+
 // ----------------------------------------------------------------------
 // MinMax implementation
 
+namespace {
+
 Result<std::unique_ptr<KernelState>> MinMaxInitAvx512(KernelContext* ctx,
                                                       const KernelInitArgs& args) {
   ARROW_ASSIGN_OR_RAISE(TypeHolder out_type,
@@ -63,17 +92,7 @@ Result<std::unique_ptr<KernelState>> MinMaxInitAvx512(KernelContext* ctx,
   return visitor.Create();
 }
 
-void AddSumAvx512AggKernels(ScalarAggregateFunction* func) {
-  AddBasicAggKernels(SumInitAvx512, SignedIntTypes(), int64(), func, SimdLevel::AVX512);
-  AddBasicAggKernels(SumInitAvx512, UnsignedIntTypes(), uint64(), func,
-                     SimdLevel::AVX512);
-  AddBasicAggKernels(SumInitAvx512, FloatingPointTypes(), float64(), func,
-                     SimdLevel::AVX512);
-}
-
-void AddMeanAvx512AggKernels(ScalarAggregateFunction* func) {
-  AddBasicAggKernels(MeanInitAvx512, NumericTypes(), float64(), func, SimdLevel::AVX512);
-}
+}  // namespace
 
 void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func) {
   // Enable 32/64 int types for avx512 variants, no advantage on 8/16 int.
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
index f08e7aaa538bb..5cc3a558b1efb 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
@@ -17,23 +17,18 @@
 
 #pragma once
 
-#include <cmath>
-#include <type_traits>
-#include <utility>
+#include <memory>
+#include <vector>
 
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/codegen_internal.h"
-#include "arrow/compute/kernels/common_internal.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/align_util.h"
-#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/decimal.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/type_fwd.h"
 
 namespace arrow::compute::internal {
 
+// aggregate_basic.cc
+
 void AddBasicAggKernels(KernelInit init,
                         const std::vector<std::shared_ptr<DataType>>& types,
                         std::shared_ptr<DataType> out_ty, ScalarAggregateFunction* func,
@@ -47,990 +42,16 @@ void AddMinMaxKernel(KernelInit init, internal::detail::GetTypeId get_id,
                      ScalarAggregateFunction* func,
                      SimdLevel::type simd_level = SimdLevel::NONE);
 
-// SIMD variants for kernels
+// aggregate_basic_avx2.cc
+
 void AddSumAvx2AggKernels(ScalarAggregateFunction* func);
 void AddMeanAvx2AggKernels(ScalarAggregateFunction* func);
 void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func);
 
+// aggregate_basic_avx512.cc
+
 void AddSumAvx512AggKernels(ScalarAggregateFunction* func);
 void AddMeanAvx512AggKernels(ScalarAggregateFunction* func);
 void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func);
 
-// ----------------------------------------------------------------------
-// Sum implementation
-
-template <typename ArrowType, SimdLevel::type SimdLevel,
-          typename ResultType = typename FindAccumulatorType<ArrowType>::Type>
-struct SumImpl : public ScalarAggregator {
-  using ThisType = SumImpl<ArrowType, SimdLevel, ResultType>;
-  using CType = typename TypeTraits<ArrowType>::CType;
-  using SumType = ResultType;
-  using SumCType = typename TypeTraits<SumType>::CType;
-  using OutputType = typename TypeTraits<SumType>::ScalarType;
-
-  SumImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options_)
-      : out_type(std::move(out_type)), options(std::move(options_)) {}
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (batch[0].is_array()) {
-      const ArraySpan& data = batch[0].array;
-      this->count += data.length - data.GetNullCount();
-      this->nulls_observed = this->nulls_observed || data.GetNullCount();
-
-      if (!options.skip_nulls && this->nulls_observed) {
-        // Short-circuit
-        return Status::OK();
-      }
-
-      if (is_boolean_type<ArrowType>::value) {
-        this->sum += GetTrueCount(data);
-      } else {
-        this->sum += SumArray<CType, SumCType, SimdLevel>(data);
-      }
-    } else {
-      const Scalar& data = *batch[0].scalar;
-      this->count += data.is_valid * batch.length;
-      this->nulls_observed = this->nulls_observed || !data.is_valid;
-      if (data.is_valid) {
-        this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
-      }
-    }
-    return Status::OK();
-  }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override {
-    const auto& other = checked_cast<const ThisType&>(src);
-    this->count += other.count;
-    this->sum += other.sum;
-    this->nulls_observed = this->nulls_observed || other.nulls_observed;
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    if ((!options.skip_nulls && this->nulls_observed) ||
-        (this->count < options.min_count)) {
-      out->value = std::make_shared<OutputType>(out_type);
-    } else {
-      out->value = std::make_shared<OutputType>(this->sum, out_type);
-    }
-    return Status::OK();
-  }
-
-  size_t count = 0;
-  bool nulls_observed = false;
-  SumCType sum = 0;
-  std::shared_ptr<DataType> out_type;
-  ScalarAggregateOptions options;
-};
-
-template <typename ArrowType>
-struct NullImpl : public ScalarAggregator {
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  explicit NullImpl(const ScalarAggregateOptions& options_) : options(options_) {}
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (batch[0].is_scalar() || batch[0].array.GetNullCount() > 0) {
-      // If the batch is a scalar or an array with elements, set is_empty to false
-      is_empty = false;
-    }
-    return Status::OK();
-  }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override {
-    const auto& other = checked_cast<const NullImpl&>(src);
-    this->is_empty &= other.is_empty;
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    if ((options.skip_nulls || this->is_empty) && options.min_count == 0) {
-      // Return 0 if the remaining data is empty
-      out->value = output_empty();
-    } else {
-      out->value = MakeNullScalar(TypeTraits<ArrowType>::type_singleton());
-    }
-    return Status::OK();
-  }
-
-  virtual std::shared_ptr<Scalar> output_empty() = 0;
-
-  bool is_empty = true;
-  ScalarAggregateOptions options;
-};
-
-template <typename ArrowType>
-struct NullSumImpl : public NullImpl<ArrowType> {
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  explicit NullSumImpl(const ScalarAggregateOptions& options_)
-      : NullImpl<ArrowType>(options_) {}
-
-  std::shared_ptr<Scalar> output_empty() override {
-    return std::make_shared<ScalarType>(0);
-  }
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
-struct MeanImpl;
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MeanImpl<ArrowType, SimdLevel, enable_if_decimal<ArrowType>>
-    : public SumImpl<ArrowType, SimdLevel> {
-  using SumImpl<ArrowType, SimdLevel>::SumImpl;
-  using SumImpl<ArrowType, SimdLevel>::options;
-  using SumCType = typename SumImpl<ArrowType, SimdLevel>::SumCType;
-  using OutputType = typename SumImpl<ArrowType, SimdLevel>::OutputType;
-
-  template <typename T = ArrowType>
-  Status FinalizeImpl(Datum* out) {
-    if ((!options.skip_nulls && this->nulls_observed) ||
-        (this->count < options.min_count) || (this->count == 0)) {
-      out->value = std::make_shared<OutputType>(this->out_type);
-    } else {
-      SumCType quotient, remainder;
-      ARROW_ASSIGN_OR_RAISE(std::tie(quotient, remainder), this->sum.Divide(this->count));
-      // Round the decimal result based on the remainder
-      remainder.Abs();
-      if (remainder * 2 >= this->count) {
-        if (this->sum >= 0) {
-          quotient += 1;
-        } else {
-          quotient -= 1;
-        }
-      }
-      out->value = std::make_shared<OutputType>(quotient, this->out_type);
-    }
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); }
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MeanImpl<ArrowType, SimdLevel,
-                std::enable_if_t<!is_decimal_type<ArrowType>::value>>
-    // Override the ResultType of SumImpl because we need to use double for intermediate
-    // sum to prevent integer overflows
-    : public SumImpl<ArrowType, SimdLevel, DoubleType> {
-  using SumImpl<ArrowType, SimdLevel, DoubleType>::SumImpl;
-  using SumImpl<ArrowType, SimdLevel, DoubleType>::options;
-
-  template <typename T = ArrowType>
-  Status FinalizeImpl(Datum* out) {
-    if ((!options.skip_nulls && this->nulls_observed) ||
-        (this->count < options.min_count)) {
-      out->value = std::make_shared<DoubleScalar>();
-    } else {
-      static_assert(std::is_same_v<decltype(this->sum), double>,
-                    "SumCType must be double for numeric inputs");
-      const double mean = this->sum / this->count;
-      out->value = std::make_shared<DoubleScalar>(mean);
-    }
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override { return FinalizeImpl(out); }
-};
-
-template <template <typename> class KernelClass>
-struct SumLikeInit {
-  std::unique_ptr<KernelState> state;
-  KernelContext* ctx;
-  std::shared_ptr<DataType> type;
-  const ScalarAggregateOptions& options;
-
-  SumLikeInit(KernelContext* ctx, std::shared_ptr<DataType> type,
-              const ScalarAggregateOptions& options)
-      : ctx(ctx), type(type), options(options) {}
-
-  Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); }
-
-  Status Visit(const HalfFloatType&) {
-    return Status::NotImplemented("No sum implemented");
-  }
-
-  Status Visit(const BooleanType&) {
-    auto ty = TypeTraits<typename KernelClass<BooleanType>::SumType>::type_singleton();
-    state.reset(new KernelClass<BooleanType>(ty, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_number<Type, Status> Visit(const Type&) {
-    auto ty = TypeTraits<typename KernelClass<Type>::SumType>::type_singleton();
-    state.reset(new KernelClass<Type>(ty, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_decimal<Type, Status> Visit(const Type&) {
-    state.reset(new KernelClass<Type>(type, options));
-    return Status::OK();
-  }
-
-  virtual Status Visit(const NullType&) {
-    state.reset(new NullSumImpl<Int64Type>(options));
-    return Status::OK();
-  }
-
-  Result<std::unique_ptr<KernelState>> Create() {
-    RETURN_NOT_OK(VisitTypeInline(*type, this));
-    return std::move(state);
-  }
-};
-
-template <template <typename> class KernelClass>
-struct MeanKernelInit : public SumLikeInit<KernelClass> {
-  MeanKernelInit(KernelContext* ctx, std::shared_ptr<DataType> type,
-                 const ScalarAggregateOptions& options)
-      : SumLikeInit<KernelClass>(ctx, type, options) {}
-
-  Status Visit(const NullType&) override {
-    this->state.reset(new NullSumImpl<DoubleType>(this->options));
-    return Status::OK();
-  }
-};
-
-// ----------------------------------------------------------------------
-// FirstLast implementation
-template <typename ArrowType, typename Enable = void>
-struct FirstLastState {};
-
-template <typename ArrowType>
-struct FirstLastState<ArrowType, enable_if_boolean<ArrowType>> {
-  using ThisType = FirstLastState<ArrowType>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->first = this->has_values ? this->first : rhs.first;
-    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
-    this->last = rhs.has_values ? rhs.last : this->last;
-    this->last_is_null = rhs.last_is_null;
-    this->has_values |= rhs.has_values;
-    this->has_any_values |= rhs.has_any_values;
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    if (!has_values) {
-      this->first = value;
-      has_values = true;
-    }
-    this->last = value;
-  }
-
-  T first = false;
-  T last = false;
-  bool has_values = false;
-  bool first_is_null = false;
-  bool last_is_null = false;
-  bool has_any_values = false;
-};
-
-template <typename ArrowType>
-struct FirstLastState<ArrowType, enable_if_physical_integer<ArrowType>> {
-  using ThisType = FirstLastState<ArrowType>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->first = this->has_values ? this->first : rhs.first;
-    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
-    this->last = rhs.has_values ? rhs.last : this->last;
-    this->last_is_null = rhs.last_is_null;
-    this->has_values |= rhs.has_values;
-    this->has_any_values |= rhs.has_any_values;
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    if (!has_values) {
-      this->first = value;
-      has_values = true;
-    }
-    this->last = value;
-  }
-
-  T first = std::numeric_limits<T>::infinity();
-  T last = std::numeric_limits<T>::infinity();
-  bool has_values = false;
-
-  // These are updated in ConsumeScalar and ConsumeArray since null values don't
-  // invoke MergeOne
-  bool first_is_null = false;
-  bool last_is_null = false;
-  // has_any_values indicates whether there is any value (either null or non-null)
-  // (1) has_any_values = false: There is no value aggregated
-  // (2) has_any_values = true, has_values = false: There are only null values aggregated
-  // (3) has_any_values = true, has_values = true: There are both null and non-null values
-  // aggregated
-  bool has_any_values = false;
-};
-
-template <typename ArrowType>
-struct FirstLastState<ArrowType, enable_if_floating_point<ArrowType>> {
-  using ThisType = FirstLastState<ArrowType>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->first = this->has_values ? this->first : rhs.first;
-    this->last = rhs.has_values ? rhs.last : this->last;
-    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
-    this->last_is_null = rhs.last_is_null;
-    this->has_values |= rhs.has_values;
-    this->has_any_values |= rhs.has_any_values;
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    if (!has_values) {
-      this->first = value;
-      has_values = true;
-    }
-    last = value;
-  }
-
-  T first = std::numeric_limits<T>::infinity();
-  T last = std::numeric_limits<T>::infinity();
-  bool has_values = false;
-  bool first_is_null = false;
-  bool last_is_null = false;
-  bool has_any_values = false;
-};
-
-template <typename ArrowType>
-struct FirstLastState<ArrowType,
-                      enable_if_t<is_base_binary_type<ArrowType>::value ||
-                                  std::is_same<ArrowType, FixedSizeBinaryType>::value>> {
-  using ThisType = FirstLastState<ArrowType>;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->first = this->has_values ? this->first : rhs.first;
-    this->last = rhs.has_values ? rhs.last : this->last;
-    this->first_is_null = this->has_any_values ? this->first_is_null : rhs.first_is_null;
-    this->last_is_null = rhs.last_is_null;
-    this->has_values |= rhs.has_values;
-    this->has_any_values |= rhs.has_any_values;
-    return *this;
-  }
-
-  void MergeOne(std::string_view value) {
-    if (!has_values) {
-      first = std::string(value);
-      has_values = true;
-    }
-    last = std::string(value);
-  }
-
-  std::string first = "";
-  std::string last = "";
-  bool has_values = false;
-  bool first_is_null = false;
-  bool last_is_null = false;
-  bool has_any_values = false;
-};
-
-template <typename ArrowType>
-struct FirstLastImpl : public ScalarAggregator {
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-  using ThisType = FirstLastImpl<ArrowType>;
-  using StateType = FirstLastState<ArrowType>;
-
-  FirstLastImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options)
-      : out_type(std::move(out_type)), options(std::move(options)), count(0) {
-    this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
-  }
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (batch[0].is_array()) {
-      return ConsumeArray(batch[0].array);
-    }
-    return ConsumeScalar(*batch[0].scalar);
-  }
-
-  Status ConsumeScalar(const Scalar& scalar) {
-    this->state.has_any_values = true;
-    if (scalar.is_valid) {
-      this->state.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
-    } else {
-      if (!this->state.has_values) {
-        this->state.first_is_null = true;
-      }
-    }
-    this->count += scalar.is_valid;
-    return Status::OK();
-  }
-
-  Status ConsumeArray(const ArraySpan& arr_span) {
-    this->state.has_any_values = true;
-    ArrayType arr(arr_span.ToArrayData());
-    const auto null_count = arr.null_count();
-    this->count += arr.length() - null_count;
-
-    if (null_count == 0) {
-      // If there are no null values, we can just merge
-      // the first and last element
-      this->state.MergeOne(arr.GetView(0));
-      this->state.MergeOne(arr.GetView(arr.length() - 1));
-    } else {
-      int64_t first_i = -1;
-      int64_t last_i = -1;
-
-      if (!this->state.has_values && arr.IsNull(0)) {
-        this->state.first_is_null = true;
-      }
-
-      if (arr.IsNull(arr.length() - 1)) {
-        this->state.last_is_null = true;
-      }
-
-      // Find the first and last non-null value and update state
-      for (int64_t i = 0; i < arr.length(); i++) {
-        if (!arr.IsNull(i)) {
-          first_i = i;
-          break;
-        }
-      }
-      if (first_i >= 0) {
-        for (int64_t i = arr.length() - 1; i >= 0; i--) {
-          if (!arr.IsNull(i)) {
-            last_i = i;
-            break;
-          }
-        }
-        DCHECK_GE(last_i, first_i);
-        this->state.MergeOne(arr.GetView(first_i));
-        this->state.MergeOne(arr.GetView(last_i));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override {
-    const auto& other = checked_cast<const ThisType&>(src);
-    this->state += other.state;
-    this->count += other.count;
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    const auto& struct_type = checked_cast<const StructType&>(*out_type);
-    const auto& child_type = struct_type.field(0)->type();
-    auto null_scalar = MakeNullScalar(child_type);
-
-    std::vector<std::shared_ptr<Scalar>> values;
-
-    if (this->count < options.min_count) {
-      values = {null_scalar, null_scalar};
-    } else {
-      if (state.has_values) {
-        if (options.skip_nulls) {
-          ARROW_ASSIGN_OR_RAISE(auto first_scalar, MakeScalar(child_type, state.first));
-          ARROW_ASSIGN_OR_RAISE(auto last_scalar, MakeScalar(child_type, state.last));
-          values = {first_scalar, last_scalar};
-        } else {
-          ARROW_ASSIGN_OR_RAISE(
-              auto first_scalar,
-              state.first_is_null ? null_scalar : MakeScalar(child_type, state.first));
-          ARROW_ASSIGN_OR_RAISE(
-              auto last_scalar,
-              state.last_is_null ? null_scalar : MakeScalar(child_type, state.last));
-
-          values = {first_scalar, last_scalar};
-        }
-      } else {
-        // If there is no non-null values, we always output null regardless of
-        // skip_null
-        values = {null_scalar, null_scalar};
-      }
-    }
-
-    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type;
-  ScalarAggregateOptions options;
-  int64_t count;
-  FirstLastState<ArrowType> state;
-};
-
-// ----------------------------------------------------------------------
-// MinMax implementation
-template <typename ArrowType, SimdLevel::type SimdLevel, typename Enable = void>
-struct MinMaxState {};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_boolean<ArrowType>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using T = typename ArrowType::c_type;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = this->min && rhs.min;
-    this->max = this->max || rhs.max;
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    this->min = this->min && value;
-    this->max = this->max || value;
-  }
-
-  T min = true;
-  T max = false;
-  bool has_nulls = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_integer<ArrowType>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = std::min(this->min, rhs.min);
-    this->max = std::max(this->max, rhs.max);
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    this->min = std::min(this->min, value);
-    this->max = std::max(this->max, value);
-  }
-
-  T min = std::numeric_limits<T>::max();
-  T max = std::numeric_limits<T>::min();
-  bool has_nulls = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_floating_point<ArrowType>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using T = typename ArrowType::c_type;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = std::fmin(this->min, rhs.min);
-    this->max = std::fmax(this->max, rhs.max);
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    this->min = std::fmin(this->min, value);
-    this->max = std::fmax(this->max, value);
-  }
-
-  T min = std::numeric_limits<T>::infinity();
-  T max = -std::numeric_limits<T>::infinity();
-  bool has_nulls = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel, enable_if_decimal<ArrowType>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using T = typename TypeTraits<ArrowType>::CType;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  MinMaxState() : min(T::GetMaxSentinel()), max(T::GetMinSentinel()) {}
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = std::min(this->min, rhs.min);
-    this->max = std::max(this->max, rhs.max);
-    return *this;
-  }
-
-  void MergeOne(std::string_view value) {
-    MergeOne(T(reinterpret_cast<const uint8_t*>(value.data())));
-  }
-
-  void MergeOne(const T value) {
-    this->min = std::min(this->min, value);
-    this->max = std::max(this->max, value);
-  }
-
-  T min;
-  T max;
-  bool has_nulls = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxState<ArrowType, SimdLevel,
-                   enable_if_t<is_base_binary_type<ArrowType>::value ||
-                               std::is_same<ArrowType, FixedSizeBinaryType>::value>> {
-  using ThisType = MinMaxState<ArrowType, SimdLevel>;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    if (!this->seen && rhs.seen) {
-      this->min = rhs.min;
-      this->max = rhs.max;
-    } else if (this->seen && rhs.seen) {
-      if (this->min > rhs.min) {
-        this->min = rhs.min;
-      }
-      if (this->max < rhs.max) {
-        this->max = rhs.max;
-      }
-    }
-    this->has_nulls |= rhs.has_nulls;
-    this->seen |= rhs.seen;
-    return *this;
-  }
-
-  void MergeOne(std::string_view value) {
-    if (!seen) {
-      this->min = std::string(value);
-      this->max = std::string(value);
-    } else {
-      if (value < std::string_view(this->min)) {
-        this->min = std::string(value);
-      } else if (value > std::string_view(this->max)) {
-        this->max = std::string(value);
-      }
-    }
-    this->seen = true;
-  }
-
-  std::string min;
-  std::string max;
-  bool has_nulls = false;
-  bool seen = false;
-};
-
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MinMaxImpl : public ScalarAggregator {
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-  using ThisType = MinMaxImpl<ArrowType, SimdLevel>;
-  using StateType = MinMaxState<ArrowType, SimdLevel>;
-
-  MinMaxImpl(std::shared_ptr<DataType> out_type, ScalarAggregateOptions options)
-      : out_type(std::move(out_type)), options(std::move(options)), count(0) {
-    this->options.min_count = std::max<uint32_t>(1, this->options.min_count);
-  }
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (batch[0].is_array()) {
-      return ConsumeArray(batch[0].array);
-    }
-    return ConsumeScalar(*batch[0].scalar);
-  }
-
-  Status ConsumeScalar(const Scalar& scalar) {
-    StateType local;
-    local.has_nulls = !scalar.is_valid;
-    this->count += scalar.is_valid;
-
-    if (!local.has_nulls || options.skip_nulls) {
-      local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
-    }
-
-    this->state += local;
-    return Status::OK();
-  }
-
-  Status ConsumeArray(const ArraySpan& arr_span) {
-    StateType local;
-
-    ArrayType arr(arr_span.ToArrayData());
-
-    const auto null_count = arr.null_count();
-    local.has_nulls = null_count > 0;
-    this->count += arr.length() - null_count;
-
-    if (!local.has_nulls) {
-      for (int64_t i = 0; i < arr.length(); i++) {
-        local.MergeOne(arr.GetView(i));
-      }
-    } else if (local.has_nulls && options.skip_nulls) {
-      local += ConsumeWithNulls(arr);
-    }
-
-    this->state += local;
-    return Status::OK();
-  }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override {
-    const auto& other = checked_cast<const ThisType&>(src);
-    this->state += other.state;
-    this->count += other.count;
-    return Status::OK();
-  }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    const auto& struct_type = checked_cast<const StructType&>(*out_type);
-    const auto& child_type = struct_type.field(0)->type();
-
-    std::vector<std::shared_ptr<Scalar>> values;
-    // Physical type != result type
-    if ((state.has_nulls && !options.skip_nulls) || (this->count < options.min_count)) {
-      // (null, null)
-      auto null_scalar = MakeNullScalar(child_type);
-      values = {null_scalar, null_scalar};
-    } else {
-      ARROW_ASSIGN_OR_RAISE(auto min_scalar,
-                            MakeScalar(child_type, std::move(state.min)));
-      ARROW_ASSIGN_OR_RAISE(auto max_scalar,
-                            MakeScalar(child_type, std::move(state.max)));
-      values = {std::move(min_scalar), std::move(max_scalar)};
-    }
-    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type;
-  ScalarAggregateOptions options;
-  int64_t count;
-  MinMaxState<ArrowType, SimdLevel> state;
-
- private:
-  StateType ConsumeWithNulls(const ArrayType& arr) const {
-    StateType local;
-    const int64_t length = arr.length();
-    int64_t offset = arr.offset();
-    const uint8_t* bitmap = arr.null_bitmap_data();
-    int64_t idx = 0;
-
-    const auto p = arrow::internal::BitmapWordAlign<1>(bitmap, offset, length);
-    // First handle the leading bits
-    const int64_t leading_bits = p.leading_bits;
-    while (idx < leading_bits) {
-      if (bit_util::GetBit(bitmap, offset)) {
-        local.MergeOne(arr.GetView(idx));
-      }
-      idx++;
-      offset++;
-    }
-
-    // The aligned parts scanned with BitBlockCounter
-    arrow::internal::BitBlockCounter data_counter(bitmap, offset, length - leading_bits);
-    auto current_block = data_counter.NextWord();
-    while (idx < length) {
-      if (current_block.AllSet()) {  // All true values
-        int run_length = 0;
-        // Scan forward until a block that has some false values (or the end)
-        while (current_block.length > 0 && current_block.AllSet()) {
-          run_length += current_block.length;
-          current_block = data_counter.NextWord();
-        }
-        for (int64_t i = 0; i < run_length; i++) {
-          local.MergeOne(arr.GetView(idx + i));
-        }
-        idx += run_length;
-        offset += run_length;
-        // The current_block already computed, advance to next loop
-        continue;
-      } else if (!current_block.NoneSet()) {  // Some values are null
-        BitmapReader reader(arr.null_bitmap_data(), offset, current_block.length);
-        for (int64_t i = 0; i < current_block.length; i++) {
-          if (reader.IsSet()) {
-            local.MergeOne(arr.GetView(idx + i));
-          }
-          reader.Next();
-        }
-
-        idx += current_block.length;
-        offset += current_block.length;
-      } else {  // All null values
-        idx += current_block.length;
-        offset += current_block.length;
-      }
-      current_block = data_counter.NextWord();
-    }
-
-    return local;
-  }
-};
-
-template <SimdLevel::type SimdLevel>
-struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
-  using StateType = MinMaxState<BooleanType, SimdLevel>;
-  using ArrayType = typename TypeTraits<BooleanType>::ArrayType;
-  using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
-  using MinMaxImpl<BooleanType, SimdLevel>::options;
-
-  Status Consume(KernelContext*, const ExecSpan& batch) override {
-    if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
-      return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar));
-    }
-    StateType local;
-    ArrayType arr(batch[0].array.ToArrayData());
-
-    const auto arr_length = arr.length();
-    const auto null_count = arr.null_count();
-    const auto valid_count = arr_length - null_count;
-
-    local.has_nulls = null_count > 0;
-    this->count += valid_count;
-    if (!local.has_nulls || options.skip_nulls) {
-      const auto true_count = arr.true_count();
-      const auto false_count = valid_count - true_count;
-      local.max = true_count > 0;
-      local.min = false_count == 0;
-    }
-
-    this->state += local;
-    return Status::OK();
-  }
-
-  Status ConsumeScalar(const BooleanScalar& scalar) {
-    StateType local;
-
-    local.has_nulls = !scalar.is_valid;
-    this->count += scalar.is_valid;
-    if (!local.has_nulls || options.skip_nulls) {
-      const int true_count = scalar.is_valid && scalar.value;
-      const int false_count = scalar.is_valid && !scalar.value;
-      local.max = true_count > 0;
-      local.min = false_count == 0;
-    }
-
-    this->state += local;
-    return Status::OK();
-  }
-};
-
-struct NullMinMaxImpl : public ScalarAggregator {
-  Status Consume(KernelContext*, const ExecSpan& batch) override { return Status::OK(); }
-
-  Status MergeFrom(KernelContext*, KernelState&& src) override { return Status::OK(); }
-
-  Status Finalize(KernelContext*, Datum* out) override {
-    std::vector<std::shared_ptr<Scalar>> values{std::make_shared<NullScalar>(),
-                                                std::make_shared<NullScalar>()};
-    out->value = std::make_shared<StructScalar>(
-        std::move(values), struct_({field("min", null()), field("max", null())}));
-    return Status::OK();
-  }
-};
-
-// First/Last
-
-struct FirstLastInitState {
-  std::unique_ptr<KernelState> state;
-  KernelContext* ctx;
-  const DataType& in_type;
-  std::shared_ptr<DataType> out_type;
-  const ScalarAggregateOptions& options;
-
-  FirstLastInitState(KernelContext* ctx, const DataType& in_type,
-                     const std::shared_ptr<DataType>& out_type,
-                     const ScalarAggregateOptions& options)
-      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
-
-  Status Visit(const DataType& ty) {
-    return Status::NotImplemented("No first/last implemented for ", ty);
-  }
-
-  Status Visit(const HalfFloatType& ty) {
-    return Status::NotImplemented("No first/last implemented for ", ty);
-  }
-
-  Status Visit(const BooleanType&) {
-    state.reset(new FirstLastImpl<BooleanType>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_physical_integer<Type, Status> Visit(const Type&) {
-    using PhysicalType = typename Type::PhysicalType;
-    state.reset(new FirstLastImpl<PhysicalType>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_physical_floating_point<Type, Status> Visit(const Type&) {
-    using PhysicalType = typename Type::PhysicalType;
-    state.reset(new FirstLastImpl<PhysicalType>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_base_binary<Type, Status> Visit(const Type&) {
-    state.reset(new FirstLastImpl<Type>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_t<std::is_same<Type, FixedSizeBinaryType>::value, Status> Visit(const Type&) {
-    state.reset(new FirstLastImpl<Type>(out_type, options));
-    return Status::OK();
-  }
-
-  Result<std::unique_ptr<KernelState>> Create() {
-    RETURN_NOT_OK(VisitTypeInline(in_type, this));
-    return std::move(state);
-  }
-};
-
-template <SimdLevel::type SimdLevel>
-struct MinMaxInitState {
-  std::unique_ptr<KernelState> state;
-  KernelContext* ctx;
-  const DataType& in_type;
-  std::shared_ptr<DataType> out_type;
-  const ScalarAggregateOptions& options;
-
-  MinMaxInitState(KernelContext* ctx, const DataType& in_type,
-                  const std::shared_ptr<DataType>& out_type,
-                  const ScalarAggregateOptions& options)
-      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
-
-  Status Visit(const DataType& ty) {
-    return Status::NotImplemented("No min/max implemented for ", ty);
-  }
-
-  Status Visit(const HalfFloatType& ty) {
-    return Status::NotImplemented("No min/max implemented for ", ty);
-  }
-
-  Status Visit(const NullType&) {
-    state.reset(new NullMinMaxImpl());
-    return Status::OK();
-  }
-
-  Status Visit(const BooleanType&) {
-    state.reset(new BooleanMinMaxImpl<SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_physical_integer<Type, Status> Visit(const Type&) {
-    using PhysicalType = typename Type::PhysicalType;
-    state.reset(new MinMaxImpl<PhysicalType, SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_floating_point<Type, Status> Visit(const Type&) {
-    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_base_binary<Type, Status> Visit(const Type&) {
-    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  template <typename Type>
-  enable_if_fixed_size_binary<Type, Status> Visit(const Type&) {
-    state.reset(new MinMaxImpl<Type, SimdLevel>(out_type, options));
-    return Status::OK();
-  }
-
-  Result<std::unique_ptr<KernelState>> Create() {
-    RETURN_NOT_OK(VisitTypeInline(in_type, this));
-    return std::move(state);
-  }
-};
-
 }  // namespace arrow::compute::internal

From c455d6b8c4ae2cb22baceb4c27e1325b973d39e1 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Tue, 3 Sep 2024 16:32:07 +0200
Subject: [PATCH 078/186] GH-43933: [CI] Remove docker-compose warnings
 (#43934)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Remove `docker-compose` warnings to avoid confusion when running `docker-compose` and see them appear at the top. The two warnings are about:

- `PYTHON_IMAGE_TAG` not having a default value in `.env`
- `version` in `docker-compose.yml` is obsolete and should be removed.

### What changes are included in this PR?

- Add a `PYTHON_IMAGE_TAG` default value in `.env`
- Remove `version` from `docker-compose.yml`

### Are these changes tested?

Yes, the warnings are removed with these changes.

### Are there any user-facing changes?
 No.

* GitHub Issue: #43933

Authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 .env               | 1 +
 docker-compose.yml | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.env b/.env
index af647fc8b7a7f..c8c236d5ac44b 100644
--- a/.env
+++ b/.env
@@ -71,6 +71,7 @@ NUMBA=latest
 NUMPY=latest
 PANDAS=latest
 PYTHON=3.8
+PYTHON_IMAGE_TAG=3.8
 R=4.4
 SPARK=master
 TURBODBC=latest
diff --git a/docker-compose.yml b/docker-compose.yml
index 97d6e1158ea03..19a9dd0de3932 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -53,8 +53,6 @@
 #
 # See more in cpp/build-support/run-test.sh::print_coredumps
 
-version: '3.5'
-
 x-common: &common
   GITHUB_ACTIONS:
 

From b2e0668a7230131a35e228958aa895442f45152a Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:49:28 +0530
Subject: [PATCH 079/186] GH-43902: [Java] Support for Long memory addresses
 (#43903)

### Rationale for this change

The usage of `Integer` instead of `Long` must be encouraged with the usage of memory sizing, indexing and addresses.

### What changes are included in this PR?

This PR refactors the usage of `Integer` into `Long` along with utilities refactors.

### Are these changes tested?

Existing test cases.

### Are there any user-facing changes?

Yes, certain API calls may subject changes.
* GitHub Issue: #43902

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../arrow/memory/AllocationReservation.java   | 32 +++++++++++++++++++
 .../apache/arrow/memory/BaseAllocator.java    | 28 +++++++++++++---
 .../apache/arrow/memory/ChildAllocator.java   |  6 ++--
 .../rounding/DefaultRoundingPolicy.java       | 16 +++++-----
 .../rounding/SegmentRoundingPolicy.java       | 23 ++++++++++++-
 .../arrow/memory/TestBaseAllocator.java       | 14 ++++----
 .../java/io/netty/buffer/NettyArrowBuf.java   | 18 +++++++++--
 .../arrow/memory/AllocatorBenchmarks.java     |  4 +--
 8 files changed, 114 insertions(+), 27 deletions(-)

diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java
index 20066ed14b65a..856cc88ab9c39 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationReservation.java
@@ -34,9 +34,22 @@ public interface AllocationReservation extends AutoCloseable {
    * @param nBytes the number of bytes to add
    * @return true if the addition is possible, false otherwise
    * @throws IllegalStateException if called after buffer() is used to allocate the reservation
+   * @deprecated use {@link #add(long)} instead
    */
+  @Deprecated(forRemoval = true)
   boolean add(int nBytes);
 
+  /**
+   * Add to the current reservation.
+   *
+   * <p>Adding may fail if the allocator is not allowed to consume any more space.
+   *
+   * @param nBytes the number of bytes to add
+   * @return true if the addition is possible, false otherwise
+   * @throws IllegalStateException if called after buffer() is used to allocate the reservation
+   */
+  boolean add(long nBytes);
+
   /**
    * Requests a reservation of additional space.
    *
@@ -44,9 +57,21 @@ public interface AllocationReservation extends AutoCloseable {
    *
    * @param nBytes the amount to reserve
    * @return true if the reservation can be satisfied, false otherwise
+   * @deprecated use {@link #reserve(long)} instead
    */
+  @Deprecated(forRemoval = true)
   boolean reserve(int nBytes);
 
+  /**
+   * Requests a reservation of additional space.
+   *
+   * <p>The implementation of the allocator's inner class provides this.
+   *
+   * @param nBytes the amount to reserve
+   * @return true if the reservation can be satisfied, false otherwise
+   */
+  boolean reserve(long nBytes);
+
   /**
    * Allocate a buffer whose size is the total of all the add()s made.
    *
@@ -65,6 +90,13 @@ public interface AllocationReservation extends AutoCloseable {
    */
   int getSize();
 
+  /**
+   * Get the current size of the reservation (the sum of all the add()s) as a long value.
+   *
+   * @return size of the current reservation
+   */
+  long getSizeLong();
+
   /**
    * Return whether or not the reservation has been used.
    *
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java
index dd6375e910b92..20a89d0b7bf18 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java
@@ -29,6 +29,7 @@
 import org.apache.arrow.memory.util.AssertionUtil;
 import org.apache.arrow.memory.util.CommonUtil;
 import org.apache.arrow.memory.util.HistoricalLog;
+import org.apache.arrow.memory.util.LargeMemoryUtil;
 import org.apache.arrow.util.Preconditions;
 import org.checkerframework.checker.initialization.qual.Initialized;
 import org.checkerframework.checker.nullness.qual.KeyFor;
@@ -860,7 +861,7 @@ RoundingPolicy getRoundingPolicy() {
   public class Reservation implements AllocationReservation {
 
     private final @Nullable HistoricalLog historicalLog;
-    private int nBytes = 0;
+    private long nBytes = 0;
     private boolean used = false;
     private boolean closed = false;
 
@@ -888,8 +889,15 @@ public Reservation() {
       }
     }
 
+    @SuppressWarnings({"removal", "InlineMeSuggester"})
+    @Deprecated(forRemoval = true)
     @Override
     public boolean add(final int nBytes) {
+      return add((long) nBytes);
+    }
+
+    @Override
+    public boolean add(final long nBytes) {
       assertOpen();
 
       Preconditions.checkArgument(nBytes >= 0, "nBytes(%d) < 0", nBytes);
@@ -906,7 +914,7 @@ public boolean add(final int nBytes) {
       // modifying this behavior so that we maintain what we reserve and what the user asked for
       // and make sure to only
       // round to power of two as necessary.
-      final int nBytesTwo = CommonUtil.nextPowerOfTwo(nBytes);
+      final long nBytesTwo = CommonUtil.nextPowerOfTwo(nBytes);
       if (!reserve(nBytesTwo)) {
         return false;
       }
@@ -929,6 +937,11 @@ public ArrowBuf allocateBuffer() {
 
     @Override
     public int getSize() {
+      return LargeMemoryUtil.checkedCastToInt(nBytes);
+    }
+
+    @Override
+    public long getSizeLong() {
       return nBytes;
     }
 
@@ -978,8 +991,15 @@ public void close() {
       closed = true;
     }
 
+    @SuppressWarnings({"removal", "InlineMeSuggester"})
+    @Deprecated(forRemoval = true)
     @Override
     public boolean reserve(int nBytes) {
+      return reserve((long) nBytes);
+    }
+
+    @Override
+    public boolean reserve(long nBytes) {
       assertOpen();
 
       final AllocationOutcome outcome = BaseAllocator.this.allocateBytes(nBytes);
@@ -999,7 +1019,7 @@ public boolean reserve(int nBytes) {
      * @param nBytes the size of the buffer requested
      * @return the buffer, or null, if the request cannot be satisfied
      */
-    private ArrowBuf allocate(int nBytes) {
+    private ArrowBuf allocate(long nBytes) {
       assertOpen();
 
       boolean success = false;
@@ -1033,7 +1053,7 @@ private ArrowBuf allocate(int nBytes) {
      *
      * @param nBytes the size of the reservation
      */
-    private void releaseReservation(int nBytes) {
+    private void releaseReservation(long nBytes) {
       assertOpen();
 
       releaseBytes(nBytes);
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ChildAllocator.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ChildAllocator.java
index f8dd7e1d1cb38..50f33d3f021c7 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ChildAllocator.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ChildAllocator.java
@@ -17,9 +17,9 @@
 package org.apache.arrow.memory;
 
 /**
- * Child allocator class. Only slightly different from the {@see RootAllocator}, in that these can't
- * be created directly, but must be obtained from {@link BufferAllocator#newChildAllocator(String,
- * AllocationListener, long, long)}.
+ * Child allocator class. Only slightly different from the {@link RootAllocator}, in that these
+ * can't be created directly, but must be obtained from {@link
+ * BufferAllocator#newChildAllocator(String, AllocationListener, long, long)}.
  *
  * <p>Child allocators can only be created by the root, or other children, so this class is package
  * private.
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/DefaultRoundingPolicy.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/DefaultRoundingPolicy.java
index 289b10634d84e..90e8a1d5eca77 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/DefaultRoundingPolicy.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/DefaultRoundingPolicy.java
@@ -34,13 +34,13 @@ public class DefaultRoundingPolicy implements RoundingPolicy {
    *
    * <p>It was copied from {@link io.netty.buffer.PooledByteBufAllocator}.
    */
-  private static final int MIN_PAGE_SIZE = 4096;
+  private static final long MIN_PAGE_SIZE = 4096;
 
-  private static final int MAX_CHUNK_SIZE = (int) (((long) Integer.MAX_VALUE + 1) / 2);
+  private static final long MAX_CHUNK_SIZE = ((long) Integer.MAX_VALUE + 1) / 2;
   private static final long DEFAULT_CHUNK_SIZE;
 
   static {
-    int defaultPageSize = Integer.getInteger("org.apache.memory.allocator.pageSize", 8192);
+    long defaultPageSize = Long.getLong("org.apache.memory.allocator.pageSize", 8192);
     try {
       validateAndCalculatePageShifts(defaultPageSize);
     } catch (Throwable t) {
@@ -60,7 +60,7 @@ public class DefaultRoundingPolicy implements RoundingPolicy {
     }
   }
 
-  private static int validateAndCalculatePageShifts(int pageSize) {
+  private static long validateAndCalculatePageShifts(long pageSize) {
     if (pageSize < MIN_PAGE_SIZE) {
       throw new IllegalArgumentException(
           "pageSize: " + pageSize + " (expected: " + MIN_PAGE_SIZE + ")");
@@ -71,17 +71,17 @@ private static int validateAndCalculatePageShifts(int pageSize) {
     }
 
     // Logarithm base 2. At this point we know that pageSize is a power of two.
-    return Integer.SIZE - 1 - Integer.numberOfLeadingZeros(pageSize);
+    return Long.SIZE - 1L - Long.numberOfLeadingZeros(pageSize);
   }
 
-  private static int validateAndCalculateChunkSize(int pageSize, int maxOrder) {
+  private static long validateAndCalculateChunkSize(long pageSize, int maxOrder) {
     if (maxOrder > 14) {
       throw new IllegalArgumentException("maxOrder: " + maxOrder + " (expected: 0-14)");
     }
 
     // Ensure the resulting chunkSize does not overflow.
-    int chunkSize = pageSize;
-    for (int i = maxOrder; i > 0; i--) {
+    long chunkSize = pageSize;
+    for (long i = maxOrder; i > 0; i--) {
       if (chunkSize > MAX_CHUNK_SIZE / 2) {
         throw new IllegalArgumentException(
             String.format(
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/SegmentRoundingPolicy.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/SegmentRoundingPolicy.java
index f501cfedd168d..89db736e6a0f9 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/SegmentRoundingPolicy.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/rounding/SegmentRoundingPolicy.java
@@ -16,6 +16,8 @@
  */
 package org.apache.arrow.memory.rounding;
 
+import com.google.errorprone.annotations.InlineMe;
+import org.apache.arrow.memory.util.LargeMemoryUtil;
 import org.apache.arrow.util.Preconditions;
 
 /** The rounding policy that each buffer size must a multiple of the segment size. */
@@ -28,7 +30,7 @@ public class SegmentRoundingPolicy implements RoundingPolicy {
    * The segment size. It must be at least {@link SegmentRoundingPolicy#MIN_SEGMENT_SIZE}, and be a
    * power of 2.
    */
-  private int segmentSize;
+  private long segmentSize;
 
   /**
    * Constructor for the segment rounding policy.
@@ -36,8 +38,22 @@ public class SegmentRoundingPolicy implements RoundingPolicy {
    * @param segmentSize the segment size.
    * @throws IllegalArgumentException if the segment size is smaller than {@link
    *     SegmentRoundingPolicy#MIN_SEGMENT_SIZE}, or is not a power of 2.
+   * @deprecated use {@link SegmentRoundingPolicy#SegmentRoundingPolicy(long)} instead.
    */
+  @Deprecated(forRemoval = true)
+  @InlineMe(replacement = "this((long) segmentSize)")
   public SegmentRoundingPolicy(int segmentSize) {
+    this((long) segmentSize);
+  }
+
+  /**
+   * Constructor for the segment rounding policy.
+   *
+   * @param segmentSize the segment size.
+   * @throws IllegalArgumentException if the segment size is smaller than {@link
+   *     SegmentRoundingPolicy#MIN_SEGMENT_SIZE}, or is not a power of 2.
+   */
+  public SegmentRoundingPolicy(long segmentSize) {
     Preconditions.checkArgument(
         segmentSize >= MIN_SEGMENT_SIZE,
         "The segment size cannot be smaller than %s",
@@ -52,7 +68,12 @@ public long getRoundedSize(long requestSize) {
     return (requestSize + (segmentSize - 1)) / segmentSize * segmentSize;
   }
 
+  @Deprecated(forRemoval = true)
   public int getSegmentSize() {
+    return LargeMemoryUtil.checkedCastToInt(segmentSize);
+  }
+
+  public long getSegmentSizeAsLong() {
     return segmentSize;
   }
 }
diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java
index a5fbc67c48f5c..87e9316964dfc 100644
--- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java
+++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java
@@ -315,7 +315,7 @@ public void testRootAllocator_createChildDontClose() throws Exception {
 
   @Test
   public void testSegmentAllocator() {
-    RoundingPolicy policy = new SegmentRoundingPolicy(1024);
+    RoundingPolicy policy = new SegmentRoundingPolicy(1024L);
     try (RootAllocator allocator =
         new RootAllocator(AllocationListener.NOOP, 1024 * 1024, policy)) {
       ArrowBuf buf = allocator.buffer(798);
@@ -334,7 +334,7 @@ public void testSegmentAllocator() {
 
   @Test
   public void testSegmentAllocator_childAllocator() {
-    RoundingPolicy policy = new SegmentRoundingPolicy(1024);
+    RoundingPolicy policy = new SegmentRoundingPolicy(1024L);
     try (RootAllocator allocator = new RootAllocator(AllocationListener.NOOP, 1024 * 1024, policy);
         BufferAllocator childAllocator = allocator.newChildAllocator("child", 0, 512 * 1024)) {
 
@@ -357,14 +357,14 @@ public void testSegmentAllocator_childAllocator() {
   @Test
   public void testSegmentAllocator_smallSegment() {
     IllegalArgumentException e =
-        assertThrows(IllegalArgumentException.class, () -> new SegmentRoundingPolicy(128));
+        assertThrows(IllegalArgumentException.class, () -> new SegmentRoundingPolicy(128L));
     assertEquals("The segment size cannot be smaller than 1024", e.getMessage());
   }
 
   @Test
   public void testSegmentAllocator_segmentSizeNotPowerOf2() {
     IllegalArgumentException e =
-        assertThrows(IllegalArgumentException.class, () -> new SegmentRoundingPolicy(4097));
+        assertThrows(IllegalArgumentException.class, () -> new SegmentRoundingPolicy(4097L));
     assertEquals("The segment size must be a power of 2", e.getMessage());
   }
 
@@ -957,7 +957,7 @@ public void testAllocator_unclaimedReservation() throws Exception {
       try (final BufferAllocator childAllocator1 =
           rootAllocator.newChildAllocator("unclaimedReservation", 0, MAX_ALLOCATION)) {
         try (final AllocationReservation reservation = childAllocator1.newReservation()) {
-          assertTrue(reservation.add(64));
+          assertTrue(reservation.add(64L));
         }
         rootAllocator.verify();
       }
@@ -972,8 +972,8 @@ public void testAllocator_claimedReservation() throws Exception {
           rootAllocator.newChildAllocator("claimedReservation", 0, MAX_ALLOCATION)) {
 
         try (final AllocationReservation reservation = childAllocator1.newReservation()) {
-          assertTrue(reservation.add(32));
-          assertTrue(reservation.add(32));
+          assertTrue(reservation.add(32L));
+          assertTrue(reservation.add(32L));
 
           final ArrowBuf arrowBuf = reservation.allocateBuffer();
           assertEquals(64, arrowBuf.capacity());
diff --git a/java/memory/memory-netty-buffer-patch/src/main/java/io/netty/buffer/NettyArrowBuf.java b/java/memory/memory-netty-buffer-patch/src/main/java/io/netty/buffer/NettyArrowBuf.java
index bdad3700cb311..9319d15aaa9a9 100644
--- a/java/memory/memory-netty-buffer-patch/src/main/java/io/netty/buffer/NettyArrowBuf.java
+++ b/java/memory/memory-netty-buffer-patch/src/main/java/io/netty/buffer/NettyArrowBuf.java
@@ -38,7 +38,7 @@ public class NettyArrowBuf extends AbstractByteBuf implements AutoCloseable {
 
   private final ArrowBuf arrowBuf;
   private final ArrowByteBufAllocator arrowByteBufAllocator;
-  private int length;
+  private long length;
   private final long address;
 
   /**
@@ -47,10 +47,24 @@ public class NettyArrowBuf extends AbstractByteBuf implements AutoCloseable {
    * @param arrowBuf The buffer to wrap.
    * @param bufferAllocator The allocator for the buffer.
    * @param length The length of this buffer.
+   * @deprecated Use {@link #NettyArrowBuf(ArrowBuf, BufferAllocator, long)} instead.
    */
+  @Deprecated(forRemoval = true)
   public NettyArrowBuf(
       final ArrowBuf arrowBuf, final BufferAllocator bufferAllocator, final int length) {
-    super(length);
+    this(arrowBuf, bufferAllocator, (long) length);
+  }
+
+  /**
+   * Constructs a new instance.
+   *
+   * @param arrowBuf The buffer to wrap.
+   * @param bufferAllocator The allocator for the buffer.
+   * @param length The length of this buffer.
+   */
+  public NettyArrowBuf(
+      final ArrowBuf arrowBuf, final BufferAllocator bufferAllocator, final long length) {
+    super((int) length);
     this.arrowBuf = arrowBuf;
     this.arrowByteBufAllocator = new ArrowByteBufAllocator(bufferAllocator);
     this.length = length;
diff --git a/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java
index f275090aae6bf..1154809cae753 100644
--- a/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java
+++ b/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java
@@ -57,9 +57,9 @@ public void defaultAllocatorBenchmark() {
   @BenchmarkMode(Mode.AverageTime)
   @OutputTimeUnit(TimeUnit.MICROSECONDS)
   public void segmentRoundingPolicyBenchmark() {
-    final int bufferSize = 1024;
+    final long bufferSize = 1024L;
     final int numBuffers = 1024;
-    final int segmentSize = 1024;
+    final long segmentSize = 1024L;
 
     RoundingPolicy policy = new SegmentRoundingPolicy(segmentSize);
     try (RootAllocator allocator =

From 2fac18587ecd4f027cef0a5a8a1905af41c4ce48 Mon Sep 17 00:00:00 2001
From: Dane Pitkin <dpitkin@apache.org>
Date: Wed, 4 Sep 2024 03:46:09 -0400
Subject: [PATCH 080/186] GH-43727: [Python] RecordBatch fails gracefully on
 non-cpu devices (#43729)

### Rationale for this change

Throw a python exception if a RecordBatch API isn't able to be used when the memory is backed by non-cpu devices.

### What changes are included in this PR?

* Assert the device is CPU for APIs that only support CPU

### Are these changes tested?

Pytests

### Are there any user-facing changes?

The user experiences Python exceptions instead of segfaults for unsupported APIs.
* GitHub Issue: #43727

Authored-by: Dane Pitkin <dpitkin@apache.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/.gitignore                  |   1 +
 python/pyarrow/lib.pxd             |   2 +-
 python/pyarrow/table.pxi           |  39 ++++++
 python/pyarrow/tests/test_table.py | 200 +++++++++++++++++++++++++++++
 4 files changed, 241 insertions(+), 1 deletion(-)

diff --git a/python/.gitignore b/python/.gitignore
index ce7f065412728..fbc3b192433b9 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -17,6 +17,7 @@ Testing/
 *.cpp
 pyarrow/lib.h
 pyarrow/*_api.h
+pyarrow/_cuda.h
 pyarrow/_generated_version.py
 cython_debug
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 5c3d981c3adc7..ad05ea31c91c6 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -516,7 +516,7 @@ cdef class ChunkedArray(_PandasConvertible):
 
 
 cdef class _Tabular(_PandasConvertible):
-    pass
+    cdef void _assert_cpu(self) except *
 
 
 cdef class Table(_Tabular):
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index fff47373cb991..9bb8623665977 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1574,6 +1574,7 @@ cdef class _Tabular(_PandasConvertible):
                         f"one of the `{self.__class__.__name__}.from_*` functions instead.")
 
     def __array__(self, dtype=None, copy=None):
+        self._assert_cpu()
         if copy is False:
             raise ValueError(
                 "Unable to avoid a copy while creating a numpy array as requested "
@@ -1827,6 +1828,7 @@ cdef class _Tabular(_PandasConvertible):
         n_legs: [[4,100]]
         animals: [["Horse","Centipede"]]
         """
+        self._assert_cpu()
         return _pc().drop_null(self)
 
     def field(self, i):
@@ -2088,6 +2090,7 @@ cdef class _Tabular(_PandasConvertible):
         n_legs: [[5,100,4,2,4,2]]
         animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]]
         """
+        self._assert_cpu()
         if isinstance(sorting, str):
             sorting = [(sorting, "ascending")]
 
@@ -2133,6 +2136,7 @@ cdef class _Tabular(_PandasConvertible):
         n_legs: [[4,100]]
         animals: [["Horse","Centipede"]]
         """
+        self._assert_cpu()
         return _pc().take(self, indices)
 
     def filter(self, mask, object null_selection_behavior="drop"):
@@ -2202,6 +2206,7 @@ cdef class _Tabular(_PandasConvertible):
         n_legs: [[2,4,null]]
         animals: [["Flamingo","Horse",null]]
         """
+        self._assert_cpu()
         if isinstance(mask, _pc().Expression):
             return _pac()._filter_table(self, mask)
         else:
@@ -2402,6 +2407,9 @@ cdef class _Tabular(_PandasConvertible):
         """
         return self.add_column(self.num_columns, field_, column)
 
+    cdef void _assert_cpu(self) except *:
+        return
+
 
 cdef class RecordBatch(_Tabular):
     """
@@ -2512,6 +2520,7 @@ cdef class RecordBatch(_Tabular):
         return self.batch != NULL
 
     def __reduce__(self):
+        self._assert_cpu()
         return _reconstruct_record_batch, (self.columns, self.schema)
 
     def validate(self, *, full=False):
@@ -2531,6 +2540,7 @@ cdef class RecordBatch(_Tabular):
         ArrowInvalid
         """
         if full:
+            self._assert_cpu()
             with nogil:
                 check_status(self.batch.ValidateFull())
         else:
@@ -2697,6 +2707,7 @@ cdef class RecordBatch(_Tabular):
         >>> batch.nbytes
         116
         """
+        self._assert_cpu()
         cdef:
             CResult[int64_t] c_res_buffer
 
@@ -2726,6 +2737,7 @@ cdef class RecordBatch(_Tabular):
         >>> batch.get_total_buffer_size()
         120
         """
+        self._assert_cpu()
         cdef:
             int64_t total_buffer_size
 
@@ -2792,12 +2804,19 @@ cdef class RecordBatch(_Tabular):
             shared_ptr[CRecordBatch] c_batch
             Field c_field
             Array c_arr
+            CDeviceAllocationType device_type = self.sp_batch.get().device_type()
 
         if isinstance(column, Array):
             c_arr = column
         else:
             c_arr = array(column)
 
+        if device_type != c_arr.sp_array.get().device_type():
+            raise TypeError("The column must be allocated on the same "
+                            "device as the RecordBatch. Got column on "
+                            f"device {c_arr.device_type!r}, but expected "
+                            f"{self.device_type!r}.")
+
         if isinstance(field_, Field):
             c_field = field_
         else:
@@ -2885,12 +2904,19 @@ cdef class RecordBatch(_Tabular):
             shared_ptr[CRecordBatch] c_batch
             Field c_field
             Array c_arr
+            CDeviceAllocationType device_type = self.sp_batch.get().device_type()
 
         if isinstance(column, Array):
             c_arr = column
         else:
             c_arr = array(column)
 
+        if device_type != c_arr.sp_array.get().device_type():
+            raise TypeError("The column must be allocated on the same "
+                            "device as the RecordBatch. Got column on "
+                            f"device {c_arr.device_type!r}, but expected "
+                            f"{self.device_type!r}.")
+
         if isinstance(field_, Field):
             c_field = field_
         else:
@@ -3016,6 +3042,7 @@ cdef class RecordBatch(_Tabular):
         n_legs: [2,2,4,4,5,100]
         animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]
         """
+        self._assert_cpu()
         cdef shared_ptr[CBuffer] buffer
         cdef CIpcWriteOptions options = CIpcWriteOptions.Defaults()
         options.memory_pool = maybe_unbox_memory_pool(memory_pool)
@@ -3117,6 +3144,7 @@ cdef class RecordBatch(_Tabular):
         >>> batch.equals(batch_1, check_metadata=True)
         False
         """
+        self._assert_cpu()
         cdef:
             CRecordBatch* this_batch = self.batch
             shared_ptr[CRecordBatch] other_batch = pyarrow_unwrap_batch(other)
@@ -3248,6 +3276,7 @@ cdef class RecordBatch(_Tabular):
         return RecordBatch.from_arrays(newcols, schema=target_schema)
 
     def _to_pandas(self, options, **kwargs):
+        self._assert_cpu()
         return Table.from_batches([self])._to_pandas(options, **kwargs)
 
     @classmethod
@@ -3473,6 +3502,8 @@ cdef class RecordBatch(_Tabular):
         """
         cdef:
             shared_ptr[CRecordBatch] c_record_batch
+        if struct_array.sp_array.get().device_type() != CDeviceAllocationType_kCPU:
+            raise NotImplementedError("Implemented only for data on CPU device")
         with nogil:
             c_record_batch = GetResultValue(
                 CRecordBatch.FromStructArray(struct_array.sp_array))
@@ -3482,6 +3513,7 @@ cdef class RecordBatch(_Tabular):
         """
         Convert to a struct array.
         """
+        self._assert_cpu()
         cdef:
             shared_ptr[CRecordBatch] c_record_batch
             shared_ptr[CArray] c_array
@@ -3560,6 +3592,7 @@ cdef class RecordBatch(_Tabular):
                [ 4., 40.],
                [nan, nan]])
         """
+        self._assert_cpu()
         cdef:
             shared_ptr[CRecordBatch] c_record_batch
             shared_ptr[CTensor] c_tensor
@@ -3686,6 +3719,7 @@ cdef class RecordBatch(_Tabular):
             A pair of PyCapsules containing a C ArrowSchema and ArrowArray,
             respectively.
         """
+        self._assert_cpu()
         cdef:
             ArrowArray* c_array
             ArrowSchema* c_schema
@@ -3731,6 +3765,7 @@ cdef class RecordBatch(_Tabular):
         -------
         PyCapsule
         """
+        self._assert_cpu()
         return Table.from_batches([self]).__arrow_c_stream__(requested_schema)
 
     @staticmethod
@@ -3943,6 +3978,10 @@ cdef class RecordBatch(_Tabular):
         """
         return self.device_type == DeviceAllocationType.CPU
 
+    cdef void _assert_cpu(self) except *:
+        if self.sp_batch.get().device_type() != CDeviceAllocationType_kCPU:
+            raise NotImplementedError("Implemented only for data on CPU device")
+
 
 def _reconstruct_record_batch(columns, schema):
     """
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 3b60cff2d8cf2..5776598550536 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -27,6 +27,7 @@
 import pytest
 import pyarrow as pa
 import pyarrow.compute as pc
+from pyarrow.interchange import from_dataframe
 from pyarrow.vendored.version import Version
 
 
@@ -3374,3 +3375,202 @@ def test_invalid_non_join_column():
     with pytest.raises(pa.lib.ArrowInvalid) as excinfo:
         t2.join(t1, 'id', join_type='inner')
     assert exp_error_msg in str(excinfo.value)
+
+
+@pytest.fixture
+def cuda_context():
+    cuda = pytest.importorskip("pyarrow.cuda")
+    return cuda.Context(0)
+
+
+@pytest.fixture
+def schema():
+    return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())])
+
+
+@pytest.fixture
+def cpu_arrays():
+    return [pa.array([1, 2, 3, 4, 5], pa.int16()),
+            pa.array([-10, -5, 0, 1, 10], pa.int32())]
+
+
+@pytest.fixture
+def cuda_arrays(cuda_context, cpu_arrays):
+    return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays]
+
+
+@pytest.fixture
+def cpu_recordbatch(cpu_arrays, schema):
+    return pa.record_batch(cpu_arrays, schema=schema)
+
+
+@pytest.fixture
+def cuda_recordbatch(cuda_context, cpu_recordbatch):
+    return cpu_recordbatch.copy_to(cuda_context.memory_manager)
+
+
+def verify_cuda_recordbatch(batch, expected_schema):
+    batch.validate()
+    assert batch.device_type == pa.DeviceAllocationType.CUDA
+    assert batch.is_cpu is False
+    assert batch.num_columns == len(expected_schema.names)
+    assert batch.column_names == expected_schema.names
+    assert str(batch) in repr(batch)
+    for c in batch.columns:
+        assert c.device_type == pa.DeviceAllocationType.CUDA
+    assert batch.schema == expected_schema
+
+
+def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
+                             cuda_arrays, schema):
+    verify_cuda_recordbatch(cuda_recordbatch, expected_schema=schema)
+    assert cuda_recordbatch.shape == (5, 2)
+
+    # columns() test
+    assert len(cuda_recordbatch.columns) == 2
+
+    # add_column(), set_column() test
+    for fn in [cuda_recordbatch.add_column, cuda_recordbatch.set_column]:
+        col = pa.array([6, 7, 8, 9, 10], pa.int8()).copy_to(cuda_context.memory_manager)
+        new_batch = fn(2, 'c2', col)
+        assert len(new_batch.columns) == 3
+        for c in new_batch.columns:
+            assert c.device_type == pa.DeviceAllocationType.CUDA
+        err_msg = ("Got column on device <DeviceAllocationType.CPU: 1>, "
+                   "but expected <DeviceAllocationType.CUDA: 2>.")
+        with pytest.raises(TypeError, match=err_msg):
+            fn(2, 'c2', [1, 1, 1, 1, 1])
+
+    # remove_column() test
+    new_batch = cuda_recordbatch.remove_column(1)
+    verify_cuda_recordbatch(new_batch, expected_schema=schema.remove(1))
+
+    # drop_columns() test
+    new_batch = cuda_recordbatch.drop_columns(['c0', 'c1'])
+    assert len(new_batch.columns) == 0
+    assert new_batch.device_type == pa.DeviceAllocationType.CUDA
+
+    # select() test
+    new_batch = cuda_recordbatch.select(['c0'])
+    verify_cuda_recordbatch(new_batch, expected_schema=schema.remove(1))
+
+    # cast() test
+    new_schema = pa.schema([pa.field('c0', pa.int64()), pa.field('c1', pa.int64())])
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.cast(new_schema)
+
+    # drop_null() test
+    null_col = pa.array([-2, -1, 0, 1, 2],
+                        mask=[True, False, True, False, True]).copy_to(
+        cuda_context.memory_manager)
+    cuda_recordbatch_with_nulls = cuda_recordbatch.add_column(2, 'c2', null_col)
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch_with_nulls.drop_null()
+
+    # filter() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.filter([True] * 5)
+
+    # take() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.take([0])
+
+    # sort_by() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.sort_by('c0')
+
+    # field() test
+    assert cuda_recordbatch.field(0) == pa.field('c0', pa.int16())
+    assert cuda_recordbatch.field(1) == pa.field('c1', pa.int32())
+
+    # equals() test
+    new_batch = cpu_recordbatch.copy_to(cuda_context.memory_manager)
+    with pytest.raises(NotImplementedError):
+        assert cuda_recordbatch.equals(new_batch) is True
+
+    # from_arrays() test
+    new_batch = pa.RecordBatch.from_arrays(cuda_arrays, ['c0', 'c1'])
+    verify_cuda_recordbatch(new_batch, expected_schema=schema)
+    assert new_batch.copy_to(pa.default_cpu_memory_manager()).equals(cpu_recordbatch)
+
+    # from_pydict() test
+    new_batch = pa.RecordBatch.from_pydict({'c0': cuda_arrays[0], 'c1': cuda_arrays[1]})
+    verify_cuda_recordbatch(new_batch, expected_schema=schema)
+    assert new_batch.copy_to(pa.default_cpu_memory_manager()).equals(cpu_recordbatch)
+
+    # from_struct_array() test
+    fields = [schema.field(i) for i in range(len(schema.names))]
+    struct_array = pa.StructArray.from_arrays(cuda_arrays, fields=fields)
+    with pytest.raises(NotImplementedError):
+        pa.RecordBatch.from_struct_array(struct_array)
+
+    # nbytes test
+    with pytest.raises(NotImplementedError):
+        assert cuda_recordbatch.nbytes
+
+    # get_total_buffer_size() test
+    with pytest.raises(NotImplementedError):
+        assert cuda_recordbatch.get_total_buffer_size()
+
+    # to_pydict() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_pydict()
+
+    # to_pylist() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_pylist()
+
+    # to_pandas() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_pandas()
+
+    # to_tensor() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_tensor()
+
+    # to_struct_array() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.to_struct_array()
+
+    # serialize() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.serialize()
+
+    # slice() test
+    new_batch = cuda_recordbatch.slice(1, 3)
+    verify_cuda_recordbatch(new_batch, expected_schema=schema)
+    assert new_batch.num_rows == 3
+    cpu_batch = new_batch.copy_to(pa.default_cpu_memory_manager())
+    assert cpu_batch == cpu_recordbatch.slice(1, 3)
+
+    # replace_schema_metadata() test
+    new_batch = cuda_recordbatch.replace_schema_metadata({b'key': b'value'})
+    verify_cuda_recordbatch(new_batch, expected_schema=schema)
+    assert new_batch.schema.metadata == {b'key': b'value'}
+
+    # rename_columns() test
+    new_batch = cuda_recordbatch.rename_columns(['col0', 'col1'])
+    expected_schema = pa.schema(
+        [pa.field('col0', pa.int16()), pa.field('col1', pa.int32())])
+    verify_cuda_recordbatch(new_batch, expected_schema=expected_schema)
+
+    # validate() test
+    cuda_recordbatch.validate()
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.validate(full=True)
+
+    # __array__() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.__array__()
+
+    # __arrow_c_array__() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.__arrow_c_array__()
+
+    # __arrow_c_stream__() test
+    with pytest.raises(NotImplementedError):
+        cuda_recordbatch.__arrow_c_stream__()
+
+    # __dataframe__() test
+    with pytest.raises(NotImplementedError):
+        from_dataframe(cuda_recordbatch.__dataframe__())

From 9445fe4b13c31ddefdb104fe988ef1e461e5ed66 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 4 Sep 2024 10:49:06 +0200
Subject: [PATCH 081/186] GH-40216: [Python][CI][Packaging] Don't upload sdist
 to scientific-python nightly channel (only wheels) (#43943)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

See https://github.com/apache/arrow/issues/40216#issuecomment-2325960675 for context. It might be expected that the channel only holds wheels, to users (downstream projects' CI) would accidentally try build from the sdist (e.g. when a wheel is missing).

* GitHub Issue: #40216

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 dev/tasks/python-sdist/github.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dev/tasks/python-sdist/github.yml b/dev/tasks/python-sdist/github.yml
index ce41f437946a7..ef36e358aa926 100644
--- a/dev/tasks/python-sdist/github.yml
+++ b/dev/tasks/python-sdist/github.yml
@@ -43,4 +43,3 @@ jobs:
 
       {{ macros.github_upload_releases("arrow/python/dist/*.tar.gz")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/dist/*.tar.gz")|indent }}
-      {{ macros.github_upload_wheel_scientific_python("arrow/python/dist/*.tar.gz")|indent }}

From 9abad7a21d33ea20ce66ebbe3a2be26515eee279 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 4 Sep 2024 11:56:50 +0200
Subject: [PATCH 082/186] GH-43669: [Docs][Dev] Document archery --debug flag
 in section about docker (#43935)

### Rationale for this change

This feature was added in https://github.com/apache/arrow/pull/40129, but adding it to the docker page in the developer docs for better visibility.

* GitHub Issue: #43669

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 .../developers/continuous_integration/docker.rst     | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/source/developers/continuous_integration/docker.rst b/docs/source/developers/continuous_integration/docker.rst
index 68f3c7d709791..129b5d0bcf135 100644
--- a/docs/source/developers/continuous_integration/docker.rst
+++ b/docs/source/developers/continuous_integration/docker.rst
@@ -156,6 +156,18 @@ The following example starts an interactive ``bash`` session in the container
 
     archery docker run ubuntu-cpp bash
 
+**Build the image with increased debugging output:**
+
+To enable additional logging output for debugging, pass the ``--debug`` flag
+to ``archery``.
+
+.. code:: bash
+
+    archery --debug docker run ubuntu-cpp
+
+In addition to enabling ``DEBUG``-level logging, this also translates to
+passing ``--progress=plain`` to docker(-compose) build command.
+
 Docker Volume Caches
 ~~~~~~~~~~~~~~~~~~~~
 

From 6382c0a5560f711370551922cf74b0c8eab32328 Mon Sep 17 00:00:00 2001
From: ndglover <neilglover@gmail.com>
Date: Wed, 4 Sep 2024 14:05:17 +0100
Subject: [PATCH 083/186] GH-43672: [C#] Schema should be optional on
 FlightInfo (#43673)

### Rationale for this change

Schema is not required on a FlightInfo message and sometimes needs to be lazily evaluated on the server. This PR allows schema to be null on the FlightInfo since it will be picked up later when requests with those tickets are made.

### What changes are included in this PR?

### Are these changes tested?
Yes, added a test to confirm this behaviour

### Are there any user-facing changes?

* GitHub Issue: #43672

Authored-by: neilglover <neilglover@gmail.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 csharp/src/Apache.Arrow.Flight/FlightInfo.cs  |  5 ++--
 .../Apache.Arrow.Flight.Tests/FlightTests.cs  | 29 +++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight/FlightInfo.cs b/csharp/src/Apache.Arrow.Flight/FlightInfo.cs
index 16ddb6fbfb834..e2452ac9ff461 100644
--- a/csharp/src/Apache.Arrow.Flight/FlightInfo.cs
+++ b/csharp/src/Apache.Arrow.Flight/FlightInfo.cs
@@ -18,6 +18,7 @@
 using System.Text;
 using Apache.Arrow.Flight.Internal;
 using Apache.Arrow.Ipc;
+using Google.Protobuf;
 
 namespace Apache.Arrow.Flight
 {
@@ -25,7 +26,7 @@ public class FlightInfo
     {
         internal FlightInfo(Protocol.FlightInfo flightInfo)
         {
-            Schema = FlightMessageSerializer.DecodeSchema(flightInfo.Schema.Memory);
+            Schema = flightInfo.Schema?.Length > 0 ? FlightMessageSerializer.DecodeSchema(flightInfo.Schema.Memory) : null;
             Descriptor = new FlightDescriptor(flightInfo.FlightDescriptor);
 
             var endpoints = new List<FlightEndpoint>();
@@ -60,7 +61,7 @@ public FlightInfo(Schema schema, FlightDescriptor descriptor, IReadOnlyList<Flig
 
         internal Protocol.FlightInfo ToProtocol()
         {
-            var serializedSchema = SchemaWriter.SerializeSchema(Schema);
+            var serializedSchema = Schema != null ? SchemaWriter.SerializeSchema(Schema) : ByteString.Empty;
             var response = new Protocol.FlightInfo()
             {
                 Schema = serializedSchema,
diff --git a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
index 8bf6e1120c6d3..0e82673d02240 100644
--- a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
+++ b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs
@@ -71,14 +71,14 @@ private FlightInfo GivenStoreBatches(FlightDescriptor flightDescriptor, params R
         {
             var initialBatch = batches.FirstOrDefault();
 
-            var flightHolder = new FlightHolder(flightDescriptor, initialBatch.RecordBatch.Schema, _testWebFactory.GetAddress());
+            var flightHolder = new FlightHolder(flightDescriptor, initialBatch?.RecordBatch.Schema, _testWebFactory.GetAddress());
 
             foreach (var batch in batches)
             {
                 flightHolder.AddBatch(batch);
             }
 
-            _flightStore.Flights.Add(flightDescriptor, flightHolder);
+            _flightStore.Flights[flightDescriptor] = flightHolder;
 
             return flightHolder.GetFlightInfo();
         }
@@ -124,6 +124,31 @@ public async Task TestPutTwoRecordBatches()
             ArrowReaderVerifier.CompareBatches(expectedBatch2, actualBatches[1].RecordBatch);
         }
 
+        [Fact]
+        public async Task TestGetRecordBatchWithDelayedSchema()
+        {
+            var flightDescriptor = FlightDescriptor.CreatePathDescriptor("test");
+            var expectedBatch = CreateTestBatch(0, 100);
+
+            //Add flight info only to the in memory store without schema or batch
+            GivenStoreBatches(flightDescriptor);
+
+            //Get the flight info for the ticket and verify the schema is null
+            var flightInfo = await _flightClient.GetInfo(flightDescriptor);
+            Assert.Single(flightInfo.Endpoints);
+            Assert.Null(flightInfo.Schema);
+
+            var endpoint = flightInfo.Endpoints.FirstOrDefault();
+
+            //Update the store with the batch and schema
+            GivenStoreBatches(flightDescriptor, new RecordBatchWithMetadata(expectedBatch));
+            var getStream = _flightClient.GetStream(endpoint.Ticket);
+            var resultList = await getStream.ResponseStream.ToListAsync();
+
+            Assert.Single(resultList);
+            ArrowReaderVerifier.CompareBatches(expectedBatch, resultList[0]);
+        }
+
         [Fact]
         public async Task TestGetSingleRecordBatch()
         {

From 50219ef69f712ca12fa5c85367a4cae9776d1c99 Mon Sep 17 00:00:00 2001
From: Dane Pitkin <dpitkin@apache.org>
Date: Wed, 4 Sep 2024 11:01:24 -0400
Subject: [PATCH 084/186] GH-43728: [Python] ChunkedArray fails gracefully on
 non-cpu devices (#43795)

### Rationale for this change

ChunkedArrays that are backed by non-cpu memory should not segfault when the user invokes an incompatible API.

### What changes are included in this PR?

* Add IsCpu() to ChunkedArray
* Throw a python exception for known incompatible APIs on non-cpu device

### Are these changes tested?

Unit tests

### Are there any user-facing changes?

The user should no longer see segfaults for certain APIs, just python exceptions.
* GitHub Issue: #43728

Authored-by: Dane Pitkin <dpitkin@apache.org>
Signed-off-by: Dane Pitkin <dpitkin@apache.org>
---
 python/pyarrow/includes/libarrow.pxd |   2 +
 python/pyarrow/lib.pxd               |   2 +
 python/pyarrow/table.pxi             |  45 ++++++-
 python/pyarrow/tests/test_table.py   | 177 +++++++++++++++++++++++++--
 4 files changed, 218 insertions(+), 8 deletions(-)

diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index c2346750a196f..8e6922a912a32 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -983,6 +983,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         CResult[vector[shared_ptr[CChunkedArray]]] Flatten(CMemoryPool* pool)
 
+        c_bool is_cpu() const
+
         CStatus Validate() const
         CStatus ValidateFull() const
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index ad05ea31c91c6..1caf58e20e653 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -506,6 +506,8 @@ cdef class ChunkedArray(_PandasConvertible):
     cdef:
         shared_ptr[CChunkedArray] sp_chunked_array
         CChunkedArray* chunked_array
+        c_bool _is_cpu
+        c_bool _init_is_cpu
 
     cdef readonly:
         # To allow Table to propagate metadata to pandas.Series
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 9bb8623665977..3b0df981e017c 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -59,6 +59,7 @@ cdef class ChunkedArray(_PandasConvertible):
 
     def __cinit__(self):
         self.chunked_array = NULL
+        self._init_is_cpu = False
 
     def __init__(self):
         raise TypeError("Do not call ChunkedArray's constructor directly, use "
@@ -69,6 +70,7 @@ cdef class ChunkedArray(_PandasConvertible):
         self.chunked_array = chunked_array.get()
 
     def __reduce__(self):
+        self._assert_cpu()
         return chunked_array, (self.chunks, self.type)
 
     @property
@@ -198,6 +200,7 @@ cdef class ChunkedArray(_PandasConvertible):
         ArrowInvalid
         """
         if full:
+            self._assert_cpu()
             with nogil:
                 check_status(self.sp_chunked_array.get().ValidateFull())
         else:
@@ -220,6 +223,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.null_count
         1
         """
+        self._assert_cpu()
         return self.chunked_array.null_count()
 
     @property
@@ -245,6 +249,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.nbytes
         49
         """
+        self._assert_cpu()
         cdef:
             CResult[int64_t] c_res_buffer
 
@@ -271,6 +276,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.get_total_buffer_size()
         49
         """
+        self._assert_cpu()
         cdef:
             int64_t total_buffer_size
 
@@ -299,13 +305,14 @@ cdef class ChunkedArray(_PandasConvertible):
         -------
         value : Scalar (index) or ChunkedArray (slice)
         """
-
+        self._assert_cpu()
         if isinstance(key, slice):
             return _normalize_slice(self, key)
 
         return self.getitem(_normalize_index(key, self.chunked_array.length()))
 
     cdef getitem(self, int64_t i):
+        self._assert_cpu()
         return Scalar.wrap(GetResultValue(self.chunked_array.GetScalar(i)))
 
     def is_null(self, *, nan_is_null=False):
@@ -338,6 +345,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         options = _pc().NullOptions(nan_is_null=nan_is_null)
         return _pc().call_function('is_null', [self], options)
 
@@ -363,6 +371,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().is_nan(self)
 
     def is_valid(self):
@@ -388,6 +397,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().is_valid(self)
 
     def __eq__(self, other):
@@ -430,6 +440,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().fill_null(self, fill_value)
 
     def equals(self, ChunkedArray other):
@@ -458,6 +469,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.equals(animals)
         False
         """
+        self._assert_cpu()
         if other is None:
             return False
 
@@ -472,6 +484,7 @@ cdef class ChunkedArray(_PandasConvertible):
         return result
 
     def _to_pandas(self, options, types_mapper=None, **kwargs):
+        self._assert_cpu()
         return _array_like_to_pandas(self, options, types_mapper=types_mapper)
 
     def to_numpy(self, zero_copy_only=False):
@@ -495,6 +508,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.to_numpy()
         array([  2,   2,   4,   4,   5, 100])
         """
+        self._assert_cpu()
         if np is None:
             raise ImportError(
                 "Cannot return a numpy.ndarray if NumPy is not present")
@@ -529,6 +543,7 @@ cdef class ChunkedArray(_PandasConvertible):
         return values
 
     def __array__(self, dtype=None, copy=None):
+        self._assert_cpu()
         if copy is False:
             raise ValueError(
                 "Unable to avoid a copy while creating a numpy array as requested "
@@ -574,6 +589,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs_seconds.type
         DurationType(duration[s])
         """
+        self._assert_cpu()
         return _pc().cast(self, target_type, safe=safe, options=options)
 
     def dictionary_encode(self, null_encoding='mask'):
@@ -636,6 +652,7 @@ cdef class ChunkedArray(_PandasConvertible):
             ]
         ]
         """
+        self._assert_cpu()
         options = _pc().DictionaryEncodeOptions(null_encoding)
         return _pc().call_function('dictionary_encode', [self], options)
 
@@ -700,6 +717,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.type
         DataType(int64)
         """
+        self._assert_cpu()
         cdef:
             vector[shared_ptr[CChunkedArray]] flattened
             CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
@@ -751,6 +769,7 @@ cdef class ChunkedArray(_PandasConvertible):
           100
         ]
         """
+        self._assert_cpu()
         if self.num_chunks == 0:
             return array([], type=self.type)
         else:
@@ -791,6 +810,7 @@ cdef class ChunkedArray(_PandasConvertible):
           100
         ]
         """
+        self._assert_cpu()
         return _pc().call_function('unique', [self])
 
     def value_counts(self):
@@ -837,6 +857,7 @@ cdef class ChunkedArray(_PandasConvertible):
             1
           ]
         """
+        self._assert_cpu()
         return _pc().call_function('value_counts', [self])
 
     def slice(self, offset=0, length=None):
@@ -959,6 +980,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().filter(self, mask, null_selection_behavior)
 
     def index(self, value, start=None, end=None, *, memory_pool=None):
@@ -1006,6 +1028,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.index(4, start=3)
         <pyarrow.Int64Scalar: 3>
         """
+        self._assert_cpu()
         return _pc().index(self, value, start, end, memory_pool=memory_pool)
 
     def take(self, object indices):
@@ -1052,6 +1075,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().take(self, indices)
 
     def drop_null(self):
@@ -1091,6 +1115,7 @@ cdef class ChunkedArray(_PandasConvertible):
           ]
         ]
         """
+        self._assert_cpu()
         return _pc().drop_null(self)
 
     def sort(self, order="ascending", **kwargs):
@@ -1110,6 +1135,7 @@ cdef class ChunkedArray(_PandasConvertible):
         -------
         result : ChunkedArray
         """
+        self._assert_cpu()
         indices = _pc().sort_indices(
             self,
             options=_pc().SortOptions(sort_keys=[("", order)], **kwargs)
@@ -1209,6 +1235,7 @@ cdef class ChunkedArray(_PandasConvertible):
             ]
         ]
         """
+        self._assert_cpu()
         cdef:
             CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
             shared_ptr[CChunkedArray] c_result
@@ -1333,6 +1360,7 @@ cdef class ChunkedArray(_PandasConvertible):
         >>> n_legs.to_pylist()
         [2, 2, 4, 4, None, 100]
         """
+        self._assert_cpu()
         result = []
         for i in range(self.num_chunks):
             result += self.chunk(i).to_pylist()
@@ -1354,6 +1382,7 @@ cdef class ChunkedArray(_PandasConvertible):
         PyCapsule
             A capsule containing a C ArrowArrayStream struct.
         """
+        self._assert_cpu()
         cdef:
             ChunkedArray chunked
             ArrowArrayStream* c_stream = NULL
@@ -1410,6 +1439,20 @@ cdef class ChunkedArray(_PandasConvertible):
         self.init(c_chunked_array)
         return self
 
+    @property
+    def is_cpu(self):
+        """
+        Whether all chunks in the ChunkedArray are CPU-accessible.
+        """
+        if not self._init_is_cpu:
+            self._is_cpu = self.chunked_array.is_cpu()
+            self._init_is_cpu = True
+        return self._is_cpu
+
+    def _assert_cpu(self):
+        if not self.is_cpu:
+            raise NotImplementedError("Implemented only for data on CPU device")
+
 
 def chunked_array(arrays, type=None):
     """
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 5776598550536..c3f805b4b32d6 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -3385,13 +3385,13 @@ def cuda_context():
 
 @pytest.fixture
 def schema():
-    return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())])
+    return pa.schema([pa.field('c0', pa.int32()), pa.field('c1', pa.int32())])
 
 
 @pytest.fixture
-def cpu_arrays():
-    return [pa.array([1, 2, 3, 4, 5], pa.int16()),
-            pa.array([-10, -5, 0, 1, 10], pa.int32())]
+def cpu_arrays(schema):
+    return [pa.array([1, 2, 3, 4, 5], schema.field(0).type),
+            pa.array([-10, -5, 0, None, 10], schema.field(1).type)]
 
 
 @pytest.fixture
@@ -3399,6 +3399,27 @@ def cuda_arrays(cuda_context, cpu_arrays):
     return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays]
 
 
+@pytest.fixture
+def cpu_chunked_array(cpu_arrays):
+    chunked_array = pa.chunked_array(cpu_arrays)
+    assert chunked_array.is_cpu is True
+    return chunked_array
+
+
+@pytest.fixture
+def cuda_chunked_array(cuda_arrays):
+    chunked_array = pa.chunked_array(cuda_arrays)
+    assert chunked_array.is_cpu is False
+    return chunked_array
+
+
+@pytest.fixture
+def cpu_and_cuda_chunked_array(cpu_arrays, cuda_arrays):
+    chunked_array = pa.chunked_array(cpu_arrays + cuda_arrays)
+    assert chunked_array.is_cpu is False
+    return chunked_array
+
+
 @pytest.fixture
 def cpu_recordbatch(cpu_arrays, schema):
     return pa.record_batch(cpu_arrays, schema=schema)
@@ -3409,6 +3430,147 @@ def cuda_recordbatch(cuda_context, cpu_recordbatch):
     return cpu_recordbatch.copy_to(cuda_context.memory_manager)
 
 
+def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_array,
+                               cpu_and_cuda_chunked_array):
+    # type test
+    assert cuda_chunked_array.type == cpu_chunked_array.type
+
+    # length() test
+    assert cuda_chunked_array.length() == cpu_chunked_array.length()
+
+    # str() test
+    assert str(cuda_chunked_array) == str(cpu_chunked_array)
+
+    # repr() test
+    assert str(cuda_chunked_array) in repr(cuda_chunked_array)
+
+    # validate() test
+    cuda_chunked_array.validate()
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.validate(full=True)
+
+    # null_count test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.null_count
+
+    # nbytes() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.nbytes
+
+    # get_total_buffer_size() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.get_total_buffer_size()
+
+    # getitem() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array[0]
+
+    # is_null() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.is_null()
+
+    # is_nan() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.is_nan()
+
+    # is_valid() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.is_valid()
+
+    # fill_null() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.fill_null(0)
+
+    # equals() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array == cuda_chunked_array
+
+    # to_pandas() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.to_pandas()
+
+    # to_numpy() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.to_numpy()
+
+    # __array__() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.__array__()
+
+    # cast() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.cast()
+
+    # dictionary_encode() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.dictionary_encode()
+
+    # flatten() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.flatten()
+
+    # combine_chunks() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.combine_chunks()
+
+    # unique() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.unique()
+
+    # value_counts() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.value_counts()
+
+    # filter() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.filter([True, False, True, False, True])
+
+    # index() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.index(5)
+
+    # slice() test
+    cuda_chunked_array.slice(2, 2)
+
+    # take() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.take([1])
+
+    # drop_null() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.drop_null()
+
+    # sort() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.sort()
+
+    # unify_dictionaries() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.unify_dictionaries()
+
+    # num_chunks test
+    assert cuda_chunked_array.num_chunks == cpu_chunked_array.num_chunks
+
+    # chunks test
+    assert len(cuda_chunked_array.chunks) == len(cpu_chunked_array.chunks)
+
+    # chunk() test
+    chunk = cuda_chunked_array.chunk(0)
+    assert chunk.device_type == pa.DeviceAllocationType.CUDA
+
+    # to_pylist() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.to_pylist()
+
+    # __arrow_c_stream__() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.__arrow_c_stream__()
+
+    # __reduce__() test
+    with pytest.raises(NotImplementedError):
+        cuda_chunked_array.__reduce__()
+
+
 def verify_cuda_recordbatch(batch, expected_schema):
     batch.validate()
     assert batch.device_type == pa.DeviceAllocationType.CUDA
@@ -3480,8 +3642,8 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
         cuda_recordbatch.sort_by('c0')
 
     # field() test
-    assert cuda_recordbatch.field(0) == pa.field('c0', pa.int16())
-    assert cuda_recordbatch.field(1) == pa.field('c1', pa.int32())
+    assert cuda_recordbatch.field(0) == schema.field(0)
+    assert cuda_recordbatch.field(1) == schema.field(1)
 
     # equals() test
     new_batch = cpu_recordbatch.copy_to(cuda_context.memory_manager)
@@ -3551,7 +3713,8 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
     # rename_columns() test
     new_batch = cuda_recordbatch.rename_columns(['col0', 'col1'])
     expected_schema = pa.schema(
-        [pa.field('col0', pa.int16()), pa.field('col1', pa.int32())])
+        [pa.field('col0', schema.field(0).type),
+         pa.field('col1', schema.field(1).type)])
     verify_cuda_recordbatch(new_batch, expected_schema=expected_schema)
 
     # validate() test

From 5ca12bd7c109cdc362f929ada12d4de79148a1c7 Mon Sep 17 00:00:00 2001
From: Amit Mittal <eramitmittal@users.noreply.github.com>
Date: Thu, 5 Sep 2024 06:17:11 +0530
Subject: [PATCH 085/186] GH-38255: [Java] Implement Flight SQL Bulk Ingestion
 (#43551)

Please look at #38255 for details on this functionality. Support for Go and C++ was added as part of #38385.
This pull request is to add the required support for Java.
* GitHub Issue: #38255

Lead-authored-by: Amit Mittal <amit.mittal@iongroup.com>
Co-authored-by: Amit Mittal <eramitmittal@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 dev/archery/archery/integration/runner.py     |   2 +-
 .../tests/FlightSqlExtensionScenario.java     |  96 ++------
 .../tests/FlightSqlIngestionScenario.java     | 106 ++++++++
 .../integration/tests/FlightSqlScenario.java  |  77 +++++-
 .../tests/FlightSqlScenarioProducer.java      |  91 +++++--
 .../flight/integration/tests/Scenarios.java   |   1 +
 .../tests/TestBufferAllocationListener.java   |  73 ++++++
 .../integration/tests/IntegrationTest.java    |  29 ++-
 java/flight/flight-sql/pom.xml                |   6 +
 .../arrow/flight/sql/FlightSqlClient.java     | 226 +++++++++++++++++-
 .../arrow/flight/sql/FlightSqlProducer.java   |  31 ++-
 .../flight/sql/NoOpFlightSqlProducer.java     |  12 +
 .../arrow/flight/sql/SqlInfoBuilder.java      |  11 +
 .../flight/sql/example/FlightSqlExample.java  | 171 ++++++++++++-
 .../arrow/flight/sql/test/TestFlightSql.java  | 161 +++++++++++++
 15 files changed, 990 insertions(+), 103 deletions(-)
 create mode 100644 java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlIngestionScenario.java
 create mode 100644 java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/TestBufferAllocationListener.java

diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 0ea244720cc1d..66c8721519ede 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -645,7 +645,7 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True,
         Scenario(
             "flight_sql:ingestion",
             description="Ensure Flight SQL ingestion works as expected.",
-            skip_testers={"JS", "C#", "Rust", "Java"}
+            skip_testers={"JS", "C#", "Rust"}
         ),
     ]
 
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlExtensionScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlExtensionScenario.java
index 76d79b226623d..69b02030ccd3d 100644
--- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlExtensionScenario.java
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlExtensionScenario.java
@@ -16,24 +16,17 @@
  */
 package org.apache.arrow.flight.integration.tests;
 
-import java.util.HashMap;
 import java.util.Map;
 import org.apache.arrow.flight.FlightClient;
 import org.apache.arrow.flight.FlightInfo;
-import org.apache.arrow.flight.FlightStream;
 import org.apache.arrow.flight.Location;
 import org.apache.arrow.flight.SchemaResult;
-import org.apache.arrow.flight.Ticket;
 import org.apache.arrow.flight.sql.CancelResult;
 import org.apache.arrow.flight.sql.FlightSqlClient;
 import org.apache.arrow.flight.sql.FlightSqlProducer;
 import org.apache.arrow.flight.sql.impl.FlightSql;
 import org.apache.arrow.memory.BufferAllocator;
-import org.apache.arrow.util.Preconditions;
-import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.VectorSchemaRoot;
-import org.apache.arrow.vector.complex.DenseUnionVector;
-import org.apache.arrow.vector.types.pojo.Schema;
 
 /**
  * Integration test scenario for validating Flight SQL specs across multiple implementations. This
@@ -53,69 +46,32 @@ public void client(BufferAllocator allocator, Location location, FlightClient cl
   }
 
   private void validateMetadataRetrieval(FlightSqlClient sqlClient) throws Exception {
-    FlightInfo info = sqlClient.getSqlInfo();
-    Ticket ticket = info.getEndpoints().get(0).getTicket();
-
-    Map<Integer, Object> infoValues = new HashMap<>();
-    try (FlightStream stream = sqlClient.getStream(ticket)) {
-      Schema actualSchema = stream.getSchema();
-      IntegrationAssertions.assertEquals(
-          FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA, actualSchema);
-
-      while (stream.next()) {
-        UInt4Vector infoName = (UInt4Vector) stream.getRoot().getVector(0);
-        DenseUnionVector value = (DenseUnionVector) stream.getRoot().getVector(1);
-
-        for (int i = 0; i < stream.getRoot().getRowCount(); i++) {
-          final int code = infoName.get(i);
-          if (infoValues.containsKey(code)) {
-            throw new AssertionError("Duplicate SqlInfo value: " + code);
-          }
-          Object object;
-          byte typeId = value.getTypeId(i);
-          switch (typeId) {
-            case 0: // string
-              object =
-                  Preconditions.checkNotNull(
-                          value.getVarCharVector(typeId).getObject(value.getOffset(i)))
-                      .toString();
-              break;
-            case 1: // bool
-              object = value.getBitVector(typeId).getObject(value.getOffset(i));
-              break;
-            case 2: // int64
-              object = value.getBigIntVector(typeId).getObject(value.getOffset(i));
-              break;
-            case 3: // int32
-              object = value.getIntVector(typeId).getObject(value.getOffset(i));
-              break;
-            default:
-              throw new AssertionError("Decoding SqlInfo of type code " + typeId);
-          }
-          infoValues.put(code, object);
-        }
-      }
-    }
-
-    IntegrationAssertions.assertEquals(
-        Boolean.FALSE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SQL_VALUE));
-    IntegrationAssertions.assertEquals(
-        Boolean.TRUE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_VALUE));
-    IntegrationAssertions.assertEquals(
-        "min_version",
-        infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION_VALUE));
-    IntegrationAssertions.assertEquals(
-        "max_version",
-        infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION_VALUE));
-    IntegrationAssertions.assertEquals(
-        FlightSql.SqlSupportedTransaction.SQL_SUPPORTED_TRANSACTION_SAVEPOINT_VALUE,
-        infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_TRANSACTION_VALUE));
-    IntegrationAssertions.assertEquals(
-        Boolean.TRUE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_CANCEL_VALUE));
-    IntegrationAssertions.assertEquals(
-        42, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT_VALUE));
-    IntegrationAssertions.assertEquals(
-        7, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT_VALUE));
+    validate(
+        FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA,
+        sqlClient.getSqlInfo(),
+        sqlClient,
+        s -> {
+          Map<Integer, Object> infoValues = readSqlInfoStream(s);
+          IntegrationAssertions.assertEquals(
+              Boolean.FALSE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SQL_VALUE));
+          IntegrationAssertions.assertEquals(
+              Boolean.TRUE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_VALUE));
+          IntegrationAssertions.assertEquals(
+              "min_version",
+              infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION_VALUE));
+          IntegrationAssertions.assertEquals(
+              "max_version",
+              infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION_VALUE));
+          IntegrationAssertions.assertEquals(
+              FlightSql.SqlSupportedTransaction.SQL_SUPPORTED_TRANSACTION_SAVEPOINT_VALUE,
+              infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_TRANSACTION_VALUE));
+          IntegrationAssertions.assertEquals(
+              Boolean.TRUE, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_CANCEL_VALUE));
+          IntegrationAssertions.assertEquals(
+              42, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT_VALUE));
+          IntegrationAssertions.assertEquals(
+              7, infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT_VALUE));
+        });
   }
 
   private void validateStatementExecution(FlightSqlClient sqlClient) throws Exception {
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlIngestionScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlIngestionScenario.java
new file mode 100644
index 0000000000000..981ce89f1b88a
--- /dev/null
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlIngestionScenario.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.flight.integration.tests;
+
+import com.google.common.collect.ImmutableMap;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.arrow.flight.FlightClient;
+import org.apache.arrow.flight.FlightProducer;
+import org.apache.arrow.flight.Location;
+import org.apache.arrow.flight.sql.FlightSqlClient;
+import org.apache.arrow.flight.sql.FlightSqlClient.ExecuteIngestOptions;
+import org.apache.arrow.flight.sql.FlightSqlProducer;
+import org.apache.arrow.flight.sql.impl.FlightSql;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.types.pojo.Schema;
+
+/**
+ * Integration test scenario for validating Flight SQL specs across multiple implementations. This
+ * should ensure that RPC objects are being built and parsed correctly for multiple languages and
+ * that the Arrow schemas are returned as expected.
+ */
+public class FlightSqlIngestionScenario extends FlightSqlScenario {
+
+  @Override
+  public FlightProducer producer(BufferAllocator allocator, Location location) throws Exception {
+    FlightSqlScenarioProducer producer =
+        (FlightSqlScenarioProducer) super.producer(allocator, location);
+    producer
+        .getSqlInfoBuilder()
+        .withFlightSqlServerBulkIngestionTransaction(true)
+        .withFlightSqlServerBulkIngestion(true);
+    return producer;
+  }
+
+  @Override
+  public void client(BufferAllocator allocator, Location location, FlightClient client)
+      throws Exception {
+    try (final FlightSqlClient sqlClient = new FlightSqlClient(client)) {
+      validateMetadataRetrieval(sqlClient);
+      validateIngestion(allocator, sqlClient);
+    }
+  }
+
+  private void validateMetadataRetrieval(FlightSqlClient sqlClient) throws Exception {
+    validate(
+        FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA,
+        sqlClient.getSqlInfo(
+            FlightSql.SqlInfo.FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED,
+            FlightSql.SqlInfo.FLIGHT_SQL_SERVER_BULK_INGESTION),
+        sqlClient,
+        s -> {
+          Map<Integer, Object> infoValues = readSqlInfoStream(s);
+          IntegrationAssertions.assertEquals(
+              Boolean.TRUE,
+              infoValues.get(
+                  FlightSql.SqlInfo.FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED_VALUE));
+          IntegrationAssertions.assertEquals(
+              Boolean.TRUE,
+              infoValues.get(FlightSql.SqlInfo.FLIGHT_SQL_SERVER_BULK_INGESTION_VALUE));
+        });
+  }
+
+  private VectorSchemaRoot getIngestVectorRoot(BufferAllocator allocator) {
+    Schema schema = FlightSqlScenarioProducer.getIngestSchema();
+    VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator);
+    root.setRowCount(3);
+    return root;
+  }
+
+  private void validateIngestion(BufferAllocator allocator, FlightSqlClient sqlClient) {
+    try (VectorSchemaRoot data = getIngestVectorRoot(allocator)) {
+      TableDefinitionOptions tableDefinitionOptions =
+          TableDefinitionOptions.newBuilder()
+              .setIfExists(TableDefinitionOptions.TableExistsOption.TABLE_EXISTS_OPTION_REPLACE)
+              .setIfNotExist(
+                  TableDefinitionOptions.TableNotExistOption.TABLE_NOT_EXIST_OPTION_CREATE)
+              .build();
+      Map<String, String> options = new HashMap<>(ImmutableMap.of("key1", "val1", "key2", "val2"));
+      ExecuteIngestOptions executeIngestOptions =
+          new ExecuteIngestOptions(
+              "test_table", tableDefinitionOptions, true, "test_catalog", "test_schema", options);
+      FlightSqlClient.Transaction transaction =
+          new FlightSqlClient.Transaction(BULK_INGEST_TRANSACTION_ID);
+      long updatedRows = sqlClient.executeIngest(data, executeIngestOptions, transaction);
+
+      IntegrationAssertions.assertEquals(3L, updatedRows);
+    }
+  }
+}
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenario.java
index 8918b252700ac..e370a30bdc6ff 100644
--- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenario.java
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenario.java
@@ -16,8 +16,14 @@
  */
 package org.apache.arrow.flight.integration.tests;
 
+import static java.util.Objects.isNull;
+
+import com.google.protobuf.Any;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Consumer;
 import org.apache.arrow.flight.CallOption;
 import org.apache.arrow.flight.FlightClient;
 import org.apache.arrow.flight.FlightInfo;
@@ -29,10 +35,14 @@
 import org.apache.arrow.flight.Ticket;
 import org.apache.arrow.flight.sql.FlightSqlClient;
 import org.apache.arrow.flight.sql.FlightSqlProducer;
+import org.apache.arrow.flight.sql.FlightSqlUtils;
 import org.apache.arrow.flight.sql.impl.FlightSql;
 import org.apache.arrow.flight.sql.util.TableRef;
 import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.util.Preconditions;
+import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.complex.DenseUnionVector;
 import org.apache.arrow.vector.types.pojo.Schema;
 
 /**
@@ -52,6 +62,7 @@ public class FlightSqlScenario implements Scenario {
   public static final FlightSqlClient.SubstraitPlan SUBSTRAIT_PLAN =
       new FlightSqlClient.SubstraitPlan(SUBSTRAIT_PLAN_TEXT, SUBSTRAIT_VERSION);
   public static final byte[] TRANSACTION_ID = "transaction_id".getBytes(StandardCharsets.UTF_8);
+  public static final byte[] BULK_INGEST_TRANSACTION_ID = "123".getBytes(StandardCharsets.UTF_8);
 
   @Override
   public FlightProducer producer(BufferAllocator allocator, Location location) throws Exception {
@@ -150,15 +161,23 @@ private void validateMetadataRetrieval(FlightSqlClient sqlClient) throws Excepti
     validateSchema(
         FlightSqlProducer.Schemas.GET_TYPE_INFO_SCHEMA, sqlClient.getXdbcTypeInfoSchema(options));
 
-    validate(
-        FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA,
+    FlightInfo sqlInfoFlightInfo =
         sqlClient.getSqlInfo(
             new FlightSql.SqlInfo[] {
               FlightSql.SqlInfo.FLIGHT_SQL_SERVER_NAME,
               FlightSql.SqlInfo.FLIGHT_SQL_SERVER_READ_ONLY
             },
-            options),
-        sqlClient);
+            options);
+
+    Ticket ticket = sqlInfoFlightInfo.getEndpoints().get(0).getTicket();
+    FlightSql.CommandGetSqlInfo requestSqlInfoCommand =
+        FlightSqlUtils.unpackOrThrow(
+            Any.parseFrom(ticket.getBytes()), FlightSql.CommandGetSqlInfo.class);
+    IntegrationAssertions.assertEquals(
+        requestSqlInfoCommand.getInfo(0), FlightSql.SqlInfo.FLIGHT_SQL_SERVER_NAME_VALUE);
+    IntegrationAssertions.assertEquals(
+        requestSqlInfoCommand.getInfo(1), FlightSql.SqlInfo.FLIGHT_SQL_SERVER_READ_ONLY_VALUE);
+    validate(FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA, sqlInfoFlightInfo, sqlClient);
     validateSchema(
         FlightSqlProducer.Schemas.GET_SQL_INFO_SCHEMA, sqlClient.getSqlInfoSchema(options));
   }
@@ -194,14 +213,64 @@ private void validatePreparedStatementExecution(
 
   protected void validate(Schema expectedSchema, FlightInfo flightInfo, FlightSqlClient sqlClient)
       throws Exception {
+    validate(expectedSchema, flightInfo, sqlClient, null);
+  }
+
+  protected void validate(
+      Schema expectedSchema,
+      FlightInfo flightInfo,
+      FlightSqlClient sqlClient,
+      Consumer<FlightStream> streamConsumer)
+      throws Exception {
     Ticket ticket = flightInfo.getEndpoints().get(0).getTicket();
     try (FlightStream stream = sqlClient.getStream(ticket)) {
       Schema actualSchema = stream.getSchema();
       IntegrationAssertions.assertEquals(expectedSchema, actualSchema);
+      if (!isNull(streamConsumer)) {
+        streamConsumer.accept(stream);
+      }
     }
   }
 
   protected void validateSchema(Schema expected, SchemaResult actual) {
     IntegrationAssertions.assertEquals(expected, actual.getSchema());
   }
+
+  protected Map<Integer, Object> readSqlInfoStream(FlightStream stream) {
+    Map<Integer, Object> infoValues = new HashMap<>();
+    while (stream.next()) {
+      UInt4Vector infoName = (UInt4Vector) stream.getRoot().getVector(0);
+      DenseUnionVector value = (DenseUnionVector) stream.getRoot().getVector(1);
+
+      for (int i = 0; i < stream.getRoot().getRowCount(); i++) {
+        final int code = infoName.get(i);
+        if (infoValues.containsKey(code)) {
+          throw new AssertionError("Duplicate SqlInfo value: " + code);
+        }
+        Object object;
+        byte typeId = value.getTypeId(i);
+        switch (typeId) {
+          case 0: // string
+            object =
+                Preconditions.checkNotNull(
+                        value.getVarCharVector(typeId).getObject(value.getOffset(i)))
+                    .toString();
+            break;
+          case 1: // bool
+            object = value.getBitVector(typeId).getObject(value.getOffset(i));
+            break;
+          case 2: // int64
+            object = value.getBigIntVector(typeId).getObject(value.getOffset(i));
+            break;
+          case 3: // int32
+            object = value.getIntVector(typeId).getObject(value.getOffset(i));
+            break;
+          default:
+            throw new AssertionError("Decoding SqlInfo of type code " + typeId);
+        }
+        infoValues.put(code, object);
+      }
+    }
+    return infoValues;
+  }
 }
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenarioProducer.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenarioProducer.java
index b7a75b459d176..be746b575761d 100644
--- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenarioProducer.java
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/FlightSqlScenarioProducer.java
@@ -16,13 +16,16 @@
  */
 package org.apache.arrow.flight.integration.tests;
 
+import com.google.common.collect.ImmutableMap;
 import com.google.protobuf.Any;
 import com.google.protobuf.ByteString;
 import com.google.protobuf.InvalidProtocolBufferException;
 import com.google.protobuf.Message;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import org.apache.arrow.flight.CallStatus;
 import org.apache.arrow.flight.Criteria;
 import org.apache.arrow.flight.FlightDescriptor;
@@ -38,6 +41,8 @@
 import org.apache.arrow.flight.sql.FlightSqlProducer;
 import org.apache.arrow.flight.sql.SqlInfoBuilder;
 import org.apache.arrow.flight.sql.impl.FlightSql;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableExistsOption;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableNotExistOption;
 import org.apache.arrow.memory.ArrowBuf;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.VectorSchemaRoot;
@@ -48,10 +53,27 @@
 
 /** Hardcoded Flight SQL producer used for cross-language integration tests. */
 public class FlightSqlScenarioProducer implements FlightSqlProducer {
+  public static final String SERVER_NAME = "Flight SQL Integration Test Server";
   private final BufferAllocator allocator;
 
+  private final SqlInfoBuilder sqlInfoBuilder;
+
+  /** Constructor. */
   public FlightSqlScenarioProducer(BufferAllocator allocator) {
     this.allocator = allocator;
+    sqlInfoBuilder =
+        new SqlInfoBuilder()
+            .withFlightSqlServerName(SERVER_NAME)
+            .withFlightSqlServerReadOnly(false)
+            .withFlightSqlServerSql(false)
+            .withFlightSqlServerSubstrait(true)
+            .withFlightSqlServerSubstraitMinVersion("min_version")
+            .withFlightSqlServerSubstraitMaxVersion("max_version")
+            .withFlightSqlServerTransaction(
+                FlightSql.SqlSupportedTransaction.SQL_SUPPORTED_TRANSACTION_SAVEPOINT)
+            .withFlightSqlServerCancel(true)
+            .withFlightSqlServerStatementTimeout(42)
+            .withFlightSqlServerTransactionTimeout(7);
   }
 
   /**
@@ -109,6 +131,15 @@ static Schema getQueryWithTransactionSchema() {
                 null)));
   }
 
+  static Schema getIngestSchema() {
+    return new Schema(
+        Collections.singletonList(Field.nullable("test_field", new ArrowType.Int(64, true))));
+  }
+
+  protected SqlInfoBuilder getSqlInfoBuilder() {
+    return sqlInfoBuilder;
+  }
+
   @Override
   public void beginSavepoint(
       FlightSql.ActionBeginSavepointRequest request,
@@ -511,6 +542,44 @@ public Runnable acceptPutStatement(
             : FlightSqlScenario.UPDATE_STATEMENT_WITH_TRANSACTION_EXPECTED_ROWS);
   }
 
+  @Override
+  public Runnable acceptPutStatementBulkIngest(
+      FlightSql.CommandStatementIngest command,
+      CallContext context,
+      FlightStream flightStream,
+      StreamListener<PutResult> ackStream) {
+
+    IntegrationAssertions.assertEquals(
+        TableExistsOption.TABLE_EXISTS_OPTION_REPLACE,
+        command.getTableDefinitionOptions().getIfExists());
+    IntegrationAssertions.assertEquals(
+        TableNotExistOption.TABLE_NOT_EXIST_OPTION_CREATE,
+        command.getTableDefinitionOptions().getIfNotExist());
+    IntegrationAssertions.assertEquals("test_table", command.getTable());
+    IntegrationAssertions.assertEquals("test_catalog", command.getCatalog());
+    IntegrationAssertions.assertEquals("test_schema", command.getSchema());
+    IntegrationAssertions.assertEquals(true, command.getTemporary());
+    IntegrationAssertions.assertEquals(
+        FlightSqlScenario.BULK_INGEST_TRANSACTION_ID, command.getTransactionId().toByteArray());
+
+    Map<String, String> expectedOptions =
+        new HashMap<>(ImmutableMap.of("key1", "val1", "key2", "val2"));
+    IntegrationAssertions.assertEquals(expectedOptions.size(), command.getOptionsCount());
+
+    for (Map.Entry<String, String> optionEntry : expectedOptions.entrySet()) {
+      String key = optionEntry.getKey();
+      IntegrationAssertions.assertEquals(optionEntry.getValue(), command.getOptionsOrThrow(key));
+    }
+
+    IntegrationAssertions.assertEquals(getIngestSchema(), flightStream.getSchema());
+    long rowCount = 0;
+    while (flightStream.next()) {
+      rowCount += flightStream.getRoot().getRowCount();
+    }
+
+    return acceptPutReturnConstant(ackStream, rowCount);
+  }
+
   @Override
   public Runnable acceptPutSubstraitPlan(
       FlightSql.CommandStatementSubstraitPlan command,
@@ -577,35 +646,19 @@ public Runnable acceptPutPreparedStatementQuery(
   @Override
   public FlightInfo getFlightInfoSqlInfo(
       FlightSql.CommandGetSqlInfo request, CallContext context, FlightDescriptor descriptor) {
-    if (request.getInfoCount() == 2) {
-      // Integration test for the protocol messages
-      IntegrationAssertions.assertEquals(
-          request.getInfo(0), FlightSql.SqlInfo.FLIGHT_SQL_SERVER_NAME_VALUE);
-      IntegrationAssertions.assertEquals(
-          request.getInfo(1), FlightSql.SqlInfo.FLIGHT_SQL_SERVER_READ_ONLY_VALUE);
-    }
     return getFlightInfoForSchema(request, descriptor, Schemas.GET_SQL_INFO_SCHEMA);
   }
 
   @Override
   public void getStreamSqlInfo(
       FlightSql.CommandGetSqlInfo command, CallContext context, ServerStreamListener listener) {
-    if (command.getInfoCount() == 2) {
+    if (command.getInfoCount() == 2
+        && command.getInfo(0) == FlightSql.SqlInfo.FLIGHT_SQL_SERVER_NAME_VALUE
+        && command.getInfo(1) == FlightSql.SqlInfo.FLIGHT_SQL_SERVER_READ_ONLY_VALUE) {
       // Integration test for the protocol messages
       putEmptyBatchToStreamListener(listener, Schemas.GET_SQL_INFO_SCHEMA);
       return;
     }
-    SqlInfoBuilder sqlInfoBuilder =
-        new SqlInfoBuilder()
-            .withFlightSqlServerSql(false)
-            .withFlightSqlServerSubstrait(true)
-            .withFlightSqlServerSubstraitMinVersion("min_version")
-            .withFlightSqlServerSubstraitMaxVersion("max_version")
-            .withFlightSqlServerTransaction(
-                FlightSql.SqlSupportedTransaction.SQL_SUPPORTED_TRANSACTION_SAVEPOINT)
-            .withFlightSqlServerCancel(true)
-            .withFlightSqlServerStatementTimeout(42)
-            .withFlightSqlServerTransactionTimeout(7);
     sqlInfoBuilder.send(command.getInfoList(), listener);
   }
 
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java
index a294902a26d35..451edb6bd5a34 100644
--- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java
@@ -48,6 +48,7 @@ private Scenarios() {
     scenarios.put("poll_flight_info", PollFlightInfoScenario::new);
     scenarios.put("flight_sql", FlightSqlScenario::new);
     scenarios.put("flight_sql:extension", FlightSqlExtensionScenario::new);
+    scenarios.put("flight_sql:ingestion", FlightSqlIngestionScenario::new);
     scenarios.put("app_metadata_flight_info_endpoint", AppMetadataFlightInfoEndpointScenario::new);
     scenarios.put("session_options", SessionOptionsScenario::new);
   }
diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/TestBufferAllocationListener.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/TestBufferAllocationListener.java
new file mode 100644
index 0000000000000..10594d4cf0962
--- /dev/null
+++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/TestBufferAllocationListener.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.flight.integration.tests;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.arrow.memory.AllocationListener;
+
+class TestBufferAllocationListener implements AllocationListener {
+  static class Entry {
+    StackTraceElement[] stackTrace;
+    long size;
+    boolean forAllocation;
+
+    public Entry(StackTraceElement[] stackTrace, long size, boolean forAllocation) {
+      this.stackTrace = stackTrace;
+      this.size = size;
+      this.forAllocation = forAllocation;
+    }
+  }
+
+  List<Entry> trail = new ArrayList<>();
+
+  public void onAllocation(long size) {
+    trail.add(new Entry(Thread.currentThread().getStackTrace(), size, true));
+  }
+
+  public void onRelease(long size) {
+    trail.add(new Entry(Thread.currentThread().getStackTrace(), size, false));
+  }
+
+  public void reThrowWithAddedAllocatorInfo(Exception e) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(e.getMessage());
+    sb.append("\n");
+    sb.append("[[Buffer allocation and release trail during the test execution: \n");
+    for (Entry trailEntry : trail) {
+      sb.append(
+          String.format(
+              "%s: %d: %n%s",
+              trailEntry.forAllocation ? "allocate" : "release",
+              trailEntry.size,
+              getStackTraceAsString(trailEntry.stackTrace)));
+    }
+    sb.append("]]");
+    throw new IllegalStateException(sb.toString(), e);
+  }
+
+  private String getStackTraceAsString(StackTraceElement[] elements) {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 1; i < elements.length; i++) {
+      StackTraceElement s = elements[i];
+      sb.append("\t");
+      sb.append(s);
+      sb.append("\n");
+    }
+    return sb.toString();
+  }
+}
diff --git a/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java b/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java
index bdf1c43ce9da6..8419432c66227 100644
--- a/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java
+++ b/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java
@@ -16,6 +16,10 @@
  */
 package org.apache.arrow.flight.integration.tests;
 
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
 import org.apache.arrow.flight.FlightClient;
 import org.apache.arrow.flight.FlightServer;
 import org.apache.arrow.flight.Location;
@@ -80,6 +84,11 @@ void flightSqlExtension() throws Exception {
     testScenario("flight_sql:extension");
   }
 
+  @Test
+  void flightSqlIngestion() throws Exception {
+    testScenario("flight_sql:ingestion");
+  }
+
   @Test
   void appMetadataFlightInfoEndpoint() throws Exception {
     testScenario("app_metadata_flight_info_endpoint");
@@ -91,9 +100,16 @@ void sessionOptions() throws Exception {
   }
 
   void testScenario(String scenarioName) throws Exception {
-    try (final BufferAllocator allocator = new RootAllocator()) {
+    TestBufferAllocationListener listener = new TestBufferAllocationListener();
+    try (final BufferAllocator allocator = new RootAllocator(listener, Long.MAX_VALUE)) {
+      final ExecutorService exec =
+          Executors.newCachedThreadPool(
+              new ThreadFactoryBuilder()
+                  .setNameFormat("integration-test-flight-server-executor-%d")
+                  .build());
       final FlightServer.Builder builder =
           FlightServer.builder()
+              .executor(exec)
               .allocator(allocator)
               .location(Location.forGrpcInsecure("0.0.0.0", 0));
       final Scenario scenario = Scenarios.getScenario(scenarioName);
@@ -108,6 +124,17 @@ void testScenario(String scenarioName) throws Exception {
           scenario.client(allocator, location, client);
         }
       }
+
+      // Shutdown the executor while allowing existing tasks to finish.
+      // Without this wait, allocator.close() may get invoked earlier than an executor thread may
+      // have finished freeing up resources
+      // In that case, allocator.close() can throw an IllegalStateException for memory leak, leading
+      // to flaky tests
+      exec.shutdown();
+      final boolean unused = exec.awaitTermination(3, TimeUnit.SECONDS);
+    } catch (IllegalStateException e) {
+      // this could be due to Allocator detecting memory leak. Add allocation trail to help debug
+      listener.reThrowWithAddedAllocatorInfo(e);
     }
   }
 }
diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml
index 92bab5e206757..021c1e65ab5b3 100644
--- a/java/flight/flight-sql/pom.xml
+++ b/java/flight/flight-sql/pom.xml
@@ -110,6 +110,12 @@ under the License.
       <version>2.12.0</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-text</artifactId>
+      <version>1.12.0</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.hamcrest</groupId>
       <artifactId>hamcrest</artifactId>
diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java
index 4bc12d86b1d0e..9a6ffdfdca847 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java
@@ -16,6 +16,7 @@
  */
 package org.apache.arrow.flight.sql;
 
+import static java.util.Objects.isNull;
 import static org.apache.arrow.flight.sql.impl.FlightSql.ActionBeginSavepointRequest;
 import static org.apache.arrow.flight.sql.impl.FlightSql.ActionBeginSavepointResult;
 import static org.apache.arrow.flight.sql.impl.FlightSql.ActionBeginTransactionRequest;
@@ -54,8 +55,10 @@
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.ExecutionException;
+import java.util.function.Consumer;
 import java.util.stream.Collectors;
 import org.apache.arrow.flight.Action;
 import org.apache.arrow.flight.CallOption;
@@ -82,11 +85,14 @@
 import org.apache.arrow.flight.sql.impl.FlightSql;
 import org.apache.arrow.flight.sql.impl.FlightSql.ActionCreatePreparedStatementResult;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementQuery;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions;
 import org.apache.arrow.flight.sql.util.TableRef;
 import org.apache.arrow.memory.ArrowBuf;
 import org.apache.arrow.util.AutoCloseables;
 import org.apache.arrow.util.Preconditions;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.ipc.ArrowStreamReader;
 import org.apache.arrow.vector.ipc.ReadChannel;
 import org.apache.arrow.vector.ipc.message.MessageSerializer;
 import org.apache.arrow.vector.types.pojo.Schema;
@@ -206,6 +212,130 @@ public SchemaResult getExecuteSubstraitSchema(
     return getExecuteSubstraitSchema(substraitPlan, /*transaction*/ null, options);
   }
 
+  /**
+   * Execute a bulk ingest on the server.
+   *
+   * @param data data to be ingested
+   * @param ingestOptions options for the ingest request.
+   * @param options RPC-layer hints for this call.
+   * @return the number of rows affected.
+   */
+  public long executeIngest(
+      final VectorSchemaRoot data,
+      final ExecuteIngestOptions ingestOptions,
+      final CallOption... options) {
+    return executeIngest(data, ingestOptions, /*transaction*/ null, options);
+  }
+
+  /**
+   * Execute a bulk ingest on the server.
+   *
+   * @param dataReader data stream to be ingested
+   * @param ingestOptions options for the ingest request.
+   * @param options RPC-layer hints for this call.
+   * @return the number of rows affected.
+   */
+  public long executeIngest(
+      final ArrowStreamReader dataReader,
+      final ExecuteIngestOptions ingestOptions,
+      final CallOption... options) {
+    return executeIngest(dataReader, ingestOptions, /*transaction*/ null, options);
+  }
+
+  /**
+   * Execute a bulk ingest on the server.
+   *
+   * @param data data to be ingested
+   * @param ingestOptions options for the ingest request.
+   * @param transaction The transaction that this ingest request is part of.
+   * @param options RPC-layer hints for this call.
+   * @return the number of rows affected.
+   */
+  public long executeIngest(
+      final VectorSchemaRoot data,
+      final ExecuteIngestOptions ingestOptions,
+      Transaction transaction,
+      final CallOption... options) {
+    return executeIngest(
+        data, ingestOptions, transaction, FlightClient.ClientStreamListener::putNext, options);
+  }
+
+  /**
+   * Execute a bulk ingest on the server.
+   *
+   * @param dataReader data stream to be ingested
+   * @param ingestOptions options for the ingest request.
+   * @param transaction The transaction that this ingest request is part of.
+   * @param options RPC-layer hints for this call.
+   * @return the number of rows affected.
+   */
+  public long executeIngest(
+      final ArrowStreamReader dataReader,
+      final ExecuteIngestOptions ingestOptions,
+      Transaction transaction,
+      final CallOption... options) {
+
+    try {
+      return executeIngest(
+          dataReader.getVectorSchemaRoot(),
+          ingestOptions,
+          transaction,
+          listener -> {
+            while (true) {
+              try {
+                if (!dataReader.loadNextBatch()) {
+                  break;
+                }
+              } catch (IOException e) {
+                throw CallStatus.UNKNOWN.withCause(e).toRuntimeException();
+              }
+              listener.putNext();
+            }
+          },
+          options);
+    } catch (IOException e) {
+      throw CallStatus.UNKNOWN.withCause(e).toRuntimeException();
+    }
+  }
+
+  private long executeIngest(
+      final VectorSchemaRoot data,
+      final ExecuteIngestOptions ingestOptions,
+      final Transaction transaction,
+      final Consumer<FlightClient.ClientStreamListener> dataPutter,
+      final CallOption... options) {
+    try {
+      final CommandStatementIngest.Builder builder = CommandStatementIngest.newBuilder();
+      if (transaction != null) {
+        builder.setTransactionId(ByteString.copyFrom(transaction.getTransactionId()));
+      }
+      ingestOptions.updateCommandBuilder(builder);
+
+      final FlightDescriptor descriptor =
+          FlightDescriptor.command(Any.pack(builder.build()).toByteArray());
+      try (final SyncPutListener putListener = new SyncPutListener()) {
+
+        final FlightClient.ClientStreamListener listener =
+            client.startPut(descriptor, data, putListener, options);
+        dataPutter.accept(listener);
+        listener.completed();
+        listener.getResult();
+
+        try (final PutResult result = putListener.read()) {
+          final DoPutUpdateResult doPutUpdateResult =
+              DoPutUpdateResult.parseFrom(result.getApplicationMetadata().nioBuffer());
+          return doPutUpdateResult.getRecordCount();
+        }
+      }
+    } catch (final InterruptedException e) {
+      throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+    } catch (final ExecutionException e) {
+      throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
+    } catch (final InvalidProtocolBufferException e) {
+      throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
+    }
+  }
+
   /**
    * Execute an update query on the server.
    *
@@ -245,8 +375,10 @@ public long executeUpdate(
       } finally {
         listener.getResult();
       }
-    } catch (final InterruptedException | ExecutionException e) {
+    } catch (final InterruptedException e) {
       throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+    } catch (final ExecutionException e) {
+      throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
     } catch (final InvalidProtocolBufferException e) {
       throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
     }
@@ -295,8 +427,10 @@ public long executeSubstraitUpdate(
       } finally {
         listener.getResult();
       }
-    } catch (final InterruptedException | ExecutionException e) {
+    } catch (final InterruptedException e) {
       throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+    } catch (final ExecutionException e) {
+      throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
     } catch (final InvalidProtocolBufferException e) {
       throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
     }
@@ -1003,6 +1137,82 @@ public void close() throws Exception {
     AutoCloseables.close(client);
   }
 
+  /** Class to encapsulate Flight SQL bulk ingest request options. * */
+  public static class ExecuteIngestOptions {
+    private final String table;
+    private final TableDefinitionOptions tableDefinitionOptions;
+    private final boolean useTemporaryTable;
+    private final String catalog;
+    private final String schema;
+    private final Map<String, String> options;
+
+    /**
+     * Constructor.
+     *
+     * @param table The table to load data into.
+     * @param tableDefinitionOptions The behavior for handling the table definition.
+     * @param catalog The catalog of the destination table to load data into. If null, a
+     *     backend-specific default may be used.
+     * @param schema The schema of the destination table to load data into. If null, a
+     *     backend-specific default may be used.
+     * @param options Backend-specific options. Can be null if there are no options to be set.
+     */
+    public ExecuteIngestOptions(
+        String table,
+        TableDefinitionOptions tableDefinitionOptions,
+        String catalog,
+        String schema,
+        Map<String, String> options) {
+      this(table, tableDefinitionOptions, false, catalog, schema, options);
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param table The table to load data into.
+     * @param tableDefinitionOptions The behavior for handling the table definition.
+     * @param useTemporaryTable Use a temporary table for bulk ingestion. Temporary table may get
+     *     placed in a backend-specific schema and/or catalog and gets dropped at the end of the
+     *     session. If backend does not support ingesting using a temporary table or an explicit
+     *     choice of schema or catalog is incompatible with the server's namespacing decision, an
+     *     error is returned as part of {@link #executeIngest} request.
+     * @param catalog The catalog of the destination table to load data into. If null, a
+     *     backend-specific default may be used.
+     * @param schema The schema of the destination table to load data into. If null, a
+     *     backend-specific default may be used.
+     * @param options Backend-specific options. Can be null if there are no options to be set.
+     */
+    public ExecuteIngestOptions(
+        String table,
+        TableDefinitionOptions tableDefinitionOptions,
+        boolean useTemporaryTable,
+        String catalog,
+        String schema,
+        Map<String, String> options) {
+      this.table = table;
+      this.tableDefinitionOptions = tableDefinitionOptions;
+      this.useTemporaryTable = useTemporaryTable;
+      this.catalog = catalog;
+      this.schema = schema;
+      this.options = options;
+    }
+
+    protected void updateCommandBuilder(CommandStatementIngest.Builder builder) {
+      builder.setTable(table);
+      builder.setTableDefinitionOptions(tableDefinitionOptions);
+      builder.setTemporary(useTemporaryTable);
+      if (!isNull(catalog)) {
+        builder.setCatalog(catalog);
+      }
+      if (!isNull(schema)) {
+        builder.setSchema(schema);
+      }
+      if (!isNull(options)) {
+        builder.putAllOptions(options);
+      }
+    }
+  }
+
   /** Helper class to encapsulate Flight SQL prepared statement logic. */
   public static class PreparedStatement implements AutoCloseable {
     private final FlightClient client;
@@ -1140,10 +1350,12 @@ public FlightInfo execute(final CallOption... options) {
               }
             }
           }
-        } catch (final InterruptedException | ExecutionException e) {
+        } catch (final InterruptedException e) {
           throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+        } catch (final ExecutionException e) {
+          throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
         } catch (final InvalidProtocolBufferException e) {
-          throw CallStatus.INVALID_ARGUMENT.withCause(e).toRuntimeException();
+          throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
         }
       }
 
@@ -1198,10 +1410,12 @@ public long executeUpdate(final CallOption... options) {
               DoPutUpdateResult.parseFrom(metadata.nioBuffer());
           return doPutUpdateResult.getRecordCount();
         }
-      } catch (final InterruptedException | ExecutionException e) {
+      } catch (final InterruptedException e) {
         throw CallStatus.CANCELLED.withCause(e).toRuntimeException();
+      } catch (final ExecutionException e) {
+        throw CallStatus.CANCELLED.withCause(e.getCause()).toRuntimeException();
       } catch (final InvalidProtocolBufferException e) {
-        throw CallStatus.INVALID_ARGUMENT.withCause(e).toRuntimeException();
+        throw CallStatus.INTERNAL.withCause(e).toRuntimeException();
       }
     }
 
diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java
index 0afef79160621..9465e5ff88053 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlProducer.java
@@ -83,6 +83,7 @@
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandGetTables;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementQuery;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementUpdate;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementQuery;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementUpdate;
 import org.apache.arrow.flight.sql.impl.FlightSql.DoPutUpdateResult;
@@ -281,7 +282,8 @@ default void getStream(CallContext context, Ticket ticket, ServerStreamListener
   /**
    * Depending on the provided command, method either: 1. Execute provided SQL query as an update
    * statement, or 2. Execute provided update SQL query prepared statement. In this case, parameters
-   * binding is allowed, or 3. Binds parameters to the provided prepared statement.
+   * binding is allowed, or 3. Binds parameters to the provided prepared statement, or 4. Bulk
+   * ingests data provided through the flightStream.
    *
    * @param context Per-call context.
    * @param flightStream The data stream being uploaded.
@@ -299,6 +301,12 @@ default Runnable acceptPut(
           context,
           flightStream,
           ackStream);
+    } else if (command.is(CommandStatementIngest.class)) {
+      return acceptPutStatementBulkIngest(
+          FlightSqlUtils.unpackOrThrow(command, CommandStatementIngest.class),
+          context,
+          flightStream,
+          ackStream);
     } else if (command.is(CommandStatementSubstraitPlan.class)) {
       return acceptPutSubstraitPlan(
           FlightSqlUtils.unpackOrThrow(command, CommandStatementSubstraitPlan.class),
@@ -777,6 +785,27 @@ Runnable acceptPutStatement(
       FlightStream flightStream,
       StreamListener<PutResult> ackStream);
 
+  /**
+   * Accepts uploaded data for a particular bulk ingest data stream.
+   *
+   * <p>`PutResult`s must be in the form of a {@link DoPutUpdateResult}.
+   *
+   * @param command The bulk ingestion request.
+   * @param context Per-call context.
+   * @param flightStream The data stream being uploaded.
+   * @param ackStream The result data stream.
+   * @return A runnable to process the stream.
+   */
+  default Runnable acceptPutStatementBulkIngest(
+      CommandStatementIngest command,
+      CallContext context,
+      FlightStream flightStream,
+      StreamListener<PutResult> ackStream) {
+    return () -> {
+      ackStream.onError(CallStatus.UNIMPLEMENTED.toRuntimeException());
+    };
+  }
+
   /**
    * Handle a Substrait plan with uploaded data.
    *
diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java
index 5091017c13cd8..72fcae8c18003 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/NoOpFlightSqlProducer.java
@@ -91,6 +91,18 @@ public Runnable acceptPutStatement(
     throw CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException();
   }
 
+  @Override
+  public Runnable acceptPutStatementBulkIngest(
+      FlightSql.CommandStatementIngest command,
+      CallContext context,
+      FlightStream flightStream,
+      StreamListener<PutResult> ackStream) {
+    return () -> {
+      ackStream.onError(
+          CallStatus.UNIMPLEMENTED.withDescription("Not implemented.").toRuntimeException());
+    };
+  }
+
   @Override
   public Runnable acceptPutPreparedStatementUpdate(
       FlightSql.CommandPreparedStatementUpdate command,
diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SqlInfoBuilder.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SqlInfoBuilder.java
index 2a31bc77365e2..cbe4989d14744 100644
--- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SqlInfoBuilder.java
+++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/SqlInfoBuilder.java
@@ -148,6 +148,17 @@ public SqlInfoBuilder withFlightSqlServerCancel(boolean value) {
     return withBooleanProvider(SqlInfo.FLIGHT_SQL_SERVER_CANCEL_VALUE, value);
   }
 
+  /** Set a value for bulk ingestion support. */
+  public SqlInfoBuilder withFlightSqlServerBulkIngestion(boolean value) {
+    return withBooleanProvider(SqlInfo.FLIGHT_SQL_SERVER_BULK_INGESTION_VALUE, value);
+  }
+
+  /** Set a value for transaction support for bulk ingestion. */
+  public SqlInfoBuilder withFlightSqlServerBulkIngestionTransaction(boolean value) {
+    return withBooleanProvider(
+        SqlInfo.FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED_VALUE, value);
+  }
+
   /** Set a value for statement timeouts. */
   public SqlInfoBuilder withFlightSqlServerStatementTimeout(int value) {
     return withIntProvider(SqlInfo.FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT_VALUE, value);
diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
index 67bfc85c48602..f9d0551a3aa22 100644
--- a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
+++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/example/FlightSqlExample.java
@@ -55,6 +55,7 @@
 import java.nio.file.NoSuchFileException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
 import java.sql.Connection;
 import java.sql.DatabaseMetaData;
 import java.sql.DriverManager;
@@ -82,6 +83,7 @@
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
 import java.util.function.Predicate;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.arrow.adapter.jdbc.ArrowVectorIterator;
 import org.apache.arrow.adapter.jdbc.JdbcFieldInfo;
@@ -112,6 +114,10 @@
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandGetTables;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementQuery;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandPreparedStatementUpdate;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableExistsOption;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableNotExistOption;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementQuery;
 import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementUpdate;
 import org.apache.arrow.flight.sql.impl.FlightSql.SqlSupportedCaseSensitivity;
@@ -146,6 +152,7 @@
 import org.apache.commons.dbcp2.PoolingDataSource;
 import org.apache.commons.pool2.ObjectPool;
 import org.apache.commons.pool2.impl.GenericObjectPool;
+import org.apache.commons.text.StringEscapeUtils;
 import org.slf4j.Logger;
 
 /**
@@ -245,7 +252,9 @@ public FlightSqlExample(final Location location, final String dbName) {
                           : SqlSupportedCaseSensitivity.SQL_CASE_SENSITIVITY_UNKNOWN)
           .withSqlAllTablesAreSelectable(true)
           .withSqlNullOrdering(SqlNullOrdering.SQL_NULLS_SORTED_AT_END)
-          .withSqlMaxColumnsInTable(42);
+          .withSqlMaxColumnsInTable(42)
+          .withFlightSqlServerBulkIngestion(true)
+          .withFlightSqlServerBulkIngestionTransaction(false);
     } catch (SQLException e) {
       throw new RuntimeException(e);
     }
@@ -714,6 +723,34 @@ private static ByteBuffer serializeMetadata(final Schema schema) {
     }
   }
 
+  private static String getRootAsCSVNoHeader(final VectorSchemaRoot root) {
+    StringBuilder sb = new StringBuilder();
+    Schema schema = root.getSchema();
+    int rowCount = root.getRowCount();
+    List<FieldVector> fieldVectors = root.getFieldVectors();
+
+    List<Object> row = new ArrayList<>(schema.getFields().size());
+    for (int i = 0; i < rowCount; i++) {
+      if (i > 0) {
+        sb.append("\n");
+      }
+      row.clear();
+      for (FieldVector v : fieldVectors) {
+        row.add(v.getObject(i));
+      }
+      printRowAsCSV(sb, row);
+    }
+    return sb.toString();
+  }
+
+  private static void printRowAsCSV(StringBuilder sb, List<Object> values) {
+    sb.append(
+        values.stream()
+            .map(v -> isNull(v) ? "" : v.toString())
+            .map(StringEscapeUtils::escapeCsv)
+            .collect(Collectors.joining(",")));
+  }
+
   @Override
   public void getStreamPreparedStatement(
       final CommandPreparedStatementQuery command,
@@ -951,6 +988,138 @@ public Runnable acceptPutStatement(
     };
   }
 
+  @Override
+  public Runnable acceptPutStatementBulkIngest(
+      CommandStatementIngest command,
+      CallContext context,
+      FlightStream flightStream,
+      StreamListener<PutResult> ackStream) {
+
+    final String schema = command.hasSchema() ? command.getSchema() : null;
+    final String table = command.getTable();
+    final boolean temporary = command.getTemporary();
+    final boolean transactionId = command.hasTransactionId();
+    final TableDefinitionOptions tableDefinitionOptions =
+        command.hasTableDefinitionOptions() ? command.getTableDefinitionOptions() : null;
+
+    return () -> {
+      TableExistsOption ifExists = TableExistsOption.TABLE_EXISTS_OPTION_APPEND;
+      if (temporary) {
+        ackStream.onError(
+            CallStatus.UNIMPLEMENTED
+                .withDescription("Bulk ingestion using temporary tables is not supported")
+                .toRuntimeException());
+      } else if (transactionId) {
+        ackStream.onError(
+            CallStatus.UNIMPLEMENTED
+                .withDescription(
+                    "Bulk ingestion automatically happens in a transaction. Specifying explicit transaction is not supported.")
+                .toRuntimeException());
+      } else if (isNull(tableDefinitionOptions)) {
+        ackStream.onError(
+            CallStatus.INVALID_ARGUMENT
+                .withDescription("TableDefinitionOptions not provided.")
+                .toRuntimeException());
+      } else {
+        TableNotExistOption ifNotExist = tableDefinitionOptions.getIfNotExist();
+        ifExists = tableDefinitionOptions.getIfExists();
+
+        if (!TableNotExistOption.TABLE_NOT_EXIST_OPTION_FAIL.equals(ifNotExist)) {
+          ackStream.onError(
+              CallStatus.UNIMPLEMENTED
+                  .withDescription(
+                      "Only supported option is TABLE_NOT_EXIST_OPTION_FAIL for TableNotExistsOption.")
+                  .toRuntimeException());
+        } else if (TableExistsOption.TABLE_EXISTS_OPTION_UNSPECIFIED.equals(ifExists)) {
+          ackStream.onError(
+              CallStatus.INVALID_ARGUMENT
+                  .withDescription("TableExistsOption must be specified")
+                  .toRuntimeException());
+        } else if (TableExistsOption.TABLE_EXISTS_OPTION_FAIL.equals(ifExists)) {
+          ackStream.onError(
+              CallStatus.UNIMPLEMENTED
+                  .withDescription("TABLE_EXISTS_OPTION_FAIL is not supported.")
+                  .toRuntimeException());
+        }
+      }
+
+      Path tempFile = null;
+      try {
+        tempFile = Files.createTempFile(null, null);
+
+        VectorSchemaRoot root = null;
+        int counter = 0;
+        while (flightStream.next()) {
+          if (counter > 0) {
+            Files.writeString(tempFile, "\n", StandardCharsets.UTF_8, StandardOpenOption.APPEND);
+          }
+          counter += 1;
+          root = flightStream.getRoot();
+          Files.writeString(
+              tempFile,
+              getRootAsCSVNoHeader(root),
+              StandardCharsets.UTF_8,
+              StandardOpenOption.APPEND);
+        }
+
+        if (counter > 0) {
+          Files.writeString(tempFile, "\n", StandardCharsets.UTF_8, StandardOpenOption.APPEND);
+        }
+
+        if (!isNull(root)) {
+          String header =
+              root.getSchema().getFields().stream()
+                  .map(Field::getName)
+                  .collect(Collectors.joining(","));
+
+          try (final Connection connection = dataSource.getConnection();
+              final PreparedStatement preparedStatement =
+                  connection.prepareStatement(
+                      "CALL SYSCS_UTIL.SYSCS_IMPORT_DATA (?,?,?,null,?,?,?,?,?)")) {
+
+            preparedStatement.setString(1, schema);
+            preparedStatement.setString(2, table);
+            preparedStatement.setString(3, header);
+            preparedStatement.setString(4, tempFile.toString());
+            preparedStatement.setString(5, ",");
+            preparedStatement.setString(6, "\"");
+            preparedStatement.setString(7, "UTF-8");
+            preparedStatement.setInt(
+                8, TableExistsOption.TABLE_EXISTS_OPTION_REPLACE.equals(ifExists) ? 1 : 0);
+            preparedStatement.execute();
+
+            final DoPutUpdateResult build =
+                DoPutUpdateResult.newBuilder().setRecordCount(-1).build();
+
+            try (final ArrowBuf buffer = rootAllocator.buffer(build.getSerializedSize())) {
+              buffer.writeBytes(build.toByteArray());
+              ackStream.onNext(PutResult.metadata(buffer));
+              ackStream.onCompleted();
+            }
+          } catch (SQLException e) {
+            ackStream.onError(
+                CallStatus.INTERNAL
+                    .withDescription("Failed to execute bulk ingest: " + e)
+                    .toRuntimeException());
+          }
+        }
+      } catch (IOException e) {
+        ackStream.onError(
+            CallStatus.INTERNAL
+                .withDescription("Failed to create temp file for bulk loading: " + e)
+                .toRuntimeException());
+      } finally {
+        if (!isNull(tempFile)) {
+          try {
+            Files.delete(tempFile);
+          } catch (IOException e) {
+            //
+          }
+        }
+      }
+    };
+  }
+
   @Override
   public Runnable acceptPutPreparedStatementUpdate(
       CommandPreparedStatementUpdate command,
diff --git a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/test/TestFlightSql.java b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/test/TestFlightSql.java
index 2eb74adc5bc0e..3f769363fb64d 100644
--- a/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/test/TestFlightSql.java
+++ b/java/flight/flight-sql/src/test/java/org/apache/arrow/flight/sql/test/TestFlightSql.java
@@ -30,6 +30,10 @@
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import java.io.PipedInputStream;
+import java.io.PipedOutputStream;
+import java.nio.charset.StandardCharsets;
 import java.sql.SQLException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -53,6 +57,9 @@
 import org.apache.arrow.flight.sql.FlightSqlProducer;
 import org.apache.arrow.flight.sql.example.FlightSqlExample;
 import org.apache.arrow.flight.sql.impl.FlightSql;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableExistsOption;
+import org.apache.arrow.flight.sql.impl.FlightSql.CommandStatementIngest.TableDefinitionOptions.TableNotExistOption;
 import org.apache.arrow.flight.sql.impl.FlightSql.SqlSupportedCaseSensitivity;
 import org.apache.arrow.flight.sql.util.TableRef;
 import org.apache.arrow.memory.BufferAllocator;
@@ -60,11 +67,15 @@
 import org.apache.arrow.vector.IntVector;
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.ipc.ArrowStreamReader;
+import org.apache.arrow.vector.ipc.ArrowStreamWriter;
 import org.apache.arrow.vector.types.Types.MinorType;
+import org.apache.arrow.vector.types.pojo.ArrowType;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.apache.arrow.vector.util.Text;
+import org.apache.arrow.vector.util.VectorBatchAppender;
 import org.hamcrest.Matcher;
 import org.hamcrest.MatcherAssert;
 import org.junit.jupiter.api.AfterAll;
@@ -96,6 +107,43 @@ public class TestFlightSql {
   protected static FlightServer server;
   protected static FlightSqlClient sqlClient;
 
+  private static void populateNext10RowsInIngestRootBatch(
+      int startRowNumber,
+      IntVector valueVector,
+      VarCharVector keyNameVector,
+      IntVector foreignIdVector,
+      VarCharVector keyNamesToBeDeletedVector,
+      VectorSchemaRoot ingestRoot) {
+
+    final int NumRowsInBatch = 10;
+
+    valueVector.reset();
+    keyNameVector.reset();
+    foreignIdVector.reset();
+
+    final IntStream range = IntStream.range(1, NumRowsInBatch);
+
+    range.forEach(
+        i -> {
+          valueVector.setSafe(i - 1, (i + startRowNumber - 1) * NumRowsInBatch);
+          keyNameVector.setSafe(i - 1, new Text("value" + (i + startRowNumber - 1)));
+          foreignIdVector.setSafe(i - 1, 1);
+        });
+    // put some comma and double-quote containing string as well
+    valueVector.setSafe(NumRowsInBatch - 1, (NumRowsInBatch + startRowNumber - 1) * NumRowsInBatch);
+    keyNameVector.setSafe(
+        NumRowsInBatch - 1,
+        new Text(
+            String.format(
+                "value%d, is \"%d\"",
+                (NumRowsInBatch + startRowNumber - 1),
+                (NumRowsInBatch + startRowNumber - 1) * NumRowsInBatch)));
+    foreignIdVector.setSafe(NumRowsInBatch - 1, 1);
+    ingestRoot.setRowCount(NumRowsInBatch);
+
+    VectorBatchAppender.batchAppend(keyNamesToBeDeletedVector, keyNameVector);
+  }
+
   @BeforeAll
   public static void setUp() throws Exception {
     setUpClientServer();
@@ -537,6 +585,119 @@ public void testSimplePreparedStatementUpdateResults() throws SQLException {
     }
   }
 
+  @Test
+  public void testBulkIngest() throws IOException {
+    // For bulk ingest DerbyDB requires uppercase column names
+    var keyName = new Field("KEYNAME", FieldType.nullable(new ArrowType.Utf8()), null);
+    var value = new Field("VALUE", FieldType.nullable(new ArrowType.Int(32, true)), null);
+    var foreignId = new Field("FOREIGNID", FieldType.nullable(new ArrowType.Int(32, true)), null);
+
+    Schema dataSchema = new Schema(List.of(keyName, value, foreignId));
+
+    try (final VectorSchemaRoot ingestRoot = VectorSchemaRoot.create(dataSchema, allocator);
+        final VarCharVector keyNamesToBeDeletedVector = new VarCharVector(keyName, allocator)) {
+      final VarCharVector keyNameVector = (VarCharVector) ingestRoot.getVector(0);
+      final IntVector valueVector = (IntVector) ingestRoot.getVector(1);
+      final IntVector foreignIdVector = (IntVector) ingestRoot.getVector(2);
+      ingestRoot.allocateNew();
+      keyNamesToBeDeletedVector.allocateNew();
+
+      try (PipedInputStream inPipe = new PipedInputStream(1024);
+          PipedOutputStream outPipe = new PipedOutputStream(inPipe);
+          ArrowStreamReader reader = new ArrowStreamReader(inPipe, allocator)) {
+
+        new Thread(
+                () -> {
+                  try (ArrowStreamWriter writer =
+                      new ArrowStreamWriter(ingestRoot, null, outPipe)) {
+                    writer.start();
+                    populateNext10RowsInIngestRootBatch(
+                        1,
+                        valueVector,
+                        keyNameVector,
+                        foreignIdVector,
+                        keyNamesToBeDeletedVector,
+                        ingestRoot);
+                    writer.writeBatch();
+                    populateNext10RowsInIngestRootBatch(
+                        11,
+                        valueVector,
+                        keyNameVector,
+                        foreignIdVector,
+                        keyNamesToBeDeletedVector,
+                        ingestRoot);
+                    writer.writeBatch();
+                  } catch (Exception e) {
+                    throw new RuntimeException(e);
+                  }
+                })
+            .start();
+
+        // Ingest from a stream
+        final long updatedRows =
+            sqlClient.executeIngest(
+                reader,
+                new FlightSqlClient.ExecuteIngestOptions(
+                    "INTTABLE",
+                    TableDefinitionOptions.newBuilder()
+                        .setIfExists(TableExistsOption.TABLE_EXISTS_OPTION_APPEND)
+                        .setIfNotExist(TableNotExistOption.TABLE_NOT_EXIST_OPTION_FAIL)
+                        .build(),
+                    null,
+                    null,
+                    null));
+
+        MatcherAssert.assertThat(updatedRows, is(-1L));
+
+        // Ingest directly using VectorSchemaRoot
+        populateNext10RowsInIngestRootBatch(
+            21, valueVector, keyNameVector, foreignIdVector, keyNamesToBeDeletedVector, ingestRoot);
+        sqlClient.executeIngest(
+            ingestRoot,
+            new FlightSqlClient.ExecuteIngestOptions(
+                "INTTABLE",
+                TableDefinitionOptions.newBuilder()
+                    .setIfExists(TableExistsOption.TABLE_EXISTS_OPTION_APPEND)
+                    .setIfNotExist(TableNotExistOption.TABLE_NOT_EXIST_OPTION_FAIL)
+                    .build(),
+                null,
+                null,
+                null));
+
+        try (PreparedStatement deletePrepare =
+            sqlClient.prepare("DELETE FROM INTTABLE WHERE keyName = ?")) {
+          final long deletedRows;
+          try (final VectorSchemaRoot deleteRoot = VectorSchemaRoot.of(keyNamesToBeDeletedVector)) {
+            deletePrepare.setParameters(deleteRoot);
+            deletedRows = deletePrepare.executeUpdate();
+          }
+
+          MatcherAssert.assertThat(deletedRows, is(30L));
+        }
+      }
+    }
+  }
+
+  @Test
+  public void testBulkIngestTransaction() {
+    assertThrows(
+        RuntimeException.class,
+        () -> {
+          sqlClient.executeIngest(
+              VectorSchemaRoot.create(new Schema(List.of()), allocator),
+              new FlightSqlClient.ExecuteIngestOptions(
+                  "INTTABLE",
+                  TableDefinitionOptions.newBuilder()
+                      .setIfExists(TableExistsOption.TABLE_EXISTS_OPTION_APPEND)
+                      .setIfNotExist(TableNotExistOption.TABLE_NOT_EXIST_OPTION_FAIL)
+                      .build(),
+                  null,
+                  null,
+                  null),
+              new FlightSqlClient.Transaction("123".getBytes(StandardCharsets.UTF_8)));
+        });
+  }
+
   @Test
   public void testSimplePreparedStatementUpdateResultsWithoutParameters() throws SQLException {
     try (PreparedStatement prepare =

From 8113594c85b4c578236cdd69d4155e0e118744b4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 5 Sep 2024 09:54:00 +0900
Subject: [PATCH 086/186] GH-43952: [CI] Bump
 actions/{upload|download}-artifact from 3 to latest v4 in /.github/workflows
 (#43940)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 3 to 4.1.7.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/actions/download-artifact/releases">actions/download-artifact's releases</a>.</em></p>
<blockquote>
<h2>v4.1.7</h2>
<h2>What's Changed</h2>
<ul>
<li>Update <code>@​actions/artifact</code> dependency by <a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/325">actions/download-artifact#325</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4.1.6...v4.1.7">https://github.com/actions/download-artifact/compare/v4.1.6...v4.1.7</a></p>
<h2>v4.1.6</h2>
<h2>What's Changed</h2>
<ul>
<li>updating <code>@ actions/artifact</code> dependency to v2.1.6 by <a href="https://github.com/eggyhead"><code>@​eggyhead</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/324">actions/download-artifact#324</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4.1.5...v4.1.6">https://github.com/actions/download-artifact/compare/v4.1.5...v4.1.6</a></p>
<h2>v4.1.5</h2>
<h2>What's Changed</h2>
<ul>
<li>Update readme with v3/v2/v1 deprecation notice by <a href="https://github.com/robherley"><code>@​robherley</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/322">actions/download-artifact#322</a></li>
<li>Update dependencies <code>@ actions/core</code> to v1.10.1 and <code>@ actions/artifact</code> to v2.1.5</li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4.1.4...v4.1.5">https://github.com/actions/download-artifact/compare/v4.1.4...v4.1.5</a></p>
<h2>v4.1.4</h2>
<h2>What's Changed</h2>
<ul>
<li>Update <code>@​actions/artifact</code> by <a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/307">actions/download-artifact#307</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4...v4.1.4">https://github.com/actions/download-artifact/compare/v4...v4.1.4</a></p>
<h2>v4.1.3</h2>
<h2>What's Changed</h2>
<ul>
<li>Update release-new-action-version.yml by <a href="https://github.com/konradpabjan"><code>@​konradpabjan</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/292">actions/download-artifact#292</a></li>
<li>Update toolkit dependency with updated unzip logic by <a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/299">actions/download-artifact#299</a></li>
<li>Update <code>@​actions/artifact</code> by <a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/303">actions/download-artifact#303</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/bethanyj28"><code>@​bethanyj28</code></a> made their first contribution in <a href="https://redirect.github.com/actions/download-artifact/pull/299">actions/download-artifact#299</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4...v4.1.3">https://github.com/actions/download-artifact/compare/v4...v4.1.3</a></p>
<h2>v4.1.2</h2>
<ul>
<li>Bump <code>@​actions/artifacts</code> to latest version to include <a href="https://redirect.github.com/actions/toolkit/pull/1648">updated GHES host check</a></li>
</ul>
<h2>v4.1.1</h2>
<ul>
<li>Fix transient request timeouts <a href="https://redirect.github.com/actions/download-artifact/issues/249">actions/download-artifact#249</a></li>
<li>Bump <code>@ actions/artifacts</code> to latest version</li>
</ul>
<h2>v4.1.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Some cleanup by <a href="https://github.com/robherley"><code>@​robherley</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/247">actions/download-artifact#247</a></li>
<li>Fix default for run-id by <a href="https://github.com/stchr"><code>@​stchr</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/252">actions/download-artifact#252</a></li>
</ul>

</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/actions/download-artifact/commit/65a9edc5881444af0b9093a5e628f2fe47ea3b2e"><code>65a9edc</code></a> Merge pull request <a href="https://redirect.github.com/actions/download-artifact/issues/325">#325</a> from bethanyj28/main</li>
<li><a href="https://github.com/actions/download-artifact/commit/fdd1595981c1a29187d3de99c28c28a166bc38f7"><code>fdd1595</code></a> licensed</li>
<li><a href="https://github.com/actions/download-artifact/commit/c13dba102f4bb92b3f679fa086db9e2973960ca7"><code>c13dba1</code></a> update <code>@​actions/artifact</code> dependency</li>
<li><a href="https://github.com/actions/download-artifact/commit/0daa75ebeac4617faeb127496dbd716b8bcce26e"><code>0daa75e</code></a> Merge pull request <a href="https://redirect.github.com/actions/download-artifact/issues/324">#324</a> from actions/eggyhead/use-artifact-v2.1.6</li>
<li><a href="https://github.com/actions/download-artifact/commit/9c19ed7fe5d278cd354c7dfd5d3b88589c7e2395"><code>9c19ed7</code></a> Merge branch 'main' into eggyhead/use-artifact-v2.1.6</li>
<li><a href="https://github.com/actions/download-artifact/commit/3d3ea8741ef44e86f7392b41e391bde3c36219bd"><code>3d3ea87</code></a> updating license</li>
<li><a href="https://github.com/actions/download-artifact/commit/89af5db8211998d3ca691103a86b0b9362a94286"><code>89af5db</code></a> updating artifact package v2.1.6</li>
<li><a href="https://github.com/actions/download-artifact/commit/b4aefff88e83a2676a730654e1ce3dce61880379"><code>b4aefff</code></a> Merge pull request <a href="https://redirect.github.com/actions/download-artifact/issues/323">#323</a> from actions/eggyhead/update-artifact-v215</li>
<li><a href="https://github.com/actions/download-artifact/commit/8caf195ad4b1dee92908e23f56eeb0696f1dd42d"><code>8caf195</code></a> package lock update</li>
<li><a href="https://github.com/actions/download-artifact/commit/d7a2ec411d177e8ca679ac5969b70be59c322700"><code>d7a2ec4</code></a> updating package version</li>
<li>Additional commits viewable in <a href="https://github.com/actions/download-artifact/compare/v3...v4.1.7">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/download-artifact&package-manager=github_actions&previous-version=3&new-version=4.1.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/arrow/network/alerts).

</details>
* GitHub Issue: #43952

Lead-authored-by: Jacob Wujciak-Jens <jacob@wujciak.de>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/pr_review_trigger.yml |  2 +-
 .github/workflows/r.yml                 | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml
index 0cd89b3206715..68f922ce8b4d9 100644
--- a/.github/workflows/pr_review_trigger.yml
+++ b/.github/workflows/pr_review_trigger.yml
@@ -29,7 +29,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Upload PR review Payload"
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4.4.0
         with:
           path: "${{ github.event_path }}"
           name: "pr_review_payload"
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 21afa4586b5a4..7fc45777bf657 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -169,9 +169,9 @@ jobs:
         if: always()
       - name: Save the test output
         if: always()
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
-          name: test-output
+          name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }}
           path: r/check/arrow.Rcheck/tests/testthat.Rout*
       - name: Docker Push
         if: >-
@@ -230,9 +230,9 @@ jobs:
         if: always()
       - name: Save the test output
         if: always()
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
-          name: test-output
+          name: test-output-bundled
           path: r/check/arrow.Rcheck/tests/testthat.Rout*
       - name: Docker Push
         if: >-
@@ -292,7 +292,7 @@ jobs:
         # So that they're unique when multiple are downloaded in the next step
         shell: bash
         run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # # v4.0.0
         with:
           name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
           path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
@@ -330,7 +330,7 @@ jobs:
           echo "$HOME/.local/bin" >> $GITHUB_PATH
       - run: mkdir r/windows
       - name: Download artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4.1.7
         with:
           name: libarrow-rtools40-ucrt64.zip
           path: r/windows

From f545b90748d5196af547abcec19d63a7b14e4daa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Thu, 5 Sep 2024 09:56:19 +0200
Subject: [PATCH 087/186] GH-43299: [Release][Packaging] Only include pyarrow
 folder  when finding packages on setuptools (#43325)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Currently we include everything when building wheels, see:
```
$ pip install pyarrow
Collecting pyarrow
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.9/39.9 MB 33.8 MB/s eta 0:00:00
Collecting numpy>=1.16.6
  Using cached numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.3 MB)
Installing collected packages: numpy, pyarrow
Successfully installed numpy-2.0.0 pyarrow-17.0.0
(test-env)  $ ls test-env/lib/python3.10/site-packages/
benchmarks/                  distutils-precedence.pth     numpy-2.0.0.dist-info/       pip-22.0.2.dist-info/        pyarrow-17.0.0.dist-info/    setuptools-59.6.0.dist-info/
cmake_modules/               examples/                    numpy.libs/                  pkg_resources/               scripts/
_distutils_hack/             numpy/                       pip/                         pyarrow/                     setuptools/
```

### What changes are included in this PR?

Use `include` as seen here: https://setuptools.pypa.io/en/latest/userguide/package_discovery.html#finding-simple-packages

### Are these changes tested?

Will check via the build wheel on CI

### Are there any user-facing changes?

No and yes :)
We will remove unnecessary files
* GitHub Issue: #43299

Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 ci/docker/python-wheel-manylinux.dockerfile  |  3 ++
 ci/scripts/python_wheel_macos_build.sh       |  1 -
 ci/scripts/python_wheel_manylinux_build.sh   |  3 +-
 ci/scripts/python_wheel_unix_test.sh         |  6 +++
 ci/scripts/python_wheel_validate_contents.py | 48 ++++++++++++++++++++
 ci/scripts/python_wheel_windows_build.bat    |  1 -
 ci/scripts/python_wheel_windows_test.bat     |  3 ++
 docker-compose.yml                           |  2 +
 docs/source/developers/python.rst            |  3 --
 python/pyproject.toml                        |  3 +-
 python/setup.py                              | 16 +------
 11 files changed, 66 insertions(+), 23 deletions(-)
 create mode 100644 ci/scripts/python_wheel_validate_contents.py

diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile
index 42f088fd8a22a..5cc1711608c03 100644
--- a/ci/docker/python-wheel-manylinux.dockerfile
+++ b/ci/docker/python-wheel-manylinux.dockerfile
@@ -100,6 +100,9 @@ RUN vcpkg install \
         --x-feature=parquet \
         --x-feature=s3
 
+# Make sure auditwheel is up-to-date
+RUN pipx upgrade auditwheel
+
 # Configure Python for applications running in the bash shell of this Dockerfile
 ARG python=3.8
 ENV PYTHON_VERSION=${python}
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
index 92b962f1740bd..d2c392e6b9db3 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -150,7 +150,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ==="
 export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE}
 export PYARROW_BUNDLE_ARROW_CPP=1
 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR}
-export PYARROW_INSTALL_TESTS=1
 export PYARROW_WITH_ACERO=${ARROW_ACERO}
 export PYARROW_WITH_AZURE=${ARROW_AZURE}
 export PYARROW_WITH_DATASET=${ARROW_DATASET}
diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh
index aa86494a9d47d..885019ff3049f 100755
--- a/ci/scripts/python_wheel_manylinux_build.sh
+++ b/ci/scripts/python_wheel_manylinux_build.sh
@@ -140,7 +140,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ==="
 export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE}
 export PYARROW_BUNDLE_ARROW_CPP=1
 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR}
-export PYARROW_INSTALL_TESTS=1
 export PYARROW_WITH_ACERO=${ARROW_ACERO}
 export PYARROW_WITH_AZURE=${ARROW_AZURE}
 export PYARROW_WITH_DATASET=${ARROW_DATASET}
@@ -181,5 +180,5 @@ popd
 rm -rf dist/temp-fix-wheel
 
 echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ==="
-auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels
+auditwheel repair dist/pyarrow-*.whl -w repaired_wheels
 popd
diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh
index cf87a17056783..6bdc3d3621e14 100755
--- a/ci/scripts/python_wheel_unix_test.sh
+++ b/ci/scripts/python_wheel_unix_test.sh
@@ -34,6 +34,7 @@ source_dir=${1}
 : ${ARROW_S3:=ON}
 : ${ARROW_SUBSTRAIT:=ON}
 : ${CHECK_IMPORTS:=ON}
+: ${CHECK_WHEEL_CONTENT:=ON}
 : ${CHECK_UNITTESTS:=ON}
 : ${INSTALL_PYARROW:=ON}
 
@@ -87,6 +88,11 @@ import pyarrow.parquet
   fi
 fi
 
+if [ "${CHECK_WHEEL_CONTENT}" == "ON" ]; then
+  python ${source_dir}/ci/scripts/python_wheel_validate_contents.py \
+    --path ${source_dir}/python/repaired_wheels
+fi
+
 if [ "${CHECK_UNITTESTS}" == "ON" ]; then
   # Install testing dependencies
   pip install -U -r ${source_dir}/python/requirements-wheel-test.txt
diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py
new file mode 100644
index 0000000000000..22b3a890f036b
--- /dev/null
+++ b/ci/scripts/python_wheel_validate_contents.py
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+from pathlib import Path
+import re
+import zipfile
+
+
+def validate_wheel(path):
+    p = Path(path)
+    wheels = list(p.glob('*.whl'))
+    error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})"
+    assert len(wheels) == 1, error_msg
+    f = zipfile.ZipFile(wheels[0])
+    outliers = [
+        info.filename for info in f.filelist if not re.match(
+            r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/)', info.filename
+        )
+    ]
+    assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}"
+    print(f"The wheel: {wheels[0]} seems valid.")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path", type=str, required=True,
+                        help="Directory where wheel is located")
+    args = parser.parse_args()
+    validate_wheel(args.path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat
index 54f02ec6f6ed0..1f1d5dca721d9 100644
--- a/ci/scripts/python_wheel_windows_build.bat
+++ b/ci/scripts/python_wheel_windows_build.bat
@@ -106,7 +106,6 @@ echo "=== (%PYTHON_VERSION%) Building wheel ==="
 set PYARROW_BUILD_TYPE=%CMAKE_BUILD_TYPE%
 set PYARROW_BUNDLE_ARROW_CPP=ON
 set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR%
-set PYARROW_INSTALL_TESTS=ON
 set PYARROW_WITH_ACERO=%ARROW_ACERO%
 set PYARROW_WITH_DATASET=%ARROW_DATASET%
 set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT%
diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat
index cac3f18434b6c..de5a2c2e965cb 100755
--- a/ci/scripts/python_wheel_windows_test.bat
+++ b/ci/scripts/python_wheel_windows_test.bat
@@ -64,6 +64,9 @@ set PYTHON_CMD=py -%PYTHON%
 %PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1
 %PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1
 
+@REM Validate wheel contents
+%PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\dist || exit /B 1
+
 @rem Download IANA Timezone Database for ORC C++
 curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B
 mkdir %USERPROFILE%\Downloads\test\tzdata
diff --git a/docker-compose.yml b/docker-compose.yml
index 19a9dd0de3932..36cf150f25f39 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1144,6 +1144,7 @@ services:
       <<: *common
       CHECK_IMPORTS: "ON"
       CHECK_UNITTESTS: "OFF"
+      CHECK_WHEEL_CONTENT: "ON"
     command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow
 
   python-wheel-manylinux-test-unittests:
@@ -1164,6 +1165,7 @@ services:
       <<: *common
       CHECK_IMPORTS: "OFF"
       CHECK_UNITTESTS: "ON"
+      CHECK_WHEEL_CONTENT: "OFF"
     command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow
 
   python-wheel-windows-vs2019:
diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst
index 6beea55e66b86..2ba4b534caeff 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -632,9 +632,6 @@ PyArrow are:
    * - ``PYARROW_BUNDLE_CYTHON_CPP``
      - Bundle the C++ files generated by Cython
      - ``0`` (``OFF``)
-   * - ``PYARROW_INSTALL_TESTS``
-     - Add the test to the python package
-     - ``1`` (``ON``)
    * - ``PYARROW_BUILD_VERBOSE``
      - Enable verbose output from Makefile builds
      - ``0`` (``OFF``)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 8ece65dd467bb..7c3fcae5cb306 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -74,7 +74,8 @@ zip-safe=false
 include-package-data=true
 
 [tool.setuptools.packages.find]
-where = ["."]
+include = ["pyarrow"]
+namespaces = false
 
 [tool.setuptools.package-data]
 pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"]
diff --git a/python/setup.py b/python/setup.py
index d3ef3a091467c..60b9a696d9785 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -32,7 +32,7 @@
     # Get correct EXT_SUFFIX on Windows (https://bugs.python.org/issue39825)
     from distutils import sysconfig
 
-from setuptools import setup, Extension, Distribution, find_namespace_packages
+from setuptools import setup, Extension, Distribution
 
 from Cython.Distutils import build_ext as _build_ext
 import Cython
@@ -396,21 +396,7 @@ def has_ext_modules(foo):
         return True
 
 
-if strtobool(os.environ.get('PYARROW_INSTALL_TESTS', '1')):
-    packages = find_namespace_packages(include=['pyarrow*'])
-    exclude_package_data = {}
-else:
-    packages = find_namespace_packages(include=['pyarrow*'],
-                                       exclude=["pyarrow.tests*"])
-    # setuptools adds back importable packages even when excluded.
-    # https://github.com/pypa/setuptools/issues/3260
-    # https://github.com/pypa/setuptools/issues/3340#issuecomment-1219383976
-    exclude_package_data = {"pyarrow": ["tests*"]}
-
-
 setup(
-    packages=packages,
-    exclude_package_data=exclude_package_data,
     distclass=BinaryDistribution,
     # Dummy extension to trigger build_ext
     ext_modules=[Extension('__dummy__', sources=[])],

From 9336bbeaa76348367c6a8dd3048088dae203b836 Mon Sep 17 00:00:00 2001
From: Crystal <45134936+CrystalZhou0529@users.noreply.github.com>
Date: Thu, 5 Sep 2024 04:55:37 -0400
Subject: [PATCH 088/186] GH-43967: [C++] Enhance error message for URI parsing
 (#43938)

### Rationale for this change

We want to enhance error message for URI parsing error to provide more information for the syntax error scenario.

When error message is generated from `uriParseSingleUriExA`, the return value might indicate a `URI_ERROR_SYNTAX` error, and `error_pos` would be set to the position causing syntax error. ([uriparser/Uri.h](https://github.com/apache/arrow/blob/c455d6b8c4ae2cb22baceb4c27e1325b973d39e1/cpp/src/arrow/vendored/uriparser/Uri.h#L288))

In the new error message, it includes the character causing syntax error and its position, so users can have a better idea why the error happens.

### What changes are included in this PR?

- Error message change in URI parsing function.

### Are these changes tested?

PR includes unit tests.

### Are there any user-facing changes?

Yes, but only for error message.

* GitHub Issue: #41365
* GitHub Issue: #43967

Authored-by: Crystal Zhou <crystal.zhouxiaoyue@hotmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 cpp/src/arrow/filesystem/filesystem_test.cc |  8 ++++++++
 cpp/src/arrow/util/uri.cc                   | 13 ++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/filesystem/filesystem_test.cc b/cpp/src/arrow/filesystem/filesystem_test.cc
index 8477647b2cd73..5072c3a8c25b1 100644
--- a/cpp/src/arrow/filesystem/filesystem_test.cc
+++ b/cpp/src/arrow/filesystem/filesystem_test.cc
@@ -20,6 +20,7 @@
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include "arrow/filesystem/filesystem.h"
@@ -632,6 +633,13 @@ TEST_F(TestMockFS, FileSystemFromUri) {
   ASSERT_OK_AND_ASSIGN(fs_, FileSystemFromUri("mock:///foo/bar?q=zzz", &path));
   ASSERT_EQ(path, "foo/bar");
   CheckDirs({});
+  ASSERT_OK_AND_ASSIGN(fs_, FileSystemFromUri("mock:/folder+name/bar?q=zzz", &path));
+  ASSERT_EQ(path, "folder+name/bar");
+  CheckDirs({});
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("syntax error at character ' ' (position 12)"),
+      FileSystemFromUri("mock:/folder name/bar", &path));
+  CheckDirs({});
 }
 
 ////////////////////////////////////////////////////////////////////////////
diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc
index 9c0f7f9a59630..6c0787a87e046 100644
--- a/cpp/src/arrow/util/uri.cc
+++ b/cpp/src/arrow/util/uri.cc
@@ -250,9 +250,16 @@ Status Uri::Parse(const std::string& uri_string) {
   const auto& s = impl_->KeepString(uri_string);
   impl_->string_rep_ = s;
   const char* error_pos;
-  if (uriParseSingleUriExA(&impl_->uri_, s.data(), s.data() + s.size(), &error_pos) !=
-      URI_SUCCESS) {
-    return Status::Invalid("Cannot parse URI: '", uri_string, "'");
+  int retval =
+      uriParseSingleUriExA(&impl_->uri_, s.data(), s.data() + s.size(), &error_pos);
+  if (retval != URI_SUCCESS) {
+    if (retval == URI_ERROR_SYNTAX) {
+      return Status::Invalid("Cannot parse URI: '", uri_string,
+                             "' due to syntax error at character '", *error_pos,
+                             "' (position ", error_pos - s.data(), ")");
+    } else {
+      return Status::Invalid("Cannot parse URI: '", uri_string, "'");
+    }
   }
 
   const auto scheme = TextRangeToView(impl_->uri_.scheme);

From 032e6a42bfa359b62d0ed4e5d9b44582a558c87e Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 5 Sep 2024 11:46:53 +0200
Subject: [PATCH 089/186] GH-43946: [C++][Parquet] Guard against use of cleared
 decryptor/encryptor (#43947)

This is to get a clearer error rather than an obscure crash, see https://github.com/apache/arrow/issues/43057 for an example.

* GitHub Issue: #43946

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../parquet/encryption/encryption_internal.cc | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/cpp/src/parquet/encryption/encryption_internal.cc b/cpp/src/parquet/encryption/encryption_internal.cc
index a0d9367b619c6..31cad130a10c7 100644
--- a/cpp/src/parquet/encryption/encryption_internal.cc
+++ b/cpp/src/parquet/encryption/encryption_internal.cc
@@ -89,6 +89,12 @@ class AesEncryptor::AesEncryptorImpl {
   }
 
  private:
+  void CheckValid() const {
+    if (ctx_ == nullptr) {
+      throw ParquetException("AesEncryptor was wiped out");
+    }
+  }
+
   EVP_CIPHER_CTX* ctx_;
   int32_t aes_mode_;
   int32_t key_length_;
@@ -156,6 +162,8 @@ AesEncryptor::AesEncryptorImpl::AesEncryptorImpl(ParquetCipher::type alg_id,
 int32_t AesEncryptor::AesEncryptorImpl::SignedFooterEncrypt(
     span<const uint8_t> footer, span<const uint8_t> key, span<const uint8_t> aad,
     span<const uint8_t> nonce, span<uint8_t> encrypted_footer) {
+  CheckValid();
+
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -180,6 +188,8 @@ int32_t AesEncryptor::AesEncryptorImpl::Encrypt(span<const uint8_t> plaintext,
                                                 span<const uint8_t> key,
                                                 span<const uint8_t> aad,
                                                 span<uint8_t> ciphertext) {
+  CheckValid();
+
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -413,6 +423,12 @@ class AesDecryptor::AesDecryptorImpl {
   }
 
  private:
+  void CheckValid() const {
+    if (ctx_ == nullptr) {
+      throw ParquetException("AesDecryptor was wiped out");
+    }
+  }
+
   EVP_CIPHER_CTX* ctx_;
   int32_t aes_mode_;
   int32_t key_length_;
@@ -714,6 +730,8 @@ int32_t AesDecryptor::AesDecryptorImpl::Decrypt(span<const uint8_t> ciphertext,
                                                 span<const uint8_t> key,
                                                 span<const uint8_t> aad,
                                                 span<uint8_t> plaintext) {
+  CheckValid();
+
   if (static_cast<size_t>(key_length_) != key.size()) {
     std::stringstream ss;
     ss << "Wrong key length " << key.size() << ". Should be " << key_length_;
@@ -806,4 +824,7 @@ void RandBytes(unsigned char* buf, size_t num) {
 
 void EnsureBackendInitialized() { openssl::EnsureInitialized(); }
 
+#undef ENCRYPT_INIT
+#undef DECRYPT_INIT
+
 }  // namespace parquet::encryption

From 7a24729341d4b7cc56ce072df86d2d28b4ddcf96 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 5 Sep 2024 14:14:58 +0200
Subject: [PATCH 090/186] GH-43969: [CI][Dev] Prune .dockerignore (#43971)

### Rationale for this change

The smaller the better, according to https://docs.docker.com/build/cache/optimize/

### What changes are included in this PR?

Prune obsolete or unnecessary inclusions from `.dockerignore`.

### Are these changes tested?

Yes, by CI.

### Are there any user-facing changes?

No.

* GitHub Issue: #43969

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .dockerignore                     | 21 ++-------------------
 .github/workflows/archery.yml     |  2 ++
 .github/workflows/cpp.yml         |  2 ++
 .github/workflows/docs_light.yml  |  1 +
 .github/workflows/go.yml          |  2 ++
 .github/workflows/integration.yml |  2 ++
 .github/workflows/java.yml        |  2 ++
 .github/workflows/java_jni.yml    |  2 ++
 .github/workflows/js.yml          |  2 ++
 .github/workflows/python.yml      |  2 ++
 .github/workflows/r.yml           |  2 ++
 .github/workflows/ruby.yml        |  2 ++
 .github/workflows/swift.yml       |  2 ++
 13 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 3791cca95e3fe..1f1715d8e833d 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -27,11 +27,11 @@
 # include explicitly
 !ci/**
 !c_glib/Gemfile
-!dev/archery/setup.py
 !dev/release/setup-*.sh
 !docs/requirements*.txt
+!go/go.mod
+!go/go.sum
 !python/requirements*.txt
-!python/manylinux1/**
 !r/DESCRIPTION
 !ruby/Gemfile
 !ruby/red-arrow/Gemfile
@@ -46,20 +46,3 @@
 !ruby/red-parquet/Gemfile
 !ruby/red-parquet/lib/parquet/version.rb
 !ruby/red-parquet/red-parquet.gemspec
-!ruby/red-plasma/Gemfile
-!ruby/red-plasma/lib/plasma/version.rb
-!ruby/red-plasma/red-plasma.gemspec
-!rust/Cargo.toml
-!rust/benchmarks/Cargo.toml
-!rust/arrow/Cargo.toml
-!rust/arrow/benches
-!rust/arrow-flight/Cargo.toml
-!rust/parquet/Cargo.toml
-!rust/parquet/build.rs
-!rust/parquet_derive/Cargo.toml
-!rust/parquet_derive_test/Cargo.toml
-!rust/datafusion/Cargo.toml
-!rust/datafusion/benches
-!rust/integration-testing/Cargo.toml
-!go/go.mod
-!go/go.sum
\ No newline at end of file
diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml
index 2c46071010962..e448209056d78 100644
--- a/.github/workflows/archery.yml
+++ b/.github/workflows/archery.yml
@@ -20,12 +20,14 @@ name: Archery & Crossbow
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/archery.yml'
       - 'dev/archery/**'
       - 'dev/tasks/**'
       - 'docker-compose.yml'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/archery.yml'
       - 'dev/archery/**'
       - 'dev/tasks/**'
diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index 20bcfcb38da69..4a01d2f8e3aab 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -20,6 +20,7 @@ name: C++
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/cpp.yml'
       - 'ci/conda_env_*'
       - 'ci/docker/**'
@@ -35,6 +36,7 @@ on:
       - 'testing'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/cpp.yml'
       - 'ci/conda_env_*'
       - 'ci/docker/**'
diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml
index 454affd7fa7f9..7d540b7cecdc9 100644
--- a/.github/workflows/docs_light.yml
+++ b/.github/workflows/docs_light.yml
@@ -20,6 +20,7 @@ name: Docs
 on:
   pull_request:
     paths:
+      - '.dockerignore'
       - 'docs/**'
       - '.github/workflows/docs_light.yml'
       - 'ci/docker/conda.dockerfile'
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 9b18b010a0cb9..d463549206471 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -20,6 +20,7 @@ name: Go
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/go.yml'
       - 'ci/docker/*_go.dockerfile'
       - 'ci/scripts/go_*'
@@ -27,6 +28,7 @@ on:
       - 'go/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/go.yml'
       - 'ci/docker/*_go.dockerfile'
       - 'ci/docker/**'
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 3a6b568c5207f..ecf89bff8f600 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -20,6 +20,7 @@ name: Integration
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/integration.yml'
       - 'ci/**'
       - 'dev/archery/**'
@@ -33,6 +34,7 @@ on:
       - 'format/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/integration.yml'
       - 'ci/**'
       - 'dev/archery/**'
diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml
index 8560f0dd1cbe9..57f834bcbabee 100644
--- a/.github/workflows/java.yml
+++ b/.github/workflows/java.yml
@@ -20,6 +20,7 @@ name: Java
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/java.yml'
       - 'ci/docker/*java*'
       - 'ci/scripts/java*.sh'
@@ -29,6 +30,7 @@ on:
       - 'java/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/java.yml'
       - 'ci/docker/*java*'
       - 'ci/scripts/java*.sh'
diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml
index f204d6459ae01..f2ecc801dc724 100644
--- a/.github/workflows/java_jni.yml
+++ b/.github/workflows/java_jni.yml
@@ -20,6 +20,7 @@ name: Java JNI
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/java_jni.yml'
       - 'ci/docker/**'
       - 'ci/scripts/cpp_build.sh'
@@ -29,6 +30,7 @@ on:
       - 'java/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/java_jni.yml'
       - 'ci/docker/**'
       - 'ci/scripts/cpp_build.sh'
diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml
index 4ab9831924fb1..17b57c42b62f6 100644
--- a/.github/workflows/js.yml
+++ b/.github/workflows/js.yml
@@ -20,12 +20,14 @@ name: NodeJS
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/js.yml'
       - 'ci/docker/*js.dockerfile'
       - 'ci/scripts/js_*'
       - 'js/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/js.yml'
       - 'ci/docker/*js.dockerfile'
       - 'ci/scripts/js_*'
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index b88ea7ce4f1ee..6e83b727593b4 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -20,6 +20,7 @@ name: Python
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/python.yml'
       - 'ci/**'
       - 'cpp/**'
@@ -27,6 +28,7 @@ on:
       - 'python/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/python.yml'
       - 'ci/**'
       - 'cpp/**'
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 7fc45777bf657..fbc2ebe0bc5f1 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -20,6 +20,7 @@ name: R
 on:
   push:
     paths:
+      - '.dockerignore'
       - ".github/workflows/r.yml"
       - "ci/docker/**"
       - "ci/etc/rprofile"
@@ -32,6 +33,7 @@ on:
       - "r/**"
   pull_request:
     paths:
+      - '.dockerignore'
       - ".github/workflows/r.yml"
       - "ci/docker/**"
       - "ci/etc/rprofile"
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index 228bacb77e58a..c4a7f31f4a94c 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -20,6 +20,7 @@ name: C GLib & Ruby
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/ruby.yml'
       - 'ci/docker/**'
       - 'ci/scripts/c_glib_*'
@@ -33,6 +34,7 @@ on:
       - 'ruby/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/ruby.yml'
       - 'ci/docker/**'
       - 'ci/scripts/c_glib_*'
diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml
index 1b3c9eca1814a..86eb113dfc833 100644
--- a/.github/workflows/swift.yml
+++ b/.github/workflows/swift.yml
@@ -20,6 +20,7 @@ name: Swift
 on:
   push:
     paths:
+      - '.dockerignore'
       - '.github/workflows/swift.yml'
       - 'ci/docker/*swift*'
       - 'ci/scripts/swift_*'
@@ -27,6 +28,7 @@ on:
       - 'swift/**'
   pull_request:
     paths:
+      - '.dockerignore'
       - '.github/workflows/swift.yml'
       - 'ci/docker/*swift*'
       - 'ci/scripts/swift_*'

From c2123b8b90ab952f854912459bb33ebaf0d99611 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 5 Sep 2024 18:10:57 +0200
Subject: [PATCH 091/186] GH-40154: [C++][Parquet] Separate encoders and
 decoder (#43972)

### Rationale for this change

`encoding.cc` is quite large nowadays : around 4000 lines of code, which makes code navigation cumbersome. It combines the functionality of encoders and decoders, however, those use distinct infrastructures and do not share any code.

Other areas of Parquet tend to separate the reading and writing facilities: for example, `column_reader.cc` vs. `column_writer.cc`.

### What changes are included in this PR?

The main change is to move encoders to `encoder.cc`, decoders to `decoder.cc`, and remove `encoding.cc`.

A small improvement is also to remove the inclusion of `arrow/util/spaced.h` in `encoding.h` by moving the `TypedDecoder<T>::DecodeSpaced` implementation into `decoder.cc`.

Note the massive code shuffle may obscure the git history quite a bit. `git log -C` doesn't seem able to track earlier versions of the encoder and decoder code, but `git blame -C` is.

### Are these changes tested?

By existing tests.

### Are there any user-facing changes?

No.

* GitHub Issue: #40154

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/CMakeLists.txt              |    3 +-
 cpp/src/parquet/{encoding.cc => decoder.cc} | 1854 +------------------
 cpp/src/parquet/encoder.cc                  | 1783 ++++++++++++++++++
 cpp/src/parquet/encoding.h                  |   36 +-
 4 files changed, 1866 insertions(+), 1810 deletions(-)
 rename cpp/src/parquet/{encoding.cc => decoder.cc} (57%)
 create mode 100644 cpp/src/parquet/encoder.cc

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 17574261d891d..b984ef77adbe0 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -165,7 +165,8 @@ set(PARQUET_SRCS
     column_reader.cc
     column_scanner.cc
     column_writer.cc
-    encoding.cc
+    decoder.cc
+    encoder.cc
     encryption/encryption.cc
     encryption/internal_file_decryptor.cc
     encryption/internal_file_encryptor.cc
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/decoder.cc
similarity index 57%
rename from cpp/src/parquet/encoding.cc
rename to cpp/src/parquet/decoder.cc
index 16a1e249273f6..70810461605b1 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -28,23 +28,23 @@
 #include <vector>
 
 #include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_dict.h"
-#include "arrow/stl_allocator.h"
+#include "arrow/array/builder_primitive.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_stream_utils_internal.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_writer.h"
 #include "arrow/util/byte_stream_split_internal.h"
 #include "arrow/util/checked_cast.h"
-#include "arrow/util/hashing.h"
 #include "arrow/util/int_util_overflow.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/rle_encoding_internal.h"
+#include "arrow/util/spaced.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
+
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
@@ -57,901 +57,12 @@ using arrow::VisitNullBitmapInline;
 using arrow::internal::AddWithOverflow;
 using arrow::internal::BitBlockCounter;
 using arrow::internal::checked_cast;
-using arrow::internal::MultiplyWithOverflow;
-using arrow::internal::SafeSignedSubtract;
-using arrow::internal::SubtractWithOverflow;
 using arrow::util::SafeLoad;
 using arrow::util::SafeLoadAs;
-using std::string_view;
-
-template <typename T>
-using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
 
 namespace parquet {
 namespace {
 
-// The Parquet spec isn't very clear whether ByteArray lengths are signed or
-// unsigned, but the Java implementation uses signed ints.
-constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
-
-// ----------------------------------------------------------------------
-// Encoders
-// ----------------------------------------------------------------------
-
-class EncoderImpl : virtual public Encoder {
- public:
-  EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
-      : descr_(descr),
-        encoding_(encoding),
-        pool_(pool),
-        type_length_(descr ? descr->type_length() : -1) {}
-
-  Encoding::type encoding() const override { return encoding_; }
-
-  MemoryPool* memory_pool() const override { return pool_; }
-
- protected:
-  // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
-  const ColumnDescriptor* descr_;
-  const Encoding::type encoding_;
-  MemoryPool* pool_;
-
-  /// Type length from descr
-  const int type_length_;
-};
-
-// ----------------------------------------------------------------------
-// Plain encoder implementation
-
-template <typename DType>
-class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
- public:
-  using T = typename DType::c_type;
-
-  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
-
-  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
-
-  std::shared_ptr<Buffer> FlushValues() override {
-    std::shared_ptr<Buffer> buffer;
-    PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
-    return buffer;
-  }
-
-  using TypedEncoder<DType>::Put;
-
-  void Put(const T* buffer, int num_values) override;
-
-  void Put(const ::arrow::Array& values) override;
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                   this->memory_pool()));
-      T* data = buffer->template mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
-  void UnsafePutByteArray(const void* data, uint32_t length) {
-    DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
-    sink_.UnsafeAppend(&length, sizeof(uint32_t));
-    sink_.UnsafeAppend(data, static_cast<int64_t>(length));
-  }
-
-  void Put(const ByteArray& val) {
-    // Write the result to the output stream
-    const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
-    if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
-      PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
-    }
-    UnsafePutByteArray(val.ptr, val.len);
-  }
-
- protected:
-  template <typename ArrayType>
-  void PutBinaryArray(const ArrayType& array) {
-    const int64_t total_bytes =
-        array.value_offset(array.length()) - array.value_offset(0);
-    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
-
-    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
-        *array.data(),
-        [&](::std::string_view view) {
-          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
-            return Status::Invalid(
-                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
-          }
-          UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
-          return Status::OK();
-        },
-        []() { return Status::OK(); }));
-  }
-
-  ::arrow::BufferBuilder sink_;
-};
-
-template <typename DType>
-void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
-  if (num_values > 0) {
-    PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
-  }
-}
-
-template <>
-inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
-  for (int i = 0; i < num_values; ++i) {
-    Put(src[i]);
-  }
-}
-
-template <typename ArrayType>
-void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
-  if (values.type_id() != ArrayType::TypeClass::type_id) {
-    std::string type_name = ArrayType::TypeClass::type_name();
-    throw ParquetException("direct put to " + type_name + " from " +
-                           values.type()->ToString() + " not supported");
-  }
-
-  using value_type = typename ArrayType::value_type;
-  constexpr auto value_size = sizeof(value_type);
-  auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
-
-  if (values.null_count() == 0) {
-    // no nulls, just dump the data
-    PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
-  } else {
-    PARQUET_THROW_NOT_OK(
-        sink->Reserve((values.length() - values.null_count()) * value_size));
-
-    for (int64_t i = 0; i < values.length(); i++) {
-      if (values.IsValid(i)) {
-        sink->UnsafeAppend(&raw_values[i], value_size);
-      }
-    }
-  }
-}
-
-template <>
-void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
-  DirectPutImpl<::arrow::Int32Array>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
-  DirectPutImpl<::arrow::Int64Array>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
-  ParquetException::NYI("direct put to Int96");
-}
-
-template <>
-void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
-  DirectPutImpl<::arrow::FloatArray>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
-  DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
-}
-
-template <typename DType>
-void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
-  ParquetException::NYI("direct put of " + values.type()->ToString());
-}
-
-void AssertBaseBinary(const ::arrow::Array& values) {
-  if (!::arrow::is_base_binary_like(values.type_id())) {
-    throw ParquetException("Only BaseBinaryArray and subclasses supported");
-  }
-}
-
-template <>
-inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
-  AssertBaseBinary(values);
-
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else {
-    DCHECK(::arrow::is_large_binary_like(values.type_id()));
-    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  }
-}
-
-void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
-  if (!::arrow::is_fixed_size_binary(values.type_id())) {
-    throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
-  }
-  if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
-      type_length) {
-    throw ParquetException("Size mismatch: " + values.type()->ToString() +
-                           " should have been " + std::to_string(type_length) + " wide");
-  }
-}
-
-template <>
-inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
-  AssertFixedSizeBinary(values, descr_->type_length());
-  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-
-  if (data.null_count() == 0) {
-    // no nulls, just dump the data
-    PARQUET_THROW_NOT_OK(
-        sink_.Append(data.raw_values(), data.length() * data.byte_width()));
-  } else {
-    const int64_t total_bytes =
-        data.length() * data.byte_width() - data.null_count() * data.byte_width();
-    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
-    for (int64_t i = 0; i < data.length(); i++) {
-      if (data.IsValid(i)) {
-        sink_.UnsafeAppend(data.Value(i), data.byte_width());
-      }
-    }
-  }
-}
-
-template <>
-inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
-  if (descr_->type_length() == 0) {
-    return;
-  }
-  for (int i = 0; i < num_values; ++i) {
-    // Write the result to the output stream
-    DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
-    PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
-  }
-}
-
-template <>
-class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
- public:
-  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
-
-  int64_t EstimatedDataEncodedSize() override;
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  void Put(const bool* src, int num_values) override;
-
-  void Put(const std::vector<bool>& src, int num_values) override;
-
-  void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                   this->memory_pool()));
-      T* data = buffer->mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
-  void Put(const ::arrow::Array& values) override {
-    if (values.type_id() != ::arrow::Type::BOOL) {
-      throw ParquetException("direct put to boolean from " + values.type()->ToString() +
-                             " not supported");
-    }
-    const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
-
-    if (data.null_count() == 0) {
-      // no nulls, just dump the data
-      PARQUET_THROW_NOT_OK(sink_.Reserve(data.length()));
-      sink_.UnsafeAppend(data.data()->GetValues<uint8_t>(1, 0), data.offset(),
-                         data.length());
-    } else {
-      PARQUET_THROW_NOT_OK(sink_.Reserve(data.length() - data.null_count()));
-      for (int64_t i = 0; i < data.length(); i++) {
-        if (data.IsValid(i)) {
-          sink_.UnsafeAppend(data.Value(i));
-        }
-      }
-    }
-  }
-
- private:
-  ::arrow::TypedBufferBuilder<bool> sink_;
-
-  template <typename SequenceType>
-  void PutImpl(const SequenceType& src, int num_values);
-};
-
-template <typename SequenceType>
-void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
-  PARQUET_THROW_NOT_OK(sink_.Reserve(num_values));
-  for (int i = 0; i < num_values; ++i) {
-    sink_.UnsafeAppend(src[i]);
-  }
-}
-
-int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
-  return ::arrow::bit_util::BytesForBits(sink_.length());
-}
-
-std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
-  std::shared_ptr<Buffer> buffer;
-  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
-  return buffer;
-}
-
-void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
-  PutImpl(src, num_values);
-}
-
-void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
-  PutImpl(src, num_values);
-}
-
-// ----------------------------------------------------------------------
-// DictEncoder<T> implementations
-
-template <typename DType>
-struct DictEncoderTraits {
-  using c_type = typename DType::c_type;
-  using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
-};
-
-template <>
-struct DictEncoderTraits<ByteArrayType> {
-  using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
-};
-
-template <>
-struct DictEncoderTraits<FLBAType> {
-  using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
-};
-
-// Initially 1024 elements
-static constexpr int32_t kInitialHashTableSize = 1 << 10;
-
-int RlePreserveBufferSize(int num_values, int bit_width) {
-  // Note: because of the way RleEncoder::CheckBufferFull()
-  // is called, we have to reserve an extra "RleEncoder::MinBufferSize"
-  // bytes. These extra bytes won't be used but not reserving them
-  // would cause the encoder to fail.
-  return ::arrow::util::RleEncoder::MaxBufferSize(bit_width, num_values) +
-         ::arrow::util::RleEncoder::MinBufferSize(bit_width);
-}
-
-/// See the dictionary encoding section of
-/// https://github.com/Parquet/parquet-format.  The encoding supports
-/// streaming encoding. Values are encoded as they are added while the
-/// dictionary is being constructed. At any time, the buffered values
-/// can be written out with the current dictionary size. More values
-/// can then be added to the encoder, including new dictionary
-/// entries.
-template <typename DType>
-class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
-  using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
-
- public:
-  typedef typename DType::c_type T;
-
-  /// In data page, the bit width used to encode the entry
-  /// ids stored as 1 byte (max bit width = 32).
-  constexpr static int32_t kDataPageBitWidthBytes = 1;
-
-  explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
-      : EncoderImpl(desc, Encoding::RLE_DICTIONARY, pool),
-        buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
-        dict_encoded_size_(0),
-        memo_table_(pool, kInitialHashTableSize) {}
-
-  ~DictEncoderImpl() override = default;
-
-  int dict_encoded_size() const override { return dict_encoded_size_; }
-
-  int WriteIndices(uint8_t* buffer, int buffer_len) override {
-    // Write bit width in first byte
-    *buffer = static_cast<uint8_t>(bit_width());
-    ++buffer;
-    --buffer_len;
-
-    ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
-
-    for (int32_t index : buffered_indices_) {
-      if (ARROW_PREDICT_FALSE(!encoder.Put(index))) return -1;
-    }
-    encoder.Flush();
-
-    ClearIndices();
-    return kDataPageBitWidthBytes + encoder.len();
-  }
-
-  /// Returns a conservative estimate of the number of bytes needed to encode the buffered
-  /// indices. Used to size the buffer passed to WriteIndices().
-  int64_t EstimatedDataEncodedSize() override {
-    return kDataPageBitWidthBytes +
-           RlePreserveBufferSize(static_cast<int>(buffered_indices_.size()), bit_width());
-  }
-
-  /// The minimum bit width required to encode the currently buffered indices.
-  int bit_width() const override {
-    if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
-    if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
-    return bit_util::Log2(num_entries());
-  }
-
-  /// Encode value. Note that this does not actually write any data, just
-  /// buffers the value's index to be written later.
-  inline void Put(const T& value);
-
-  // Not implemented for other data types
-  inline void PutByteArray(const void* ptr, int32_t length);
-
-  void Put(const T* src, int num_values) override {
-    for (int32_t i = 0; i < num_values; i++) {
-      Put(SafeLoad(src + i));
-    }
-  }
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
-                                           [&](int64_t position, int64_t length) {
-                                             for (int64_t i = 0; i < length; i++) {
-                                               Put(SafeLoad(src + i + position));
-                                             }
-                                           });
-  }
-
-  using TypedEncoder<DType>::Put;
-
-  void Put(const ::arrow::Array& values) override;
-  void PutDictionary(const ::arrow::Array& values) override;
-
-  template <typename ArrowType, typename T = typename ArrowType::c_type>
-  void PutIndicesTyped(const ::arrow::Array& data) {
-    auto values = data.data()->GetValues<T>(1);
-    size_t buffer_position = buffered_indices_.size();
-    buffered_indices_.resize(buffer_position +
-                             static_cast<size_t>(data.length() - data.null_count()));
-    ::arrow::internal::VisitSetBitRunsVoid(
-        data.null_bitmap_data(), data.offset(), data.length(),
-        [&](int64_t position, int64_t length) {
-          for (int64_t i = 0; i < length; ++i) {
-            buffered_indices_[buffer_position++] =
-                static_cast<int32_t>(values[i + position]);
-          }
-        });
-  }
-
-  void PutIndices(const ::arrow::Array& data) override {
-    switch (data.type()->id()) {
-      case ::arrow::Type::UINT8:
-      case ::arrow::Type::INT8:
-        return PutIndicesTyped<::arrow::UInt8Type>(data);
-      case ::arrow::Type::UINT16:
-      case ::arrow::Type::INT16:
-        return PutIndicesTyped<::arrow::UInt16Type>(data);
-      case ::arrow::Type::UINT32:
-      case ::arrow::Type::INT32:
-        return PutIndicesTyped<::arrow::UInt32Type>(data);
-      case ::arrow::Type::UINT64:
-      case ::arrow::Type::INT64:
-        return PutIndicesTyped<::arrow::UInt64Type>(data);
-      default:
-        throw ParquetException("Passed non-integer array to PutIndices");
-    }
-  }
-
-  std::shared_ptr<Buffer> FlushValues() override {
-    std::shared_ptr<ResizableBuffer> buffer =
-        AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
-    int result_size = WriteIndices(buffer->mutable_data(),
-                                   static_cast<int>(EstimatedDataEncodedSize()));
-    PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
-    return buffer;
-  }
-
-  /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
-  /// dict_encoded_size() bytes.
-  void WriteDict(uint8_t* buffer) const override;
-
-  /// The number of entries in the dictionary.
-  int num_entries() const override { return memo_table_.size(); }
-
- private:
-  /// Clears all the indices (but leaves the dictionary).
-  void ClearIndices() { buffered_indices_.clear(); }
-
-  /// Indices that have not yet be written out by WriteIndices().
-  ArrowPoolVector<int32_t> buffered_indices_;
-
-  template <typename ArrayType>
-  void PutBinaryArray(const ArrayType& array) {
-    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
-        *array.data(),
-        [&](::std::string_view view) {
-          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
-            return Status::Invalid(
-                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
-          }
-          PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
-          return Status::OK();
-        },
-        []() { return Status::OK(); }));
-  }
-
-  template <typename ArrayType>
-  void PutBinaryDictionaryArray(const ArrayType& array) {
-    DCHECK_EQ(array.null_count(), 0);
-    for (int64_t i = 0; i < array.length(); i++) {
-      auto v = array.GetView(i);
-      if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
-        throw ParquetException(
-            "Parquet cannot store strings with size 2GB or more, got: ", v.size());
-      }
-      dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
-      int32_t unused_memo_index;
-      PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
-          v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
-    }
-  }
-
-  /// The number of bytes needed to encode the dictionary.
-  int dict_encoded_size_;
-
-  MemoTableType memo_table_;
-};
-
-template <typename DType>
-void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) const {
-  // For primitive types, only a memcpy
-  DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
-  memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
-}
-
-// ByteArray and FLBA already have the dictionary encoded in their data heaps
-template <>
-void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) const {
-  memo_table_.VisitValues(0, [&buffer](::std::string_view v) {
-    uint32_t len = static_cast<uint32_t>(v.length());
-    memcpy(buffer, &len, sizeof(len));
-    buffer += sizeof(len);
-    memcpy(buffer, v.data(), len);
-    buffer += len;
-  });
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) const {
-  memo_table_.VisitValues(0, [&](::std::string_view v) {
-    DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
-    memcpy(buffer, v.data(), type_length_);
-    buffer += type_length_;
-  });
-}
-
-template <typename DType>
-inline void DictEncoderImpl<DType>::Put(const T& v) {
-  // Put() implementation for primitive types
-  auto on_found = [](int32_t memo_index) {};
-  auto on_not_found = [this](int32_t memo_index) {
-    dict_encoded_size_ += static_cast<int>(sizeof(T));
-  };
-
-  int32_t memo_index;
-  PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
-  buffered_indices_.push_back(memo_index);
-}
-
-template <typename DType>
-inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
-  DCHECK(false);
-}
-
-template <>
-inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
-                                                         int32_t length) {
-  static const uint8_t empty[] = {0};
-
-  auto on_found = [](int32_t memo_index) {};
-  auto on_not_found = [&](int32_t memo_index) {
-    dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
-  };
-
-  DCHECK(ptr != nullptr || length == 0);
-  ptr = (ptr != nullptr) ? ptr : empty;
-  int32_t memo_index;
-  PARQUET_THROW_NOT_OK(
-      memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
-  buffered_indices_.push_back(memo_index);
-}
-
-template <>
-inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
-  return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
-}
-
-template <>
-inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
-  static const uint8_t empty[] = {0};
-
-  auto on_found = [](int32_t memo_index) {};
-  auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
-
-  DCHECK(v.ptr != nullptr || type_length_ == 0);
-  const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
-  int32_t memo_index;
-  PARQUET_THROW_NOT_OK(
-      memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
-  buffered_indices_.push_back(memo_index);
-}
-
-template <>
-void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
-  ParquetException::NYI("Direct put to Int96");
-}
-
-template <>
-void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
-  ParquetException::NYI("Direct put to Int96");
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
-  using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
-  const auto& data = checked_cast<const ArrayType&>(values);
-  if (data.null_count() == 0) {
-    // no nulls, just dump the data
-    for (int64_t i = 0; i < data.length(); i++) {
-      Put(data.Value(i));
-    }
-  } else {
-    for (int64_t i = 0; i < data.length(); i++) {
-      if (data.IsValid(i)) {
-        Put(data.Value(i));
-      }
-    }
-  }
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
-  AssertFixedSizeBinary(values, type_length_);
-  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-  if (data.null_count() == 0) {
-    // no nulls, just dump the data
-    for (int64_t i = 0; i < data.length(); i++) {
-      Put(FixedLenByteArray(data.Value(i)));
-    }
-  } else {
-    std::vector<uint8_t> empty(type_length_, 0);
-    for (int64_t i = 0; i < data.length(); i++) {
-      if (data.IsValid(i)) {
-        Put(FixedLenByteArray(data.Value(i)));
-      }
-    }
-  }
-}
-
-template <>
-void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
-  AssertBaseBinary(values);
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else {
-    DCHECK(::arrow::is_large_binary_like(values.type_id()));
-    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  }
-}
-
-template <typename DType>
-void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
-  if (dict.null_count() > 0) {
-    throw ParquetException("Inserted dictionary cannot contain nulls");
-  }
-
-  if (encoder->num_entries() > 0) {
-    throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
-  }
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
-  AssertCanPutDictionary(this, values);
-
-  using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
-  const auto& data = checked_cast<const ArrayType&>(values);
-
-  dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
-  for (int64_t i = 0; i < data.length(); i++) {
-    int32_t unused_memo_index;
-    PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
-  }
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
-  AssertFixedSizeBinary(values, type_length_);
-  AssertCanPutDictionary(this, values);
-
-  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-
-  dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
-  for (int64_t i = 0; i < data.length(); i++) {
-    int32_t unused_memo_index;
-    PARQUET_THROW_NOT_OK(
-        memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
-  }
-}
-
-template <>
-void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
-  AssertBaseBinary(values);
-  AssertCanPutDictionary(this, values);
-
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else {
-    DCHECK(::arrow::is_large_binary_like(values.type_id()));
-    PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  }
-}
-
-// ----------------------------------------------------------------------
-// ByteStreamSplitEncoder<T> implementations
-
-// Common base class for all types
-
-template <typename DType>
-class ByteStreamSplitEncoderBase : public EncoderImpl,
-                                   virtual public TypedEncoder<DType> {
- public:
-  using T = typename DType::c_type;
-  using TypedEncoder<DType>::Put;
-
-  ByteStreamSplitEncoderBase(const ColumnDescriptor* descr, int byte_width,
-                             ::arrow::MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
-        sink_{pool},
-        byte_width_(byte_width),
-        num_values_in_buffer_{0} {}
-
-  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
-
-  std::shared_ptr<Buffer> FlushValues() override {
-    if (byte_width_ == 1) {
-      // Special-cased fast path
-      PARQUET_ASSIGN_OR_THROW(auto buf, sink_.Finish());
-      return buf;
-    }
-    auto output_buffer = AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
-    uint8_t* output_buffer_raw = output_buffer->mutable_data();
-    const uint8_t* raw_values = sink_.data();
-    ::arrow::util::internal::ByteStreamSplitEncode(
-        raw_values, /*width=*/byte_width_, num_values_in_buffer_, output_buffer_raw);
-    sink_.Reset();
-    num_values_in_buffer_ = 0;
-    return output_buffer;
-  }
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                   this->memory_pool()));
-      T* data = buffer->template mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
- protected:
-  ::arrow::BufferBuilder sink_;
-  // Required because type_length_ is only filled in for FLBA
-  const int byte_width_;
-  int64_t num_values_in_buffer_;
-};
-
-// BYTE_STREAM_SPLIT encoder implementation for FLOAT, DOUBLE, INT32, INT64
-
-template <typename DType>
-class ByteStreamSplitEncoder : public ByteStreamSplitEncoderBase<DType> {
- public:
-  using T = typename DType::c_type;
-  using ArrowType = typename EncodingTraits<DType>::ArrowType;
-
-  ByteStreamSplitEncoder(const ColumnDescriptor* descr,
-                         ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
-      : ByteStreamSplitEncoderBase<DType>(descr,
-                                          /*byte_width=*/static_cast<int>(sizeof(T)),
-                                          pool) {}
-
-  // Inherit Put(const std::vector<T>&...)
-  using TypedEncoder<DType>::Put;
-
-  void Put(const T* buffer, int num_values) override {
-    if (num_values > 0) {
-      PARQUET_THROW_NOT_OK(
-          this->sink_.Append(reinterpret_cast<const uint8_t*>(buffer),
-                             num_values * static_cast<int64_t>(sizeof(T))));
-      this->num_values_in_buffer_ += num_values;
-    }
-  }
-
-  void Put(const ::arrow::Array& values) override {
-    if (values.type_id() != ArrowType::type_id) {
-      throw ParquetException(std::string() + "direct put from " +
-                             values.type()->ToString() + " not supported");
-    }
-    const auto& data = *values.data();
-    this->PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
-                    static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0),
-                    data.offset);
-  }
-};
-
-// BYTE_STREAM_SPLIT encoder implementation for FLBA
-
-template <>
-class ByteStreamSplitEncoder<FLBAType> : public ByteStreamSplitEncoderBase<FLBAType> {
- public:
-  using DType = FLBAType;
-  using T = FixedLenByteArray;
-  using ArrowType = ::arrow::FixedSizeBinaryArray;
-
-  ByteStreamSplitEncoder(const ColumnDescriptor* descr,
-                         ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
-      : ByteStreamSplitEncoderBase<DType>(descr,
-                                          /*byte_width=*/descr->type_length(), pool) {}
-
-  // Inherit Put(const std::vector<T>&...)
-  using TypedEncoder<DType>::Put;
-
-  void Put(const T* buffer, int num_values) override {
-    if (byte_width_ > 0) {
-      const int64_t total_bytes = static_cast<int64_t>(num_values) * byte_width_;
-      PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
-      for (int i = 0; i < num_values; ++i) {
-        // Write the result to the output stream
-        DCHECK(buffer[i].ptr != nullptr) << "Value ptr cannot be NULL";
-        sink_.UnsafeAppend(buffer[i].ptr, byte_width_);
-      }
-    }
-    this->num_values_in_buffer_ += num_values;
-  }
-
-  void Put(const ::arrow::Array& values) override {
-    AssertFixedSizeBinary(values, byte_width_);
-    const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-    if (data.null_count() == 0) {
-      // no nulls, just buffer the data
-      PARQUET_THROW_NOT_OK(sink_.Append(data.raw_values(), data.length() * byte_width_));
-      this->num_values_in_buffer_ += data.length();
-    } else {
-      const int64_t num_values = data.length() - data.null_count();
-      const int64_t total_bytes = num_values * byte_width_;
-      PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
-      // TODO use VisitSetBitRunsVoid
-      for (int64_t i = 0; i < data.length(); i++) {
-        if (data.IsValid(i)) {
-          sink_.UnsafeAppend(data.Value(i), byte_width_);
-        }
-      }
-      this->num_values_in_buffer_ += num_values;
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// Decoders
-// ----------------------------------------------------------------------
-
 class DecoderImpl : virtual public Decoder {
  public:
   void SetData(int num_values, const uint8_t* data, int len) override {
@@ -978,9 +89,35 @@ class DecoderImpl : virtual public Decoder {
 };
 
 template <typename DType>
-class PlainDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+class TypedDecoderImpl : virtual public TypedDecoder<DType> {
  public:
   using T = typename DType::c_type;
+
+  int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
+                   int64_t valid_bits_offset) override {
+    if (null_count > 0) {
+      int values_to_read = num_values - null_count;
+      int values_read = this->Decode(buffer, values_to_read);
+      if (values_read != values_to_read) {
+        throw ParquetException("Number of values / definition_levels read did not match");
+      }
+
+      return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
+                                                      valid_bits, valid_bits_offset);
+    } else {
+      return this->Decode(buffer, num_values);
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// PLAIN decoder
+
+template <typename DType>
+class PlainDecoder : public DecoderImpl, virtual public TypedDecoderImpl<DType> {
+ public:
+  using T = typename DType::c_type;
+
   explicit PlainDecoder(const ColumnDescriptor* descr);
 
   int Decode(T* buffer, int max_values) override;
@@ -1156,7 +293,11 @@ int PlainDecoder<DType>::Decode(T* buffer, int max_values) {
   return max_values;
 }
 
-class PlainBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+// PLAIN decoder implementation for BOOLEAN
+
+class PlainBooleanDecoder : public DecoderImpl,
+                            virtual public TypedDecoderImpl<BooleanType>,
+                            virtual public BooleanDecoder {
  public:
   explicit PlainBooleanDecoder(const ColumnDescriptor* descr);
   void SetData(int num_values, const uint8_t* data, int len) override;
@@ -1273,6 +414,8 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
   return max_values;
 }
 
+// PLAIN decoder implementation for FIXED_LEN_BYTE_ARRAY and BYTE_ARRAY
+
 // A helper class to abstract away differences between EncodingTraits<DType>::Accumulator
 // for ByteArrayType and FLBAType.
 template <typename DType>
@@ -1592,7 +735,7 @@ class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecod
 };
 
 // ----------------------------------------------------------------------
-// Dictionary encoding and decoding
+// Dictionary decoding
 
 template <typename Type>
 class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
@@ -2167,329 +1310,42 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
         ++num_appended;
       }
     }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-
-  template <typename BuilderType>
-  Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
-    constexpr int32_t kBufferSize = 2048;
-    int32_t indices[kBufferSize];
-
-    RETURN_NOT_OK(builder->Reserve(num_values));
-
-    const auto* dict_values = dictionary_->data_as<ByteArray>();
-
-    int values_decoded = 0;
-    while (values_decoded < num_values) {
-      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
-      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-      if (num_indices == 0) ParquetException::EofException();
-      for (int i = 0; i < num_indices; ++i) {
-        auto idx = indices[i];
-        RETURN_NOT_OK(IndexInBounds(idx));
-        const auto& val = dict_values[idx];
-        RETURN_NOT_OK(builder->Append(val.ptr, val.len));
-      }
-      values_decoded += num_indices;
-    }
-    *out_num_values = values_decoded;
-    return Status::OK();
-  }
-};
-
-// ----------------------------------------------------------------------
-// DeltaBitPackEncoder
-
-/// DeltaBitPackEncoder is an encoder for the DeltaBinary Packing format
-/// as per the parquet spec. See:
-/// https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5
-///
-/// Consists of a header followed by blocks of delta encoded values binary packed.
-///
-///  Format
-///    [header] [block 1] [block 2] ... [block N]
-///
-///  Header
-///    [block size] [number of mini blocks per block] [total value count] [first value]
-///
-///  Block
-///    [min delta] [list of bitwidths of the mini blocks] [miniblocks]
-///
-/// Sets aside bytes at the start of the internal buffer where the header will be written,
-/// and only writes the header when FlushValues is called before returning it.
-///
-/// To encode a block, we will:
-///
-/// 1. Compute the differences between consecutive elements. For the first element in the
-/// block, use the last element in the previous block or, in the case of the first block,
-/// use the first value of the whole sequence, stored in the header.
-///
-/// 2. Compute the frame of reference (the minimum of the deltas in the block). Subtract
-/// this min delta from all deltas in the block. This guarantees that all values are
-/// non-negative.
-///
-/// 3. Encode the frame of reference (min delta) as a zigzag ULEB128 int followed by the
-/// bit widths of the mini blocks and the delta values (minus the min delta) bit packed
-/// per mini block.
-///
-/// Supports only INT32 and INT64.
-
-template <typename DType>
-class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
-  // Maximum possible header size
-  static constexpr uint32_t kMaxPageHeaderWriterSize = 32;
-  static constexpr uint32_t kValuesPerBlock =
-      std::is_same_v<int32_t, typename DType::c_type> ? 128 : 256;
-  static constexpr uint32_t kMiniBlocksPerBlock = 4;
-
- public:
-  using T = typename DType::c_type;
-  using UT = std::make_unsigned_t<T>;
-  using TypedEncoder<DType>::Put;
-
-  explicit DeltaBitPackEncoder(const ColumnDescriptor* descr, MemoryPool* pool,
-                               const uint32_t values_per_block = kValuesPerBlock,
-                               const uint32_t mini_blocks_per_block = kMiniBlocksPerBlock)
-      : EncoderImpl(descr, Encoding::DELTA_BINARY_PACKED, pool),
-        values_per_block_(values_per_block),
-        mini_blocks_per_block_(mini_blocks_per_block),
-        values_per_mini_block_(values_per_block / mini_blocks_per_block),
-        deltas_(values_per_block, ::arrow::stl::allocator<T>(pool)),
-        bits_buffer_(
-            AllocateBuffer(pool, (kMiniBlocksPerBlock + values_per_block) * sizeof(T))),
-        sink_(pool),
-        bit_writer_(bits_buffer_->mutable_data(),
-                    static_cast<int>(bits_buffer_->size())) {
-    if (values_per_block_ % 128 != 0) {
-      throw ParquetException(
-          "the number of values in a block must be multiple of 128, but it's " +
-          std::to_string(values_per_block_));
-    }
-    if (values_per_mini_block_ % 32 != 0) {
-      throw ParquetException(
-          "the number of values in a miniblock must be multiple of 32, but it's " +
-          std::to_string(values_per_mini_block_));
-    }
-    if (values_per_block % mini_blocks_per_block != 0) {
-      throw ParquetException(
-          "the number of values per block % number of miniblocks per block must be 0, "
-          "but it's " +
-          std::to_string(values_per_block % mini_blocks_per_block));
-    }
-    // Reserve enough space at the beginning of the buffer for largest possible header.
-    PARQUET_THROW_NOT_OK(sink_.Advance(kMaxPageHeaderWriterSize));
-  }
-
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
-
-  void Put(const ::arrow::Array& values) override;
-
-  void Put(const T* buffer, int num_values) override;
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override;
-
-  void FlushBlock();
-
- private:
-  const uint32_t values_per_block_;
-  const uint32_t mini_blocks_per_block_;
-  const uint32_t values_per_mini_block_;
-  uint32_t values_current_block_{0};
-  uint32_t total_value_count_{0};
-  T first_value_{0};
-  T current_value_{0};
-  ArrowPoolVector<T> deltas_;
-  std::shared_ptr<ResizableBuffer> bits_buffer_;
-  ::arrow::BufferBuilder sink_;
-  ::arrow::bit_util::BitWriter bit_writer_;
-};
-
-template <typename DType>
-void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
-  if (num_values == 0) {
-    return;
-  }
-
-  int idx = 0;
-  if (total_value_count_ == 0) {
-    current_value_ = src[0];
-    first_value_ = current_value_;
-    idx = 1;
-  }
-  total_value_count_ += num_values;
-
-  while (idx < num_values) {
-    T value = src[idx];
-    // Calculate deltas. The possible overflow is handled by use of unsigned integers
-    // making subtraction operations well-defined and correct even in case of overflow.
-    // Encoded integers will wrap back around on decoding.
-    // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
-    deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
-    current_value_ = value;
-    idx++;
-    values_current_block_++;
-    if (values_current_block_ == values_per_block_) {
-      FlushBlock();
-    }
-  }
-}
-
-template <typename DType>
-void DeltaBitPackEncoder<DType>::FlushBlock() {
-  if (values_current_block_ == 0) {
-    return;
-  }
-
-  // Calculate the frame of reference for this miniblock. This value will be subtracted
-  // from all deltas to guarantee all deltas are positive for encoding.
-  const T min_delta =
-      *std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
-  bit_writer_.PutZigZagVlqInt(min_delta);
-
-  // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
-  // bit widths of miniblocks as they become known during the encoding.
-  uint8_t* bit_width_data = bit_writer_.GetNextBytePtr(mini_blocks_per_block_);
-  DCHECK(bit_width_data != nullptr);
-
-  const uint32_t num_miniblocks =
-      static_cast<uint32_t>(std::ceil(static_cast<double>(values_current_block_) /
-                                      static_cast<double>(values_per_mini_block_)));
-  for (uint32_t i = 0; i < num_miniblocks; i++) {
-    const uint32_t values_current_mini_block =
-        std::min(values_per_mini_block_, values_current_block_);
-
-    const uint32_t start = i * values_per_mini_block_;
-    const T max_delta = *std::max_element(
-        deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);
-
-    // The minimum number of bits required to write any of values in deltas_ vector.
-    // See overflow comment above.
-    const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
-        static_cast<UT>(max_delta) - static_cast<UT>(min_delta));
-
-    for (uint32_t j = start; j < start + values_current_mini_block; j++) {
-      // Convert delta to frame of reference. See overflow comment above.
-      const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
-      bit_writer_.PutValue(value, bit_width);
-    }
-    // If there are not enough values to fill the last mini block, we pad the mini block
-    // with zeroes so that its length is the number of values in a full mini block
-    // multiplied by the bit width.
-    for (uint32_t j = values_current_mini_block; j < values_per_mini_block_; j++) {
-      bit_writer_.PutValue(0, bit_width);
-    }
-    values_current_block_ -= values_current_mini_block;
-  }
-
-  // If, in the last block, less than <number of miniblocks in a block> miniblocks are
-  // needed to store the values, the bytes storing the bit widths of the unneeded
-  // miniblocks are still present, their value should be zero, but readers must accept
-  // arbitrary values as well.
-  for (uint32_t i = num_miniblocks; i < mini_blocks_per_block_; i++) {
-    bit_width_data[i] = 0;
-  }
-  DCHECK_EQ(values_current_block_, 0);
-
-  bit_writer_.Flush();
-  PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
-  bit_writer_.Clear();
-}
-
-template <typename DType>
-std::shared_ptr<Buffer> DeltaBitPackEncoder<DType>::FlushValues() {
-  if (values_current_block_ > 0) {
-    FlushBlock();
-  }
-  PARQUET_ASSIGN_OR_THROW(auto buffer, sink_.Finish(/*shrink_to_fit=*/true));
-
-  uint8_t header_buffer_[kMaxPageHeaderWriterSize] = {};
-  bit_util::BitWriter header_writer(header_buffer_, sizeof(header_buffer_));
-  if (!header_writer.PutVlqInt(values_per_block_) ||
-      !header_writer.PutVlqInt(mini_blocks_per_block_) ||
-      !header_writer.PutVlqInt(total_value_count_) ||
-      !header_writer.PutZigZagVlqInt(static_cast<T>(first_value_))) {
-    throw ParquetException("header writing error");
-  }
-  header_writer.Flush();
-
-  // We reserved enough space at the beginning of the buffer for largest possible header
-  // and data was written immediately after. We now write the header data immediately
-  // before the end of reserved space.
-  const size_t offset_bytes = kMaxPageHeaderWriterSize - header_writer.bytes_written();
-  std::memcpy(buffer->mutable_data() + offset_bytes, header_buffer_,
-              header_writer.bytes_written());
-
-  // Reset counter of cached values
-  total_value_count_ = 0;
-  // Reserve enough space at the beginning of the buffer for largest possible header.
-  PARQUET_THROW_NOT_OK(sink_.Advance(kMaxPageHeaderWriterSize));
-
-  // Excess bytes at the beginning are sliced off and ignored.
-  return SliceBuffer(buffer, offset_bytes);
-}
-
-template <>
-void DeltaBitPackEncoder<Int32Type>::Put(const ::arrow::Array& values) {
-  const ::arrow::ArrayData& data = *values.data();
-  if (values.type_id() != ::arrow::Type::INT32) {
-    throw ParquetException("Expected Int32TArray, got ", values.type()->ToString());
-  }
-  if (data.length > std::numeric_limits<int32_t>::max()) {
-    throw ParquetException("Array cannot be longer than ",
-                           std::numeric_limits<int32_t>::max());
+    *out_num_values = values_decoded;
+    return Status::OK();
   }
 
-  if (values.null_count() == 0) {
-    Put(data.GetValues<int32_t>(1), static_cast<int>(data.length));
-  } else {
-    PutSpaced(data.GetValues<int32_t>(1), static_cast<int>(data.length),
-              data.GetValues<uint8_t>(0, 0), data.offset);
-  }
-}
+  template <typename BuilderType>
+  Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
+    constexpr int32_t kBufferSize = 2048;
+    int32_t indices[kBufferSize];
 
-template <>
-void DeltaBitPackEncoder<Int64Type>::Put(const ::arrow::Array& values) {
-  const ::arrow::ArrayData& data = *values.data();
-  if (values.type_id() != ::arrow::Type::INT64) {
-    throw ParquetException("Expected Int64TArray, got ", values.type()->ToString());
-  }
-  if (data.length > std::numeric_limits<int32_t>::max()) {
-    throw ParquetException("Array cannot be longer than ",
-                           std::numeric_limits<int32_t>::max());
-  }
-  if (values.null_count() == 0) {
-    Put(data.GetValues<int64_t>(1), static_cast<int>(data.length));
-  } else {
-    PutSpaced(data.GetValues<int64_t>(1), static_cast<int>(data.length),
-              data.GetValues<uint8_t>(0, 0), data.offset);
-  }
-}
+    RETURN_NOT_OK(builder->Reserve(num_values));
 
-template <typename DType>
-void DeltaBitPackEncoder<DType>::PutSpaced(const T* src, int num_values,
-                                           const uint8_t* valid_bits,
-                                           int64_t valid_bits_offset) {
-  if (valid_bits != NULLPTR) {
-    PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                 this->memory_pool()));
-    T* data = buffer->template mutable_data_as<T>();
-    int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-        src, num_values, valid_bits, valid_bits_offset, data);
-    Put(data, num_valid_values);
-  } else {
-    Put(src, num_values);
+    const auto* dict_values = dictionary_->data_as<ByteArray>();
+
+    int values_decoded = 0;
+    while (values_decoded < num_values) {
+      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+      if (num_indices == 0) ParquetException::EofException();
+      for (int i = 0; i < num_indices; ++i) {
+        auto idx = indices[i];
+        RETURN_NOT_OK(IndexInBounds(idx));
+        const auto& val = dict_values[idx];
+        RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+      }
+      values_decoded += num_indices;
+    }
+    *out_num_values = values_decoded;
+    return Status::OK();
   }
-}
+};
 
 // ----------------------------------------------------------------------
-// DeltaBitPackDecoder
+// DELTA_BINARY_PACKED decoder
 
 template <typename DType>
-class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+class DeltaBitPackDecoder : public DecoderImpl, public TypedDecoderImpl<DType> {
  public:
   typedef typename DType::c_type T;
   using UT = std::make_unsigned_t<T>;
@@ -2727,135 +1583,10 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
 };
 
 // ----------------------------------------------------------------------
-// DELTA_LENGTH_BYTE_ARRAY
-
-// ----------------------------------------------------------------------
-// DeltaLengthByteArrayEncoder
-
-class DeltaLengthByteArrayEncoder : public EncoderImpl,
-                                    virtual public TypedEncoder<ByteArrayType> {
- public:
-  explicit DeltaLengthByteArrayEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY,
-                    pool = ::arrow::default_memory_pool()),
-        sink_(pool),
-        length_encoder_(nullptr, pool) {}
-
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  int64_t EstimatedDataEncodedSize() override {
-    return sink_.length() + length_encoder_.EstimatedDataEncodedSize();
-  }
-
-  using TypedEncoder<ByteArrayType>::Put;
-
-  void Put(const ::arrow::Array& values) override;
-
-  void Put(const T* buffer, int num_values) override;
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override;
-
- protected:
-  template <typename ArrayType>
-  void PutBinaryArray(const ArrayType& array) {
-    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
-        *array.data(),
-        [&](::std::string_view view) {
-          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
-            return Status::Invalid(
-                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
-          }
-          if (ARROW_PREDICT_FALSE(
-                  view.size() + sink_.length() >
-                  static_cast<size_t>(std::numeric_limits<int32_t>::max()))) {
-            return Status::Invalid("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
-          }
-          length_encoder_.Put({static_cast<int32_t>(view.length())}, 1);
-          PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length()));
-          return Status::OK();
-        },
-        []() { return Status::OK(); }));
-  }
-
-  ::arrow::BufferBuilder sink_;
-  DeltaBitPackEncoder<Int32Type> length_encoder_;
-};
-
-void DeltaLengthByteArrayEncoder::Put(const ::arrow::Array& values) {
-  AssertBaseBinary(values);
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else {
-    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  }
-}
-
-void DeltaLengthByteArrayEncoder::Put(const T* src, int num_values) {
-  if (num_values == 0) {
-    return;
-  }
-
-  constexpr int kBatchSize = 256;
-  std::array<int32_t, kBatchSize> lengths;
-  uint32_t total_increment_size = 0;
-  for (int idx = 0; idx < num_values; idx += kBatchSize) {
-    const int batch_size = std::min(kBatchSize, num_values - idx);
-    for (int j = 0; j < batch_size; ++j) {
-      const int32_t len = src[idx + j].len;
-      if (ARROW_PREDICT_FALSE(
-              AddWithOverflow(total_increment_size, len, &total_increment_size))) {
-        throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
-      }
-      lengths[j] = len;
-    }
-    length_encoder_.Put(lengths.data(), batch_size);
-  }
-  if (sink_.length() + total_increment_size > std::numeric_limits<int32_t>::max()) {
-    throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
-  }
-  PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size));
-  for (int idx = 0; idx < num_values; idx++) {
-    sink_.UnsafeAppend(src[idx].ptr, src[idx].len);
-  }
-}
-
-void DeltaLengthByteArrayEncoder::PutSpaced(const T* src, int num_values,
-                                            const uint8_t* valid_bits,
-                                            int64_t valid_bits_offset) {
-  if (valid_bits != NULLPTR) {
-    PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                 this->memory_pool()));
-    T* data = buffer->template mutable_data_as<T>();
-    int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-        src, num_values, valid_bits, valid_bits_offset, data);
-    Put(data, num_valid_values);
-  } else {
-    Put(src, num_values);
-  }
-}
-
-std::shared_ptr<Buffer> DeltaLengthByteArrayEncoder::FlushValues() {
-  std::shared_ptr<Buffer> encoded_lengths = length_encoder_.FlushValues();
-
-  std::shared_ptr<Buffer> data;
-  PARQUET_THROW_NOT_OK(sink_.Finish(&data));
-  sink_.Reset();
-
-  PARQUET_THROW_NOT_OK(sink_.Resize(encoded_lengths->size() + data->size()));
-  PARQUET_THROW_NOT_OK(sink_.Append(encoded_lengths->data(), encoded_lengths->size()));
-  PARQUET_THROW_NOT_OK(sink_.Append(data->data(), data->size()));
-
-  std::shared_ptr<Buffer> buffer;
-  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
-  return buffer;
-}
-
-// ----------------------------------------------------------------------
-// DeltaLengthByteArrayDecoder
+// DELTA_LENGTH_BYTE_ARRAY decoder
 
 class DeltaLengthByteArrayDecoder : public DecoderImpl,
-                                    virtual public TypedDecoder<ByteArrayType> {
+                                    public TypedDecoderImpl<ByteArrayType> {
  public:
   explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
                                        MemoryPool* pool = ::arrow::default_memory_pool())
@@ -2989,113 +1720,11 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
 };
 
 // ----------------------------------------------------------------------
-// RLE_BOOLEAN_ENCODER
-
-class RleBooleanEncoder final : public EncoderImpl, virtual public BooleanEncoder {
- public:
-  explicit RleBooleanEncoder(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
-      : EncoderImpl(descr, Encoding::RLE, pool),
-        buffered_append_values_(::arrow::stl::allocator<T>(pool)) {}
-
-  int64_t EstimatedDataEncodedSize() override {
-    return kRleLengthInBytes + MaxRleBufferSize();
-  }
-
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  void Put(const T* buffer, int num_values) override;
-  void Put(const ::arrow::Array& values) override {
-    if (values.type_id() != ::arrow::Type::BOOL) {
-      throw ParquetException("RleBooleanEncoder expects BooleanArray, got ",
-                             values.type()->ToString());
-    }
-    const auto& boolean_array = checked_cast<const ::arrow::BooleanArray&>(values);
-    if (values.null_count() == 0) {
-      for (int i = 0; i < boolean_array.length(); ++i) {
-        // null_count == 0, so just call Value directly is ok.
-        buffered_append_values_.push_back(boolean_array.Value(i));
-      }
-    } else {
-      PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BooleanType>(
-          *boolean_array.data(),
-          [&](bool value) {
-            buffered_append_values_.push_back(value);
-            return Status::OK();
-          },
-          []() { return Status::OK(); }));
-    }
-  }
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
-                                                                   this->memory_pool()));
-      T* data = buffer->mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
-  void Put(const std::vector<bool>& src, int num_values) override;
-
- protected:
-  template <typename SequenceType>
-  void PutImpl(const SequenceType& src, int num_values);
-
-  int MaxRleBufferSize() const noexcept {
-    return RlePreserveBufferSize(static_cast<int>(buffered_append_values_.size()),
-                                 kBitWidth);
-  }
-
-  constexpr static int32_t kBitWidth = 1;
-  /// 4 bytes in little-endian, which indicates the length.
-  constexpr static int32_t kRleLengthInBytes = 4;
-
-  // std::vector<bool> in C++ is tricky, because it's a bitmap.
-  // Here RleBooleanEncoder will only append values into it, and
-  // dump values into Buffer, so using it here is ok.
-  ArrowPoolVector<bool> buffered_append_values_;
-};
-
-void RleBooleanEncoder::Put(const bool* src, int num_values) { PutImpl(src, num_values); }
-
-void RleBooleanEncoder::Put(const std::vector<bool>& src, int num_values) {
-  PutImpl(src, num_values);
-}
-
-template <typename SequenceType>
-void RleBooleanEncoder::PutImpl(const SequenceType& src, int num_values) {
-  for (int i = 0; i < num_values; ++i) {
-    buffered_append_values_.push_back(src[i]);
-  }
-}
-
-std::shared_ptr<Buffer> RleBooleanEncoder::FlushValues() {
-  int rle_buffer_size_max = MaxRleBufferSize();
-  std::shared_ptr<ResizableBuffer> buffer =
-      AllocateBuffer(this->pool_, rle_buffer_size_max + kRleLengthInBytes);
-  ::arrow::util::RleEncoder encoder(buffer->mutable_data() + kRleLengthInBytes,
-                                    rle_buffer_size_max, /*bit_width*/ kBitWidth);
-
-  for (bool value : buffered_append_values_) {
-    encoder.Put(value ? 1 : 0);
-  }
-  encoder.Flush();
-  ::arrow::util::SafeStore(buffer->mutable_data(),
-                           ::arrow::bit_util::ToLittleEndian(encoder.len()));
-  PARQUET_THROW_NOT_OK(buffer->Resize(kRleLengthInBytes + encoder.len()));
-  buffered_append_values_.clear();
-  return buffer;
-}
-
-// ----------------------------------------------------------------------
-// RLE_BOOLEAN_DECODER
+// RLE decoder for BOOLEAN
 
-class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+class RleBooleanDecoder : public DecoderImpl,
+                          virtual public TypedDecoderImpl<BooleanType>,
+                          virtual public BooleanDecoder {
  public:
   explicit RleBooleanDecoder(const ColumnDescriptor* descr)
       : DecoderImpl(descr, Encoding::RLE) {}
@@ -3209,235 +1838,10 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
 };
 
 // ----------------------------------------------------------------------
-// DELTA_BYTE_ARRAY
-
-/// Delta Byte Array encoding also known as incremental encoding or front compression:
-/// for each element in a sequence of strings, store the prefix length of the previous
-/// entry plus the suffix.
-///
-/// This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED),
-/// followed by the suffixes encoded as delta length byte arrays
-/// (DELTA_LENGTH_BYTE_ARRAY).
-
-// ----------------------------------------------------------------------
-// DeltaByteArrayEncoder
-
-template <typename DType>
-class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
-  static constexpr std::string_view kEmpty = "";
-
- public:
-  using T = typename DType::c_type;
-
-  explicit DeltaByteArrayEncoder(const ColumnDescriptor* descr,
-                                 MemoryPool* pool = ::arrow::default_memory_pool())
-      : EncoderImpl(descr, Encoding::DELTA_BYTE_ARRAY, pool),
-        sink_(pool),
-        prefix_length_encoder_(/*descr=*/nullptr, pool),
-        suffix_encoder_(descr, pool),
-        last_value_(""),
-        empty_(static_cast<uint32_t>(kEmpty.size()),
-               reinterpret_cast<const uint8_t*>(kEmpty.data())) {}
-
-  std::shared_ptr<Buffer> FlushValues() override;
-
-  int64_t EstimatedDataEncodedSize() override {
-    return prefix_length_encoder_.EstimatedDataEncodedSize() +
-           suffix_encoder_.EstimatedDataEncodedSize();
-  }
-
-  using TypedEncoder<DType>::Put;
-
-  void Put(const ::arrow::Array& values) override;
-
-  void Put(const T* buffer, int num_values) override;
-
-  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
-                 int64_t valid_bits_offset) override {
-    if (valid_bits != nullptr) {
-      if (buffer_ == nullptr) {
-        PARQUET_ASSIGN_OR_THROW(buffer_,
-                                ::arrow::AllocateResizableBuffer(num_values * sizeof(T),
-                                                                 this->memory_pool()));
-      } else {
-        PARQUET_THROW_NOT_OK(buffer_->Resize(num_values * sizeof(T), false));
-      }
-      T* data = buffer_->mutable_data_as<T>();
-      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
-          src, num_values, valid_bits, valid_bits_offset, data);
-      Put(data, num_valid_values);
-    } else {
-      Put(src, num_values);
-    }
-  }
-
- protected:
-  template <typename VisitorType>
-  void PutInternal(const T* src, int num_values, const VisitorType visitor) {
-    if (num_values == 0) {
-      return;
-    }
-
-    std::string_view last_value_view = last_value_;
-    constexpr int kBatchSize = 256;
-    std::array<int32_t, kBatchSize> prefix_lengths;
-    std::array<ByteArray, kBatchSize> suffixes;
-
-    for (int i = 0; i < num_values; i += kBatchSize) {
-      const int batch_size = std::min(kBatchSize, num_values - i);
-
-      for (int j = 0; j < batch_size; ++j) {
-        const int idx = i + j;
-        const auto view = visitor[idx];
-        const auto len = static_cast<const uint32_t>(view.length());
-
-        uint32_t common_prefix_length = 0;
-        const uint32_t maximum_common_prefix_length =
-            std::min(len, static_cast<uint32_t>(last_value_view.length()));
-        while (common_prefix_length < maximum_common_prefix_length) {
-          if (last_value_view[common_prefix_length] != view[common_prefix_length]) {
-            break;
-          }
-          common_prefix_length++;
-        }
-
-        last_value_view = view;
-        prefix_lengths[j] = common_prefix_length;
-        const uint32_t suffix_length = len - common_prefix_length;
-        const uint8_t* suffix_ptr = src[idx].ptr + common_prefix_length;
-
-        // Convert to ByteArray, so it can be passed to the suffix_encoder_.
-        const ByteArray suffix(suffix_length, suffix_ptr);
-        suffixes[j] = suffix;
-      }
-      suffix_encoder_.Put(suffixes.data(), batch_size);
-      prefix_length_encoder_.Put(prefix_lengths.data(), batch_size);
-    }
-    last_value_ = last_value_view;
-  }
-
-  template <typename ArrayType>
-  void PutBinaryArray(const ArrayType& array) {
-    auto previous_len = static_cast<uint32_t>(last_value_.length());
-    std::string_view last_value_view = last_value_;
-
-    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
-        *array.data(),
-        [&](::std::string_view view) {
-          if (ARROW_PREDICT_FALSE(view.size() >= kMaxByteArraySize)) {
-            return Status::Invalid(
-                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
-          }
-          const ByteArray src{view};
-
-          uint32_t common_prefix_length = 0;
-          const uint32_t len = src.len;
-          const uint32_t maximum_common_prefix_length = std::min(previous_len, len);
-          while (common_prefix_length < maximum_common_prefix_length) {
-            if (last_value_view[common_prefix_length] != view[common_prefix_length]) {
-              break;
-            }
-            common_prefix_length++;
-          }
-          previous_len = len;
-          prefix_length_encoder_.Put({static_cast<int32_t>(common_prefix_length)}, 1);
-
-          last_value_view = view;
-          const auto suffix_length = static_cast<uint32_t>(len - common_prefix_length);
-          if (suffix_length == 0) {
-            suffix_encoder_.Put(&empty_, 1);
-            return Status::OK();
-          }
-          const uint8_t* suffix_ptr = src.ptr + common_prefix_length;
-          // Convert to ByteArray, so it can be passed to the suffix_encoder_.
-          const ByteArray suffix(suffix_length, suffix_ptr);
-          suffix_encoder_.Put(&suffix, 1);
-
-          return Status::OK();
-        },
-        []() { return Status::OK(); }));
-    last_value_ = last_value_view;
-  }
-
-  ::arrow::BufferBuilder sink_;
-  DeltaBitPackEncoder<Int32Type> prefix_length_encoder_;
-  DeltaLengthByteArrayEncoder suffix_encoder_;
-  std::string last_value_;
-  const ByteArray empty_;
-  std::unique_ptr<ResizableBuffer> buffer_;
-};
-
-struct ByteArrayVisitor {
-  const ByteArray* src;
-
-  std::string_view operator[](int i) const {
-    if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) {
-      throw ParquetException("Parquet cannot store strings with size 2GB or more, got: ",
-                             src[i].len);
-    }
-    return std::string_view{src[i]};
-  }
-
-  uint32_t len(int i) const { return src[i].len; }
-};
-
-struct FLBAVisitor {
-  const FLBA* src;
-  const uint32_t type_length;
-
-  std::string_view operator[](int i) const {
-    return std::string_view{reinterpret_cast<const char*>(src[i].ptr), type_length};
-  }
-
-  uint32_t len(int i) const { return type_length; }
-};
-
-template <>
-void DeltaByteArrayEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
-  auto visitor = ByteArrayVisitor{src};
-  PutInternal<ByteArrayVisitor>(src, num_values, visitor);
-}
-
-template <>
-void DeltaByteArrayEncoder<FLBAType>::Put(const FLBA* src, int num_values) {
-  auto visitor = FLBAVisitor{src, static_cast<uint32_t>(descr_->type_length())};
-  PutInternal<FLBAVisitor>(src, num_values, visitor);
-}
-
-template <typename DType>
-void DeltaByteArrayEncoder<DType>::Put(const ::arrow::Array& values) {
-  if (::arrow::is_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
-  } else if (::arrow::is_large_binary_like(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
-  } else if (::arrow::is_fixed_size_binary(values.type_id())) {
-    PutBinaryArray(checked_cast<const ::arrow::FixedSizeBinaryArray&>(values));
-  } else {
-    throw ParquetException("Only BaseBinaryArray and subclasses supported");
-  }
-}
-
-template <typename DType>
-std::shared_ptr<Buffer> DeltaByteArrayEncoder<DType>::FlushValues() {
-  PARQUET_THROW_NOT_OK(sink_.Resize(EstimatedDataEncodedSize(), false));
-
-  std::shared_ptr<Buffer> prefix_lengths = prefix_length_encoder_.FlushValues();
-  PARQUET_THROW_NOT_OK(sink_.Append(prefix_lengths->data(), prefix_lengths->size()));
-
-  std::shared_ptr<Buffer> suffixes = suffix_encoder_.FlushValues();
-  PARQUET_THROW_NOT_OK(sink_.Append(suffixes->data(), suffixes->size()));
-
-  std::shared_ptr<Buffer> buffer;
-  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
-  last_value_.clear();
-  return buffer;
-}
-
-// ----------------------------------------------------------------------
-// DeltaByteArrayDecoder
+// DELTA_BYTE_ARRAY decoder
 
 template <typename DType>
-class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecoder<DType> {
+class DeltaByteArrayDecoderImpl : public DecoderImpl, public TypedDecoderImpl<DType> {
   using T = typename DType::c_type;
 
  public:
@@ -3575,7 +1979,7 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, virtual public TypedDecode
     }
     PARQUET_THROW_NOT_OK(buffered_data_->Resize(data_size));
 
-    string_view prefix{last_value_};
+    std::string_view prefix{last_value_};
     uint8_t* data_ptr = buffered_data_->mutable_data();
     if (max_values > 0) {
       BuildBufferInternal</*is_first_run=*/true>(prefix_len_ptr, 0, buffer, &prefix,
@@ -3683,8 +2087,7 @@ class DeltaByteArrayFLBADecoder : public DeltaByteArrayDecoderImpl<FLBAType>,
 // BYTE_STREAM_SPLIT decoders
 
 template <typename DType>
-class ByteStreamSplitDecoderBase : public DecoderImpl,
-                                   virtual public TypedDecoder<DType> {
+class ByteStreamSplitDecoderBase : public DecoderImpl, public TypedDecoderImpl<DType> {
  public:
   using T = typename DType::c_type;
 
@@ -3852,110 +2255,7 @@ class ByteStreamSplitDecoder<FLBAType> : public ByteStreamSplitDecoderBase<FLBAT
 }  // namespace
 
 // ----------------------------------------------------------------------
-// Encoder and decoder factory functions
-
-std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
-                                     bool use_dictionary, const ColumnDescriptor* descr,
-                                     MemoryPool* pool) {
-  if (use_dictionary) {
-    switch (type_num) {
-      case Type::INT32:
-        return std::make_unique<DictEncoderImpl<Int32Type>>(descr, pool);
-      case Type::INT64:
-        return std::make_unique<DictEncoderImpl<Int64Type>>(descr, pool);
-      case Type::INT96:
-        return std::make_unique<DictEncoderImpl<Int96Type>>(descr, pool);
-      case Type::FLOAT:
-        return std::make_unique<DictEncoderImpl<FloatType>>(descr, pool);
-      case Type::DOUBLE:
-        return std::make_unique<DictEncoderImpl<DoubleType>>(descr, pool);
-      case Type::BYTE_ARRAY:
-        return std::make_unique<DictEncoderImpl<ByteArrayType>>(descr, pool);
-      case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_unique<DictEncoderImpl<FLBAType>>(descr, pool);
-      default:
-        DCHECK(false) << "Encoder not implemented";
-        break;
-    }
-  } else if (encoding == Encoding::PLAIN) {
-    switch (type_num) {
-      case Type::BOOLEAN:
-        return std::make_unique<PlainEncoder<BooleanType>>(descr, pool);
-      case Type::INT32:
-        return std::make_unique<PlainEncoder<Int32Type>>(descr, pool);
-      case Type::INT64:
-        return std::make_unique<PlainEncoder<Int64Type>>(descr, pool);
-      case Type::INT96:
-        return std::make_unique<PlainEncoder<Int96Type>>(descr, pool);
-      case Type::FLOAT:
-        return std::make_unique<PlainEncoder<FloatType>>(descr, pool);
-      case Type::DOUBLE:
-        return std::make_unique<PlainEncoder<DoubleType>>(descr, pool);
-      case Type::BYTE_ARRAY:
-        return std::make_unique<PlainEncoder<ByteArrayType>>(descr, pool);
-      case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_unique<PlainEncoder<FLBAType>>(descr, pool);
-      default:
-        DCHECK(false) << "Encoder not implemented";
-        break;
-    }
-  } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
-    switch (type_num) {
-      case Type::INT32:
-        return std::make_unique<ByteStreamSplitEncoder<Int32Type>>(descr, pool);
-      case Type::INT64:
-        return std::make_unique<ByteStreamSplitEncoder<Int64Type>>(descr, pool);
-      case Type::FLOAT:
-        return std::make_unique<ByteStreamSplitEncoder<FloatType>>(descr, pool);
-      case Type::DOUBLE:
-        return std::make_unique<ByteStreamSplitEncoder<DoubleType>>(descr, pool);
-      case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_unique<ByteStreamSplitEncoder<FLBAType>>(descr, pool);
-      default:
-        throw ParquetException(
-            "BYTE_STREAM_SPLIT only supports FLOAT, DOUBLE, INT32, INT64 "
-            "and FIXED_LEN_BYTE_ARRAY");
-    }
-  } else if (encoding == Encoding::DELTA_BINARY_PACKED) {
-    switch (type_num) {
-      case Type::INT32:
-        return std::make_unique<DeltaBitPackEncoder<Int32Type>>(descr, pool);
-      case Type::INT64:
-        return std::make_unique<DeltaBitPackEncoder<Int64Type>>(descr, pool);
-      default:
-        throw ParquetException(
-            "DELTA_BINARY_PACKED encoder only supports INT32 and INT64");
-    }
-  } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
-    switch (type_num) {
-      case Type::BYTE_ARRAY:
-        return std::make_unique<DeltaLengthByteArrayEncoder>(descr, pool);
-      default:
-        throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY");
-    }
-  } else if (encoding == Encoding::RLE) {
-    switch (type_num) {
-      case Type::BOOLEAN:
-        return std::make_unique<RleBooleanEncoder>(descr, pool);
-      default:
-        throw ParquetException("RLE only supports BOOLEAN");
-    }
-  } else if (encoding == Encoding::DELTA_BYTE_ARRAY) {
-    switch (type_num) {
-      case Type::BYTE_ARRAY:
-        return std::make_unique<DeltaByteArrayEncoder<ByteArrayType>>(descr, pool);
-      case Type::FIXED_LEN_BYTE_ARRAY:
-        return std::make_unique<DeltaByteArrayEncoder<FLBAType>>(descr, pool);
-      default:
-        throw ParquetException(
-            "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY");
-    }
-  } else {
-    ParquetException::NYI("Selected encoding is not supported");
-  }
-  DCHECK(false) << "Should not be able to reach this code";
-  return nullptr;
-}
+// Factory functions
 
 std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
                                      const ColumnDescriptor* descr,
diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc
new file mode 100644
index 0000000000000..89d5d44c5219c
--- /dev/null
+++ b/cpp/src/parquet/encoder.cc
@@ -0,0 +1,1783 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encoding.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_stream_utils_internal.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/byte_stream_split_internal.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding_internal.h"
+#include "arrow/util/spaced.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visit_data_inline.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace bit_util = arrow::bit_util;
+
+using arrow::Status;
+using arrow::internal::AddWithOverflow;
+using arrow::internal::checked_cast;
+using arrow::internal::SafeSignedSubtract;
+using arrow::util::SafeLoad;
+using arrow::util::SafeLoadAs;
+
+template <typename T>
+using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
+
+namespace parquet {
+namespace {
+
+// The Parquet spec isn't very clear whether ByteArray lengths are signed or
+// unsigned, but the Java implementation uses signed ints.
+constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
+
+class EncoderImpl : virtual public Encoder {
+ public:
+  EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
+      : descr_(descr),
+        encoding_(encoding),
+        pool_(pool),
+        type_length_(descr ? descr->type_length() : -1) {}
+
+  Encoding::type encoding() const override { return encoding_; }
+
+  MemoryPool* memory_pool() const override { return pool_; }
+
+ protected:
+  // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
+  const ColumnDescriptor* descr_;
+  const Encoding::type encoding_;
+  MemoryPool* pool_;
+
+  /// Type length from descr
+  const int type_length_;
+};
+
+// ----------------------------------------------------------------------
+// PLAIN encoder
+
+template <typename DType>
+class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+  using T = typename DType::c_type;
+
+  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
+
+  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    std::shared_ptr<Buffer> buffer;
+    PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+    return buffer;
+  }
+
+  using TypedEncoder<DType>::Put;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void Put(const ::arrow::Array& values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                   this->memory_pool()));
+      T* data = buffer->template mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+  void UnsafePutByteArray(const void* data, uint32_t length) {
+    DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
+    sink_.UnsafeAppend(&length, sizeof(uint32_t));
+    sink_.UnsafeAppend(data, static_cast<int64_t>(length));
+  }
+
+  void Put(const ByteArray& val) {
+    // Write the result to the output stream
+    const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
+    if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
+      PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
+    }
+    UnsafePutByteArray(val.ptr, val.len);
+  }
+
+ protected:
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    const int64_t total_bytes =
+        array.value_offset(array.length()) - array.value_offset(0);
+    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
+
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+            return Status::Invalid(
+                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
+          }
+          UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+  }
+
+  ::arrow::BufferBuilder sink_;
+};
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
+  if (num_values > 0) {
+    PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+  }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
+  for (int i = 0; i < num_values; ++i) {
+    Put(src[i]);
+  }
+}
+
+template <typename ArrayType>
+void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
+  if (values.type_id() != ArrayType::TypeClass::type_id) {
+    std::string type_name = ArrayType::TypeClass::type_name();
+    throw ParquetException("direct put to " + type_name + " from " +
+                           values.type()->ToString() + " not supported");
+  }
+
+  using value_type = typename ArrayType::value_type;
+  constexpr auto value_size = sizeof(value_type);
+  auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
+
+  if (values.null_count() == 0) {
+    // no nulls, just dump the data
+    PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
+  } else {
+    PARQUET_THROW_NOT_OK(
+        sink->Reserve((values.length() - values.null_count()) * value_size));
+
+    for (int64_t i = 0; i < values.length(); i++) {
+      if (values.IsValid(i)) {
+        sink->UnsafeAppend(&raw_values[i], value_size);
+      }
+    }
+  }
+}
+
+template <>
+void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
+  DirectPutImpl<::arrow::Int32Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
+  DirectPutImpl<::arrow::Int64Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
+  ParquetException::NYI("direct put to Int96");
+}
+
+template <>
+void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
+  DirectPutImpl<::arrow::FloatArray>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+  DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
+}
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
+  ParquetException::NYI("direct put of " + values.type()->ToString());
+}
+
+void AssertBaseBinary(const ::arrow::Array& values) {
+  if (!::arrow::is_base_binary_like(values.type_id())) {
+    throw ParquetException("Only BaseBinaryArray and subclasses supported");
+  }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
+  AssertBaseBinary(values);
+
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else {
+    DCHECK(::arrow::is_large_binary_like(values.type_id()));
+    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  }
+}
+
+void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
+  if (!::arrow::is_fixed_size_binary(values.type_id())) {
+    throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
+  }
+  if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
+      type_length) {
+    throw ParquetException("Size mismatch: " + values.type()->ToString() +
+                           " should have been " + std::to_string(type_length) + " wide");
+  }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
+  AssertFixedSizeBinary(values, descr_->type_length());
+  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+  if (data.null_count() == 0) {
+    // no nulls, just dump the data
+    PARQUET_THROW_NOT_OK(
+        sink_.Append(data.raw_values(), data.length() * data.byte_width()));
+  } else {
+    const int64_t total_bytes =
+        data.length() * data.byte_width() - data.null_count() * data.byte_width();
+    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+    for (int64_t i = 0; i < data.length(); i++) {
+      if (data.IsValid(i)) {
+        sink_.UnsafeAppend(data.Value(i), data.byte_width());
+      }
+    }
+  }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
+  if (descr_->type_length() == 0) {
+    return;
+  }
+  for (int i = 0; i < num_values; ++i) {
+    // Write the result to the output stream
+    DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
+    PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
+  }
+}
+
+template <>
+class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
+ public:
+  explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
+
+  int64_t EstimatedDataEncodedSize() override;
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  void Put(const bool* src, int num_values) override;
+
+  void Put(const std::vector<bool>& src, int num_values) override;
+
+  void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                   this->memory_pool()));
+      T* data = buffer->mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+  void Put(const ::arrow::Array& values) override {
+    if (values.type_id() != ::arrow::Type::BOOL) {
+      throw ParquetException("direct put to boolean from " + values.type()->ToString() +
+                             " not supported");
+    }
+    const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
+
+    if (data.null_count() == 0) {
+      // no nulls, just dump the data
+      PARQUET_THROW_NOT_OK(sink_.Reserve(data.length()));
+      sink_.UnsafeAppend(data.data()->GetValues<uint8_t>(1, 0), data.offset(),
+                         data.length());
+    } else {
+      PARQUET_THROW_NOT_OK(sink_.Reserve(data.length() - data.null_count()));
+      for (int64_t i = 0; i < data.length(); i++) {
+        if (data.IsValid(i)) {
+          sink_.UnsafeAppend(data.Value(i));
+        }
+      }
+    }
+  }
+
+ private:
+  ::arrow::TypedBufferBuilder<bool> sink_;
+
+  template <typename SequenceType>
+  void PutImpl(const SequenceType& src, int num_values);
+};
+
+template <typename SequenceType>
+void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
+  PARQUET_THROW_NOT_OK(sink_.Reserve(num_values));
+  for (int i = 0; i < num_values; ++i) {
+    sink_.UnsafeAppend(src[i]);
+  }
+}
+
+int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
+  return ::arrow::bit_util::BytesForBits(sink_.length());
+}
+
+std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
+  std::shared_ptr<Buffer> buffer;
+  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+  return buffer;
+}
+
+void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
+  PutImpl(src, num_values);
+}
+
+void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+  PutImpl(src, num_values);
+}
+
+// ----------------------------------------------------------------------
+// DictEncoder<T> implementations
+
+template <typename DType>
+struct DictEncoderTraits {
+  using c_type = typename DType::c_type;
+  using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
+};
+
+template <>
+struct DictEncoderTraits<ByteArrayType> {
+  using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+template <>
+struct DictEncoderTraits<FLBAType> {
+  using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+// Initially 1024 elements
+static constexpr int32_t kInitialHashTableSize = 1 << 10;
+
+int RlePreserveBufferSize(int num_values, int bit_width) {
+  // Note: because of the way RleEncoder::CheckBufferFull()
+  // is called, we have to reserve an extra "RleEncoder::MinBufferSize"
+  // bytes. These extra bytes won't be used but not reserving them
+  // would cause the encoder to fail.
+  return ::arrow::util::RleEncoder::MaxBufferSize(bit_width, num_values) +
+         ::arrow::util::RleEncoder::MinBufferSize(bit_width);
+}
+
+/// See the dictionary encoding section of
+/// https://github.com/Parquet/parquet-format.  The encoding supports
+/// streaming encoding. Values are encoded as they are added while the
+/// dictionary is being constructed. At any time, the buffered values
+/// can be written out with the current dictionary size. More values
+/// can then be added to the encoder, including new dictionary
+/// entries.
+template <typename DType>
+class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
+  using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
+
+ public:
+  typedef typename DType::c_type T;
+
+  /// In data page, the bit width used to encode the entry
+  /// ids stored as 1 byte (max bit width = 32).
+  constexpr static int32_t kDataPageBitWidthBytes = 1;
+
+  explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
+      : EncoderImpl(desc, Encoding::RLE_DICTIONARY, pool),
+        buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
+        dict_encoded_size_(0),
+        memo_table_(pool, kInitialHashTableSize) {}
+
+  ~DictEncoderImpl() override = default;
+
+  int dict_encoded_size() const override { return dict_encoded_size_; }
+
+  int WriteIndices(uint8_t* buffer, int buffer_len) override {
+    // Write bit width in first byte
+    *buffer = static_cast<uint8_t>(bit_width());
+    ++buffer;
+    --buffer_len;
+
+    ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
+
+    for (int32_t index : buffered_indices_) {
+      if (ARROW_PREDICT_FALSE(!encoder.Put(index))) return -1;
+    }
+    encoder.Flush();
+
+    ClearIndices();
+    return kDataPageBitWidthBytes + encoder.len();
+  }
+
+  /// Returns a conservative estimate of the number of bytes needed to encode the buffered
+  /// indices. Used to size the buffer passed to WriteIndices().
+  int64_t EstimatedDataEncodedSize() override {
+    return kDataPageBitWidthBytes +
+           RlePreserveBufferSize(static_cast<int>(buffered_indices_.size()), bit_width());
+  }
+
+  /// The minimum bit width required to encode the currently buffered indices.
+  int bit_width() const override {
+    if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
+    if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
+    return bit_util::Log2(num_entries());
+  }
+
+  /// Encode value. Note that this does not actually write any data, just
+  /// buffers the value's index to be written later.
+  inline void Put(const T& value);
+
+  // Not implemented for other data types
+  inline void PutByteArray(const void* ptr, int32_t length);
+
+  void Put(const T* src, int num_values) override {
+    for (int32_t i = 0; i < num_values; i++) {
+      Put(SafeLoad(src + i));
+    }
+  }
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
+                                           [&](int64_t position, int64_t length) {
+                                             for (int64_t i = 0; i < length; i++) {
+                                               Put(SafeLoad(src + i + position));
+                                             }
+                                           });
+  }
+
+  using TypedEncoder<DType>::Put;
+
+  void Put(const ::arrow::Array& values) override;
+  void PutDictionary(const ::arrow::Array& values) override;
+
+  template <typename ArrowType, typename T = typename ArrowType::c_type>
+  void PutIndicesTyped(const ::arrow::Array& data) {
+    auto values = data.data()->GetValues<T>(1);
+    size_t buffer_position = buffered_indices_.size();
+    buffered_indices_.resize(buffer_position +
+                             static_cast<size_t>(data.length() - data.null_count()));
+    ::arrow::internal::VisitSetBitRunsVoid(
+        data.null_bitmap_data(), data.offset(), data.length(),
+        [&](int64_t position, int64_t length) {
+          for (int64_t i = 0; i < length; ++i) {
+            buffered_indices_[buffer_position++] =
+                static_cast<int32_t>(values[i + position]);
+          }
+        });
+  }
+
+  void PutIndices(const ::arrow::Array& data) override {
+    switch (data.type()->id()) {
+      case ::arrow::Type::UINT8:
+      case ::arrow::Type::INT8:
+        return PutIndicesTyped<::arrow::UInt8Type>(data);
+      case ::arrow::Type::UINT16:
+      case ::arrow::Type::INT16:
+        return PutIndicesTyped<::arrow::UInt16Type>(data);
+      case ::arrow::Type::UINT32:
+      case ::arrow::Type::INT32:
+        return PutIndicesTyped<::arrow::UInt32Type>(data);
+      case ::arrow::Type::UINT64:
+      case ::arrow::Type::INT64:
+        return PutIndicesTyped<::arrow::UInt64Type>(data);
+      default:
+        throw ParquetException("Passed non-integer array to PutIndices");
+    }
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    std::shared_ptr<ResizableBuffer> buffer =
+        AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
+    int result_size = WriteIndices(buffer->mutable_data(),
+                                   static_cast<int>(EstimatedDataEncodedSize()));
+    PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
+    return buffer;
+  }
+
+  /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+  /// dict_encoded_size() bytes.
+  void WriteDict(uint8_t* buffer) const override;
+
+  /// The number of entries in the dictionary.
+  int num_entries() const override { return memo_table_.size(); }
+
+ private:
+  /// Clears all the indices (but leaves the dictionary).
+  void ClearIndices() { buffered_indices_.clear(); }
+
+  /// Indices that have not yet be written out by WriteIndices().
+  ArrowPoolVector<int32_t> buffered_indices_;
+
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+            return Status::Invalid(
+                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
+          }
+          PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+  }
+
+  template <typename ArrayType>
+  void PutBinaryDictionaryArray(const ArrayType& array) {
+    DCHECK_EQ(array.null_count(), 0);
+    for (int64_t i = 0; i < array.length(); i++) {
+      auto v = array.GetView(i);
+      if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
+        throw ParquetException(
+            "Parquet cannot store strings with size 2GB or more, got: ", v.size());
+      }
+      dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
+      int32_t unused_memo_index;
+      PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
+          v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
+    }
+  }
+
+  /// The number of bytes needed to encode the dictionary.
+  int dict_encoded_size_;
+
+  MemoTableType memo_table_;
+};
+
+template <typename DType>
+void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) const {
+  // For primitive types, only a memcpy
+  DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
+  memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
+}
+
+// ByteArray and FLBA already have the dictionary encoded in their data heaps
+template <>
+void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) const {
+  memo_table_.VisitValues(0, [&buffer](::std::string_view v) {
+    uint32_t len = static_cast<uint32_t>(v.length());
+    memcpy(buffer, &len, sizeof(len));
+    buffer += sizeof(len);
+    memcpy(buffer, v.data(), len);
+    buffer += len;
+  });
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) const {
+  memo_table_.VisitValues(0, [&](::std::string_view v) {
+    DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
+    memcpy(buffer, v.data(), type_length_);
+    buffer += type_length_;
+  });
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::Put(const T& v) {
+  // Put() implementation for primitive types
+  auto on_found = [](int32_t memo_index) {};
+  auto on_not_found = [this](int32_t memo_index) {
+    dict_encoded_size_ += static_cast<int>(sizeof(T));
+  };
+
+  int32_t memo_index;
+  PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
+  buffered_indices_.push_back(memo_index);
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
+  DCHECK(false);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
+                                                         int32_t length) {
+  static const uint8_t empty[] = {0};
+
+  auto on_found = [](int32_t memo_index) {};
+  auto on_not_found = [&](int32_t memo_index) {
+    dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
+  };
+
+  DCHECK(ptr != nullptr || length == 0);
+  ptr = (ptr != nullptr) ? ptr : empty;
+  int32_t memo_index;
+  PARQUET_THROW_NOT_OK(
+      memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
+  buffered_indices_.push_back(memo_index);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
+  return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
+}
+
+template <>
+inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
+  static const uint8_t empty[] = {0};
+
+  auto on_found = [](int32_t memo_index) {};
+  auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
+
+  DCHECK(v.ptr != nullptr || type_length_ == 0);
+  const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
+  int32_t memo_index;
+  PARQUET_THROW_NOT_OK(
+      memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
+  buffered_indices_.push_back(memo_index);
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
+  ParquetException::NYI("Direct put to Int96");
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
+  ParquetException::NYI("Direct put to Int96");
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
+  using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+  const auto& data = checked_cast<const ArrayType&>(values);
+  if (data.null_count() == 0) {
+    // no nulls, just dump the data
+    for (int64_t i = 0; i < data.length(); i++) {
+      Put(data.Value(i));
+    }
+  } else {
+    for (int64_t i = 0; i < data.length(); i++) {
+      if (data.IsValid(i)) {
+        Put(data.Value(i));
+      }
+    }
+  }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
+  AssertFixedSizeBinary(values, type_length_);
+  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+  if (data.null_count() == 0) {
+    // no nulls, just dump the data
+    for (int64_t i = 0; i < data.length(); i++) {
+      Put(FixedLenByteArray(data.Value(i)));
+    }
+  } else {
+    std::vector<uint8_t> empty(type_length_, 0);
+    for (int64_t i = 0; i < data.length(); i++) {
+      if (data.IsValid(i)) {
+        Put(FixedLenByteArray(data.Value(i)));
+      }
+    }
+  }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
+  AssertBaseBinary(values);
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else {
+    DCHECK(::arrow::is_large_binary_like(values.type_id()));
+    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  }
+}
+
+template <typename DType>
+void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
+  if (dict.null_count() > 0) {
+    throw ParquetException("Inserted dictionary cannot contain nulls");
+  }
+
+  if (encoder->num_entries() > 0) {
+    throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
+  }
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
+  AssertCanPutDictionary(this, values);
+
+  using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+  const auto& data = checked_cast<const ArrayType&>(values);
+
+  dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
+  for (int64_t i = 0; i < data.length(); i++) {
+    int32_t unused_memo_index;
+    PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
+  }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
+  AssertFixedSizeBinary(values, type_length_);
+  AssertCanPutDictionary(this, values);
+
+  const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+  dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
+  for (int64_t i = 0; i < data.length(); i++) {
+    int32_t unused_memo_index;
+    PARQUET_THROW_NOT_OK(
+        memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
+  }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
+  AssertBaseBinary(values);
+  AssertCanPutDictionary(this, values);
+
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else {
+    DCHECK(::arrow::is_large_binary_like(values.type_id()));
+    PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  }
+}
+
+// ----------------------------------------------------------------------
+// BYTE_STREAM_SPLIT encoder
+
+// Common base class for all types
+
+template <typename DType>
+class ByteStreamSplitEncoderBase : public EncoderImpl,
+                                   virtual public TypedEncoder<DType> {
+ public:
+  using T = typename DType::c_type;
+  using TypedEncoder<DType>::Put;
+
+  ByteStreamSplitEncoderBase(const ColumnDescriptor* descr, int byte_width,
+                             ::arrow::MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
+        sink_{pool},
+        byte_width_(byte_width),
+        num_values_in_buffer_{0} {}
+
+  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    if (byte_width_ == 1) {
+      // Special-cased fast path
+      PARQUET_ASSIGN_OR_THROW(auto buf, sink_.Finish());
+      return buf;
+    }
+    auto output_buffer = AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
+    uint8_t* output_buffer_raw = output_buffer->mutable_data();
+    const uint8_t* raw_values = sink_.data();
+    ::arrow::util::internal::ByteStreamSplitEncode(
+        raw_values, /*width=*/byte_width_, num_values_in_buffer_, output_buffer_raw);
+    sink_.Reset();
+    num_values_in_buffer_ = 0;
+    return output_buffer;
+  }
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                   this->memory_pool()));
+      T* data = buffer->template mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+ protected:
+  ::arrow::BufferBuilder sink_;
+  // Required because type_length_ is only filled in for FLBA
+  const int byte_width_;
+  int64_t num_values_in_buffer_;
+};
+
+// BYTE_STREAM_SPLIT encoder implementation for FLOAT, DOUBLE, INT32, INT64
+
+template <typename DType>
+class ByteStreamSplitEncoder : public ByteStreamSplitEncoderBase<DType> {
+ public:
+  using T = typename DType::c_type;
+  using ArrowType = typename EncodingTraits<DType>::ArrowType;
+
+  ByteStreamSplitEncoder(const ColumnDescriptor* descr,
+                         ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+      : ByteStreamSplitEncoderBase<DType>(descr,
+                                          /*byte_width=*/static_cast<int>(sizeof(T)),
+                                          pool) {}
+
+  // Inherit Put(const std::vector<T>&...)
+  using TypedEncoder<DType>::Put;
+
+  void Put(const T* buffer, int num_values) override {
+    if (num_values > 0) {
+      PARQUET_THROW_NOT_OK(
+          this->sink_.Append(reinterpret_cast<const uint8_t*>(buffer),
+                             num_values * static_cast<int64_t>(sizeof(T))));
+      this->num_values_in_buffer_ += num_values;
+    }
+  }
+
+  void Put(const ::arrow::Array& values) override {
+    if (values.type_id() != ArrowType::type_id) {
+      throw ParquetException(std::string() + "direct put from " +
+                             values.type()->ToString() + " not supported");
+    }
+    const auto& data = *values.data();
+    this->PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
+                    static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0),
+                    data.offset);
+  }
+};
+
+// BYTE_STREAM_SPLIT encoder implementation for FLBA
+
+template <>
+class ByteStreamSplitEncoder<FLBAType> : public ByteStreamSplitEncoderBase<FLBAType> {
+ public:
+  using DType = FLBAType;
+  using T = FixedLenByteArray;
+  using ArrowType = ::arrow::FixedSizeBinaryArray;
+
+  ByteStreamSplitEncoder(const ColumnDescriptor* descr,
+                         ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+      : ByteStreamSplitEncoderBase<DType>(descr,
+                                          /*byte_width=*/descr->type_length(), pool) {}
+
+  // Inherit Put(const std::vector<T>&...)
+  using TypedEncoder<DType>::Put;
+
+  void Put(const T* buffer, int num_values) override {
+    if (byte_width_ > 0) {
+      const int64_t total_bytes = static_cast<int64_t>(num_values) * byte_width_;
+      PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+      for (int i = 0; i < num_values; ++i) {
+        // Write the result to the output stream
+        DCHECK(buffer[i].ptr != nullptr) << "Value ptr cannot be NULL";
+        sink_.UnsafeAppend(buffer[i].ptr, byte_width_);
+      }
+    }
+    this->num_values_in_buffer_ += num_values;
+  }
+
+  void Put(const ::arrow::Array& values) override {
+    AssertFixedSizeBinary(values, byte_width_);
+    const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+    if (data.null_count() == 0) {
+      // no nulls, just buffer the data
+      PARQUET_THROW_NOT_OK(sink_.Append(data.raw_values(), data.length() * byte_width_));
+      this->num_values_in_buffer_ += data.length();
+    } else {
+      const int64_t num_values = data.length() - data.null_count();
+      const int64_t total_bytes = num_values * byte_width_;
+      PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+      // TODO use VisitSetBitRunsVoid
+      for (int64_t i = 0; i < data.length(); i++) {
+        if (data.IsValid(i)) {
+          sink_.UnsafeAppend(data.Value(i), byte_width_);
+        }
+      }
+      this->num_values_in_buffer_ += num_values;
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// DELTA_BINARY_PACKED encoder
+
+/// DeltaBitPackEncoder is an encoder for the DeltaBinary Packing format
+/// as per the parquet spec. See:
+/// https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5
+///
+/// Consists of a header followed by blocks of delta encoded values binary packed.
+///
+///  Format
+///    [header] [block 1] [block 2] ... [block N]
+///
+///  Header
+///    [block size] [number of mini blocks per block] [total value count] [first value]
+///
+///  Block
+///    [min delta] [list of bitwidths of the mini blocks] [miniblocks]
+///
+/// Sets aside bytes at the start of the internal buffer where the header will be written,
+/// and only writes the header when FlushValues is called before returning it.
+///
+/// To encode a block, we will:
+///
+/// 1. Compute the differences between consecutive elements. For the first element in the
+/// block, use the last element in the previous block or, in the case of the first block,
+/// use the first value of the whole sequence, stored in the header.
+///
+/// 2. Compute the frame of reference (the minimum of the deltas in the block). Subtract
+/// this min delta from all deltas in the block. This guarantees that all values are
+/// non-negative.
+///
+/// 3. Encode the frame of reference (min delta) as a zigzag ULEB128 int followed by the
+/// bit widths of the mini blocks and the delta values (minus the min delta) bit packed
+/// per mini block.
+///
+/// Supports only INT32 and INT64.
+
+template <typename DType>
+class DeltaBitPackEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+  // Maximum possible header size
+  static constexpr uint32_t kMaxPageHeaderWriterSize = 32;
+  static constexpr uint32_t kValuesPerBlock =
+      std::is_same_v<int32_t, typename DType::c_type> ? 128 : 256;
+  static constexpr uint32_t kMiniBlocksPerBlock = 4;
+
+ public:
+  using T = typename DType::c_type;
+  using UT = std::make_unsigned_t<T>;
+  using TypedEncoder<DType>::Put;
+
+  explicit DeltaBitPackEncoder(const ColumnDescriptor* descr, MemoryPool* pool,
+                               const uint32_t values_per_block = kValuesPerBlock,
+                               const uint32_t mini_blocks_per_block = kMiniBlocksPerBlock)
+      : EncoderImpl(descr, Encoding::DELTA_BINARY_PACKED, pool),
+        values_per_block_(values_per_block),
+        mini_blocks_per_block_(mini_blocks_per_block),
+        values_per_mini_block_(values_per_block / mini_blocks_per_block),
+        deltas_(values_per_block, ::arrow::stl::allocator<T>(pool)),
+        bits_buffer_(
+            AllocateBuffer(pool, (kMiniBlocksPerBlock + values_per_block) * sizeof(T))),
+        sink_(pool),
+        bit_writer_(bits_buffer_->mutable_data(),
+                    static_cast<int>(bits_buffer_->size())) {
+    if (values_per_block_ % 128 != 0) {
+      throw ParquetException(
+          "the number of values in a block must be multiple of 128, but it's " +
+          std::to_string(values_per_block_));
+    }
+    if (values_per_mini_block_ % 32 != 0) {
+      throw ParquetException(
+          "the number of values in a miniblock must be multiple of 32, but it's " +
+          std::to_string(values_per_mini_block_));
+    }
+    if (values_per_block % mini_blocks_per_block != 0) {
+      throw ParquetException(
+          "the number of values per block % number of miniblocks per block must be 0, "
+          "but it's " +
+          std::to_string(values_per_block % mini_blocks_per_block));
+    }
+    // Reserve enough space at the beginning of the buffer for largest possible header.
+    PARQUET_THROW_NOT_OK(sink_.Advance(kMaxPageHeaderWriterSize));
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+  void Put(const ::arrow::Array& values) override;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override;
+
+  void FlushBlock();
+
+ private:
+  const uint32_t values_per_block_;
+  const uint32_t mini_blocks_per_block_;
+  const uint32_t values_per_mini_block_;
+  uint32_t values_current_block_{0};
+  uint32_t total_value_count_{0};
+  T first_value_{0};
+  T current_value_{0};
+  ArrowPoolVector<T> deltas_;
+  std::shared_ptr<ResizableBuffer> bits_buffer_;
+  ::arrow::BufferBuilder sink_;
+  ::arrow::bit_util::BitWriter bit_writer_;
+};
+
+template <typename DType>
+void DeltaBitPackEncoder<DType>::Put(const T* src, int num_values) {
+  if (num_values == 0) {
+    return;
+  }
+
+  int idx = 0;
+  if (total_value_count_ == 0) {
+    current_value_ = src[0];
+    first_value_ = current_value_;
+    idx = 1;
+  }
+  total_value_count_ += num_values;
+
+  while (idx < num_values) {
+    T value = src[idx];
+    // Calculate deltas. The possible overflow is handled by use of unsigned integers
+    // making subtraction operations well-defined and correct even in case of overflow.
+    // Encoded integers will wrap back around on decoding.
+    // See http://en.wikipedia.org/wiki/Modular_arithmetic#Integers_modulo_n
+    deltas_[values_current_block_] = SafeSignedSubtract(value, current_value_);
+    current_value_ = value;
+    idx++;
+    values_current_block_++;
+    if (values_current_block_ == values_per_block_) {
+      FlushBlock();
+    }
+  }
+}
+
+template <typename DType>
+void DeltaBitPackEncoder<DType>::FlushBlock() {
+  if (values_current_block_ == 0) {
+    return;
+  }
+
+  // Calculate the frame of reference for this miniblock. This value will be subtracted
+  // from all deltas to guarantee all deltas are positive for encoding.
+  const T min_delta =
+      *std::min_element(deltas_.begin(), deltas_.begin() + values_current_block_);
+  bit_writer_.PutZigZagVlqInt(min_delta);
+
+  // Call to GetNextBytePtr reserves mini_blocks_per_block_ bytes of space to write
+  // bit widths of miniblocks as they become known during the encoding.
+  uint8_t* bit_width_data = bit_writer_.GetNextBytePtr(mini_blocks_per_block_);
+  DCHECK(bit_width_data != nullptr);
+
+  const uint32_t num_miniblocks =
+      static_cast<uint32_t>(std::ceil(static_cast<double>(values_current_block_) /
+                                      static_cast<double>(values_per_mini_block_)));
+  for (uint32_t i = 0; i < num_miniblocks; i++) {
+    const uint32_t values_current_mini_block =
+        std::min(values_per_mini_block_, values_current_block_);
+
+    const uint32_t start = i * values_per_mini_block_;
+    const T max_delta = *std::max_element(
+        deltas_.begin() + start, deltas_.begin() + start + values_current_mini_block);
+
+    // The minimum number of bits required to write any of values in deltas_ vector.
+    // See overflow comment above.
+    const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits(
+        static_cast<UT>(max_delta) - static_cast<UT>(min_delta));
+
+    for (uint32_t j = start; j < start + values_current_mini_block; j++) {
+      // Convert delta to frame of reference. See overflow comment above.
+      const UT value = static_cast<UT>(deltas_[j]) - static_cast<UT>(min_delta);
+      bit_writer_.PutValue(value, bit_width);
+    }
+    // If there are not enough values to fill the last mini block, we pad the mini block
+    // with zeroes so that its length is the number of values in a full mini block
+    // multiplied by the bit width.
+    for (uint32_t j = values_current_mini_block; j < values_per_mini_block_; j++) {
+      bit_writer_.PutValue(0, bit_width);
+    }
+    values_current_block_ -= values_current_mini_block;
+  }
+
+  // If, in the last block, less than <number of miniblocks in a block> miniblocks are
+  // needed to store the values, the bytes storing the bit widths of the unneeded
+  // miniblocks are still present, their value should be zero, but readers must accept
+  // arbitrary values as well.
+  for (uint32_t i = num_miniblocks; i < mini_blocks_per_block_; i++) {
+    bit_width_data[i] = 0;
+  }
+  DCHECK_EQ(values_current_block_, 0);
+
+  bit_writer_.Flush();
+  PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+  bit_writer_.Clear();
+}
+
+template <typename DType>
+std::shared_ptr<Buffer> DeltaBitPackEncoder<DType>::FlushValues() {
+  if (values_current_block_ > 0) {
+    FlushBlock();
+  }
+  PARQUET_ASSIGN_OR_THROW(auto buffer, sink_.Finish(/*shrink_to_fit=*/true));
+
+  uint8_t header_buffer_[kMaxPageHeaderWriterSize] = {};
+  bit_util::BitWriter header_writer(header_buffer_, sizeof(header_buffer_));
+  if (!header_writer.PutVlqInt(values_per_block_) ||
+      !header_writer.PutVlqInt(mini_blocks_per_block_) ||
+      !header_writer.PutVlqInt(total_value_count_) ||
+      !header_writer.PutZigZagVlqInt(static_cast<T>(first_value_))) {
+    throw ParquetException("header writing error");
+  }
+  header_writer.Flush();
+
+  // We reserved enough space at the beginning of the buffer for largest possible header
+  // and data was written immediately after. We now write the header data immediately
+  // before the end of reserved space.
+  const size_t offset_bytes = kMaxPageHeaderWriterSize - header_writer.bytes_written();
+  std::memcpy(buffer->mutable_data() + offset_bytes, header_buffer_,
+              header_writer.bytes_written());
+
+  // Reset counter of cached values
+  total_value_count_ = 0;
+  // Reserve enough space at the beginning of the buffer for largest possible header.
+  PARQUET_THROW_NOT_OK(sink_.Advance(kMaxPageHeaderWriterSize));
+
+  // Excess bytes at the beginning are sliced off and ignored.
+  return SliceBuffer(buffer, offset_bytes);
+}
+
+template <>
+void DeltaBitPackEncoder<Int32Type>::Put(const ::arrow::Array& values) {
+  const ::arrow::ArrayData& data = *values.data();
+  if (values.type_id() != ::arrow::Type::INT32) {
+    throw ParquetException("Expected Int32TArray, got ", values.type()->ToString());
+  }
+  if (data.length > std::numeric_limits<int32_t>::max()) {
+    throw ParquetException("Array cannot be longer than ",
+                           std::numeric_limits<int32_t>::max());
+  }
+
+  if (values.null_count() == 0) {
+    Put(data.GetValues<int32_t>(1), static_cast<int>(data.length));
+  } else {
+    PutSpaced(data.GetValues<int32_t>(1), static_cast<int>(data.length),
+              data.GetValues<uint8_t>(0, 0), data.offset);
+  }
+}
+
+template <>
+void DeltaBitPackEncoder<Int64Type>::Put(const ::arrow::Array& values) {
+  const ::arrow::ArrayData& data = *values.data();
+  if (values.type_id() != ::arrow::Type::INT64) {
+    throw ParquetException("Expected Int64TArray, got ", values.type()->ToString());
+  }
+  if (data.length > std::numeric_limits<int32_t>::max()) {
+    throw ParquetException("Array cannot be longer than ",
+                           std::numeric_limits<int32_t>::max());
+  }
+  if (values.null_count() == 0) {
+    Put(data.GetValues<int64_t>(1), static_cast<int>(data.length));
+  } else {
+    PutSpaced(data.GetValues<int64_t>(1), static_cast<int>(data.length),
+              data.GetValues<uint8_t>(0, 0), data.offset);
+  }
+}
+
+template <typename DType>
+void DeltaBitPackEncoder<DType>::PutSpaced(const T* src, int num_values,
+                                           const uint8_t* valid_bits,
+                                           int64_t valid_bits_offset) {
+  if (valid_bits != NULLPTR) {
+    PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                 this->memory_pool()));
+    T* data = buffer->template mutable_data_as<T>();
+    int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+        src, num_values, valid_bits, valid_bits_offset, data);
+    Put(data, num_valid_values);
+  } else {
+    Put(src, num_values);
+  }
+}
+
+// ----------------------------------------------------------------------
+// DELTA_LENGTH_BYTE_ARRAY encoder
+
+class DeltaLengthByteArrayEncoder : public EncoderImpl,
+                                    virtual public TypedEncoder<ByteArrayType> {
+ public:
+  explicit DeltaLengthByteArrayEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY,
+                    pool = ::arrow::default_memory_pool()),
+        sink_(pool),
+        length_encoder_(nullptr, pool) {}
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  int64_t EstimatedDataEncodedSize() override {
+    return sink_.length() + length_encoder_.EstimatedDataEncodedSize();
+  }
+
+  using TypedEncoder<ByteArrayType>::Put;
+
+  void Put(const ::arrow::Array& values) override;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override;
+
+ protected:
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+            return Status::Invalid(
+                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
+          }
+          if (ARROW_PREDICT_FALSE(
+                  view.size() + sink_.length() >
+                  static_cast<size_t>(std::numeric_limits<int32_t>::max()))) {
+            return Status::Invalid("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
+          }
+          length_encoder_.Put({static_cast<int32_t>(view.length())}, 1);
+          PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length()));
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+  }
+
+  ::arrow::BufferBuilder sink_;
+  DeltaBitPackEncoder<Int32Type> length_encoder_;
+};
+
+void DeltaLengthByteArrayEncoder::Put(const ::arrow::Array& values) {
+  AssertBaseBinary(values);
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else {
+    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  }
+}
+
+void DeltaLengthByteArrayEncoder::Put(const T* src, int num_values) {
+  if (num_values == 0) {
+    return;
+  }
+
+  constexpr int kBatchSize = 256;
+  std::array<int32_t, kBatchSize> lengths;
+  uint32_t total_increment_size = 0;
+  for (int idx = 0; idx < num_values; idx += kBatchSize) {
+    const int batch_size = std::min(kBatchSize, num_values - idx);
+    for (int j = 0; j < batch_size; ++j) {
+      const int32_t len = src[idx + j].len;
+      if (ARROW_PREDICT_FALSE(
+              AddWithOverflow(total_increment_size, len, &total_increment_size))) {
+        throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
+      }
+      lengths[j] = len;
+    }
+    length_encoder_.Put(lengths.data(), batch_size);
+  }
+  if (sink_.length() + total_increment_size > std::numeric_limits<int32_t>::max()) {
+    throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY");
+  }
+  PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size));
+  for (int idx = 0; idx < num_values; idx++) {
+    sink_.UnsafeAppend(src[idx].ptr, src[idx].len);
+  }
+}
+
+void DeltaLengthByteArrayEncoder::PutSpaced(const T* src, int num_values,
+                                            const uint8_t* valid_bits,
+                                            int64_t valid_bits_offset) {
+  if (valid_bits != NULLPTR) {
+    PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                 this->memory_pool()));
+    T* data = buffer->template mutable_data_as<T>();
+    int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+        src, num_values, valid_bits, valid_bits_offset, data);
+    Put(data, num_valid_values);
+  } else {
+    Put(src, num_values);
+  }
+}
+
+std::shared_ptr<Buffer> DeltaLengthByteArrayEncoder::FlushValues() {
+  std::shared_ptr<Buffer> encoded_lengths = length_encoder_.FlushValues();
+
+  std::shared_ptr<Buffer> data;
+  PARQUET_THROW_NOT_OK(sink_.Finish(&data));
+  sink_.Reset();
+
+  PARQUET_THROW_NOT_OK(sink_.Resize(encoded_lengths->size() + data->size()));
+  PARQUET_THROW_NOT_OK(sink_.Append(encoded_lengths->data(), encoded_lengths->size()));
+  PARQUET_THROW_NOT_OK(sink_.Append(data->data(), data->size()));
+
+  std::shared_ptr<Buffer> buffer;
+  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
+  return buffer;
+}
+
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY encoder
+
+/// Delta Byte Array encoding also known as incremental encoding or front compression:
+/// for each element in a sequence of strings, store the prefix length of the previous
+/// entry plus the suffix.
+///
+/// This is stored as a sequence of delta-encoded prefix lengths (DELTA_BINARY_PACKED),
+/// followed by the suffixes encoded as delta length byte arrays
+/// (DELTA_LENGTH_BYTE_ARRAY).
+
+template <typename DType>
+class DeltaByteArrayEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+  static constexpr std::string_view kEmpty = "";
+
+ public:
+  using T = typename DType::c_type;
+
+  explicit DeltaByteArrayEncoder(const ColumnDescriptor* descr,
+                                 MemoryPool* pool = ::arrow::default_memory_pool())
+      : EncoderImpl(descr, Encoding::DELTA_BYTE_ARRAY, pool),
+        sink_(pool),
+        prefix_length_encoder_(/*descr=*/nullptr, pool),
+        suffix_encoder_(descr, pool),
+        last_value_(""),
+        empty_(static_cast<uint32_t>(kEmpty.size()),
+               reinterpret_cast<const uint8_t*>(kEmpty.data())) {}
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  int64_t EstimatedDataEncodedSize() override {
+    return prefix_length_encoder_.EstimatedDataEncodedSize() +
+           suffix_encoder_.EstimatedDataEncodedSize();
+  }
+
+  using TypedEncoder<DType>::Put;
+
+  void Put(const ::arrow::Array& values) override;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != nullptr) {
+      if (buffer_ == nullptr) {
+        PARQUET_ASSIGN_OR_THROW(buffer_,
+                                ::arrow::AllocateResizableBuffer(num_values * sizeof(T),
+                                                                 this->memory_pool()));
+      } else {
+        PARQUET_THROW_NOT_OK(buffer_->Resize(num_values * sizeof(T), false));
+      }
+      T* data = buffer_->mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+ protected:
+  template <typename VisitorType>
+  void PutInternal(const T* src, int num_values, const VisitorType visitor) {
+    if (num_values == 0) {
+      return;
+    }
+
+    std::string_view last_value_view = last_value_;
+    constexpr int kBatchSize = 256;
+    std::array<int32_t, kBatchSize> prefix_lengths;
+    std::array<ByteArray, kBatchSize> suffixes;
+
+    for (int i = 0; i < num_values; i += kBatchSize) {
+      const int batch_size = std::min(kBatchSize, num_values - i);
+
+      for (int j = 0; j < batch_size; ++j) {
+        const int idx = i + j;
+        const auto view = visitor[idx];
+        const auto len = static_cast<const uint32_t>(view.length());
+
+        uint32_t common_prefix_length = 0;
+        const uint32_t maximum_common_prefix_length =
+            std::min(len, static_cast<uint32_t>(last_value_view.length()));
+        while (common_prefix_length < maximum_common_prefix_length) {
+          if (last_value_view[common_prefix_length] != view[common_prefix_length]) {
+            break;
+          }
+          common_prefix_length++;
+        }
+
+        last_value_view = view;
+        prefix_lengths[j] = common_prefix_length;
+        const uint32_t suffix_length = len - common_prefix_length;
+        const uint8_t* suffix_ptr = src[idx].ptr + common_prefix_length;
+
+        // Convert to ByteArray, so it can be passed to the suffix_encoder_.
+        const ByteArray suffix(suffix_length, suffix_ptr);
+        suffixes[j] = suffix;
+      }
+      suffix_encoder_.Put(suffixes.data(), batch_size);
+      prefix_length_encoder_.Put(prefix_lengths.data(), batch_size);
+    }
+    last_value_ = last_value_view;
+  }
+
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    auto previous_len = static_cast<uint32_t>(last_value_.length());
+    std::string_view last_value_view = last_value_;
+
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() >= kMaxByteArraySize)) {
+            return Status::Invalid(
+                "Parquet cannot store strings with size 2GB or more, got: ", view.size());
+          }
+          const ByteArray src{view};
+
+          uint32_t common_prefix_length = 0;
+          const uint32_t len = src.len;
+          const uint32_t maximum_common_prefix_length = std::min(previous_len, len);
+          while (common_prefix_length < maximum_common_prefix_length) {
+            if (last_value_view[common_prefix_length] != view[common_prefix_length]) {
+              break;
+            }
+            common_prefix_length++;
+          }
+          previous_len = len;
+          prefix_length_encoder_.Put({static_cast<int32_t>(common_prefix_length)}, 1);
+
+          last_value_view = view;
+          const auto suffix_length = static_cast<uint32_t>(len - common_prefix_length);
+          if (suffix_length == 0) {
+            suffix_encoder_.Put(&empty_, 1);
+            return Status::OK();
+          }
+          const uint8_t* suffix_ptr = src.ptr + common_prefix_length;
+          // Convert to ByteArray, so it can be passed to the suffix_encoder_.
+          const ByteArray suffix(suffix_length, suffix_ptr);
+          suffix_encoder_.Put(&suffix, 1);
+
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+    last_value_ = last_value_view;
+  }
+
+  ::arrow::BufferBuilder sink_;
+  DeltaBitPackEncoder<Int32Type> prefix_length_encoder_;
+  DeltaLengthByteArrayEncoder suffix_encoder_;
+  std::string last_value_;
+  const ByteArray empty_;
+  std::unique_ptr<ResizableBuffer> buffer_;
+};
+
+struct ByteArrayVisitor {
+  const ByteArray* src;
+
+  std::string_view operator[](int i) const {
+    if (ARROW_PREDICT_FALSE(src[i].len >= kMaxByteArraySize)) {
+      throw ParquetException("Parquet cannot store strings with size 2GB or more, got: ",
+                             src[i].len);
+    }
+    return std::string_view{src[i]};
+  }
+
+  uint32_t len(int i) const { return src[i].len; }
+};
+
+struct FLBAVisitor {
+  const FLBA* src;
+  const uint32_t type_length;
+
+  std::string_view operator[](int i) const {
+    return std::string_view{reinterpret_cast<const char*>(src[i].ptr), type_length};
+  }
+
+  uint32_t len(int i) const { return type_length; }
+};
+
+template <>
+void DeltaByteArrayEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
+  auto visitor = ByteArrayVisitor{src};
+  PutInternal<ByteArrayVisitor>(src, num_values, visitor);
+}
+
+template <>
+void DeltaByteArrayEncoder<FLBAType>::Put(const FLBA* src, int num_values) {
+  auto visitor = FLBAVisitor{src, static_cast<uint32_t>(descr_->type_length())};
+  PutInternal<FLBAVisitor>(src, num_values, visitor);
+}
+
+template <typename DType>
+void DeltaByteArrayEncoder<DType>::Put(const ::arrow::Array& values) {
+  if (::arrow::is_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+  } else if (::arrow::is_large_binary_like(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+  } else if (::arrow::is_fixed_size_binary(values.type_id())) {
+    PutBinaryArray(checked_cast<const ::arrow::FixedSizeBinaryArray&>(values));
+  } else {
+    throw ParquetException("Only BaseBinaryArray and subclasses supported");
+  }
+}
+
+template <typename DType>
+std::shared_ptr<Buffer> DeltaByteArrayEncoder<DType>::FlushValues() {
+  PARQUET_THROW_NOT_OK(sink_.Resize(EstimatedDataEncodedSize(), false));
+
+  std::shared_ptr<Buffer> prefix_lengths = prefix_length_encoder_.FlushValues();
+  PARQUET_THROW_NOT_OK(sink_.Append(prefix_lengths->data(), prefix_lengths->size()));
+
+  std::shared_ptr<Buffer> suffixes = suffix_encoder_.FlushValues();
+  PARQUET_THROW_NOT_OK(sink_.Append(suffixes->data(), suffixes->size()));
+
+  std::shared_ptr<Buffer> buffer;
+  PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true));
+  last_value_.clear();
+  return buffer;
+}
+
+// ----------------------------------------------------------------------
+// RLE encoder for BOOLEAN
+
+class RleBooleanEncoder final : public EncoderImpl, virtual public BooleanEncoder {
+ public:
+  explicit RleBooleanEncoder(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::RLE, pool),
+        buffered_append_values_(::arrow::stl::allocator<T>(pool)) {}
+
+  int64_t EstimatedDataEncodedSize() override {
+    return kRleLengthInBytes + MaxRleBufferSize();
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  void Put(const T* buffer, int num_values) override;
+  void Put(const ::arrow::Array& values) override {
+    if (values.type_id() != ::arrow::Type::BOOL) {
+      throw ParquetException("RleBooleanEncoder expects BooleanArray, got ",
+                             values.type()->ToString());
+    }
+    const auto& boolean_array = checked_cast<const ::arrow::BooleanArray&>(values);
+    if (values.null_count() == 0) {
+      for (int i = 0; i < boolean_array.length(); ++i) {
+        // null_count == 0, so just call Value directly is ok.
+        buffered_append_values_.push_back(boolean_array.Value(i));
+      }
+    } else {
+      PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BooleanType>(
+          *boolean_array.data(),
+          [&](bool value) {
+            buffered_append_values_.push_back(value);
+            return Status::OK();
+          },
+          []() { return Status::OK(); }));
+    }
+  }
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+                                                                   this->memory_pool()));
+      T* data = buffer->mutable_data_as<T>();
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+          src, num_values, valid_bits, valid_bits_offset, data);
+      Put(data, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+  void Put(const std::vector<bool>& src, int num_values) override;
+
+ protected:
+  template <typename SequenceType>
+  void PutImpl(const SequenceType& src, int num_values);
+
+  int MaxRleBufferSize() const noexcept {
+    return RlePreserveBufferSize(static_cast<int>(buffered_append_values_.size()),
+                                 kBitWidth);
+  }
+
+  constexpr static int32_t kBitWidth = 1;
+  /// 4 bytes in little-endian, which indicates the length.
+  constexpr static int32_t kRleLengthInBytes = 4;
+
+  // std::vector<bool> in C++ is tricky, because it's a bitmap.
+  // Here RleBooleanEncoder will only append values into it, and
+  // dump values into Buffer, so using it here is ok.
+  ArrowPoolVector<bool> buffered_append_values_;
+};
+
+void RleBooleanEncoder::Put(const bool* src, int num_values) { PutImpl(src, num_values); }
+
+void RleBooleanEncoder::Put(const std::vector<bool>& src, int num_values) {
+  PutImpl(src, num_values);
+}
+
+template <typename SequenceType>
+void RleBooleanEncoder::PutImpl(const SequenceType& src, int num_values) {
+  for (int i = 0; i < num_values; ++i) {
+    buffered_append_values_.push_back(src[i]);
+  }
+}
+
+std::shared_ptr<Buffer> RleBooleanEncoder::FlushValues() {
+  int rle_buffer_size_max = MaxRleBufferSize();
+  std::shared_ptr<ResizableBuffer> buffer =
+      AllocateBuffer(this->pool_, rle_buffer_size_max + kRleLengthInBytes);
+  ::arrow::util::RleEncoder encoder(buffer->mutable_data() + kRleLengthInBytes,
+                                    rle_buffer_size_max, /*bit_width*/ kBitWidth);
+
+  for (bool value : buffered_append_values_) {
+    encoder.Put(value ? 1 : 0);
+  }
+  encoder.Flush();
+  ::arrow::util::SafeStore(buffer->mutable_data(),
+                           ::arrow::bit_util::ToLittleEndian(encoder.len()));
+  PARQUET_THROW_NOT_OK(buffer->Resize(kRleLengthInBytes + encoder.len()));
+  buffered_append_values_.clear();
+  return buffer;
+}
+
+}  // namespace
+
+// ----------------------------------------------------------------------
+// Factory function
+
+std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
+                                     bool use_dictionary, const ColumnDescriptor* descr,
+                                     MemoryPool* pool) {
+  if (use_dictionary) {
+    switch (type_num) {
+      case Type::INT32:
+        return std::make_unique<DictEncoderImpl<Int32Type>>(descr, pool);
+      case Type::INT64:
+        return std::make_unique<DictEncoderImpl<Int64Type>>(descr, pool);
+      case Type::INT96:
+        return std::make_unique<DictEncoderImpl<Int96Type>>(descr, pool);
+      case Type::FLOAT:
+        return std::make_unique<DictEncoderImpl<FloatType>>(descr, pool);
+      case Type::DOUBLE:
+        return std::make_unique<DictEncoderImpl<DoubleType>>(descr, pool);
+      case Type::BYTE_ARRAY:
+        return std::make_unique<DictEncoderImpl<ByteArrayType>>(descr, pool);
+      case Type::FIXED_LEN_BYTE_ARRAY:
+        return std::make_unique<DictEncoderImpl<FLBAType>>(descr, pool);
+      default:
+        DCHECK(false) << "Encoder not implemented";
+        break;
+    }
+  } else if (encoding == Encoding::PLAIN) {
+    switch (type_num) {
+      case Type::BOOLEAN:
+        return std::make_unique<PlainEncoder<BooleanType>>(descr, pool);
+      case Type::INT32:
+        return std::make_unique<PlainEncoder<Int32Type>>(descr, pool);
+      case Type::INT64:
+        return std::make_unique<PlainEncoder<Int64Type>>(descr, pool);
+      case Type::INT96:
+        return std::make_unique<PlainEncoder<Int96Type>>(descr, pool);
+      case Type::FLOAT:
+        return std::make_unique<PlainEncoder<FloatType>>(descr, pool);
+      case Type::DOUBLE:
+        return std::make_unique<PlainEncoder<DoubleType>>(descr, pool);
+      case Type::BYTE_ARRAY:
+        return std::make_unique<PlainEncoder<ByteArrayType>>(descr, pool);
+      case Type::FIXED_LEN_BYTE_ARRAY:
+        return std::make_unique<PlainEncoder<FLBAType>>(descr, pool);
+      default:
+        DCHECK(false) << "Encoder not implemented";
+        break;
+    }
+  } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+    switch (type_num) {
+      case Type::INT32:
+        return std::make_unique<ByteStreamSplitEncoder<Int32Type>>(descr, pool);
+      case Type::INT64:
+        return std::make_unique<ByteStreamSplitEncoder<Int64Type>>(descr, pool);
+      case Type::FLOAT:
+        return std::make_unique<ByteStreamSplitEncoder<FloatType>>(descr, pool);
+      case Type::DOUBLE:
+        return std::make_unique<ByteStreamSplitEncoder<DoubleType>>(descr, pool);
+      case Type::FIXED_LEN_BYTE_ARRAY:
+        return std::make_unique<ByteStreamSplitEncoder<FLBAType>>(descr, pool);
+      default:
+        throw ParquetException(
+            "BYTE_STREAM_SPLIT only supports FLOAT, DOUBLE, INT32, INT64 "
+            "and FIXED_LEN_BYTE_ARRAY");
+    }
+  } else if (encoding == Encoding::DELTA_BINARY_PACKED) {
+    switch (type_num) {
+      case Type::INT32:
+        return std::make_unique<DeltaBitPackEncoder<Int32Type>>(descr, pool);
+      case Type::INT64:
+        return std::make_unique<DeltaBitPackEncoder<Int64Type>>(descr, pool);
+      default:
+        throw ParquetException(
+            "DELTA_BINARY_PACKED encoder only supports INT32 and INT64");
+    }
+  } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
+    switch (type_num) {
+      case Type::BYTE_ARRAY:
+        return std::make_unique<DeltaLengthByteArrayEncoder>(descr, pool);
+      default:
+        throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY");
+    }
+  } else if (encoding == Encoding::RLE) {
+    switch (type_num) {
+      case Type::BOOLEAN:
+        return std::make_unique<RleBooleanEncoder>(descr, pool);
+      default:
+        throw ParquetException("RLE only supports BOOLEAN");
+    }
+  } else if (encoding == Encoding::DELTA_BYTE_ARRAY) {
+    switch (type_num) {
+      case Type::BYTE_ARRAY:
+        return std::make_unique<DeltaByteArrayEncoder<ByteArrayType>>(descr, pool);
+      case Type::FIXED_LEN_BYTE_ARRAY:
+        return std::make_unique<DeltaByteArrayEncoder<FLBAType>>(descr, pool);
+      default:
+        throw ParquetException(
+            "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY");
+    }
+  } else {
+    ParquetException::NYI("Selected encoding is not supported");
+  }
+  DCHECK(false) << "Should not be able to reach this code";
+  return nullptr;
+}
+
+}  // namespace parquet
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 493c4044ddc1c..5717886f10759 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -22,31 +22,16 @@
 #include <memory>
 #include <vector>
 
-#include "arrow/util/spaced.h"
+#include "arrow/type_fwd.h"
 
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/types.h"
 
 namespace arrow {
-
-class Array;
-class ArrayBuilder;
-class BinaryArray;
-class BinaryBuilder;
-class BooleanBuilder;
-class Int32Type;
-class Int64Type;
-class FloatType;
-class DoubleType;
-class FixedSizeBinaryType;
-template <typename T>
-class NumericBuilder;
-class FixedSizeBinaryBuilder;
 template <typename T>
 class Dictionary32Builder;
-
-}  // namespace arrow
+}
 
 namespace parquet {
 
@@ -184,7 +169,7 @@ class Encoder {
 template <typename DType>
 class TypedEncoder : virtual public Encoder {
  public:
-  typedef typename DType::c_type T;
+  using T = typename DType::c_type;
 
   using Encoder::Put;
 
@@ -293,20 +278,7 @@ class TypedDecoder : virtual public Decoder {
   /// \param[in] valid_bits_offset offset into valid_bits
   /// \return The number of values decoded, including nulls.
   virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
-                           const uint8_t* valid_bits, int64_t valid_bits_offset) {
-    if (null_count > 0) {
-      int values_to_read = num_values - null_count;
-      int values_read = Decode(buffer, values_to_read);
-      if (values_read != values_to_read) {
-        throw ParquetException("Number of values / definition_levels read did not match");
-      }
-
-      return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
-                                                      valid_bits, valid_bits_offset);
-    } else {
-      return Decode(buffer, num_values);
-    }
-  }
+                           const uint8_t* valid_bits, int64_t valid_bits_offset) = 0;
 
   /// \brief Decode into an ArrayBuilder or other accumulator
   ///

From 9761241bc831b7f421558622410a39fe4f9aa563 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 5 Sep 2024 22:34:46 +0200
Subject: [PATCH 092/186] MINOR: [CI][C++] Add C++ example builds to "cpp"
 Crossbow task group (#43975)

### Rationale for this change

The `python` task group already includes the Python example builds. This PR does the same for the `cpp` task group.

### Are these changes tested?

By CI itself.

### Are there any user-facing changes?

No.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/tasks/tasks.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index b7e0c1601e336..9ded6ee41ab4b 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -75,6 +75,7 @@ groups:
 
   cpp:
     - test-*cpp*
+    - example-*cpp*
 
   c-glib:
     - test-*c-glib*

From 262d6f6f68814b6495b87a13cfba5fd9bf6c7d67 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 6 Sep 2024 05:41:46 +0900
Subject: [PATCH 093/186] GH-43944: [C++][Parquet] Add support for
 arrow::ArrayStatistics: non zero-copy int based types (#43945)

### Rationale for this change

Statistics is useful for fast processing.

Target types:

* `UInt8`
* `Int8`
* `UInt16`
* `Int16`
* `UInt32`
* `UInt64`
* `Date32`
* `Time32`
* `Time64`
* `Duration`

### What changes are included in this PR?

Map `ColumnChunkMetaData` information to `arrow::ArrayStatistics`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.

* GitHub Issue: #43944

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .../parquet/arrow/arrow_statistics_test.cc    | 105 ++++++++++++++++++
 cpp/src/parquet/arrow/reader.cc               |   5 +-
 cpp/src/parquet/arrow/reader_internal.cc      |  78 +++++++++----
 cpp/src/parquet/arrow/reader_internal.h       |  28 +++--
 4 files changed, 187 insertions(+), 29 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc
index a19303c3dc03a..2638358f1ce7c 100644
--- a/cpp/src/parquet/arrow/arrow_statistics_test.cc
+++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc
@@ -17,12 +17,14 @@
 
 #include "gtest/gtest.h"
 
+#include "arrow/array.h"
 #include "arrow/table.h"
 #include "arrow/testing/gtest_util.h"
 
 #include "parquet/api/reader.h"
 #include "parquet/api/writer.h"
 
+#include "parquet/arrow/reader.h"
 #include "parquet/arrow/schema.h"
 #include "parquet/arrow/writer.h"
 #include "parquet/file_writer.h"
@@ -179,4 +181,107 @@ TEST(StatisticsTest, TruncateOnlyHalfMinMax) {
   ASSERT_FALSE(stats->HasMinMax());
 }
 
+namespace {
+::arrow::Result<std::shared_ptr<::arrow::Array>> StatisticsReadArray(
+    std::shared_ptr<::arrow::DataType> data_type, const std::string& json) {
+  auto schema = ::arrow::schema({::arrow::field("column", data_type)});
+  auto array = ::arrow::ArrayFromJSON(data_type, json);
+  auto record_batch = ::arrow::RecordBatch::Make(schema, array->length(), {array});
+  ARROW_ASSIGN_OR_RAISE(auto sink, ::arrow::io::BufferOutputStream::Create());
+  const auto arrow_writer_properties =
+      parquet::ArrowWriterProperties::Builder().store_schema()->build();
+  ARROW_ASSIGN_OR_RAISE(
+      auto writer,
+      FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink,
+                       default_writer_properties(), arrow_writer_properties));
+  ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*record_batch));
+  ARROW_RETURN_NOT_OK(writer->Close());
+  ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish());
+
+  auto reader =
+      ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
+  std::unique_ptr<FileReader> file_reader;
+  ARROW_RETURN_NOT_OK(
+      FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader));
+  std::shared_ptr<::arrow::ChunkedArray> chunked_array;
+  ARROW_RETURN_NOT_OK(file_reader->ReadColumn(0, &chunked_array));
+  return chunked_array->chunk(0);
+}
+
+template <typename ArrowType, typename MinMaxType>
+void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) {
+  using ArrowArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+  using ArrowCType = typename ArrowType::c_type;
+  constexpr auto min = std::numeric_limits<ArrowCType>::min();
+  constexpr auto max = std::numeric_limits<ArrowCType>::max();
+
+  std::string json;
+  json += "[";
+  json += std::to_string(max);
+  json += ", null, ";
+  json += std::to_string(min);
+  json += ", ";
+  json += std::to_string(max);
+  json += "]";
+  ASSERT_OK_AND_ASSIGN(auto array, StatisticsReadArray(arrow_type, json));
+  auto typed_array = std::static_pointer_cast<ArrowArrayType>(array);
+  auto statistics = typed_array->statistics();
+  ASSERT_NE(nullptr, statistics);
+  ASSERT_EQ(true, statistics->null_count.has_value());
+  ASSERT_EQ(1, statistics->null_count.value());
+  ASSERT_EQ(false, statistics->distinct_count.has_value());
+  ASSERT_EQ(true, statistics->min.has_value());
+  ASSERT_EQ(true, std::holds_alternative<MinMaxType>(*statistics->min));
+  ASSERT_EQ(min, std::get<MinMaxType>(*statistics->min));
+  ASSERT_EQ(true, statistics->is_min_exact);
+  ASSERT_EQ(true, statistics->max.has_value());
+  ASSERT_EQ(true, std::holds_alternative<MinMaxType>(*statistics->max));
+  ASSERT_EQ(max, std::get<MinMaxType>(*statistics->max));
+  ASSERT_EQ(true, statistics->is_min_exact);
+}
+}  // namespace
+
+TEST(TestStatisticsRead, Int8) {
+  TestStatisticsReadArray<::arrow::Int8Type, int64_t>(::arrow::int8());
+}
+
+TEST(TestStatisticsRead, UInt8) {
+  TestStatisticsReadArray<::arrow::UInt8Type, uint64_t>(::arrow::uint8());
+}
+
+TEST(TestStatisticsRead, Int16) {
+  TestStatisticsReadArray<::arrow::Int16Type, int64_t>(::arrow::int16());
+}
+
+TEST(TestStatisticsRead, UInt16) {
+  TestStatisticsReadArray<::arrow::UInt16Type, uint64_t>(::arrow::uint16());
+}
+
+TEST(TestStatisticsRead, UInt32) {
+  TestStatisticsReadArray<::arrow::UInt32Type, uint64_t>(::arrow::uint32());
+}
+
+TEST(TestStatisticsRead, UInt64) {
+  TestStatisticsReadArray<::arrow::UInt64Type, uint64_t>(::arrow::uint64());
+}
+
+TEST(TestStatisticsRead, Date32) {
+  TestStatisticsReadArray<::arrow::Date32Type, int64_t>(::arrow::date32());
+}
+
+TEST(TestStatisticsRead, Time32) {
+  TestStatisticsReadArray<::arrow::Time32Type, int64_t>(
+      ::arrow::time32(::arrow::TimeUnit::MILLI));
+}
+
+TEST(TestStatisticsRead, Time64) {
+  TestStatisticsReadArray<::arrow::Time64Type, int64_t>(
+      ::arrow::time64(::arrow::TimeUnit::MICRO));
+}
+
+TEST(TestStatisticsRead, Duration) {
+  TestStatisticsReadArray<::arrow::DurationType, int64_t>(
+      ::arrow::duration(::arrow::TimeUnit::NANO));
+}
+
 }  // namespace parquet::arrow
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 285e2a597389d..4f57c3f4f56f7 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -485,8 +485,9 @@ class LeafReader : public ColumnReaderImpl {
         NextRowGroup();
       }
     }
-    RETURN_NOT_OK(
-        TransferColumnData(record_reader_.get(), field_, descr_, ctx_->pool, &out_));
+    RETURN_NOT_OK(TransferColumnData(record_reader_.get(),
+                                     input_->column_chunk_metadata(), field_, descr_,
+                                     ctx_.get(), &out_));
     return Status::OK();
     END_PARQUET_CATCH_EXCEPTIONS
   }
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index e5aef5a45b5f3..e6c2d95e1fbf7 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -319,26 +319,59 @@ void ReconstructChunksWithoutNulls(::arrow::ArrayVector* chunks) {
 }
 
 template <typename ArrowType, typename ParquetType>
-Status TransferInt(RecordReader* reader, MemoryPool* pool,
-                   const std::shared_ptr<Field>& field, Datum* out) {
+Status TransferInt(RecordReader* reader,
+                   std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                   const ReaderContext* ctx, const std::shared_ptr<Field>& field,
+                   Datum* out) {
   using ArrowCType = typename ArrowType::c_type;
   using ParquetCType = typename ParquetType::c_type;
   int64_t length = reader->values_written();
   ARROW_ASSIGN_OR_RAISE(auto data,
-                        ::arrow::AllocateBuffer(length * sizeof(ArrowCType), pool));
+                        ::arrow::AllocateBuffer(length * sizeof(ArrowCType), ctx->pool));
 
   auto values = reinterpret_cast<const ParquetCType*>(reader->values());
   auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
   std::copy(values, values + length, out_ptr);
+  int64_t null_count = 0;
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, std::move(data)};
   if (field->nullable()) {
-    *out = std::make_shared<ArrayType<ArrowType>>(field->type(), length, std::move(data),
-                                                  reader->ReleaseIsValid(),
-                                                  reader->null_count());
-  } else {
-    *out =
-        std::make_shared<ArrayType<ArrowType>>(field->type(), length, std::move(data),
-                                               /*null_bitmap=*/nullptr, /*null_count=*/0);
+    null_count = reader->null_count();
+    buffers[0] = reader->ReleaseIsValid();
+  }
+  auto array_data =
+      ::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count);
+  auto array_statistics = std::make_shared<::arrow::ArrayStatistics>();
+  array_statistics->null_count = null_count;
+  auto statistics = metadata->statistics().get();
+  if (statistics) {
+    if (statistics->HasDistinctCount()) {
+      array_statistics->distinct_count = statistics->distinct_count();
+    }
+    if (statistics->HasMinMax()) {
+      auto typed_statistics =
+          static_cast<::parquet::TypedStatistics<ParquetType>*>(statistics);
+      const ArrowCType min = typed_statistics->min();
+      const ArrowCType max = typed_statistics->max();
+      if (std::is_signed<ArrowCType>::value) {
+        array_statistics->min = static_cast<int64_t>(min);
+        array_statistics->max = static_cast<int64_t>(max);
+      } else {
+        array_statistics->min = static_cast<uint64_t>(min);
+        array_statistics->max = static_cast<uint64_t>(max);
+      }
+      // We can assume that integer based min/max are always exact if
+      // they exist. Apache Parquet's "Statistics" has
+      // "is_min_value_exact" and "is_max_value_exact" but we can
+      // ignore them for integer based min/max.
+      //
+      // See also the discussion at dev@parquet.apache.org:
+      // https://lists.apache.org/thread/zfnmg5p51b7oylft5w5k4670wgkd4zv4
+      array_statistics->is_min_exact = true;
+      array_statistics->is_max_exact = true;
+    }
   }
+  array_data->statistics = std::move(array_statistics);
+  *out = std::make_shared<ArrayType<ArrowType>>(std::move(array_data));
   return Status::OK();
 }
 
@@ -728,21 +761,26 @@ Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool,
 
 }  // namespace
 
-#define TRANSFER_INT32(ENUM, ArrowType)                                               \
-  case ::arrow::Type::ENUM: {                                                         \
-    Status s = TransferInt<ArrowType, Int32Type>(reader, pool, value_field, &result); \
-    RETURN_NOT_OK(s);                                                                 \
+#define TRANSFER_INT32(ENUM, ArrowType)                                            \
+  case ::arrow::Type::ENUM: {                                                      \
+    Status s = TransferInt<ArrowType, Int32Type>(reader, std::move(metadata), ctx, \
+                                                 value_field, &result);            \
+    RETURN_NOT_OK(s);                                                              \
   } break;
 
-#define TRANSFER_INT64(ENUM, ArrowType)                                               \
-  case ::arrow::Type::ENUM: {                                                         \
-    Status s = TransferInt<ArrowType, Int64Type>(reader, pool, value_field, &result); \
-    RETURN_NOT_OK(s);                                                                 \
+#define TRANSFER_INT64(ENUM, ArrowType)                                            \
+  case ::arrow::Type::ENUM: {                                                      \
+    Status s = TransferInt<ArrowType, Int64Type>(reader, std::move(metadata), ctx, \
+                                                 value_field, &result);            \
+    RETURN_NOT_OK(s);                                                              \
   } break;
 
-Status TransferColumnData(RecordReader* reader, const std::shared_ptr<Field>& value_field,
-                          const ColumnDescriptor* descr, MemoryPool* pool,
+Status TransferColumnData(RecordReader* reader,
+                          std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                          const std::shared_ptr<Field>& value_field,
+                          const ColumnDescriptor* descr, const ReaderContext* ctx,
                           std::shared_ptr<ChunkedArray>* out) {
+  auto pool = ctx->pool;
   Datum result;
   std::shared_ptr<ChunkedArray> chunked_result;
   switch (value_field->type()->id()) {
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index cf9dbb86577b5..fab56c888045d 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -66,7 +66,8 @@ class FileColumnIterator {
       : column_index_(column_index),
         reader_(reader),
         schema_(reader->metadata()->schema()),
-        row_groups_(row_groups.begin(), row_groups.end()) {}
+        row_groups_(row_groups.begin(), row_groups.end()),
+        row_group_index_(-1) {}
 
   virtual ~FileColumnIterator() {}
 
@@ -75,7 +76,8 @@ class FileColumnIterator {
       return nullptr;
     }
 
-    auto row_group_reader = reader_->RowGroup(row_groups_.front());
+    row_group_index_ = row_groups_.front();
+    auto row_group_reader = reader_->RowGroup(row_group_index_);
     row_groups_.pop_front();
     return row_group_reader->GetColumnPageReader(column_index_);
   }
@@ -86,23 +88,29 @@ class FileColumnIterator {
 
   std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
 
+  std::unique_ptr<RowGroupMetaData> row_group_metadata() const {
+    return metadata()->RowGroup(row_group_index_);
+  }
+
+  std::unique_ptr<ColumnChunkMetaData> column_chunk_metadata() const {
+    return row_group_metadata()->ColumnChunk(column_index_);
+  }
+
   int column_index() const { return column_index_; }
 
+  int row_group_index() const { return row_group_index_; }
+
  protected:
   int column_index_;
   ParquetFileReader* reader_;
   const SchemaDescriptor* schema_;
   std::deque<int> row_groups_;
+  int row_group_index_;
 };
 
 using FileColumnIteratorFactory =
     std::function<FileColumnIterator*(int, ParquetFileReader*)>;
 
-Status TransferColumnData(::parquet::internal::RecordReader* reader,
-                          const std::shared_ptr<::arrow::Field>& value_field,
-                          const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
-                          std::shared_ptr<::arrow::ChunkedArray>* out);
-
 struct ReaderContext {
   ParquetFileReader* reader;
   ::arrow::MemoryPool* pool;
@@ -118,5 +126,11 @@ struct ReaderContext {
   }
 };
 
+Status TransferColumnData(::parquet::internal::RecordReader* reader,
+                          std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                          const std::shared_ptr<::arrow::Field>& value_field,
+                          const ColumnDescriptor* descr, const ReaderContext* ctx,
+                          std::shared_ptr<::arrow::ChunkedArray>* out);
+
 }  // namespace arrow
 }  // namespace parquet

From 5ad0b3e36f9302c4cf8dd5ab997f30bfab95e2d4 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 6 Sep 2024 14:18:41 +0900
Subject: [PATCH 094/186] GH-43796: [C++] Indent preprocessor directives
 (#43798)

### Rationale for this change

This is for easy to read.

FYI: Google C++ style guide doesn't require indent in preprocessor directives nor deny it:

https://google.github.io/styleguide/cppguide.html#Preprocessor_Directives

```cpp
// Good - directives at beginning of line
  if (lopsided_score) {
#if DISASTER_PENDING      // Correct -- Starts at beginning of line
    DropEverything();
# if NOTIFY               // OK but not required -- Spaces after #
    NotifyClient();
# endif
#endif
    BackToNormal();
  }
```

### What changes are included in this PR?

* Add clang-format configurations for preprocessor directives indentation
* Apply these configurations

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #43796

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .clang-format                                 |   1 +
 cpp/src/arrow/acero/aggregate_benchmark.cc    |  16 +-
 cpp/src/arrow/acero/asof_join_node.cc         |  22 +-
 cpp/src/arrow/acero/asof_join_node_test.cc    |   4 +-
 cpp/src/arrow/acero/bloom_filter.h            |   2 +-
 cpp/src/arrow/acero/bloom_filter_test.cc      |   4 +-
 cpp/src/arrow/acero/options_internal.h        |   4 +-
 cpp/src/arrow/acero/visibility.h              |  42 ++--
 cpp/src/arrow/adapters/orc/adapter.cc         |   2 +-
 cpp/src/arrow/c/abi.h                         |  42 ++--
 cpp/src/arrow/c/bridge_test.cc                |   2 +-
 cpp/src/arrow/c/dlpack_abi.h                  |  16 +-
 cpp/src/arrow/compute/kernel.h                |   2 +-
 .../compute/kernels/scalar_cast_string.cc     |   6 +-
 .../compute/kernels/scalar_round_benchmark.cc |  40 ++--
 .../compute/kernels/scalar_string_ascii.cc    |   2 +-
 .../compute/kernels/scalar_string_test.cc     |   6 +-
 .../compute/kernels/scalar_string_utf8.cc     |   2 +-
 cpp/src/arrow/compute/key_hash_internal.h     |   2 +-
 cpp/src/arrow/compute/util.h                  |  29 +--
 cpp/src/arrow/compute/util_internal.cc        |   2 +-
 cpp/src/arrow/csv/writer.cc                   |   2 +-
 cpp/src/arrow/dataset/api.h                   |   8 +-
 cpp/src/arrow/dataset/file_csv_test.cc        |  16 +-
 cpp/src/arrow/dataset/visibility.h            |  42 ++--
 cpp/src/arrow/engine/substrait/visibility.h   |  42 ++--
 cpp/src/arrow/extension_type.cc               |   4 +-
 cpp/src/arrow/filesystem/api.h                |   6 +-
 cpp/src/arrow/filesystem/azurefs.cc           |   6 +-
 cpp/src/arrow/filesystem/filesystem.cc        |   8 +-
 cpp/src/arrow/filesystem/localfs.cc           |  16 +-
 cpp/src/arrow/filesystem/s3_test_util.cc      |   2 +-
 cpp/src/arrow/filesystem/s3fs.cc              |  38 ++--
 cpp/src/arrow/filesystem/s3fs_test.cc         |  12 +-
 cpp/src/arrow/flight/client.h                 |   6 +-
 .../arrow/flight/client_tracing_middleware.cc |   4 +-
 cpp/src/arrow/flight/cookie_internal.cc       |   2 +-
 cpp/src/arrow/flight/flight_benchmark.cc      |   6 +-
 cpp/src/arrow/flight/flight_test.cc           |  12 +-
 cpp/src/arrow/flight/otel_logging.h           |   6 +-
 cpp/src/arrow/flight/otel_logging_internal.h  |  36 +--
 cpp/src/arrow/flight/perf_server.cc           |   4 +-
 cpp/src/arrow/flight/platform.h               |   2 +-
 cpp/src/arrow/flight/server.cc                |   2 +-
 .../arrow/flight/server_tracing_middleware.cc |  10 +-
 cpp/src/arrow/flight/sql/test_app_cli.cc      |  20 +-
 cpp/src/arrow/flight/sql/test_server_cli.cc   |  10 +-
 cpp/src/arrow/flight/sql/visibility.h         |  42 ++--
 cpp/src/arrow/flight/test_definitions.cc      |   2 +-
 .../flight/transport/grpc/customize_grpc.h    |   8 +-
 .../flight/transport/grpc/grpc_client.cc      |  46 ++--
 .../transport/grpc/serialization_internal.cc  |  12 +-
 .../ucx/flight_transport_ucx_test.cc          |   4 +-
 cpp/src/arrow/flight/types.h                  |   6 +-
 cpp/src/arrow/flight/visibility.h             |  42 ++--
 cpp/src/arrow/io/buffered_test.cc             |   4 +-
 cpp/src/arrow/io/file.cc                      |  18 +-
 cpp/src/arrow/io/file_benchmark.cc            |   8 +-
 cpp/src/arrow/io/file_test.cc                 |   4 +-
 cpp/src/arrow/io/hdfs_internal.cc             |   8 +-
 cpp/src/arrow/io/memory_benchmark.cc          |  84 +++----
 cpp/src/arrow/io/mman.h                       |   2 +-
 cpp/src/arrow/io/test_common.cc               |   2 +-
 cpp/src/arrow/ipc/json_simple_test.cc         |   2 +-
 cpp/src/arrow/ipc/read_write_benchmark.cc     |  36 +--
 cpp/src/arrow/ipc/read_write_test.cc          |   8 +-
 cpp/src/arrow/json/rapidjson_defs.h           |   6 +-
 cpp/src/arrow/memory_pool.cc                  |   6 +-
 cpp/src/arrow/memory_pool_jemalloc.cc         |  30 +--
 cpp/src/arrow/public_api_test.cc              |  16 +-
 cpp/src/arrow/result_internal.h               |   2 +-
 cpp/src/arrow/status.h                        |  30 +--
 cpp/src/arrow/telemetry/logging.cc            |   6 +-
 cpp/src/arrow/testing/gtest_compat.h          |  12 +-
 cpp/src/arrow/testing/gtest_util.cc           |  12 +-
 cpp/src/arrow/testing/gtest_util.h            |   4 +-
 cpp/src/arrow/testing/process.cc              |  48 ++--
 cpp/src/arrow/testing/util.cc                 |  18 +-
 cpp/src/arrow/testing/visibility.h            |  42 ++--
 cpp/src/arrow/type_benchmark.cc               |   8 +-
 cpp/src/arrow/util/atfork_internal.cc         |   2 +-
 cpp/src/arrow/util/atfork_test.cc             |  37 +--
 .../arrow/util/bit_stream_utils_internal.h    |  18 +-
 cpp/src/arrow/util/bit_util.h                 |  22 +-
 cpp/src/arrow/util/bpacking.cc                |   6 +-
 .../arrow/util/byte_stream_split_internal.h   |  32 +--
 cpp/src/arrow/util/byte_stream_split_test.cc  |   8 +-
 cpp/src/arrow/util/cancel.cc                  |   2 +-
 cpp/src/arrow/util/cancel_test.cc             |  14 +-
 cpp/src/arrow/util/compression_benchmark.cc   |  20 +-
 cpp/src/arrow/util/compression_lz4.cc         |   2 +-
 cpp/src/arrow/util/cpu_info.cc                |  50 ++---
 cpp/src/arrow/util/decimal_internal.h         |   8 +-
 cpp/src/arrow/util/endian.h                   |  48 ++--
 cpp/src/arrow/util/hash_util.h                |   4 +-
 cpp/src/arrow/util/hashing.h                  |   2 +-
 cpp/src/arrow/util/int128_internal.h          |   2 +-
 cpp/src/arrow/util/io_util.cc                 |  80 +++----
 cpp/src/arrow/util/io_util.h                  |   4 +-
 cpp/src/arrow/util/io_util_test.cc            |  22 +-
 cpp/src/arrow/util/logger.h                   |  38 ++--
 cpp/src/arrow/util/logging.cc                 |  48 ++--
 cpp/src/arrow/util/logging.h                  | 212 +++++++++---------
 cpp/src/arrow/util/macros.h                   | 165 +++++++-------
 cpp/src/arrow/util/math_constants.h           |   6 +-
 cpp/src/arrow/util/mutex.cc                   |   4 +-
 cpp/src/arrow/util/simd.h                     |  28 +--
 cpp/src/arrow/util/small_vector_benchmark.cc  |   4 +-
 cpp/src/arrow/util/string.h                   |   2 +-
 cpp/src/arrow/util/thread_pool.cc             |  20 +-
 cpp/src/arrow/util/thread_pool.h              |   2 +-
 cpp/src/arrow/util/thread_pool_test.cc        |  20 +-
 cpp/src/arrow/util/tracing_internal.cc        |   6 +-
 cpp/src/arrow/util/tracing_internal.h         | 187 +++++++--------
 cpp/src/arrow/util/utf8.cc                    |   2 +-
 cpp/src/arrow/util/utf8_internal.h            |   2 +-
 cpp/src/arrow/util/visibility.h               |  90 ++++----
 cpp/src/arrow/util/windows_compatibility.h    |  24 +-
 cpp/src/arrow/util/windows_fixup.h            |  42 ++--
 cpp/src/gandiva/cast_time.cc                  |   6 +-
 cpp/src/gandiva/context_helper.cc             |   4 +-
 cpp/src/gandiva/decimal_xlarge.cc             |   4 +-
 cpp/src/gandiva/engine.cc                     |  52 ++---
 cpp/src/gandiva/gandiva_object_cache.h        |  12 +-
 cpp/src/gandiva/gdv_function_stubs.h          |  12 +-
 cpp/src/gandiva/llvm_includes.h               |  20 +-
 .../gandiva/precompiled/extended_math_ops.cc  |   2 +-
 .../precompiled/extended_math_ops_test.cc     |   2 +-
 cpp/src/gandiva/precompiled/types.h           |   4 +-
 cpp/src/gandiva/selection_vector.cc           |   6 +-
 cpp/src/gandiva/tests/projector_test.cc       |   2 +-
 cpp/src/gandiva/tests/test_util.cc            |   2 +-
 cpp/src/gandiva/visibility.h                  |  42 ++--
 .../parquet/arrow/arrow_reader_writer_test.cc |   6 +-
 cpp/src/parquet/exception.h                   |   2 +-
 cpp/src/parquet/level_comparison_inc.h        |   2 +-
 cpp/src/parquet/level_conversion_inc.h        |   8 +-
 cpp/src/parquet/platform.h                    |  56 ++---
 cpp/src/parquet/types_test.cc                 |  12 +-
 cpp/src/parquet/windows_fixup.h               |   6 +-
 matlab/src/cpp/arrow/matlab/api/visibility.h  |  12 +-
 python/pyarrow/src/arrow/python/datetime.h    |   4 +-
 python/pyarrow/src/arrow/python/flight.h      |  32 +--
 .../pyarrow/src/arrow/python/numpy_interop.h  |  26 +--
 .../src/arrow/python/parquet_encryption.h     |  32 +--
 python/pyarrow/src/arrow/python/platform.h    |  12 +-
 python/pyarrow/src/arrow/python/visibility.h  |  30 +--
 147 files changed, 1425 insertions(+), 1418 deletions(-)

diff --git a/.clang-format b/.clang-format
index 9448dc8d8c80d..abd823c103904 100644
--- a/.clang-format
+++ b/.clang-format
@@ -19,3 +19,4 @@ BasedOnStyle: Google
 ColumnLimit: 90
 DerivePointerAlignment: false
 IncludeBlocks: Preserve
+IndentPPDirectives: AfterHash
diff --git a/cpp/src/arrow/acero/aggregate_benchmark.cc b/cpp/src/arrow/acero/aggregate_benchmark.cc
index 854862e3e48ca..c0dfba66336af 100644
--- a/cpp/src/arrow/acero/aggregate_benchmark.cc
+++ b/cpp/src/arrow/acero/aggregate_benchmark.cc
@@ -165,11 +165,11 @@ struct SumSentinelUnrolled : public Summer<T> {
   static void Sum(const ArrayType& array, SumState<T>* state) {
     SumState<T> local;
 
-#define SUM_NOT_NULL(ITEM)                                                  \
-  do {                                                                      \
-    local.total += values[i + ITEM] * Traits<T>::NotNull(values[i + ITEM]); \
-    local.valid_count++;                                                    \
-  } while (0)
+#  define SUM_NOT_NULL(ITEM)                                                  \
+    do {                                                                      \
+      local.total += values[i + ITEM] * Traits<T>::NotNull(values[i + ITEM]); \
+      local.valid_count++;                                                    \
+    } while (0)
 
     const auto values = array.raw_values();
     const auto length = array.length();
@@ -185,7 +185,7 @@ struct SumSentinelUnrolled : public Summer<T> {
       SUM_NOT_NULL(7);
     }
 
-#undef SUM_NOT_NULL
+#  undef SUM_NOT_NULL
 
     for (int64_t i = length_rounded * 8; i < length; ++i) {
       local.total += values[i] * Traits<T>::NotNull(values[i]);
@@ -256,7 +256,7 @@ struct SumBitmapVectorizeUnroll : public Summer<T> {
     for (int64_t i = 0; i < length_rounded; i += 8) {
       const uint8_t valid_byte = bitmap[i / 8];
 
-#define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1))
+#  define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1))
 
       if (valid_byte < 0xFF) {
         // Some nulls
@@ -277,7 +277,7 @@ struct SumBitmapVectorizeUnroll : public Summer<T> {
       }
     }
 
-#undef SUM_SHIFT
+#  undef SUM_SHIFT
 
     for (int64_t i = length_rounded; i < length; ++i) {
       if (bit_util::GetBit(bitmap, i)) {
diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc
index 2248362241cd7..c4f11d01f3d5c 100644
--- a/cpp/src/arrow/acero/asof_join_node.cc
+++ b/cpp/src/arrow/acero/asof_join_node.cc
@@ -34,7 +34,7 @@
 #include "arrow/acero/options.h"
 #include "arrow/acero/unmaterialized_table_internal.h"
 #ifndef NDEBUG
-#include "arrow/acero/options_internal.h"
+#  include "arrow/acero/options_internal.h"
 #endif
 #include "arrow/acero/query_context.h"
 #include "arrow/acero/schema_util.h"
@@ -42,7 +42,7 @@
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_primitive.h"
 #ifndef NDEBUG
-#include "arrow/compute/function_internal.h"
+#  include "arrow/compute/function_internal.h"
 #endif
 #include "arrow/acero/time_series_util.h"
 #include "arrow/compute/key_hash_internal.h"
@@ -207,16 +207,16 @@ class DebugSync {
   std::unique_lock<std::mutex> debug_lock_;
 };
 
-#define DEBUG_SYNC(node, ...) DebugSync(node).insert(__VA_ARGS__)
-#define DEBUG_MANIP(manip) \
-  DebugSync::Manip([](DebugSync& d) -> DebugSync& { return d << manip; })
-#define NDEBUG_EXPLICIT
-#define DEBUG_ADD(ndebug, ...) ndebug, __VA_ARGS__
+#  define DEBUG_SYNC(node, ...) DebugSync(node).insert(__VA_ARGS__)
+#  define DEBUG_MANIP(manip) \
+    DebugSync::Manip([](DebugSync& d) -> DebugSync& { return d << manip; })
+#  define NDEBUG_EXPLICIT
+#  define DEBUG_ADD(ndebug, ...) ndebug, __VA_ARGS__
 #else
-#define DEBUG_SYNC(...)
-#define DEBUG_MANIP(...)
-#define NDEBUG_EXPLICIT explicit
-#define DEBUG_ADD(ndebug, ...) ndebug
+#  define DEBUG_SYNC(...)
+#  define DEBUG_MANIP(...)
+#  define NDEBUG_EXPLICIT explicit
+#  define DEBUG_ADD(ndebug, ...) ndebug
 #endif
 
 struct MemoStore {
diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc
index 555f580028fac..5d3e9fba08bbf 100644
--- a/cpp/src/arrow/acero/asof_join_node_test.cc
+++ b/cpp/src/arrow/acero/asof_join_node_test.cc
@@ -26,13 +26,13 @@
 #include "arrow/acero/exec_plan.h"
 #include "arrow/testing/future_util.h"
 #ifndef NDEBUG
-#include <sstream>
+#  include <sstream>
 #endif
 #include <unordered_set>
 
 #include "arrow/acero/options.h"
 #ifndef NDEBUG
-#include "arrow/acero/options_internal.h"
+#  include "arrow/acero/options_internal.h"
 #endif
 #include "arrow/acero/map_node.h"
 #include "arrow/acero/query_context.h"
diff --git a/cpp/src/arrow/acero/bloom_filter.h b/cpp/src/arrow/acero/bloom_filter.h
index 50d07bfd948e0..530beaea64827 100644
--- a/cpp/src/arrow/acero/bloom_filter.h
+++ b/cpp/src/arrow/acero/bloom_filter.h
@@ -18,7 +18,7 @@
 #pragma once
 
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
-#include <immintrin.h>
+#  include <immintrin.h>
 #endif
 
 #include <atomic>
diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc
index a2d6e9575a1aa..30cafd120caea 100644
--- a/cpp/src/arrow/acero/bloom_filter_test.cc
+++ b/cpp/src/arrow/acero/bloom_filter_test.cc
@@ -503,9 +503,9 @@ TEST(BloomFilter, Scaling) {
   num_build.push_back(4000000);
 
   std::vector<BloomFilterBuildStrategy> strategies;
-#ifdef ARROW_ENABLE_THREADING
+#  ifdef ARROW_ENABLE_THREADING
   strategies.push_back(BloomFilterBuildStrategy::PARALLEL);
-#endif
+#  endif
   strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED);
 
   for (const auto hardware_flags : HardwareFlagsForTesting()) {
diff --git a/cpp/src/arrow/acero/options_internal.h b/cpp/src/arrow/acero/options_internal.h
index d4bf79a7cd008..fd3ea78116572 100644
--- a/cpp/src/arrow/acero/options_internal.h
+++ b/cpp/src/arrow/acero/options_internal.h
@@ -18,8 +18,8 @@
 #pragma once
 
 #ifndef NDEBUG
-#include <mutex>
-#include <ostream>
+#  include <mutex>
+#  include <ostream>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/acero/visibility.h b/cpp/src/arrow/acero/visibility.h
index 02382232b69dd..21a697a56eca9 100644
--- a/cpp/src/arrow/acero/visibility.h
+++ b/cpp/src/arrow/acero/visibility.h
@@ -20,31 +20,31 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef ARROW_ACERO_STATIC
-#define ARROW_ACERO_EXPORT
-#elif defined(ARROW_ACERO_EXPORTING)
-#define ARROW_ACERO_EXPORT __declspec(dllexport)
-#else
-#define ARROW_ACERO_EXPORT __declspec(dllimport)
-#endif
+#  ifdef ARROW_ACERO_STATIC
+#    define ARROW_ACERO_EXPORT
+#  elif defined(ARROW_ACERO_EXPORTING)
+#    define ARROW_ACERO_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_ACERO_EXPORT __declspec(dllimport)
+#  endif
 
-#define ARROW_ACERO_NO_EXPORT
+#  define ARROW_ACERO_NO_EXPORT
 #else  // Not Windows
-#ifndef ARROW_ACERO_EXPORT
-#define ARROW_ACERO_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef ARROW_ACERO_NO_EXPORT
-#define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
+#  ifndef ARROW_ACERO_EXPORT
+#    define ARROW_ACERO_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_ACERO_NO_EXPORT
+#    define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
 #endif  // Not-Windows
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 25759f8471365..d16b6cfd2e97d 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -25,7 +25,7 @@
 #include <vector>
 
 #ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK
-#include <filesystem>
+#  include <filesystem>
 #endif
 
 #include "arrow/adapters/orc/util.h"
diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 6abe866b5f6f6..db051fff5ff05 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -41,11 +41,11 @@ extern "C" {
 #endif
 
 #ifndef ARROW_C_DATA_INTERFACE
-#define ARROW_C_DATA_INTERFACE
+#  define ARROW_C_DATA_INTERFACE
 
-#define ARROW_FLAG_DICTIONARY_ORDERED 1
-#define ARROW_FLAG_NULLABLE 2
-#define ARROW_FLAG_MAP_KEYS_SORTED 4
+#  define ARROW_FLAG_DICTIONARY_ORDERED 1
+#  define ARROW_FLAG_NULLABLE 2
+#  define ARROW_FLAG_MAP_KEYS_SORTED 4
 
 struct ArrowSchema {
   // Array type description
@@ -83,7 +83,7 @@ struct ArrowArray {
 #endif  // ARROW_C_DATA_INTERFACE
 
 #ifndef ARROW_C_DEVICE_DATA_INTERFACE
-#define ARROW_C_DEVICE_DATA_INTERFACE
+#  define ARROW_C_DEVICE_DATA_INTERFACE
 
 // Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html
 
@@ -91,33 +91,33 @@ struct ArrowArray {
 typedef int32_t ArrowDeviceType;
 
 // CPU device, same as using ArrowArray directly
-#define ARROW_DEVICE_CPU 1
+#  define ARROW_DEVICE_CPU 1
 // CUDA GPU Device
-#define ARROW_DEVICE_CUDA 2
+#  define ARROW_DEVICE_CUDA 2
 // Pinned CUDA CPU memory by cudaMallocHost
-#define ARROW_DEVICE_CUDA_HOST 3
+#  define ARROW_DEVICE_CUDA_HOST 3
 // OpenCL Device
-#define ARROW_DEVICE_OPENCL 4
+#  define ARROW_DEVICE_OPENCL 4
 // Vulkan buffer for next-gen graphics
-#define ARROW_DEVICE_VULKAN 7
+#  define ARROW_DEVICE_VULKAN 7
 // Metal for Apple GPU
-#define ARROW_DEVICE_METAL 8
+#  define ARROW_DEVICE_METAL 8
 // Verilog simulator buffer
-#define ARROW_DEVICE_VPI 9
+#  define ARROW_DEVICE_VPI 9
 // ROCm GPUs for AMD GPUs
-#define ARROW_DEVICE_ROCM 10
+#  define ARROW_DEVICE_ROCM 10
 // Pinned ROCm CPU memory allocated by hipMallocHost
-#define ARROW_DEVICE_ROCM_HOST 11
+#  define ARROW_DEVICE_ROCM_HOST 11
 // Reserved for extension
-#define ARROW_DEVICE_EXT_DEV 12
+#  define ARROW_DEVICE_EXT_DEV 12
 // CUDA managed/unified memory allocated by cudaMallocManaged
-#define ARROW_DEVICE_CUDA_MANAGED 13
+#  define ARROW_DEVICE_CUDA_MANAGED 13
 // unified shared memory allocated on a oneAPI non-partitioned device.
-#define ARROW_DEVICE_ONEAPI 14
+#  define ARROW_DEVICE_ONEAPI 14
 // GPU support for next-gen WebGPU standard
-#define ARROW_DEVICE_WEBGPU 15
+#  define ARROW_DEVICE_WEBGPU 15
 // Qualcomm Hexagon DSP
-#define ARROW_DEVICE_HEXAGON 16
+#  define ARROW_DEVICE_HEXAGON 16
 
 struct ArrowDeviceArray {
   // the Allocated Array
@@ -138,7 +138,7 @@ struct ArrowDeviceArray {
 #endif  // ARROW_C_DEVICE_DATA_INTERFACE
 
 #ifndef ARROW_C_STREAM_INTERFACE
-#define ARROW_C_STREAM_INTERFACE
+#  define ARROW_C_STREAM_INTERFACE
 
 struct ArrowArrayStream {
   // Callback to get the stream type
@@ -179,7 +179,7 @@ struct ArrowArrayStream {
 #endif  // ARROW_C_STREAM_INTERFACE
 
 #ifndef ARROW_C_DEVICE_STREAM_INTERFACE
-#define ARROW_C_DEVICE_STREAM_INTERFACE
+#  define ARROW_C_DEVICE_STREAM_INTERFACE
 
 // Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
 //
diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc
index 09bb524adbdf0..01fd56f631d99 100644
--- a/cpp/src/arrow/c/bridge_test.cc
+++ b/cpp/src/arrow/c/bridge_test.cc
@@ -48,7 +48,7 @@
 
 // TODO(GH-37221): Remove these ifdef checks when compute dependency is removed
 #ifdef ARROW_COMPUTE
-#include "arrow/compute/api_vector.h"
+#  include "arrow/compute/api_vector.h"
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/c/dlpack_abi.h b/cpp/src/arrow/c/dlpack_abi.h
index 4af557a7ed5d7..fbe2a56a344b3 100644
--- a/cpp/src/arrow/c/dlpack_abi.h
+++ b/cpp/src/arrow/c/dlpack_abi.h
@@ -12,9 +12,9 @@
  * \brief Compatibility with C++
  */
 #ifdef __cplusplus
-#define DLPACK_EXTERN_C extern "C"
+#  define DLPACK_EXTERN_C extern "C"
 #else
-#define DLPACK_EXTERN_C
+#  define DLPACK_EXTERN_C
 #endif
 
 /*! \brief The current major version of dlpack */
@@ -25,13 +25,13 @@
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
-#ifdef DLPACK_EXPORTS
-#define DLPACK_DLL __declspec(dllexport)
+#  ifdef DLPACK_EXPORTS
+#    define DLPACK_DLL __declspec(dllexport)
+#  else
+#    define DLPACK_DLL __declspec(dllimport)
+#  endif
 #else
-#define DLPACK_DLL __declspec(dllimport)
-#endif
-#else
-#define DLPACK_DLL
+#  define DLPACK_DLL
 #endif
 
 #include <stddef.h>
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index cfa1cd8193f36..cfb6265f12904 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -42,7 +42,7 @@
 // macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h.
 // No other BSD seems to do so. The name is used as an identifier in MemAllocation enum.
 #if defined(__APPLE__) && defined(PREALLOCATE)
-#undef PREALLOCATE
+#  undef PREALLOCATE
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index dc3fe29a3dfae..11875522b42c9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -217,8 +217,8 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
 
 #if defined(_MSC_VER)
 // Silence warning: """'visitor': unreferenced local variable"""
-#pragma warning(push)
-#pragma warning(disable : 4101)
+#  pragma warning(push)
+#  pragma warning(disable : 4101)
 #endif
 
 struct Utf8Validator {
@@ -422,7 +422,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
 }
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/kernels/scalar_round_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_round_benchmark.cc
index 3c5bb76dc24e9..7f1b5ef710379 100644
--- a/cpp/src/arrow/compute/kernels/scalar_round_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_round_benchmark.cc
@@ -122,26 +122,28 @@ void SetRoundArgs(benchmark::internal::Benchmark* bench) {
   BENCHMARK_TEMPLATE(BENCHMARK, OP, DoubleType)->Apply(SetRoundArgs);
 
 #ifdef ALL_ROUND_BENCHMARKS
-#define DECLARE_ROUND_BENCHMARKS_WITH_ROUNDMODE(BENCHMARK, OP, TYPE)                     \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::DOWN)->Apply(SetRoundArgs);         \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::UP)->Apply(SetRoundArgs);           \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::TOWARDS_ZERO)->Apply(SetRoundArgs); \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::TOWARDS_INFINITY)                   \
-      ->Apply(SetRoundArgs);                                                             \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_DOWN)->Apply(SetRoundArgs);    \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_UP)->Apply(SetRoundArgs);      \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TOWARDS_ZERO)                  \
-      ->Apply(SetRoundArgs);                                                             \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TOWARDS_INFINITY)              \
-      ->Apply(SetRoundArgs);                                                             \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TO_EVEN)->Apply(SetRoundArgs); \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TO_ODD)->Apply(SetRoundArgs)
+#  define DECLARE_ROUND_BENCHMARKS_WITH_ROUNDMODE(BENCHMARK, OP, TYPE)                  \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::DOWN)->Apply(SetRoundArgs);      \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::UP)->Apply(SetRoundArgs);        \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::TOWARDS_ZERO)                    \
+        ->Apply(SetRoundArgs);                                                          \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::TOWARDS_INFINITY)                \
+        ->Apply(SetRoundArgs);                                                          \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_DOWN)->Apply(SetRoundArgs); \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_UP)->Apply(SetRoundArgs);   \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TOWARDS_ZERO)               \
+        ->Apply(SetRoundArgs);                                                          \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TOWARDS_INFINITY)           \
+        ->Apply(SetRoundArgs);                                                          \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TO_EVEN)                    \
+        ->Apply(SetRoundArgs);                                                          \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TO_ODD)->Apply(SetRoundArgs)
 #else
-#define DECLARE_ROUND_BENCHMARKS_WITH_ROUNDMODE(BENCHMARK, OP, TYPE)             \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::DOWN)->Apply(SetRoundArgs); \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TOWARDS_ZERO)          \
-      ->Apply(SetRoundArgs);                                                     \
-  BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TO_ODD)->Apply(SetRoundArgs)
+#  define DECLARE_ROUND_BENCHMARKS_WITH_ROUNDMODE(BENCHMARK, OP, TYPE)             \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::DOWN)->Apply(SetRoundArgs); \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TOWARDS_ZERO)          \
+        ->Apply(SetRoundArgs);                                                     \
+    BENCHMARK_TEMPLATE(BENCHMARK, OP, TYPE, RoundMode::HALF_TO_ODD)->Apply(SetRoundArgs)
 #endif
 
 #define DECLARE_ROUND_BENCHMARKS(BENCHMARK, OP)                       \
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
index fecd57412b436..e58f7b065a8e5 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
@@ -30,7 +30,7 @@
 #include "arrow/util/value_parsing.h"
 
 #ifdef ARROW_WITH_RE2
-#include <re2/re2.h>
+#  include <re2/re2.h>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 0a2261290846a..59a22b9926456 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -34,7 +34,7 @@
 #include "arrow/util/value_parsing.h"
 
 #ifdef ARROW_WITH_UTF8PROC
-#include <utf8proc.h>
+#  include <utf8proc.h>
 #endif
 
 namespace arrow::compute {
@@ -1415,7 +1415,7 @@ TYPED_TEST(TestStringKernels, IsTitleUnicode) {
 }
 
 // Older versions of utf8proc fail
-#if !(UTF8PROC_VERSION_MAJOR <= 2 && UTF8PROC_VERSION_MINOR < 5)
+#  if !(UTF8PROC_VERSION_MAJOR <= 2 && UTF8PROC_VERSION_MINOR < 5)
 
 TYPED_TEST(TestStringKernels, IsUpperUnicode) {
   // ٣ is arabic 3 (decimal), Φ capital
@@ -1437,7 +1437,7 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
                    boolean(), "[true, true, true, false, true, false]");
 }
 
-#endif  // UTF8PROC_VERSION_MINOR >= 5
+#  endif  // UTF8PROC_VERSION_MINOR >= 5
 
 #endif  // ARROW_WITH_UTF8PROC
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
index 42762ca8b116f..cf248b7c9f879 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
@@ -24,7 +24,7 @@
 #include "arrow/util/utf8_internal.h"
 
 #ifdef ARROW_WITH_UTF8PROC
-#include <utf8proc.h>
+#  include <utf8proc.h>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/key_hash_internal.h b/cpp/src/arrow/compute/key_hash_internal.h
index 1f25beb0e1622..582cf28732352 100644
--- a/cpp/src/arrow/compute/key_hash_internal.h
+++ b/cpp/src/arrow/compute/key_hash_internal.h
@@ -18,7 +18,7 @@
 #pragma once
 
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
-#include <immintrin.h>
+#  include <immintrin.h>
 #endif
 
 #include <cstdint>
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
index d56e398667f66..9034849bbc36d 100644
--- a/cpp/src/arrow/compute/util.h
+++ b/cpp/src/arrow/compute/util.h
@@ -30,21 +30,22 @@
 #include "arrow/util/cpu_info.h"
 
 #if defined(__clang__) || defined(__GNUC__)
-#define BYTESWAP(x) __builtin_bswap64(x)
-#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
-#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
-#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  define BYTESWAP(x) __builtin_bswap64(x)
+#  define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
+#  define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
+#  define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
 #elif defined(_MSC_VER)
-#include <intrin.h>
-#define BYTESWAP(x) _byteswap_uint64(x)
-#define ROTL(x, n) _rotl((x), (n))
-#define ROTL64(x, n) _rotl64((x), (n))
-#if defined(_M_X64) || defined(_M_I86)
-#include <mmintrin.h>  // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx
-#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
-#else
-#define PREFETCH(ptr) (void)(ptr) /* disabled */
-#endif
+#  include <intrin.h>
+#  define BYTESWAP(x) _byteswap_uint64(x)
+#  define ROTL(x, n) _rotl((x), (n))
+#  define ROTL64(x, n) _rotl64((x), (n))
+#  if defined(_M_X64) || defined(_M_I86)
+// https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx
+#    include <mmintrin.h>
+#    define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  else
+#    define PREFETCH(ptr) (void)(ptr) /* disabled */
+#  endif
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/util_internal.cc b/cpp/src/arrow/compute/util_internal.cc
index 7a7875162c434..7d6c41e092889 100644
--- a/cpp/src/arrow/compute/util_internal.cc
+++ b/cpp/src/arrow/compute/util_internal.cc
@@ -21,7 +21,7 @@
 #include "arrow/memory_pool.h"
 
 #ifdef ADDRESS_SANITIZER
-#include <sanitizer/asan_interface.h>
+#  include <sanitizer/asan_interface.h>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc
index 5b9c51cda5576..4b5252076af53 100644
--- a/cpp/src/arrow/csv/writer.cc
+++ b/cpp/src/arrow/csv/writer.cc
@@ -32,7 +32,7 @@
 #include <memory>
 
 #if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
-#include <xsimd/xsimd.hpp>
+#  include <xsimd/xsimd.hpp>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/dataset/api.h b/cpp/src/arrow/dataset/api.h
index c2ebd9d300727..38caa1cff19de 100644
--- a/cpp/src/arrow/dataset/api.h
+++ b/cpp/src/arrow/dataset/api.h
@@ -24,16 +24,16 @@
 #include "arrow/dataset/discovery.h"
 #include "arrow/dataset/file_base.h"
 #ifdef ARROW_CSV
-#include "arrow/dataset/file_csv.h"
+#  include "arrow/dataset/file_csv.h"
 #endif
 #ifdef ARROW_JSON
-#include "arrow/dataset/file_json.h"
+#  include "arrow/dataset/file_json.h"
 #endif
 #include "arrow/dataset/file_ipc.h"
 #ifdef ARROW_ORC
-#include "arrow/dataset/file_orc.h"
+#  include "arrow/dataset/file_orc.h"
 #endif
 #ifdef ARROW_PARQUET
-#include "arrow/dataset/file_parquet.h"
+#  include "arrow/dataset/file_parquet.h"
 #endif
 #include "arrow/dataset/scanner.h"
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc b/cpp/src/arrow/dataset/file_csv_test.cc
index 60a6685dc22fd..e8e5838e6f93a 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -464,35 +464,35 @@ INSTANTIATE_TEST_SUITE_P(TestUncompressedCsvV2, TestCsvFileFormat,
 // codecs should be independently tested and so we do not need to cover those with
 // valgrind here.
 #ifndef ARROW_VALGRIND
-#ifdef ARROW_WITH_BZ2
+#  ifdef ARROW_WITH_BZ2
 INSTANTIATE_TEST_SUITE_P(TestBZ2Csv, TestCsvFileFormat,
                          ::testing::Values(CsvFileFormatParams{Compression::BZ2, false}));
 INSTANTIATE_TEST_SUITE_P(TestBZ2CsvV2, TestCsvFileFormat,
                          ::testing::Values(CsvFileFormatParams{Compression::BZ2, true}));
-#endif
-#ifdef ARROW_WITH_LZ4
+#  endif
+#  ifdef ARROW_WITH_LZ4
 INSTANTIATE_TEST_SUITE_P(TestLZ4Csv, TestCsvFileFormat,
                          ::testing::Values(CsvFileFormatParams{Compression::LZ4_FRAME,
                                                                false}));
 INSTANTIATE_TEST_SUITE_P(TestLZ4CsvV2, TestCsvFileFormat,
                          ::testing::Values(CsvFileFormatParams{Compression::LZ4_FRAME,
                                                                true}));
-#endif
+#  endif
 // Snappy does not support streaming compression
-#ifdef ARROW_WITH_ZLIB
+#  ifdef ARROW_WITH_ZLIB
 INSTANTIATE_TEST_SUITE_P(TestGzipCsv, TestCsvFileFormat,
                          ::testing::Values(CsvFileFormatParams{Compression::GZIP,
                                                                false}));
 INSTANTIATE_TEST_SUITE_P(TestGzipCsvV2, TestCsvFileFormat,
                          ::testing::Values(CsvFileFormatParams{Compression::GZIP, true}));
-#endif
-#ifdef ARROW_WITH_ZSTD
+#  endif
+#  ifdef ARROW_WITH_ZSTD
 INSTANTIATE_TEST_SUITE_P(TestZSTDCsv, TestCsvFileFormat,
                          ::testing::Values(CsvFileFormatParams{Compression::ZSTD,
                                                                false}));
 INSTANTIATE_TEST_SUITE_P(TestZSTDCsvV2, TestCsvFileFormat,
                          ::testing::Values(CsvFileFormatParams{Compression::ZSTD, true}));
-#endif
+#  endif
 #endif  // ARROW_VALGRIND
 
 class TestCsvFileFormatScan : public FileFormatScanMixin<CsvFormatHelper> {};
diff --git a/cpp/src/arrow/dataset/visibility.h b/cpp/src/arrow/dataset/visibility.h
index b43a253050fd8..752907238ca07 100644
--- a/cpp/src/arrow/dataset/visibility.h
+++ b/cpp/src/arrow/dataset/visibility.h
@@ -20,31 +20,31 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef ARROW_DS_STATIC
-#define ARROW_DS_EXPORT
-#elif defined(ARROW_DS_EXPORTING)
-#define ARROW_DS_EXPORT __declspec(dllexport)
-#else
-#define ARROW_DS_EXPORT __declspec(dllimport)
-#endif
+#  ifdef ARROW_DS_STATIC
+#    define ARROW_DS_EXPORT
+#  elif defined(ARROW_DS_EXPORTING)
+#    define ARROW_DS_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_DS_EXPORT __declspec(dllimport)
+#  endif
 
-#define ARROW_DS_NO_EXPORT
+#  define ARROW_DS_NO_EXPORT
 #else  // Not Windows
-#ifndef ARROW_DS_EXPORT
-#define ARROW_DS_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef ARROW_DS_NO_EXPORT
-#define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
+#  ifndef ARROW_DS_EXPORT
+#    define ARROW_DS_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_DS_NO_EXPORT
+#    define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
 #endif  // Non-Windows
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
diff --git a/cpp/src/arrow/engine/substrait/visibility.h b/cpp/src/arrow/engine/substrait/visibility.h
index d81d202ee6567..9ed1c67352d60 100644
--- a/cpp/src/arrow/engine/substrait/visibility.h
+++ b/cpp/src/arrow/engine/substrait/visibility.h
@@ -22,31 +22,31 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef ARROW_ENGINE_STATIC
-#define ARROW_ENGINE_EXPORT
-#elif defined(ARROW_ENGINE_EXPORTING)
-#define ARROW_ENGINE_EXPORT __declspec(dllexport)
-#else
-#define ARROW_ENGINE_EXPORT __declspec(dllimport)
-#endif
+#  ifdef ARROW_ENGINE_STATIC
+#    define ARROW_ENGINE_EXPORT
+#  elif defined(ARROW_ENGINE_EXPORTING)
+#    define ARROW_ENGINE_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_ENGINE_EXPORT __declspec(dllimport)
+#  endif
 
-#define ARROW_ENGINE_NO_EXPORT
+#  define ARROW_ENGINE_NO_EXPORT
 #else  // Not Windows
-#ifndef ARROW_ENGINE_EXPORT
-#define ARROW_ENGINE_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef ARROW_ENGINE_NO_EXPORT
-#define ARROW_ENGINE_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
+#  ifndef ARROW_ENGINE_EXPORT
+#    define ARROW_ENGINE_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_ENGINE_NO_EXPORT
+#    define ARROW_ENGINE_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
 #endif  // Non-Windows
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index fc220f73a6beb..d0135e905a0c3 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -29,8 +29,8 @@
 #include "arrow/config.h"
 #include "arrow/extension/bool8.h"
 #ifdef ARROW_JSON
-#include "arrow/extension/fixed_shape_tensor.h"
-#include "arrow/extension/opaque.h"
+#  include "arrow/extension/fixed_shape_tensor.h"
+#  include "arrow/extension/opaque.h"
 #endif
 #include "arrow/extension/uuid.h"
 #include "arrow/status.h"
diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h
index 562b7c1808ec1..7211ad5c2ccdb 100644
--- a/cpp/src/arrow/filesystem/api.h
+++ b/cpp/src/arrow/filesystem/api.h
@@ -21,14 +21,14 @@
 
 #include "arrow/filesystem/filesystem.h"  // IWYU pragma: export
 #ifdef ARROW_AZURE
-#include "arrow/filesystem/azurefs.h"  // IWYU pragma: export
+#  include "arrow/filesystem/azurefs.h"  // IWYU pragma: export
 #endif
 #ifdef ARROW_GCS
-#include "arrow/filesystem/gcsfs.h"  // IWYU pragma: export
+#  include "arrow/filesystem/gcsfs.h"  // IWYU pragma: export
 #endif
 #include "arrow/filesystem/hdfs.h"     // IWYU pragma: export
 #include "arrow/filesystem/localfs.h"  // IWYU pragma: export
 #include "arrow/filesystem/mockfs.h"   // IWYU pragma: export
 #ifdef ARROW_S3
-#include "arrow/filesystem/s3fs.h"  // IWYU pragma: export
+#  include "arrow/filesystem/s3fs.h"  // IWYU pragma: export
 #endif
diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc
index 0bad856339729..d407b1654f5b5 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -27,12 +27,12 @@
 // idenfity.hpp triggers -Wattributes warnings cause -Werror builds to fail,
 // so disable it for this file with pragmas.
 #if defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wattributes"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wattributes"
 #endif
 #include <azure/identity.hpp>
 #if defined(__GNUC__)
-#pragma GCC diagnostic pop
+#  pragma GCC diagnostic pop
 #endif
 #include <azure/storage/blobs.hpp>
 #include <azure/storage/files/datalake.hpp>
diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc
index 284be685fa800..b5765010ec7e9 100644
--- a/cpp/src/arrow/filesystem/filesystem.cc
+++ b/cpp/src/arrow/filesystem/filesystem.cc
@@ -26,16 +26,16 @@
 
 #include "arrow/filesystem/filesystem.h"
 #ifdef ARROW_AZURE
-#include "arrow/filesystem/azurefs.h"
+#  include "arrow/filesystem/azurefs.h"
 #endif
 #ifdef ARROW_GCS
-#include "arrow/filesystem/gcsfs.h"
+#  include "arrow/filesystem/gcsfs.h"
 #endif
 #ifdef ARROW_HDFS
-#include "arrow/filesystem/hdfs.h"
+#  include "arrow/filesystem/hdfs.h"
 #endif
 #ifdef ARROW_S3
-#include "arrow/filesystem/s3fs.h"
+#  include "arrow/filesystem/s3fs.h"
 #endif
 #include "arrow/filesystem/localfs.h"
 #include "arrow/filesystem/mockfs.h"
diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc
index 22d802d8f9f7f..9fe19cbf25058 100644
--- a/cpp/src/arrow/filesystem/localfs.cc
+++ b/cpp/src/arrow/filesystem/localfs.cc
@@ -22,12 +22,12 @@
 #include <utility>
 
 #ifdef _WIN32
-#include "arrow/util/windows_compatibility.h"
+#  include "arrow/util/windows_compatibility.h"
 #else
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cerrno>
-#include <cstdio>
+#  include <fcntl.h>
+#  include <sys/stat.h>
+#  include <cerrno>
+#  include <cstdio>
 #endif
 
 #include "arrow/filesystem/filesystem.h"
@@ -157,12 +157,12 @@ FileInfo StatToFileInfo(const struct stat& s) {
     info.set_type(FileType::Unknown);
     info.set_size(kNoSize);
   }
-#ifdef __APPLE__
+#  ifdef __APPLE__
   // macOS doesn't use the POSIX-compliant spelling
   info.set_mtime(ToTimePoint(s.st_mtimespec));
-#else
+#  else
   info.set_mtime(ToTimePoint(s.st_mtim));
-#endif
+#  endif
   return info;
 }
 
diff --git a/cpp/src/arrow/filesystem/s3_test_util.cc b/cpp/src/arrow/filesystem/s3_test_util.cc
index 003afa68f1e35..db0c60f2e80f2 100644
--- a/cpp/src/arrow/filesystem/s3_test_util.cc
+++ b/cpp/src/arrow/filesystem/s3_test_util.cc
@@ -16,7 +16,7 @@
 // under the License.
 
 #ifndef _WIN32
-#include <sys/wait.h>
+#  include <sys/wait.h>
 #endif
 
 #include "arrow/filesystem/s3_test_util.h"
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index fd5b2e5be2a3a..96c771aeb61b8 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -34,12 +34,12 @@
 
 #ifdef _WIN32
 // Undefine preprocessor macros that interfere with AWS function / method names
-#ifdef GetMessage
-#undef GetMessage
-#endif
-#ifdef GetObject
-#undef GetObject
-#endif
+#  ifdef GetMessage
+#    undef GetMessage
+#  endif
+#  ifdef GetObject
+#    undef GetObject
+#  endif
 #endif
 
 #include <aws/core/Aws.h>
@@ -84,13 +84,13 @@
 // Redundant "(...)" are for suppressing "Weird number of spaces at
 // line-start. Are you using a 2-space indent? [whitespace/indent]
 // [3]" errors...
-#define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch)                      \
-  ((AWS_SDK_VERSION_MAJOR > (major) ||                                        \
-    (AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR > (minor)) ||  \
-    ((AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR == (minor) && \
-      AWS_SDK_VERSION_PATCH >= (patch)))))
+#  define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch)                      \
+    ((AWS_SDK_VERSION_MAJOR > (major) ||                                        \
+      (AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR > (minor)) ||  \
+      ((AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR == (minor) && \
+        AWS_SDK_VERSION_PATCH >= (patch)))))
 #else
-#define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) 0
+#  define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) 0
 #endif
 
 // This feature is available since 1.9.0 but
@@ -98,22 +98,22 @@
 // we can't use this feature for [1.9.0,1.9.6]. If it's a problem,
 // please report it to our issue tracker.
 #if ARROW_AWS_SDK_VERSION_CHECK(1, 9, 0)
-#define ARROW_S3_HAS_CRT
+#  define ARROW_S3_HAS_CRT
 #endif
 
 #if ARROW_AWS_SDK_VERSION_CHECK(1, 10, 0)
-#define ARROW_S3_HAS_S3CLIENT_CONFIGURATION
+#  define ARROW_S3_HAS_S3CLIENT_CONFIGURATION
 #endif
 
 #ifdef ARROW_S3_HAS_CRT
-#include <aws/crt/io/Bootstrap.h>
-#include <aws/crt/io/EventLoopGroup.h>
-#include <aws/crt/io/HostResolver.h>
+#  include <aws/crt/io/Bootstrap.h>
+#  include <aws/crt/io/EventLoopGroup.h>
+#  include <aws/crt/io/HostResolver.h>
 #endif
 
 #ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION
-#include <aws/s3/S3ClientConfiguration.h>
-#include <aws/s3/S3EndpointProvider.h>
+#  include <aws/s3/S3ClientConfiguration.h>
+#  include <aws/s3/S3EndpointProvider.h>
 #endif
 
 #include "arrow/util/windows_fixup.h"
diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc
index c33fa4f5aac97..82a7d6e546ef3 100644
--- a/cpp/src/arrow/filesystem/s3fs_test.cc
+++ b/cpp/src/arrow/filesystem/s3fs_test.cc
@@ -28,12 +28,12 @@
 
 #ifdef _WIN32
 // Undefine preprocessor macros that interfere with AWS function / method names
-#ifdef GetMessage
-#undef GetMessage
-#endif
-#ifdef GetObject
-#undef GetObject
-#endif
+#  ifdef GetMessage
+#    undef GetMessage
+#  endif
+#  ifdef GetObject
+#    undef GetObject
+#  endif
 #endif
 
 #include <aws/core/Aws.h>
diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h
index 613903108949e..ae6011b117aa7 100644
--- a/cpp/src/arrow/flight/client.h
+++ b/cpp/src/arrow/flight/client.h
@@ -146,8 +146,8 @@ class ARROW_FLIGHT_EXPORT FlightStreamReader : public MetadataRecordBatchReader
 // Silence warning
 // "non dll-interface class RecordBatchReader used as base for dll-interface class"
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4275)
+#  pragma warning(push)
+#  pragma warning(disable : 4275)
 #endif
 
 /// \brief A RecordBatchWriter that also allows sending
@@ -163,7 +163,7 @@ class ARROW_FLIGHT_EXPORT FlightStreamWriter : public MetadataRecordBatchWriter
 };
 
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 /// \brief A reader for application-specific metadata sent back to the
diff --git a/cpp/src/arrow/flight/client_tracing_middleware.cc b/cpp/src/arrow/flight/client_tracing_middleware.cc
index a45784bd31ecd..9433ed48509aa 100644
--- a/cpp/src/arrow/flight/client_tracing_middleware.cc
+++ b/cpp/src/arrow/flight/client_tracing_middleware.cc
@@ -25,8 +25,8 @@
 #include "arrow/util/tracing_internal.h"
 
 #ifdef ARROW_WITH_OPENTELEMETRY
-#include <opentelemetry/context/propagation/global_propagator.h>
-#include <opentelemetry/context/propagation/text_map_propagator.h>
+#  include <opentelemetry/context/propagation/global_propagator.h>
+#  include <opentelemetry/context/propagation/text_map_propagator.h>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/flight/cookie_internal.cc b/cpp/src/arrow/flight/cookie_internal.cc
index 75a10d148bf47..99fa8b238ddc8 100644
--- a/cpp/src/arrow/flight/cookie_internal.cc
+++ b/cpp/src/arrow/flight/cookie_internal.cc
@@ -28,7 +28,7 @@
 
 // Mingw-w64 defines strcasecmp in string.h
 #if defined(_WIN32) && !defined(strcasecmp)
-#define strcasecmp stricmp
+#  define strcasecmp stricmp
 #endif
 
 #include <algorithm>
diff --git a/cpp/src/arrow/flight/flight_benchmark.cc b/cpp/src/arrow/flight/flight_benchmark.cc
index 661c47737f024..1383788e08233 100644
--- a/cpp/src/arrow/flight/flight_benchmark.cc
+++ b/cpp/src/arrow/flight/flight_benchmark.cc
@@ -40,11 +40,11 @@
 #include "arrow/flight/test_util.h"
 
 #ifdef ARROW_CUDA
-#include <cuda.h>
-#include "arrow/gpu/cuda_api.h"
+#  include <cuda.h>
+#  include "arrow/gpu/cuda_api.h"
 #endif
 #ifdef ARROW_WITH_UCX
-#include "arrow/flight/transport/ucx/ucx.h"
+#  include "arrow/flight/transport/ucx/ucx.h"
 #endif
 
 DEFINE_bool(cuda, false, "Allocate results in CUDA memory");
diff --git a/cpp/src/arrow/flight/flight_test.cc b/cpp/src/arrow/flight/flight_test.cc
index 6425233dadec4..863f21f8db5e4 100644
--- a/cpp/src/arrow/flight/flight_test.cc
+++ b/cpp/src/arrow/flight/flight_test.cc
@@ -44,7 +44,7 @@
 #include "arrow/util/logging.h"
 
 #ifdef GRPCPP_GRPCPP_H
-#error "gRPC headers should not be in public API"
+#  error "gRPC headers should not be in public API"
 #endif
 
 #include <grpcpp/grpcpp.h>
@@ -71,11 +71,11 @@
 // > between the two different versions of Abseil.
 #include "arrow/util/tracing_internal.h"
 #ifdef ARROW_WITH_OPENTELEMETRY
-#include <opentelemetry/context/propagation/global_propagator.h>
-#include <opentelemetry/context/propagation/text_map_propagator.h>
-#include <opentelemetry/sdk/trace/processor.h>
-#include <opentelemetry/sdk/trace/tracer_provider.h>
-#include <opentelemetry/trace/propagation/http_trace_context.h>
+#  include <opentelemetry/context/propagation/global_propagator.h>
+#  include <opentelemetry/context/propagation/text_map_propagator.h>
+#  include <opentelemetry/sdk/trace/processor.h>
+#  include <opentelemetry/sdk/trace/tracer_provider.h>
+#  include <opentelemetry/trace/propagation/http_trace_context.h>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/flight/otel_logging.h b/cpp/src/arrow/flight/otel_logging.h
index 9a91e5d99ce7d..d1e8cbb6fcc64 100644
--- a/cpp/src/arrow/flight/otel_logging.h
+++ b/cpp/src/arrow/flight/otel_logging.h
@@ -20,9 +20,9 @@
 #include "arrow/util/config.h"
 
 #ifdef ARROW_WITH_OPENTELEMETRY
-#include "arrow/status.h"
-#include "arrow/telemetry/logging.h"
-#include "arrow/util/macros.h"
+#  include "arrow/status.h"
+#  include "arrow/telemetry/logging.h"
+#  include "arrow/util/macros.h"
 
 namespace arrow::flight {
 
diff --git a/cpp/src/arrow/flight/otel_logging_internal.h b/cpp/src/arrow/flight/otel_logging_internal.h
index 52602f0fe8aa5..426692297c362 100644
--- a/cpp/src/arrow/flight/otel_logging_internal.h
+++ b/cpp/src/arrow/flight/otel_logging_internal.h
@@ -21,8 +21,8 @@
 
 #include "arrow/util/macros.h"
 #ifdef ARROW_WITH_OPENTELEMETRY
-#include "arrow/flight/otel_logging.h"
-#include "arrow/util/logger.h"
+#  include "arrow/flight/otel_logging.h"
+#  include "arrow/util/logger.h"
 
 namespace arrow::flight::internal {
 
@@ -33,24 +33,24 @@ ARROW_EXPORT std::shared_ptr<util::Logger> GetOtelSqlServerLogger();
 
 }  // namespace arrow::flight::internal
 
-#define ARROW_FLIGHT_OTELLOG_CLIENT(LEVEL, ...)                                  \
-  ARROW_LOGGER_CALL(::arrow::flight::internal::GetOtelGrpcClientLogger(), LEVEL, \
-                    __VA_ARGS__)
-#define ARROW_FLIGHT_OTELLOG_SERVER(LEVEL, ...)                                  \
-  ARROW_LOGGER_CALL(::arrow::flight::internal::GetOtelGrpcServerLogger(), LEVEL, \
-                    __VA_ARGS__)
-#define ARROW_FLIGHT_OTELLOG_SQL_CLIENT(LEVEL, ...)                             \
-  ARROW_LOGGER_CALL(::arrow::flight::internal::GetOtelSqlClientLogger(), LEVEL, \
-                    __VA_ARGS__)
-#define ARROW_FLIGHT_OTELLOG_SQL_SERVER(LEVEL, ...)                             \
-  ARROW_LOGGER_CALL(::arrow::flight::internal::GetOtelSqlServerLogger(), LEVEL, \
-                    __VA_ARGS__)
+#  define ARROW_FLIGHT_OTELLOG_CLIENT(LEVEL, ...)                                  \
+    ARROW_LOGGER_CALL(::arrow::flight::internal::GetOtelGrpcClientLogger(), LEVEL, \
+                      __VA_ARGS__)
+#  define ARROW_FLIGHT_OTELLOG_SERVER(LEVEL, ...)                                  \
+    ARROW_LOGGER_CALL(::arrow::flight::internal::GetOtelGrpcServerLogger(), LEVEL, \
+                      __VA_ARGS__)
+#  define ARROW_FLIGHT_OTELLOG_SQL_CLIENT(LEVEL, ...)                             \
+    ARROW_LOGGER_CALL(::arrow::flight::internal::GetOtelSqlClientLogger(), LEVEL, \
+                      __VA_ARGS__)
+#  define ARROW_FLIGHT_OTELLOG_SQL_SERVER(LEVEL, ...)                             \
+    ARROW_LOGGER_CALL(::arrow::flight::internal::GetOtelSqlServerLogger(), LEVEL, \
+                      __VA_ARGS__)
 
 #else
 
-#define ARROW_FLIGHT_OTELLOG_CLIENT(LEVEL, ...) ARROW_UNUSED(0)
-#define ARROW_FLIGHT_OTELLOG_SERVER(LEVEL, ...) ARROW_UNUSED(0)
-#define ARROW_FLIGHT_OTELLOG_SQL_CLIENT(LEVEL, ...) ARROW_UNUSED(0)
-#define ARROW_FLIGHT_OTELLOG_SQL_SERVER(LEVEL, ...) ARROW_UNUSED(0)
+#  define ARROW_FLIGHT_OTELLOG_CLIENT(LEVEL, ...) ARROW_UNUSED(0)
+#  define ARROW_FLIGHT_OTELLOG_SERVER(LEVEL, ...) ARROW_UNUSED(0)
+#  define ARROW_FLIGHT_OTELLOG_SQL_CLIENT(LEVEL, ...) ARROW_UNUSED(0)
+#  define ARROW_FLIGHT_OTELLOG_SQL_SERVER(LEVEL, ...) ARROW_UNUSED(0)
 
 #endif
diff --git a/cpp/src/arrow/flight/perf_server.cc b/cpp/src/arrow/flight/perf_server.cc
index 87676da11213d..e6477edd7050a 100644
--- a/cpp/src/arrow/flight/perf_server.cc
+++ b/cpp/src/arrow/flight/perf_server.cc
@@ -42,10 +42,10 @@
 #include "arrow/flight/test_util.h"
 
 #ifdef ARROW_CUDA
-#include "arrow/gpu/cuda_api.h"
+#  include "arrow/gpu/cuda_api.h"
 #endif
 #ifdef ARROW_WITH_UCX
-#include "arrow/flight/transport/ucx/ucx.h"
+#  include "arrow/flight/transport/ucx/ucx.h"
 #endif
 
 DEFINE_bool(cuda, false, "Allocate results in CUDA memory");
diff --git a/cpp/src/arrow/flight/platform.h b/cpp/src/arrow/flight/platform.h
index 8f8db2d2dc805..498c87c5b7dc9 100644
--- a/cpp/src/arrow/flight/platform.h
+++ b/cpp/src/arrow/flight/platform.h
@@ -24,7 +24,7 @@
 // The protobuf documentation says that C4251 warnings when using the
 // library are spurious and suppressed when the build the library and
 // compiler, but must be also suppressed in downstream projects
-#pragma warning(disable : 4251)
+#  pragma warning(disable : 4251)
 
 #endif  // _MSC_VER
 
diff --git a/cpp/src/arrow/flight/server.cc b/cpp/src/arrow/flight/server.cc
index 06512bda36a49..adbdfb85f29e6 100644
--- a/cpp/src/arrow/flight/server.cc
+++ b/cpp/src/arrow/flight/server.cc
@@ -47,7 +47,7 @@ namespace flight {
 
 namespace {
 #if (ATOMIC_INT_LOCK_FREE != 2 || ATOMIC_POINTER_LOCK_FREE != 2)
-#error "atomic ints and atomic pointers not always lock-free!"
+#  error "atomic ints and atomic pointers not always lock-free!"
 #endif
 
 using ::arrow::internal::SelfPipe;
diff --git a/cpp/src/arrow/flight/server_tracing_middleware.cc b/cpp/src/arrow/flight/server_tracing_middleware.cc
index 02520cb66fd0e..6884308c7ff48 100644
--- a/cpp/src/arrow/flight/server_tracing_middleware.cc
+++ b/cpp/src/arrow/flight/server_tracing_middleware.cc
@@ -27,11 +27,11 @@
 #include "arrow/util/tracing_internal.h"
 
 #ifdef ARROW_WITH_OPENTELEMETRY
-#include <opentelemetry/context/propagation/global_propagator.h>
-#include <opentelemetry/context/propagation/text_map_propagator.h>
-#include <opentelemetry/trace/context.h>
-#include <opentelemetry/trace/propagation/http_trace_context.h>
-#include <opentelemetry/trace/semantic_conventions.h>
+#  include <opentelemetry/context/propagation/global_propagator.h>
+#  include <opentelemetry/context/propagation/text_map_propagator.h>
+#  include <opentelemetry/trace/context.h>
+#  include <opentelemetry/trace/propagation/http_trace_context.h>
+#  include <opentelemetry/trace/semantic_conventions.h>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/flight/sql/test_app_cli.cc b/cpp/src/arrow/flight/sql/test_app_cli.cc
index 194ecf5e57808..c5606a605e018 100644
--- a/cpp/src/arrow/flight/sql/test_app_cli.cc
+++ b/cpp/src/arrow/flight/sql/test_app_cli.cc
@@ -35,16 +35,16 @@
 #include "arrow/table.h"
 
 #ifdef ARROW_WITH_OPENTELEMETRY
-#include "arrow/flight/otel_logging.h"
-#include "arrow/util/tracing_internal.h"
-
-#include <opentelemetry/context/propagation/global_propagator.h>
-#include <opentelemetry/context/propagation/text_map_propagator.h>
-#include <opentelemetry/sdk/trace/processor.h>
-#include <opentelemetry/sdk/trace/tracer_provider.h>
-#include <opentelemetry/trace/propagation/http_trace_context.h>
-#include <opentelemetry/trace/provider.h>
-#include <opentelemetry/trace/scope.h>
+#  include "arrow/flight/otel_logging.h"
+#  include "arrow/util/tracing_internal.h"
+
+#  include <opentelemetry/context/propagation/global_propagator.h>
+#  include <opentelemetry/context/propagation/text_map_propagator.h>
+#  include <opentelemetry/sdk/trace/processor.h>
+#  include <opentelemetry/sdk/trace/tracer_provider.h>
+#  include <opentelemetry/trace/propagation/http_trace_context.h>
+#  include <opentelemetry/trace/provider.h>
+#  include <opentelemetry/trace/scope.h>
 #endif
 
 using arrow::Result;
diff --git a/cpp/src/arrow/flight/sql/test_server_cli.cc b/cpp/src/arrow/flight/sql/test_server_cli.cc
index a8124140497c6..b632851a1f97c 100644
--- a/cpp/src/arrow/flight/sql/test_server_cli.cc
+++ b/cpp/src/arrow/flight/sql/test_server_cli.cc
@@ -31,12 +31,12 @@
 #include "arrow/util/logging.h"
 
 #ifdef ARROW_WITH_OPENTELEMETRY
-#include "arrow/flight/otel_logging.h"
-#include "arrow/util/tracing_internal.h"
+#  include "arrow/flight/otel_logging.h"
+#  include "arrow/util/tracing_internal.h"
 
-#include <opentelemetry/context/propagation/global_propagator.h>
-#include <opentelemetry/context/propagation/text_map_propagator.h>
-#include <opentelemetry/trace/propagation/http_trace_context.h>
+#  include <opentelemetry/context/propagation/global_propagator.h>
+#  include <opentelemetry/context/propagation/text_map_propagator.h>
+#  include <opentelemetry/trace/propagation/http_trace_context.h>
 #endif
 
 DEFINE_int32(port, 31337, "Server port to listen on");
diff --git a/cpp/src/arrow/flight/sql/visibility.h b/cpp/src/arrow/flight/sql/visibility.h
index 2074815e0a246..cdd8fd953d19f 100644
--- a/cpp/src/arrow/flight/sql/visibility.h
+++ b/cpp/src/arrow/flight/sql/visibility.h
@@ -18,31 +18,31 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef ARROW_FLIGHT_SQL_STATIC
-#define ARROW_FLIGHT_SQL_EXPORT
-#elif defined(ARROW_FLIGHT_SQL_EXPORTING)
-#define ARROW_FLIGHT_SQL_EXPORT __declspec(dllexport)
-#else
-#define ARROW_FLIGHT_SQL_EXPORT __declspec(dllimport)
-#endif
+#  ifdef ARROW_FLIGHT_SQL_STATIC
+#    define ARROW_FLIGHT_SQL_EXPORT
+#  elif defined(ARROW_FLIGHT_SQL_EXPORTING)
+#    define ARROW_FLIGHT_SQL_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_FLIGHT_SQL_EXPORT __declspec(dllimport)
+#  endif
 
-#define ARROW_FLIGHT_SQL_NO_EXPORT
+#  define ARROW_FLIGHT_SQL_NO_EXPORT
 #else  // Not Windows
-#ifndef ARROW_FLIGHT_SQL_EXPORT
-#define ARROW_FLIGHT_SQL_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef ARROW_FLIGHT_SQL_NO_EXPORT
-#define ARROW_FLIGHT_SQL_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
+#  ifndef ARROW_FLIGHT_SQL_EXPORT
+#    define ARROW_FLIGHT_SQL_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_FLIGHT_SQL_NO_EXPORT
+#    define ARROW_FLIGHT_SQL_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
 #endif  // Non-Windows
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
diff --git a/cpp/src/arrow/flight/test_definitions.cc b/cpp/src/arrow/flight/test_definitions.cc
index 273d394c288d9..ea6576088f2f5 100644
--- a/cpp/src/arrow/flight/test_definitions.cc
+++ b/cpp/src/arrow/flight/test_definitions.cc
@@ -42,7 +42,7 @@
 #include "gmock/gmock.h"
 
 #if defined(ARROW_CUDA)
-#include "arrow/gpu/cuda_api.h"
+#  include "arrow/gpu/cuda_api.h"
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/flight/transport/grpc/customize_grpc.h b/cpp/src/arrow/flight/transport/grpc/customize_grpc.h
index 5005fc6b16eb4..b668022087587 100644
--- a/cpp/src/arrow/flight/transport/grpc/customize_grpc.h
+++ b/cpp/src/arrow/flight/transport/grpc/customize_grpc.h
@@ -26,16 +26,16 @@
 
 // Silence protobuf warnings
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4244)
-#pragma warning(disable : 4267)
+#  pragma warning(push)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4267)
 #endif
 
 #include <grpcpp/impl/codegen/config_protobuf.h>
 #include <grpcpp/impl/codegen/proto_utils.h>
 
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 namespace grpc {
diff --git a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc
index 0eb7549134a04..22e8676707342 100644
--- a/cpp/src/arrow/flight/transport/grpc/grpc_client.cc
+++ b/cpp/src/arrow/flight/transport/grpc/grpc_client.cc
@@ -31,7 +31,7 @@
 #include <grpcpp/grpcpp.h>
 #include <grpcpp/support/client_callback.h>
 #if defined(GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS)
-#include <grpcpp/security/tls_credentials_options.h>
+#  include <grpcpp/security/tls_credentials_options.h>
 #endif
 
 #include <grpc/grpc_security_constants.h>
@@ -670,11 +670,11 @@ class UnaryUnaryAsyncCall : public ::grpc::ClientUnaryReactor, public internal::
   }
 };
 
-#define LISTENER_NOT_OK(LISTENER, EXPR)                 \
-  if (auto arrow_status = (EXPR); !arrow_status.ok()) { \
-    (LISTENER)->OnFinish(std::move(arrow_status));      \
-    return;                                             \
-  }
+#  define LISTENER_NOT_OK(LISTENER, EXPR)                 \
+    if (auto arrow_status = (EXPR); !arrow_status.ok()) { \
+      (LISTENER)->OnFinish(std::move(arrow_status));      \
+      return;                                             \
+    }
 #endif
 
 class GrpcClientImpl : public internal::ClientTransport {
@@ -697,7 +697,7 @@ class GrpcClientImpl : public internal::ClientTransport {
 #if defined(GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS)
           namespace ge = ::GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS;
 
-#if defined(GRPC_USE_CERTIFICATE_VERIFIER)
+#  if defined(GRPC_USE_CERTIFICATE_VERIFIER)
           // gRPC >= 1.43
           class NoOpCertificateVerifier : public ge::ExternalCertificateVerifier {
            public:
@@ -712,10 +712,10 @@ class GrpcClientImpl : public internal::ClientTransport {
           auto cert_verifier =
               ge::ExternalCertificateVerifier::Create<NoOpCertificateVerifier>();
 
-#else   // defined(GRPC_USE_CERTIFICATE_VERIFIER)
-        // gRPC < 1.43
-        // A callback to supply to TlsCredentialsOptions that accepts any server
-        // arguments.
+#  else   // defined(GRPC_USE_CERTIFICATE_VERIFIER)
+          // gRPC < 1.43
+          // A callback to supply to TlsCredentialsOptions that accepts any server
+          // arguments.
           struct NoOpTlsAuthorizationCheck
               : public ge::TlsServerAuthorizationCheckInterface {
             int Schedule(ge::TlsServerAuthorizationCheckArg* arg) override {
@@ -727,33 +727,33 @@ class GrpcClientImpl : public internal::ClientTransport {
           auto server_authorization_check = std::make_shared<NoOpTlsAuthorizationCheck>();
           noop_auth_check_ = std::make_shared<ge::TlsServerAuthorizationCheckConfig>(
               server_authorization_check);
-#endif  // defined(GRPC_USE_CERTIFICATE_VERIFIER)
+#  endif  // defined(GRPC_USE_CERTIFICATE_VERIFIER)
 
-#if defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS)
+#  if defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS)
           auto certificate_provider =
               std::make_shared<::grpc::experimental::StaticDataCertificateProvider>(
                   kDummyRootCert);
-#if defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS_ROOT_CERTS)
+#    if defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS_ROOT_CERTS)
           ::grpc::experimental::TlsChannelCredentialsOptions tls_options(
               certificate_provider);
-#else   // defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS_ROOT_CERTS)
-        // While gRPC >= 1.36 does not require a root cert (it has a default)
-        // in practice the path it hardcodes is broken. See grpc/grpc#21655.
+#    else   // defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS_ROOT_CERTS)
+            // While gRPC >= 1.36 does not require a root cert (it has a default)
+            // in practice the path it hardcodes is broken. See grpc/grpc#21655.
           ::grpc::experimental::TlsChannelCredentialsOptions tls_options;
           tls_options.set_certificate_provider(certificate_provider);
-#endif  // defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS_ROOT_CERTS)
+#    endif  // defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS_ROOT_CERTS)
           tls_options.watch_root_certs();
           tls_options.set_root_cert_name("dummy");
-#if defined(GRPC_USE_CERTIFICATE_VERIFIER)
+#    if defined(GRPC_USE_CERTIFICATE_VERIFIER)
           tls_options.set_certificate_verifier(std::move(cert_verifier));
           tls_options.set_check_call_host(false);
           tls_options.set_verify_server_certs(false);
-#else   // defined(GRPC_USE_CERTIFICATE_VERIFIER)
+#    else   // defined(GRPC_USE_CERTIFICATE_VERIFIER)
           tls_options.set_server_verification_option(
               grpc_tls_server_verification_option::GRPC_TLS_SKIP_ALL_SERVER_VERIFICATION);
           tls_options.set_server_authorization_check_config(noop_auth_check_);
-#endif  // defined(GRPC_USE_CERTIFICATE_VERIFIER)
-#elif defined(GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS)
+#    endif  // defined(GRPC_USE_CERTIFICATE_VERIFIER)
+#  elif defined(GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS)
           // continues defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS)
           auto materials_config = std::make_shared<ge::TlsKeyMaterialsConfig>();
           materials_config->set_pem_root_certs(kDummyRootCert);
@@ -761,7 +761,7 @@ class GrpcClientImpl : public internal::ClientTransport {
               GRPC_SSL_DONT_REQUEST_CLIENT_CERTIFICATE,
               GRPC_TLS_SKIP_ALL_SERVER_VERIFICATION, materials_config,
               std::shared_ptr<ge::TlsCredentialReloadConfig>(), noop_auth_check_);
-#endif  // defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS)
+#  endif  // defined(GRPC_USE_TLS_CHANNEL_CREDENTIALS_OPTIONS)
           creds = ge::TlsCredentials(tls_options);
 #else   // defined(GRPC_NAMESPACE_FOR_TLS_CREDENTIALS_OPTIONS)
           return Status::NotImplemented(
diff --git a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc
index 3df13532b0b05..9b503ede05655 100644
--- a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc
+++ b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc
@@ -28,8 +28,8 @@
 #include "arrow/flight/platform.h"
 
 #if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4267)
+#  pragma warning(push)
+#  pragma warning(disable : 4267)
 #endif
 
 #include <google/protobuf/io/coded_stream.h>
@@ -41,7 +41,7 @@
 #include <grpcpp/impl/codegen/proto_utils.h>
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 #include "arrow/buffer.h"
@@ -400,8 +400,8 @@ ::grpc::Status FlightDataDeserialize(ByteBuffer* buffer,
 
 // The pointer bitcast hack below causes legitimate warnings, silence them.
 #ifndef _WIN32
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif
 
 // Pointer bitcast explanation: grpc::*Writer<T>::Write() and grpc::*Reader<T>::Read()
@@ -478,7 +478,7 @@ bool ReadPayload(::grpc::ClientReaderWriter<pb::FlightData, pb::PutResult>* read
 }
 
 #ifndef _WIN32
-#pragma GCC diagnostic pop
+#  pragma GCC diagnostic pop
 #endif
 
 }  // namespace grpc
diff --git a/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc b/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc
index c3481d834f6ea..1090b8356294a 100644
--- a/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc
+++ b/cpp/src/arrow/flight/transport/ucx/flight_transport_ucx_test.cc
@@ -27,13 +27,13 @@
 #include "arrow/util/config.h"
 
 #ifdef UCP_API_VERSION
-#error "UCX headers should not be in public API"
+#  error "UCX headers should not be in public API"
 #endif
 
 #include "arrow/flight/transport/ucx/ucx_internal.h"
 
 #ifdef ARROW_CUDA
-#include "arrow/gpu/cuda_api.h"
+#  include "arrow/gpu/cuda_api.h"
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h
index fc54bce9758b2..b6309d0af2a71 100644
--- a/cpp/src/arrow/flight/types.h
+++ b/cpp/src/arrow/flight/types.h
@@ -104,8 +104,8 @@ enum class FlightStatusCode : int8_t {
 // Silence warning
 // "non dll-interface class RecordBatchReader used as base for dll-interface class"
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4275)
+#  pragma warning(push)
+#  pragma warning(disable : 4275)
 #endif
 
 /// \brief Flight-specific error information in a Status.
@@ -139,7 +139,7 @@ class ARROW_FLIGHT_EXPORT FlightStatusDetail : public arrow::StatusDetail {
 };
 
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 /// \brief Make an appropriate Arrow status for the given
diff --git a/cpp/src/arrow/flight/visibility.h b/cpp/src/arrow/flight/visibility.h
index bdee8b751d8a3..06f864ba8cffc 100644
--- a/cpp/src/arrow/flight/visibility.h
+++ b/cpp/src/arrow/flight/visibility.h
@@ -18,31 +18,31 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef ARROW_FLIGHT_STATIC
-#define ARROW_FLIGHT_EXPORT
-#elif defined(ARROW_FLIGHT_EXPORTING)
-#define ARROW_FLIGHT_EXPORT __declspec(dllexport)
-#else
-#define ARROW_FLIGHT_EXPORT __declspec(dllimport)
-#endif
+#  ifdef ARROW_FLIGHT_STATIC
+#    define ARROW_FLIGHT_EXPORT
+#  elif defined(ARROW_FLIGHT_EXPORTING)
+#    define ARROW_FLIGHT_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_FLIGHT_EXPORT __declspec(dllimport)
+#  endif
 
-#define ARROW_FLIGHT_NO_EXPORT
+#  define ARROW_FLIGHT_NO_EXPORT
 #else  // Not Windows
-#ifndef ARROW_FLIGHT_EXPORT
-#define ARROW_FLIGHT_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef ARROW_FLIGHT_NO_EXPORT
-#define ARROW_FLIGHT_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
+#  ifndef ARROW_FLIGHT_EXPORT
+#    define ARROW_FLIGHT_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_FLIGHT_NO_EXPORT
+#    define ARROW_FLIGHT_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
 #endif  // Non-Windows
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
diff --git a/cpp/src/arrow/io/buffered_test.cc b/cpp/src/arrow/io/buffered_test.cc
index cbf2c2cf06938..89fe4b159f341 100644
--- a/cpp/src/arrow/io/buffered_test.cc
+++ b/cpp/src/arrow/io/buffered_test.cc
@@ -16,8 +16,8 @@
 // under the License.
 
 #ifndef _WIN32
-#include <fcntl.h>  // IWYU pragma: keep
-#include <unistd.h>
+#  include <fcntl.h>  // IWYU pragma: keep
+#  include <unistd.h>
 #endif
 
 #include <algorithm>
diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc
index a22accf65660a..9fda5b7fdc16e 100644
--- a/cpp/src/arrow/io/file.cc
+++ b/cpp/src/arrow/io/file.cc
@@ -19,16 +19,16 @@
 
 // sys/mman.h not present in Visual Studio or Cygwin
 #ifdef _WIN32
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include "arrow/io/mman.h"
-#undef Realloc
-#undef Free
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include "arrow/io/mman.h"
+#  undef Realloc
+#  undef Free
 #else
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <unistd.h>  // IWYU pragma: keep
+#  include <fcntl.h>
+#  include <sys/mman.h>
+#  include <unistd.h>  // IWYU pragma: keep
 #endif
 
 #include <algorithm>
diff --git a/cpp/src/arrow/io/file_benchmark.cc b/cpp/src/arrow/io/file_benchmark.cc
index 02ccfb6337f4b..bcaa1a9df014b 100644
--- a/cpp/src/arrow/io/file_benchmark.cc
+++ b/cpp/src/arrow/io/file_benchmark.cc
@@ -33,13 +33,13 @@
 
 #ifdef _WIN32
 
-#include <io.h>
+#  include <io.h>
 
 #else
 
-#include <fcntl.h>
-#include <poll.h>
-#include <unistd.h>
+#  include <fcntl.h>
+#  include <poll.h>
+#  include <unistd.h>
 
 #endif
 
diff --git a/cpp/src/arrow/io/file_test.cc b/cpp/src/arrow/io/file_test.cc
index af414891b950e..44a63e9fdfa81 100644
--- a/cpp/src/arrow/io/file_test.cc
+++ b/cpp/src/arrow/io/file_test.cc
@@ -16,8 +16,8 @@
 // under the License.
 
 #ifndef _WIN32
-#include <fcntl.h>  // IWYU pragma: keep
-#include <unistd.h>
+#  include <fcntl.h>  // IWYU pragma: keep
+#  include <unistd.h>
 #endif
 
 #include <atomic>
diff --git a/cpp/src/arrow/io/hdfs_internal.cc b/cpp/src/arrow/io/hdfs_internal.cc
index 5619dd2435acc..0f46a6faff924 100644
--- a/cpp/src/arrow/io/hdfs_internal.cc
+++ b/cpp/src/arrow/io/hdfs_internal.cc
@@ -40,7 +40,7 @@
 #include "arrow/util/basic_decimal.h"
 
 #ifndef _WIN32
-#include <dlfcn.h>
+#  include <dlfcn.h>
 #endif
 
 #include "arrow/result.h"
@@ -162,13 +162,13 @@ Result<std::vector<PlatformFilename>> get_potential_libjvm_paths() {
 // SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are
 // expecting users to set an environment variable
 #else
-#if defined(__aarch64__)
+#  if defined(__aarch64__)
   const std::string prefix_arch{"arm64"};
   const std::string suffix_arch{"aarch64"};
-#else
+#  else
   const std::string prefix_arch{"amd64"};
   const std::string suffix_arch{"amd64"};
-#endif
+#  endif
   ARROW_ASSIGN_OR_RAISE(
       search_prefixes,
       MakeFilenameVector({
diff --git a/cpp/src/arrow/io/memory_benchmark.cc b/cpp/src/arrow/io/memory_benchmark.cc
index e16bbaf03ec47..fda5e17e073bd 100644
--- a/cpp/src/arrow/io/memory_benchmark.cc
+++ b/cpp/src/arrow/io/memory_benchmark.cc
@@ -39,50 +39,50 @@ constexpr size_t kMemoryPerCore = 32 * 1024 * 1024;
 using BufferPtr = std::shared_ptr<Buffer>;
 
 #ifdef ARROW_WITH_BENCHMARKS_REFERENCE
-#ifndef _MSC_VER
+#  ifndef _MSC_VER
 
-#ifdef ARROW_HAVE_SSE4_2
+#    ifdef ARROW_HAVE_SSE4_2
 
-#ifdef ARROW_HAVE_AVX512
+#      ifdef ARROW_HAVE_AVX512
 
 using VectorType = __m512i;
-#define VectorSet _mm512_set1_epi32
-#define VectorLoad _mm512_stream_load_si512
-#define VectorLoadAsm(SRC, DST) \
-  asm volatile("vmovaps %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
-#define VectorStreamLoad _mm512_stream_load_si512
-#define VectorStreamLoadAsm(SRC, DST) \
-  asm volatile("vmovntdqa %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
-#define VectorStreamWrite _mm512_stream_si512
+#        define VectorSet _mm512_set1_epi32
+#        define VectorLoad _mm512_stream_load_si512
+#        define VectorLoadAsm(SRC, DST) \
+          asm volatile("vmovaps %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
+#        define VectorStreamLoad _mm512_stream_load_si512
+#        define VectorStreamLoadAsm(SRC, DST) \
+          asm volatile("vmovntdqa %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
+#        define VectorStreamWrite _mm512_stream_si512
 
-#else
+#      else
 
-#ifdef ARROW_HAVE_AVX2
+#        ifdef ARROW_HAVE_AVX2
 
 using VectorType = __m256i;
-#define VectorSet _mm256_set1_epi32
-#define VectorLoad _mm256_stream_load_si256
-#define VectorLoadAsm(SRC, DST) \
-  asm volatile("vmovaps %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
-#define VectorStreamLoad _mm256_stream_load_si256
-#define VectorStreamLoadAsm(SRC, DST) \
-  asm volatile("vmovntdqa %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
-#define VectorStreamWrite _mm256_stream_si256
+#          define VectorSet _mm256_set1_epi32
+#          define VectorLoad _mm256_stream_load_si256
+#          define VectorLoadAsm(SRC, DST) \
+            asm volatile("vmovaps %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
+#          define VectorStreamLoad _mm256_stream_load_si256
+#          define VectorStreamLoadAsm(SRC, DST) \
+            asm volatile("vmovntdqa %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :)
+#          define VectorStreamWrite _mm256_stream_si256
 
-#else  // ARROW_HAVE_AVX2 not set
+#        else  // ARROW_HAVE_AVX2 not set
 
 using VectorType = __m128i;
-#define VectorSet _mm_set1_epi32
-#define VectorLoad _mm_stream_load_si128
-#define VectorLoadAsm(SRC, DST) \
-  asm volatile("movaps %[src], %[dst]" : [dst] "=x"(DST) : [src] "m"(SRC) :)
-#define VectorStreamLoad _mm_stream_load_si128
-#define VectorStreamLoadAsm(SRC, DST) \
-  asm volatile("movntdqa %[src], %[dst]" : [dst] "=x"(DST) : [src] "m"(SRC) :)
-#define VectorStreamWrite _mm_stream_si128
-
-#endif  // ARROW_HAVE_AVX2
-#endif  // ARROW_HAVE_AVX512
+#          define VectorSet _mm_set1_epi32
+#          define VectorLoad _mm_stream_load_si128
+#          define VectorLoadAsm(SRC, DST) \
+            asm volatile("movaps %[src], %[dst]" : [dst] "=x"(DST) : [src] "m"(SRC) :)
+#          define VectorStreamLoad _mm_stream_load_si128
+#          define VectorStreamLoadAsm(SRC, DST) \
+            asm volatile("movntdqa %[src], %[dst]" : [dst] "=x"(DST) : [src] "m"(SRC) :)
+#          define VectorStreamWrite _mm_stream_si128
+
+#        endif  // ARROW_HAVE_AVX2
+#      endif    // ARROW_HAVE_AVX512
 
 static void Read(void* src, void* dst, size_t size) {
   const auto simd = static_cast<VectorType*>(src);
@@ -154,15 +154,15 @@ static void StreamReadWrite(void* src, void* dst, size_t size) {
   }
 }
 
-#endif  // ARROW_HAVE_SSE4_2
+#    endif  // ARROW_HAVE_SSE4_2
 
-#ifdef ARROW_HAVE_NEON
+#    ifdef ARROW_HAVE_NEON
 
 using VectorType = uint8x16_t;
 using VectorTypeDual = uint8x16x2_t;
 
-#define VectorSet vdupq_n_u8
-#define VectorLoadAsm vld1q_u8
+#      define VectorSet vdupq_n_u8
+#      define VectorLoadAsm vld1q_u8
 
 static void armv8_stream_load_pair(VectorType* src, VectorType* dst) {
   asm volatile("LDNP %[reg1], %[reg2], [%[from]]\n\t"
@@ -239,7 +239,7 @@ static void StreamReadWrite(void* src, void* dst, size_t size) {
   }
 }
 
-#endif  // ARROW_HAVE_NEON
+#    endif  // ARROW_HAVE_NEON
 
 static void PlatformMemcpy(void* src, void* dst, size_t size) { memcpy(src, dst, size); }
 
@@ -261,7 +261,7 @@ static void MemoryBandwidth(benchmark::State& state) {  // NOLINT non-const refe
   state.SetBytesProcessed(state.iterations() * buffer_size);
 }
 
-#ifdef ARROW_HAVE_SSE4_2
+#    ifdef ARROW_HAVE_SSE4_2
 static void SetCacheBandwidthArgs(benchmark::internal::Benchmark* bench) {
   auto cache_sizes = {kL1Size, kL2Size, kL3Size};
   for (auto size : cache_sizes) {
@@ -274,7 +274,7 @@ static void SetCacheBandwidthArgs(benchmark::internal::Benchmark* bench) {
 }
 
 BENCHMARK_TEMPLATE(MemoryBandwidth, Read)->Apply(SetCacheBandwidthArgs);
-#endif  // ARROW_HAVE_SSE4_2
+#    endif  // ARROW_HAVE_SSE4_2
 
 static void SetMemoryBandwidthArgs(benchmark::internal::Benchmark* bench) {
   // `UseRealTime` is required due to threads, otherwise the cumulative CPU time
@@ -287,8 +287,8 @@ BENCHMARK_TEMPLATE(MemoryBandwidth, StreamWrite)->Apply(SetMemoryBandwidthArgs);
 BENCHMARK_TEMPLATE(MemoryBandwidth, StreamReadWrite)->Apply(SetMemoryBandwidthArgs);
 BENCHMARK_TEMPLATE(MemoryBandwidth, PlatformMemcpy)->Apply(SetMemoryBandwidthArgs);
 
-#endif  // _MSC_VER
-#endif  // ARROW_WITH_BENCHMARKS_REFERENCE
+#  endif  // _MSC_VER
+#endif    // ARROW_WITH_BENCHMARKS_REFERENCE
 
 static void ParallelMemoryCopy(benchmark::State& state) {  // NOLINT non-const reference
   const int64_t n_threads = state.range(0);
diff --git a/cpp/src/arrow/io/mman.h b/cpp/src/arrow/io/mman.h
index 9b06ac8e7b5ca..04d450cbff513 100644
--- a/cpp/src/arrow/io/mman.h
+++ b/cpp/src/arrow/io/mman.h
@@ -36,7 +36,7 @@
 #define MS_INVALIDATE 4
 
 #ifndef FILE_MAP_EXECUTE
-#define FILE_MAP_EXECUTE 0x0020
+#  define FILE_MAP_EXECUTE 0x0020
 #endif
 
 static inline int __map_mman_error(const DWORD err, const int deferr) {
diff --git a/cpp/src/arrow/io/test_common.cc b/cpp/src/arrow/io/test_common.cc
index 5caa20a445e6d..a06ef2f59221c 100644
--- a/cpp/src/arrow/io/test_common.cc
+++ b/cpp/src/arrow/io/test_common.cc
@@ -23,7 +23,7 @@
 #include <vector>
 
 #ifndef _WIN32
-#include <fcntl.h>
+#  include <fcntl.h>
 #endif
 
 #include "arrow/buffer.h"
diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc
index c6f14b1e1d50e..d3201d8571b2c 100644
--- a/cpp/src/arrow/ipc/json_simple_test.cc
+++ b/cpp/src/arrow/ipc/json_simple_test.cc
@@ -48,7 +48,7 @@
 
 #if defined(_MSC_VER)
 // "warning C4307: '+': integral constant overflow"
-#pragma warning(disable : 4307)
+#  pragma warning(disable : 4307)
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/ipc/read_write_benchmark.cc b/cpp/src/arrow/ipc/read_write_benchmark.cc
index defe9790678c0..19ed2d3034e8d 100644
--- a/cpp/src/arrow/ipc/read_write_benchmark.cc
+++ b/cpp/src/arrow/ipc/read_write_benchmark.cc
@@ -185,24 +185,24 @@ static void DecodeStream(benchmark::State& state) {  // NOLINT non-const referen
 }
 
 #ifdef ARROW_WITH_ZSTD
-#define GENERATE_COMPRESSED_DATA_IN_MEMORY()                                      \
-  constexpr int64_t kBatchSize = 1 << 20; /* 1 MB */                              \
-  constexpr int64_t kBatches = 16;                                                \
-  auto options = ipc::IpcWriteOptions::Defaults();                                \
-  ASSIGN_OR_ABORT(options.codec,                                                  \
-                  arrow::util::Codec::Create(arrow::Compression::type::ZSTD));    \
-  std::shared_ptr<ResizableBuffer> buffer = *AllocateResizableBuffer(1024);       \
-  {                                                                               \
-    auto record_batch = MakeRecordBatch(kBatchSize, state.range(0));              \
-    io::BufferOutputStream stream(buffer);                                        \
-    auto writer = *ipc::MakeFileWriter(&stream, record_batch->schema(), options); \
-    for (int i = 0; i < kBatches; i++) {                                          \
-      ABORT_NOT_OK(writer->WriteRecordBatch(*record_batch));                      \
-    }                                                                             \
-    ABORT_NOT_OK(writer->Close());                                                \
-    ABORT_NOT_OK(stream.Close());                                                 \
-  }                                                                               \
-  constexpr int64_t total_size = kBatchSize * kBatches;
+#  define GENERATE_COMPRESSED_DATA_IN_MEMORY()                                      \
+    constexpr int64_t kBatchSize = 1 << 20; /* 1 MB */                              \
+    constexpr int64_t kBatches = 16;                                                \
+    auto options = ipc::IpcWriteOptions::Defaults();                                \
+    ASSIGN_OR_ABORT(options.codec,                                                  \
+                    arrow::util::Codec::Create(arrow::Compression::type::ZSTD));    \
+    std::shared_ptr<ResizableBuffer> buffer = *AllocateResizableBuffer(1024);       \
+    {                                                                               \
+      auto record_batch = MakeRecordBatch(kBatchSize, state.range(0));              \
+      io::BufferOutputStream stream(buffer);                                        \
+      auto writer = *ipc::MakeFileWriter(&stream, record_batch->schema(), options); \
+      for (int i = 0; i < kBatches; i++) {                                          \
+        ABORT_NOT_OK(writer->WriteRecordBatch(*record_batch));                      \
+      }                                                                             \
+      ABORT_NOT_OK(writer->Close());                                                \
+      ABORT_NOT_OK(stream.Close());                                                 \
+    }                                                                               \
+    constexpr int64_t total_size = kBatchSize * kBatches;
 #endif
 
 #define GENERATE_DATA_IN_MEMORY()                                                 \
diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc
index ff7838cc39d72..39fd2c40fb4ec 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -1081,9 +1081,9 @@ TEST_F(RecursionLimits, ReadLimit) {
 // Test fails with a structured exception on Windows + Debug
 #if !defined(_WIN32) || defined(NDEBUG)
 TEST_F(RecursionLimits, StressLimit) {
-#ifdef __EMSCRIPTEN__
+#  ifdef __EMSCRIPTEN__
   GTEST_SKIP() << "This crashes the Emscripten runtime.";
-#endif
+#  endif
 
   auto CheckDepth = [this](int recursion_depth, bool* it_works) {
     int32_t metadata_length = -1;
@@ -1112,10 +1112,10 @@ TEST_F(RecursionLimits, StressLimit) {
   ASSERT_TRUE(it_works);
 
 // Mitigate Valgrind's slowness
-#if !defined(ARROW_VALGRIND)
+#  if !defined(ARROW_VALGRIND)
   CheckDepth(500, &it_works);
   ASSERT_TRUE(it_works);
-#endif
+#  endif
 }
 #endif  // !defined(_WIN32) || defined(NDEBUG)
 
diff --git a/cpp/src/arrow/json/rapidjson_defs.h b/cpp/src/arrow/json/rapidjson_defs.h
index 9ed81d000c555..2354c6157263a 100644
--- a/cpp/src/arrow/json/rapidjson_defs.h
+++ b/cpp/src/arrow/json/rapidjson_defs.h
@@ -34,10 +34,10 @@
 
 // enable SIMD whitespace skipping, if available
 #if defined(ARROW_HAVE_SSE4_2)
-#define RAPIDJSON_SSE2 1
-#define RAPIDJSON_SSE42 1
+#  define RAPIDJSON_SSE2 1
+#  define RAPIDJSON_SSE42 1
 #endif
 
 #if defined(ARROW_HAVE_NEON)
-#define RAPIDJSON_NEON 1
+#  define RAPIDJSON_NEON 1
 #endif
diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc
index 34207781277d1..3ace2c8f23ab0 100644
--- a/cpp/src/arrow/memory_pool.cc
+++ b/cpp/src/arrow/memory_pool.cc
@@ -28,7 +28,7 @@
 #include <optional>
 
 #if defined(sun) || defined(__sun)
-#include <stdlib.h>
+#  include <stdlib.h>
 #endif
 
 #include "arrow/buffer.h"
@@ -46,11 +46,11 @@
 #include "arrow/util/ubsan.h"
 
 #ifdef __GLIBC__
-#include <malloc.h>
+#  include <malloc.h>
 #endif
 
 #ifdef ARROW_MIMALLOC
-#include <mimalloc.h>
+#  include <mimalloc.h>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/memory_pool_jemalloc.cc b/cpp/src/arrow/memory_pool_jemalloc.cc
index 24bc0f27f0912..239d83b81bc67 100644
--- a/cpp/src/arrow/memory_pool_jemalloc.cc
+++ b/cpp/src/arrow/memory_pool_jemalloc.cc
@@ -26,11 +26,11 @@
 // its family.
 
 #ifdef ARROW_JEMALLOC_VENDORED
-#define JEMALLOC_MANGLE
+#  define JEMALLOC_MANGLE
 // Explicitly link to our version of jemalloc
-#include "jemalloc_ep/dist/include/jemalloc/jemalloc.h"
+#  include "jemalloc_ep/dist/include/jemalloc/jemalloc.h"
 #else
-#include <jemalloc/jemalloc.h>
+#  include <jemalloc/jemalloc.h>
 #endif
 
 #ifdef ARROW_JEMALLOC_VENDORED
@@ -47,31 +47,31 @@
 // aggressively (and in the background) to the OS. This can be configured
 // further by using the arrow::jemalloc_set_decay_ms API
 
-#undef USE_JEMALLOC_BACKGROUND_THREAD
-#ifndef __APPLE__
+#  undef USE_JEMALLOC_BACKGROUND_THREAD
+#  ifndef __APPLE__
 // ARROW-6977: jemalloc's background_thread isn't always enabled on macOS
-#define USE_JEMALLOC_BACKGROUND_THREAD
-#endif
+#    define USE_JEMALLOC_BACKGROUND_THREAD
+#  endif
 
 // In debug mode, add memory poisoning on alloc / free
-#ifdef NDEBUG
-#define JEMALLOC_DEBUG_OPTIONS ""
-#else
-#define JEMALLOC_DEBUG_OPTIONS ",junk:true"
-#endif
+#  ifdef NDEBUG
+#    define JEMALLOC_DEBUG_OPTIONS ""
+#  else
+#    define JEMALLOC_DEBUG_OPTIONS ",junk:true"
+#  endif
 
 const char* je_arrow_malloc_conf =
     ("oversize_threshold:0"
-#ifdef USE_JEMALLOC_BACKGROUND_THREAD
+#  ifdef USE_JEMALLOC_BACKGROUND_THREAD
      ",dirty_decay_ms:1000"
      ",muzzy_decay_ms:1000"
      ",background_thread:true"
-#else
+#  else
      // ARROW-6994: return memory immediately to the OS if the
      // background_thread option isn't available
      ",dirty_decay_ms:0"
      ",muzzy_decay_ms:0"
-#endif
+#  endif
      JEMALLOC_DEBUG_OPTIONS);  // NOLINT: whitespace/parens
 
 #endif  // ARROW_JEMALLOC_VENDORED
diff --git a/cpp/src/arrow/public_api_test.cc b/cpp/src/arrow/public_api_test.cc
index 20de827ced13f..ccc80dc93a50a 100644
--- a/cpp/src/arrow/public_api_test.cc
+++ b/cpp/src/arrow/public_api_test.cc
@@ -28,32 +28,32 @@
 #include "arrow/ipc/api.h"      // IWYU pragma: keep
 
 #ifdef ARROW_CSV
-#include "arrow/csv/api.h"  // IWYU pragma: keep
+#  include "arrow/csv/api.h"  // IWYU pragma: keep
 #endif
 
 #ifdef ARROW_DATASET
-#include "arrow/dataset/api.h"  // IWYU pragma: keep
+#  include "arrow/dataset/api.h"  // IWYU pragma: keep
 #endif
 
 #ifdef ARROW_FILESYSTEM
-#include "arrow/filesystem/api.h"  // IWYU pragma: keep
+#  include "arrow/filesystem/api.h"  // IWYU pragma: keep
 #endif
 
 #ifdef ARROW_FLIGHT
-#include "arrow/flight/api.h"  // IWYU pragma: keep
+#  include "arrow/flight/api.h"  // IWYU pragma: keep
 #endif
 
 #ifdef ARROW_FLIGHT_SQL
-#include "arrow/flight/sql/api.h"  // IWYU pragma: keep
+#  include "arrow/flight/sql/api.h"  // IWYU pragma: keep
 #endif
 
 #ifdef ARROW_JSON
-#include "arrow/json/api.h"  // IWYU pragma: keep
+#  include "arrow/json/api.h"  // IWYU pragma: keep
 #endif
 
 #ifdef ARROW_SUBSTRAIT
-#include "arrow/engine/api.h"            // IWYU pragma: keep
-#include "arrow/engine/substrait/api.h"  // IWYU pragma: keep
+#  include "arrow/engine/api.h"            // IWYU pragma: keep
+#  include "arrow/engine/substrait/api.h"  // IWYU pragma: keep
 #endif
 
 #include <gmock/gmock-matchers.h>
diff --git a/cpp/src/arrow/result_internal.h b/cpp/src/arrow/result_internal.h
index 7550f945d85d0..134902e1b75ad 100644
--- a/cpp/src/arrow/result_internal.h
+++ b/cpp/src/arrow/result_internal.h
@@ -18,5 +18,5 @@
 #include "arrow/result.h"
 
 #ifndef ASSIGN_OR_RAISE
-#define ASSIGN_OR_RAISE(lhs, rhs) ARROW_ASSIGN_OR_RAISE(lhs, rhs)
+#  define ASSIGN_OR_RAISE(lhs, rhs) ARROW_ASSIGN_OR_RAISE(lhs, rhs)
 #endif
diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h
index ac384fc389a49..fb75d963f3a3c 100644
--- a/cpp/src/arrow/status.h
+++ b/cpp/src/arrow/status.h
@@ -28,23 +28,23 @@
 #ifdef ARROW_EXTRA_ERROR_CONTEXT
 
 /// \brief Return with given status if condition is met.
-#define ARROW_RETURN_IF_(condition, status, expr)   \
-  do {                                              \
-    if (ARROW_PREDICT_FALSE(condition)) {           \
-      ::arrow::Status _st = (status);               \
-      _st.AddContextLine(__FILE__, __LINE__, expr); \
-      return _st;                                   \
-    }                                               \
-  } while (0)
+#  define ARROW_RETURN_IF_(condition, status, expr)   \
+    do {                                              \
+      if (ARROW_PREDICT_FALSE(condition)) {           \
+        ::arrow::Status _st = (status);               \
+        _st.AddContextLine(__FILE__, __LINE__, expr); \
+        return _st;                                   \
+      }                                               \
+    } while (0)
 
 #else
 
-#define ARROW_RETURN_IF_(condition, status, _) \
-  do {                                         \
-    if (ARROW_PREDICT_FALSE(condition)) {      \
-      return (status);                         \
-    }                                          \
-  } while (0)
+#  define ARROW_RETURN_IF_(condition, status, _) \
+    do {                                         \
+      if (ARROW_PREDICT_FALSE(condition)) {      \
+        return (status);                         \
+      }                                          \
+    } while (0)
 
 #endif  // ARROW_EXTRA_ERROR_CONTEXT
 
@@ -78,7 +78,7 @@
 
 // This is an internal-use macro and should not be used in public headers.
 #ifndef RETURN_NOT_OK
-#define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s)
+#  define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s)
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/telemetry/logging.cc b/cpp/src/arrow/telemetry/logging.cc
index 7e9a69afedbb5..11a70ae1319c8 100644
--- a/cpp/src/arrow/telemetry/logging.cc
+++ b/cpp/src/arrow/telemetry/logging.cc
@@ -21,8 +21,8 @@
 #include "arrow/util/logging.h"
 
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4522)
+#  pragma warning(push)
+#  pragma warning(disable : 4522)
 #endif
 
 #include <google/protobuf/util/json_util.h>
@@ -46,7 +46,7 @@
 
 #include <opentelemetry/exporters/otlp/protobuf_include_suffix.h>
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/testing/gtest_compat.h b/cpp/src/arrow/testing/gtest_compat.h
index c934dd2793890..1fd0bfd32c5bc 100644
--- a/cpp/src/arrow/testing/gtest_compat.h
+++ b/cpp/src/arrow/testing/gtest_compat.h
@@ -21,13 +21,13 @@
 
 // GTest < 1.11
 #ifndef GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST
-#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(A)
+#  define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(A)
 #endif
 // GTest < 1.10
 #ifndef TYPED_TEST_SUITE
-#define TYPED_TEST_SUITE TYPED_TEST_CASE
-#define TYPED_TEST_SUITE_P TYPED_TEST_CASE_P
-#define INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
-#define REGISTER_TYPED_TEST_SUITE_P REGISTER_TYPED_TEST_CASE_P
-#define INSTANTIATE_TYPED_TEST_SUITE_P INSTANTIATE_TYPED_TEST_CASE_P
+#  define TYPED_TEST_SUITE TYPED_TEST_CASE
+#  define TYPED_TEST_SUITE_P TYPED_TEST_CASE_P
+#  define INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#  define REGISTER_TYPED_TEST_SUITE_P REGISTER_TYPED_TEST_CASE_P
+#  define INSTANTIATE_TYPED_TEST_SUITE_P INSTANTIATE_TYPED_TEST_CASE_P
 #endif
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index ae2e53b30a3ee..c4a7f363c71bc 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -20,13 +20,13 @@
 #include "arrow/testing/extension_type.h"
 
 #ifdef _WIN32
-#include <crtdbg.h>
-#include <io.h>
+#  include <crtdbg.h>
+#  include <io.h>
 #else
-#include <fcntl.h>     // IWYU pragma: keep
-#include <sys/stat.h>  // IWYU pragma: keep
-#include <sys/wait.h>  // IWYU pragma: keep
-#include <unistd.h>    // IWYU pragma: keep
+#  include <fcntl.h>     // IWYU pragma: keep
+#  include <sys/stat.h>  // IWYU pragma: keep
+#  include <sys/wait.h>  // IWYU pragma: keep
+#  include <unistd.h>    // IWYU pragma: keep
 #endif
 
 #include <algorithm>
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index 85b4c1f1f0138..90311464c283b 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -457,9 +457,9 @@ class ARROW_TESTING_EXPORT SignalHandlerGuard {
 };
 
 #ifndef ARROW_LARGE_MEMORY_TESTS
-#define LARGE_MEMORY_TEST(name) DISABLED_##name
+#  define LARGE_MEMORY_TEST(name) DISABLED_##name
 #else
-#define LARGE_MEMORY_TEST(name) name
+#  define LARGE_MEMORY_TEST(name) name
 #endif
 
 inline void PrintTo(const Status& st, std::ostream* os) { *os << st.ToString(); }
diff --git a/cpp/src/arrow/testing/process.cc b/cpp/src/arrow/testing/process.cc
index 32da81f14630e..941ddd9a6b603 100644
--- a/cpp/src/arrow/testing/process.cc
+++ b/cpp/src/arrow/testing/process.cc
@@ -32,24 +32,24 @@
 //
 // [1] https://github.com/boostorg/process/issues/259
 // [2] https://github.com/googleapis/storage-testbench/issues/669
-#ifndef _WIN32
-#define BOOST_PROCESS_USE_V2
-#endif
+#  ifndef _WIN32
+#    define BOOST_PROCESS_USE_V2
+#  endif
 #endif
 
 #ifdef BOOST_PROCESS_USE_V2
-#ifdef BOOST_PROCESS_NEED_SOURCE
+#  ifdef BOOST_PROCESS_NEED_SOURCE
 // Workaround for https://github.com/boostorg/process/issues/312
-#define BOOST_PROCESS_V2_SEPARATE_COMPILATION
-#ifdef __APPLE__
-#include <sys/sysctl.h>
-#endif
-#include <boost/process/v2.hpp>
-#include <boost/process/v2/src.hpp>
-#else
-#include <boost/process/v2.hpp>
-#endif
-#include <unordered_map>
+#    define BOOST_PROCESS_V2_SEPARATE_COMPILATION
+#    ifdef __APPLE__
+#      include <sys/sysctl.h>
+#    endif
+#    include <boost/process/v2.hpp>
+#    include <boost/process/v2/src.hpp>
+#  else
+#    include <boost/process/v2.hpp>
+#  endif
+#  include <unordered_map>
 #else
 // We need BOOST_USE_WINDOWS_H definition with MinGW when we use
 // boost/process.hpp. boost/process/detail/windows/handle_workaround.hpp
@@ -58,19 +58,19 @@
 //
 // See also:
 // https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp
-#ifdef __MINGW32__
-#define BOOST_USE_WINDOWS_H = 1
-#endif
-#ifdef BOOST_PROCESS_HAVE_V1
-#include <boost/process/v1.hpp>
-#else
-#include <boost/process.hpp>
-#endif
+#  ifdef __MINGW32__
+#    define BOOST_USE_WINDOWS_H = 1
+#  endif
+#  ifdef BOOST_PROCESS_HAVE_V1
+#    include <boost/process/v1.hpp>
+#  else
+#    include <boost/process.hpp>
+#  endif
 #endif
 
 #ifdef __APPLE__
-#include <limits.h>
-#include <mach-o/dyld.h>
+#  include <limits.h>
+#  include <mach-o/dyld.h>
 #endif
 
 #include <chrono>
diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc
index 36351fa8595be..7bef9f7d4756d 100644
--- a/cpp/src/arrow/testing/util.cc
+++ b/cpp/src/arrow/testing/util.cc
@@ -30,13 +30,13 @@
 #include <winsock2.h>
 // clang-format on
 #else
-#include <arpa/inet.h>   // IWYU pragma: keep
-#include <netinet/in.h>  // IWYU pragma: keep
-#include <sys/socket.h>  // IWYU pragma: keep
-#include <sys/stat.h>    // IWYU pragma: keep
-#include <sys/types.h>   // IWYU pragma: keep
-#include <sys/wait.h>    // IWYU pragma: keep
-#include <unistd.h>      // IWYU pragma: keep
+#  include <arpa/inet.h>   // IWYU pragma: keep
+#  include <netinet/in.h>  // IWYU pragma: keep
+#  include <sys/socket.h>  // IWYU pragma: keep
+#  include <sys/stat.h>    // IWYU pragma: keep
+#  include <sys/types.h>   // IWYU pragma: keep
+#  include <sys/wait.h>    // IWYU pragma: keep
+#  include <unistd.h>      // IWYU pragma: keep
 #endif
 
 #include "arrow/config.h"
@@ -144,8 +144,8 @@ int GetListenPort() {
     return internal::WinErrorMessage(WSAGetLastError());
   };
 #else
-#define INVALID_SOCKET -1
-#define SOCKET_ERROR -1
+#  define INVALID_SOCKET -1
+#  define SOCKET_ERROR -1
   int sock_fd;
   auto sin_len = static_cast<socklen_t>(sizeof(sin));
   auto errno_message = []() -> std::string { return internal::ErrnoMessage(errno); };
diff --git a/cpp/src/arrow/testing/visibility.h b/cpp/src/arrow/testing/visibility.h
index 1b2aa7cd86fc6..b7fbcd42757bd 100644
--- a/cpp/src/arrow/testing/visibility.h
+++ b/cpp/src/arrow/testing/visibility.h
@@ -18,31 +18,31 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef ARROW_TESTING_STATIC
-#define ARROW_TESTING_EXPORT
-#elif defined(ARROW_TESTING_EXPORTING)
-#define ARROW_TESTING_EXPORT __declspec(dllexport)
-#else
-#define ARROW_TESTING_EXPORT __declspec(dllimport)
-#endif
+#  ifdef ARROW_TESTING_STATIC
+#    define ARROW_TESTING_EXPORT
+#  elif defined(ARROW_TESTING_EXPORTING)
+#    define ARROW_TESTING_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_TESTING_EXPORT __declspec(dllimport)
+#  endif
 
-#define ARROW_TESTING_NO_EXPORT
+#  define ARROW_TESTING_NO_EXPORT
 #else  // Not Windows
-#ifndef ARROW_TESTING_EXPORT
-#define ARROW_TESTING_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef ARROW_TESTING_NO_EXPORT
-#define ARROW_TESTING_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
+#  ifndef ARROW_TESTING_EXPORT
+#    define ARROW_TESTING_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_TESTING_NO_EXPORT
+#    define ARROW_TESTING_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
 #endif  // Non-Windows
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
diff --git a/cpp/src/arrow/type_benchmark.cc b/cpp/src/arrow/type_benchmark.cc
index 17dccfcb33138..0d1425a405709 100644
--- a/cpp/src/arrow/type_benchmark.cc
+++ b/cpp/src/arrow/type_benchmark.cc
@@ -174,12 +174,12 @@ static void SchemaEqualsWithMetadata(
 // Micro-benchmark various error reporting schemes
 
 #if (defined(__GNUC__) || defined(__APPLE__))
-#define ARROW_NO_INLINE __attribute__((noinline))
+#  define ARROW_NO_INLINE __attribute__((noinline))
 #elif defined(_MSC_VER)
-#define ARROW_NO_INLINE __declspec(noinline)
+#  define ARROW_NO_INLINE __declspec(noinline)
 #else
-#define ARROW_NO_INLINE
-#warning Missing "noinline" attribute, no-inline benchmarks may be bogus
+#  define ARROW_NO_INLINE
+#  warning Missing "noinline" attribute, no-inline benchmarks may be bogus
 #endif
 
 inline int64_t Accumulate(int64_t partial, int32_t value) {
diff --git a/cpp/src/arrow/util/atfork_internal.cc b/cpp/src/arrow/util/atfork_internal.cc
index eb26304fba36e..e89b37d83456e 100644
--- a/cpp/src/arrow/util/atfork_internal.cc
+++ b/cpp/src/arrow/util/atfork_internal.cc
@@ -23,7 +23,7 @@
 #include <vector>
 
 #ifndef _WIN32
-#include <pthread.h>
+#  include <pthread.h>
 #endif
 
 #include "arrow/util/io_util.h"
diff --git a/cpp/src/arrow/util/atfork_test.cc b/cpp/src/arrow/util/atfork_test.cc
index 750f4d138793b..97910f9539c0d 100644
--- a/cpp/src/arrow/util/atfork_test.cc
+++ b/cpp/src/arrow/util/atfork_test.cc
@@ -25,9 +25,9 @@
 #include <vector>
 
 #ifndef _WIN32
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
+#  include <sys/types.h>
+#  include <sys/wait.h>
+#  include <unistd.h>
 #endif
 
 #include <gmock/gmock-matchers.h>
@@ -110,9 +110,9 @@ class TestAtFork : public ::testing::Test {
 #ifndef _WIN32
 
 TEST_F(TestAtFork, EmptyHandlers) {
-#ifndef ARROW_ENABLE_THREADING
+#  ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
 
   auto handlers = std::make_shared<AtForkHandler>();
 
@@ -135,9 +135,9 @@ TEST_F(TestAtFork, EmptyHandlers) {
 }
 
 TEST_F(TestAtFork, SingleThread) {
-#ifndef ARROW_ENABLE_THREADING
+#  ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
 
   auto handlers1 = std::make_shared<AtForkHandler>(PushBefore(1), PushParentAfter(11),
                                                    PushChildAfter(21));
@@ -190,16 +190,17 @@ TEST_F(TestAtFork, SingleThread) {
   ASSERT_THAT(child_after_, ElementsAre());
 }
 
-#if !(defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER))
+#  if !(defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \
+        defined(THREAD_SANITIZER))
 
 // The two following tests would seem to leak for various reasons.
 // Also, Thread Sanitizer would fail with the same error message as in
 // https://github.com/google/sanitizers/issues/950.
 
 TEST_F(TestAtFork, MultipleThreads) {
-#ifndef ARROW_ENABLE_THREADING
+#    ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#    endif
 
   const int kNumThreads = 5;
   const int kNumIterations = 40;
@@ -255,12 +256,12 @@ TEST_F(TestAtFork, MultipleThreads) {
 }
 
 TEST_F(TestAtFork, NestedChild) {
-#ifdef __APPLE__
+#    ifdef __APPLE__
   GTEST_SKIP() << "Nested fork is not supported on macOS";
-#endif
-#ifndef ARROW_ENABLE_THREADING
+#    endif
+#    ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#    endif
 
   auto handlers1 = std::make_shared<AtForkHandler>(PushBefore(1), PushParentAfter(11),
                                                    PushChildAfter(21));
@@ -295,16 +296,16 @@ TEST_F(TestAtFork, NestedChild) {
   ASSERT_THAT(child_after_, ElementsAre());
 }
 
-#endif  // !(defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) ||
-        //   defined(THREAD_SANITIZER))
+#  endif  // !(defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) ||
+          //   defined(THREAD_SANITIZER))
 
 #endif  // !defined(_WIN32)
 
 #ifdef _WIN32
 TEST_F(TestAtFork, NoOp) {
-#ifndef ARROW_ENABLE_THREADING
+#  ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
 
   auto handlers = std::make_shared<AtForkHandler>(PushBefore(1), PushParentAfter(11),
                                                   PushChildAfter(21));
diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h
index 811694e43b76c..316086fcf0c04 100644
--- a/cpp/src/arrow/util/bit_stream_utils_internal.h
+++ b/cpp/src/arrow/util/bit_stream_utils_internal.h
@@ -269,13 +269,13 @@ template <typename T>
 inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
                       int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
+#  pragma warning(push)
+#  pragma warning(disable : 4800)
 #endif
   *v = static_cast<T>(bit_util::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
                       *bit_offset);
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
   *bit_offset += num_bits;
   if (*bit_offset >= 64) {
@@ -285,8 +285,8 @@ inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
     *buffered_values =
         detail::ReadLittleEndianWord(buffer + *byte_offset, max_bytes - *byte_offset);
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800 4805)
+#  pragma warning(push)
+#  pragma warning(disable : 4800 4805)
 #endif
     // Read bits of v that crossed into new buffered_values_
     if (ARROW_PREDICT_TRUE(num_bits - *bit_offset < static_cast<int>(8 * sizeof(T)))) {
@@ -297,7 +297,7 @@ inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
                                << (num_bits - *bit_offset));
     }
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
     DCHECK_LE(*bit_offset, 64);
   }
@@ -367,12 +367,12 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
       }
       for (int k = 0; k < num_unpacked; ++k) {
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
+#  pragma warning(push)
+#  pragma warning(disable : 4800)
 #endif
         v[i + k] = static_cast<T>(unpack_buffer[k]);
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
       }
       i += num_unpacked;
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index 1d3a1dc2459f9..17d1de406d514 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -18,18 +18,18 @@
 #pragma once
 
 #if defined(_MSC_VER)
-#if defined(_M_AMD64) || defined(_M_X64)
-#include <intrin.h>  // IWYU pragma: keep
-#include <nmmintrin.h>
-#endif
-
-#pragma intrinsic(_BitScanReverse)
-#pragma intrinsic(_BitScanForward)
-#define ARROW_POPCOUNT64 __popcnt64
-#define ARROW_POPCOUNT32 __popcnt
+#  if defined(_M_AMD64) || defined(_M_X64)
+#    include <intrin.h>  // IWYU pragma: keep
+#    include <nmmintrin.h>
+#  endif
+
+#  pragma intrinsic(_BitScanReverse)
+#  pragma intrinsic(_BitScanForward)
+#  define ARROW_POPCOUNT64 __popcnt64
+#  define ARROW_POPCOUNT32 __popcnt
 #else
-#define ARROW_POPCOUNT64 __builtin_popcountll
-#define ARROW_POPCOUNT32 __builtin_popcount
+#  define ARROW_POPCOUNT64 __builtin_popcountll
+#  define ARROW_POPCOUNT32 __builtin_popcount
 #endif
 
 #include <cstdint>
diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc
index b33eb92606be2..e0d61d8db01e1 100644
--- a/cpp/src/arrow/util/bpacking.cc
+++ b/cpp/src/arrow/util/bpacking.cc
@@ -24,13 +24,13 @@
 #include "arrow/util/logging.h"
 
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
-#include "arrow/util/bpacking_avx2.h"
+#  include "arrow/util/bpacking_avx2.h"
 #endif
 #if defined(ARROW_HAVE_RUNTIME_AVX512)
-#include "arrow/util/bpacking_avx512.h"
+#  include "arrow/util/bpacking_avx512.h"
 #endif
 #if defined(ARROW_HAVE_NEON)
-#include "arrow/util/bpacking_neon.h"
+#  include "arrow/util/bpacking_neon.h"
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 8bca0d442c681..d3214239ff9fb 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -29,8 +29,8 @@
 #include <cstring>
 
 #if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
-#include <xsimd/xsimd.hpp>
-#define ARROW_HAVE_SIMD_SPLIT
+#  include <xsimd/xsimd.hpp>
+#  define ARROW_HAVE_SIMD_SPLIT
 #endif
 
 namespace arrow::util::internal {
@@ -383,28 +383,28 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, int width,
 template <int kNumStreams>
 void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int width, int64_t num_values,
                                       int64_t stride, uint8_t* out) {
-#if defined(ARROW_HAVE_AVX2)
+#  if defined(ARROW_HAVE_AVX2)
   return ByteStreamSplitDecodeAvx2<kNumStreams>(data, width, num_values, stride, out);
-#elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
+#  elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
   return ByteStreamSplitDecodeSimd128<kNumStreams>(data, width, num_values, stride, out);
-#else
-#error "ByteStreamSplitDecodeSimd not implemented"
-#endif
+#  else
+#    error "ByteStreamSplitDecodeSimd not implemented"
+#  endif
 }
 
 template <int kNumStreams>
 void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width,
                                       const int64_t num_values,
                                       uint8_t* output_buffer_raw) {
-#if defined(ARROW_HAVE_AVX2)
+#  if defined(ARROW_HAVE_AVX2)
   return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, width, num_values,
                                                 output_buffer_raw);
-#elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
+#  elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
   return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width, num_values,
                                                    output_buffer_raw);
-#else
-#error "ByteStreamSplitEncodeSimd not implemented"
-#endif
+#  else
+#    error "ByteStreamSplitEncodeSimd not implemented"
+#  endif
 }
 #endif
 
@@ -546,9 +546,9 @@ inline void ByteStreamSplitDecodeScalarDynamic(const uint8_t* data, int width,
 inline void ByteStreamSplitEncode(const uint8_t* raw_values, int width,
                                   const int64_t num_values, uint8_t* out) {
 #if defined(ARROW_HAVE_SIMD_SPLIT)
-#define ByteStreamSplitEncodePerhapsSimd ByteStreamSplitEncodeSimd
+#  define ByteStreamSplitEncodePerhapsSimd ByteStreamSplitEncodeSimd
 #else
-#define ByteStreamSplitEncodePerhapsSimd ByteStreamSplitEncodeScalar
+#  define ByteStreamSplitEncodePerhapsSimd ByteStreamSplitEncodeScalar
 #endif
   switch (width) {
     case 1:
@@ -570,9 +570,9 @@ inline void ByteStreamSplitEncode(const uint8_t* raw_values, int width,
 inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_values,
                                   int64_t stride, uint8_t* out) {
 #if defined(ARROW_HAVE_SIMD_SPLIT)
-#define ByteStreamSplitDecodePerhapsSimd ByteStreamSplitDecodeSimd
+#  define ByteStreamSplitDecodePerhapsSimd ByteStreamSplitDecodeSimd
 #else
-#define ByteStreamSplitDecodePerhapsSimd ByteStreamSplitDecodeScalar
+#  define ByteStreamSplitDecodePerhapsSimd ByteStreamSplitDecodeScalar
 #endif
   switch (width) {
     case 1:
diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc
index 3a537725b0692..9755cd8b8d0f2 100644
--- a/cpp/src/arrow/util/byte_stream_split_test.cc
+++ b/cpp/src/arrow/util/byte_stream_split_test.cc
@@ -145,9 +145,9 @@ class TestByteStreamSplitSpecialized : public ::testing::Test {
     if constexpr (kSimdImplemented) {
       funcs.push_back({"simd", &ByteStreamSplitDecodeSimd<kWidth>});
       funcs.push_back({"simd128", &ByteStreamSplitDecodeSimd128<kWidth>});
-#if defined(ARROW_HAVE_AVX2)
+#  if defined(ARROW_HAVE_AVX2)
       funcs.push_back({"avx2", &ByteStreamSplitDecodeAvx2<kWidth>});
-#endif
+#  endif
     }
 #endif  // defined(ARROW_HAVE_SIMD_SPLIT)
     return funcs;
@@ -163,9 +163,9 @@ class TestByteStreamSplitSpecialized : public ::testing::Test {
     if constexpr (kSimdImplemented) {
       funcs.push_back({"simd", &ByteStreamSplitEncodeSimd<kWidth>});
       funcs.push_back({"simd128", &ByteStreamSplitEncodeSimd128<kWidth>});
-#if defined(ARROW_HAVE_AVX2)
+#  if defined(ARROW_HAVE_AVX2)
       funcs.push_back({"avx2", &ByteStreamSplitEncodeAvx2<kWidth>});
-#endif
+#  endif
     }
 #endif  // defined(ARROW_HAVE_SIMD_SPLIT)
     return funcs;
diff --git a/cpp/src/arrow/util/cancel.cc b/cpp/src/arrow/util/cancel.cc
index 2648059af81ee..b3a0c1f92c24e 100644
--- a/cpp/src/arrow/util/cancel.cc
+++ b/cpp/src/arrow/util/cancel.cc
@@ -33,7 +33,7 @@
 namespace arrow {
 
 #if ATOMIC_INT_LOCK_FREE != 2
-#error Lock-free atomic int required for signal safety
+#  error Lock-free atomic int required for signal safety
 #endif
 
 using internal::AtForkHandler;
diff --git a/cpp/src/arrow/util/cancel_test.cc b/cpp/src/arrow/util/cancel_test.cc
index 713418f15a0cc..6cea75755de10 100644
--- a/cpp/src/arrow/util/cancel_test.cc
+++ b/cpp/src/arrow/util/cancel_test.cc
@@ -29,9 +29,9 @@
 
 #include <signal.h>
 #ifndef _WIN32
-#include <sys/time.h>  // for setitimer()
-#include <sys/types.h>
-#include <unistd.h>
+#  include <sys/time.h>  // for setitimer()
+#  include <sys/types.h>
+#  include <unistd.h>
 #endif
 
 #include "arrow/testing/gtest_util.h"
@@ -269,9 +269,9 @@ TEST_F(SignalCancelTest, RegisterUnregister) {
 #if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \
       defined(THREAD_SANITIZER))
 TEST_F(SignalCancelTest, ForkSafetyUnregisteredHandlers) {
-#ifndef ARROW_ENABLE_THREADING
+#  ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
 
   RunInChild([&]() {
     // Child
@@ -296,9 +296,9 @@ TEST_F(SignalCancelTest, ForkSafetyUnregisteredHandlers) {
 }
 
 TEST_F(SignalCancelTest, ForkSafetyRegisteredHandlers) {
-#ifndef ARROW_ENABLE_THREADING
+#  ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
 
   RegisterHandler();
 
diff --git a/cpp/src/arrow/util/compression_benchmark.cc b/cpp/src/arrow/util/compression_benchmark.cc
index 0b9727cff9041..361935805be79 100644
--- a/cpp/src/arrow/util/compression_benchmark.cc
+++ b/cpp/src/arrow/util/compression_benchmark.cc
@@ -228,28 +228,28 @@ static void ReferenceDecompression(
   state.SetBytesProcessed(state.iterations() * data.size());
 }
 
-#ifdef ARROW_WITH_ZLIB
+#  ifdef ARROW_WITH_ZLIB
 BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::GZIP);
 BENCHMARK_TEMPLATE(ReferenceCompression, Compression::GZIP);
 BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::GZIP);
 BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::GZIP);
-#endif
+#  endif
 
-#ifdef ARROW_WITH_BROTLI
+#  ifdef ARROW_WITH_BROTLI
 BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::BROTLI);
 BENCHMARK_TEMPLATE(ReferenceCompression, Compression::BROTLI);
 BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::BROTLI);
 BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::BROTLI);
-#endif
+#  endif
 
-#ifdef ARROW_WITH_ZSTD
+#  ifdef ARROW_WITH_ZSTD
 BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::ZSTD);
 BENCHMARK_TEMPLATE(ReferenceCompression, Compression::ZSTD);
 BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::ZSTD);
 BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::ZSTD);
-#endif
+#  endif
 
-#ifdef ARROW_WITH_LZ4
+#  ifdef ARROW_WITH_LZ4
 BENCHMARK_TEMPLATE(ReferenceStreamingCompression, Compression::LZ4_FRAME);
 BENCHMARK_TEMPLATE(ReferenceCompression, Compression::LZ4_FRAME);
 BENCHMARK_TEMPLATE(ReferenceStreamingDecompression, Compression::LZ4_FRAME);
@@ -257,12 +257,12 @@ BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::LZ4_FRAME);
 
 BENCHMARK_TEMPLATE(ReferenceCompression, Compression::LZ4);
 BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::LZ4);
-#endif
+#  endif
 
-#ifdef ARROW_WITH_SNAPPY
+#  ifdef ARROW_WITH_SNAPPY
 BENCHMARK_TEMPLATE(ReferenceCompression, Compression::SNAPPY);
 BENCHMARK_TEMPLATE(ReferenceDecompression, Compression::SNAPPY);
-#endif
+#  endif
 
 #endif
 
diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc
index be957afab3c46..ae1d0961de600 100644
--- a/cpp/src/arrow/util/compression_lz4.cc
+++ b/cpp/src/arrow/util/compression_lz4.cc
@@ -34,7 +34,7 @@
 #include "arrow/util/ubsan.h"
 
 #ifndef LZ4F_HEADER_SIZE_MAX
-#define LZ4F_HEADER_SIZE_MAX 19
+#  define LZ4F_HEADER_SIZE_MAX 19
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/util/cpu_info.cc b/cpp/src/arrow/util/cpu_info.cc
index 7c2e9fa921246..620e520e72b39 100644
--- a/cpp/src/arrow/util/cpu_info.cc
+++ b/cpp/src/arrow/util/cpu_info.cc
@@ -20,17 +20,17 @@
 #include "arrow/util/cpu_info.h"
 
 #ifdef __APPLE__
-#include <sys/sysctl.h>
+#  include <sys/sysctl.h>
 #endif
 
 #ifndef _MSC_VER
-#include <unistd.h>
+#  include <unistd.h>
 #endif
 
 #ifdef _WIN32
-#include <intrin.h>
+#  include <intrin.h>
 
-#include "arrow/util/windows_compatibility.h"
+#  include "arrow/util/windows_compatibility.h"
 #endif
 
 #include <algorithm>
@@ -55,12 +55,12 @@
 #undef CPUINFO_ARCH_PPC
 
 #if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#define CPUINFO_ARCH_X86
+#  define CPUINFO_ARCH_X86
 #elif defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__)
-#define CPUINFO_ARCH_ARM
+#  define CPUINFO_ARCH_ARM
 #elif defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || \
     defined(__powerpc64__)
-#define CPUINFO_ARCH_PPC
+#  define CPUINFO_ARCH_PPC
 #endif
 
 namespace arrow {
@@ -122,10 +122,10 @@ void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) {
   free(buffer);
 }
 
-#if defined(CPUINFO_ARCH_X86)
+#  if defined(CPUINFO_ARCH_X86)
 // On x86, get CPU features by cpuid, https://en.wikipedia.org/wiki/CPUID
 
-#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
+#    if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
 void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
   __asm__ __volatile__("cpuid"
                        : "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]),
@@ -138,7 +138,7 @@ int64_t _xgetbv(int xcr) {
   __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
   return out;
 }
-#endif  // MINGW
+#    endif  // MINGW
 
 void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
                        std::string* model_name) {
@@ -215,14 +215,14 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
     }
   }
 }
-#elif defined(CPUINFO_ARCH_ARM)
+#  elif defined(CPUINFO_ARCH_ARM)
 // Windows on Arm
 void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
                        std::string* model_name) {
   *hardware_flags |= CpuInfo::ASIMD;
   // TODO: vendor, model_name
 }
-#endif
+#  endif
 
 #elif defined(__APPLE__)
 //------------------------------ MACOS ------------------------------//
@@ -265,7 +265,7 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
     int64_t flag;
   };
   std::vector<SysCtlCpuFeature> features = {
-#if defined(CPUINFO_ARCH_X86)
+#  if defined(CPUINFO_ARCH_X86)
     {"hw.optional.sse4_2",
      CpuInfo::SSSE3 | CpuInfo::SSE4_1 | CpuInfo::SSE4_2 | CpuInfo::POPCNT},
     {"hw.optional.avx1_0", CpuInfo::AVX},
@@ -277,10 +277,10 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
     {"hw.optional.avx512dq", CpuInfo::AVX512DQ},
     {"hw.optional.avx512bw", CpuInfo::AVX512BW},
     {"hw.optional.avx512vl", CpuInfo::AVX512VL},
-#elif defined(CPUINFO_ARCH_ARM)
+#  elif defined(CPUINFO_ARCH_ARM)
     // ARM64 (note that this is exposed under Rosetta as well)
     {"hw.optional.neon", CpuInfo::ASIMD},
-#endif
+#  endif
   };
   for (const auto& feature : features) {
     auto v = IntegerSysCtlByName(feature.name);
@@ -297,7 +297,7 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor,
 // Get cache size, return 0 on error
 int64_t LinuxGetCacheSize(int level) {
   // get cache size by sysconf()
-#ifdef _SC_LEVEL1_DCACHE_SIZE
+#  ifdef _SC_LEVEL1_DCACHE_SIZE
   const int kCacheSizeConf[] = {
       _SC_LEVEL1_DCACHE_SIZE,
       _SC_LEVEL2_CACHE_SIZE,
@@ -310,7 +310,7 @@ int64_t LinuxGetCacheSize(int level) {
   if (errno == 0 && cache_size > 0) {
     return cache_size;
   }
-#endif
+#  endif
 
   // get cache size from sysfs if sysconf() fails or not supported
   const char* kCacheSizeSysfs[] = {
@@ -345,12 +345,12 @@ int64_t LinuxGetCacheSize(int level) {
 // care about are present.
 // Returns a bitmap of flags.
 int64_t LinuxParseCpuFlags(const std::string& values) {
-#if defined(CPUINFO_ARCH_X86) || defined(CPUINFO_ARCH_ARM)
+#  if defined(CPUINFO_ARCH_X86) || defined(CPUINFO_ARCH_ARM)
   const struct {
     std::string name;
     int64_t flag;
   } flag_mappings[] = {
-#if defined(CPUINFO_ARCH_X86)
+#    if defined(CPUINFO_ARCH_X86)
     {"ssse3", CpuInfo::SSSE3},
     {"sse4_1", CpuInfo::SSE4_1},
     {"sse4_2", CpuInfo::SSE4_2},
@@ -364,9 +364,9 @@ int64_t LinuxParseCpuFlags(const std::string& values) {
     {"avx512bw", CpuInfo::AVX512BW},
     {"bmi1", CpuInfo::BMI1},
     {"bmi2", CpuInfo::BMI2},
-#elif defined(CPUINFO_ARCH_ARM)
+#    elif defined(CPUINFO_ARCH_ARM)
     {"asimd", CpuInfo::ASIMD},
-#endif
+#    endif
   };
   const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
 
@@ -377,9 +377,9 @@ int64_t LinuxParseCpuFlags(const std::string& values) {
     }
   }
   return flags;
-#else
+#  else
   return 0;
-#endif
+#  endif
 }
 
 void OsRetrieveCacheSize(std::array<int64_t, kCacheLevels>* cache_sizes) {
@@ -466,11 +466,11 @@ bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_fla
 }
 
 void ArchVerifyCpuRequirements(const CpuInfo* ci) {
-#if defined(ARROW_HAVE_SSE4_2)
+#  if defined(ARROW_HAVE_SSE4_2)
   if (!ci->IsDetected(CpuInfo::SSE4_2)) {
     DCHECK(false) << "CPU does not support the Supplemental SSE4_2 instruction set";
   }
-#endif
+#  endif
 }
 
 #elif defined(CPUINFO_ARCH_ARM)
diff --git a/cpp/src/arrow/util/decimal_internal.h b/cpp/src/arrow/util/decimal_internal.h
index 89f755af88316..b3a8b1127f918 100644
--- a/cpp/src/arrow/util/decimal_internal.h
+++ b/cpp/src/arrow/util/decimal_internal.h
@@ -189,11 +189,11 @@ constexpr BasicDecimal128 kDecimal128HalfPowersOfTen[] = {
     BasicDecimal128(2710505431213761085LL, 343699775700336640ULL)};
 
 #if ARROW_LITTLE_ENDIAN
-#define BasicDecimal256FromLE(v1, v2, v3, v4) \
-  BasicDecimal256(std::array<uint64_t, 4>{v1, v2, v3, v4})
+#  define BasicDecimal256FromLE(v1, v2, v3, v4) \
+    BasicDecimal256(std::array<uint64_t, 4>{v1, v2, v3, v4})
 #else
-#define BasicDecimal256FromLE(v1, v2, v3, v4) \
-  BasicDecimal256(std::array<uint64_t, 4>{v4, v3, v2, v1})
+#  define BasicDecimal256FromLE(v1, v2, v3, v4) \
+    BasicDecimal256(std::array<uint64_t, 4>{v4, v3, v2, v1})
 #endif
 
 constexpr BasicDecimal256 kDecimal256PowersOfTen[76 + 1] = {
diff --git a/cpp/src/arrow/util/endian.h b/cpp/src/arrow/util/endian.h
index 3d394ba8b7801..9c603144a7fd8 100644
--- a/cpp/src/arrow/util/endian.h
+++ b/cpp/src/arrow/util/endian.h
@@ -18,38 +18,38 @@
 #pragma once
 
 #ifdef _WIN32
-#define ARROW_LITTLE_ENDIAN 1
+#  define ARROW_LITTLE_ENDIAN 1
 #else
-#if defined(__APPLE__) || defined(__FreeBSD__)
-#include <machine/endian.h>  // IWYU pragma: keep
-#elif defined(sun) || defined(__sun)
-#include <sys/byteorder.h>  // IWYU pragma: keep
-#else
-#include <endian.h>  // IWYU pragma: keep
-#endif
+#  if defined(__APPLE__) || defined(__FreeBSD__)
+#    include <machine/endian.h>  // IWYU pragma: keep
+#  elif defined(sun) || defined(__sun)
+#    include <sys/byteorder.h>  // IWYU pragma: keep
+#  else
+#    include <endian.h>  // IWYU pragma: keep
+#  endif
 #
-#ifndef __BYTE_ORDER__
-#error "__BYTE_ORDER__ not defined"
-#endif
+#  ifndef __BYTE_ORDER__
+#    error "__BYTE_ORDER__ not defined"
+#  endif
 #
-#ifndef __ORDER_LITTLE_ENDIAN__
-#error "__ORDER_LITTLE_ENDIAN__ not defined"
-#endif
+#  ifndef __ORDER_LITTLE_ENDIAN__
+#    error "__ORDER_LITTLE_ENDIAN__ not defined"
+#  endif
 #
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define ARROW_LITTLE_ENDIAN 1
-#else
-#define ARROW_LITTLE_ENDIAN 0
-#endif
+#  if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#    define ARROW_LITTLE_ENDIAN 1
+#  else
+#    define ARROW_LITTLE_ENDIAN 0
+#  endif
 #endif
 
 #if defined(_MSC_VER)
-#include <intrin.h>  // IWYU pragma: keep
-#define ARROW_BYTE_SWAP64 _byteswap_uint64
-#define ARROW_BYTE_SWAP32 _byteswap_ulong
+#  include <intrin.h>  // IWYU pragma: keep
+#  define ARROW_BYTE_SWAP64 _byteswap_uint64
+#  define ARROW_BYTE_SWAP32 _byteswap_ulong
 #else
-#define ARROW_BYTE_SWAP64 __builtin_bswap64
-#define ARROW_BYTE_SWAP32 __builtin_bswap32
+#  define ARROW_BYTE_SWAP64 __builtin_bswap64
+#  define ARROW_BYTE_SWAP32 __builtin_bswap32
 #endif
 
 #include <algorithm>
diff --git a/cpp/src/arrow/util/hash_util.h b/cpp/src/arrow/util/hash_util.h
index dd1c38a78216e..7b3de2208935f 100644
--- a/cpp/src/arrow/util/hash_util.h
+++ b/cpp/src/arrow/util/hash_util.h
@@ -26,9 +26,9 @@ namespace internal {
 namespace detail {
 
 #if defined(_MSC_VER)
-#define ARROW_HASH_ROTL32(x, r) _rotl(x, r)
+#  define ARROW_HASH_ROTL32(x, r) _rotl(x, r)
 #else
-#define ARROW_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
+#  define ARROW_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
 #endif
 
 template <typename SizeT>
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index 2de9f4153248f..4ead1a7283d81 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -182,7 +182,7 @@ hash_t ComputeStringHash(const void* data, int64_t length) {
   }
 
 #if XXH3_SECRET_SIZE_MIN != 136
-#error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets
+#  error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets
 #endif
 
   // XXH3_64bits_withSeed generates a secret based on the seed, which is too slow.
diff --git a/cpp/src/arrow/util/int128_internal.h b/cpp/src/arrow/util/int128_internal.h
index 1d494671a9f8f..201e4a1349190 100644
--- a/cpp/src/arrow/util/int128_internal.h
+++ b/cpp/src/arrow/util/int128_internal.h
@@ -20,7 +20,7 @@
 #include "arrow/util/macros.h"
 
 #ifndef ARROW_USE_NATIVE_INT128
-#include <boost/multiprecision/cpp_int.hpp>
+#  include <boost/multiprecision/cpp_int.hpp>
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc
index 2eefe96f0d5c4..8c4d925dac541 100644
--- a/cpp/src/arrow/util/io_util.cc
+++ b/cpp/src/arrow/util/io_util.cc
@@ -17,7 +17,7 @@
 
 // Ensure 64-bit off_t for platforms where it matters
 #ifdef _FILE_OFFSET_BITS
-#undef _FILE_OFFSET_BITS
+#  undef _FILE_OFFSET_BITS
 #endif
 
 #define _FILE_OFFSET_BITS 64
@@ -27,8 +27,8 @@
 // is the best way to enable modern POSIX APIs, such as posix_madvise(), on Solaris.
 // (see also
 // https://github.com/illumos/illumos-gate/blob/master/usr/src/uts/common/sys/mman.h)
-#undef __EXTENSIONS__
-#define __EXTENSIONS__
+#  undef __EXTENSIONS__
+#  define __EXTENSIONS__
 #endif
 
 #include "arrow/util/windows_compatibility.h"  // IWYU pragma: keep
@@ -60,34 +60,34 @@
 // file compatibility stuff
 
 #ifdef _WIN32
-#include <direct.h>
-#include <io.h>
-#include <share.h>
+#  include <direct.h>
+#  include <io.h>
+#  include <share.h>
 #else  // POSIX-like platforms
-#include <dirent.h>
+#  include <dirent.h>
 #endif
 
 #ifdef _WIN32
-#include "arrow/io/mman.h"
-#undef Realloc
-#undef Free
+#  include "arrow/io/mman.h"
+#  undef Realloc
+#  undef Free
 #else  // POSIX-like platforms
-#include <sys/mman.h>
-#include <unistd.h>
+#  include <sys/mman.h>
+#  include <unistd.h>
 #endif
 
 // define max read/write count
 #ifdef _WIN32
-#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
+#  define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
 #else
 
-#ifdef __APPLE__
+#  ifdef __APPLE__
 // due to macOS bug, we need to set read/write max
-#define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
-#else
+#    define ARROW_MAX_IO_CHUNKSIZE INT32_MAX
+#  else
 // see notes on Linux read/write manpage
-#define ARROW_MAX_IO_CHUNKSIZE 0x7ffff000
-#endif
+#    define ARROW_MAX_IO_CHUNKSIZE 0x7ffff000
+#  endif
 
 #endif
 
@@ -102,25 +102,25 @@
 
 // For filename conversion
 #if defined(_WIN32)
-#include "arrow/util/utf8.h"
+#  include "arrow/util/utf8.h"
 #endif
 
 #ifdef _WIN32
-#include <psapi.h>
+#  include <psapi.h>
 
 #elif __APPLE__
-#include <mach/mach.h>
-#include <sys/sysctl.h>
+#  include <mach/mach.h>
+#  include <sys/sysctl.h>
 
 #elif __linux__
-#include <sys/sysinfo.h>
-#include <fstream>
+#  include <sys/sysinfo.h>
+#  include <fstream>
 #endif
 
 #ifdef _WIN32
-#include <Windows.h>
+#  include <Windows.h>
 #else
-#include <dlfcn.h>
+#  include <dlfcn.h>
 #endif
 
 namespace arrow::internal {
@@ -1223,11 +1223,11 @@ Status SetPipeFileDescriptorNonBlocking(int fd) {
 namespace {
 
 #ifdef WIN32
-#define PIPE_WRITE _write
-#define PIPE_READ _read
+#  define PIPE_WRITE _write
+#  define PIPE_READ _read
 #else
-#define PIPE_WRITE write
-#define PIPE_READ read
+#  define PIPE_WRITE write
+#  define PIPE_READ read
 #endif
 
 class SelfPipeImpl : public SelfPipe, public std::enable_shared_from_this<SelfPipeImpl> {
@@ -1500,7 +1500,7 @@ Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions) {
             region.size + static_cast<size_t>(addr - aligned_addr)};
   };
 
-#ifdef _WIN32
+#  ifdef _WIN32
   // PrefetchVirtualMemory() is available on Windows 8 or later
   struct PrefetchEntry {  // Like WIN32_MEMORY_RANGE_ENTRY
     void* VirtualAddress;
@@ -1528,7 +1528,7 @@ Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions) {
     }
   }
   return Status::OK();
-#elif defined(POSIX_MADV_WILLNEED)
+#  elif defined(POSIX_MADV_WILLNEED)
   for (const auto& region : regions) {
     if (region.size != 0) {
       const auto aligned = align_region(region);
@@ -1542,9 +1542,9 @@ Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions) {
     }
   }
   return Status::OK();
-#else
+#  else
   return Status::OK();
-#endif
+#  endif
 #else
   return Status::OK();
 #endif
@@ -1876,11 +1876,11 @@ std::vector<NativePathString> GetPlatformTemporaryDirs() {
 
 #else
   selectors = {{"TMPDIR", ""}, {"TMP", ""}, {"TEMP", ""}, {"TEMPDIR", ""}};
-#ifdef __ANDROID__
+#  ifdef __ANDROID__
   fallback_tmp = "/data/local/tmp";
-#else
+#  else
   fallback_tmp = "/tmp";
-#endif
+#  endif
 #endif
 
   std::vector<NativePathString> temp_dirs;
@@ -2157,7 +2157,7 @@ int64_t GetCurrentRSS() {
 
 #elif defined(__APPLE__)
 // OSX ------------------------------------------------------
-#ifdef MACH_TASK_BASIC_INFO
+#  ifdef MACH_TASK_BASIC_INFO
   struct mach_task_basic_info info;
   mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
   if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) !=
@@ -2165,7 +2165,7 @@ int64_t GetCurrentRSS() {
     ARROW_LOG(WARNING) << "Can't resolve RSS value";
     return 0;
   }
-#else
+#  else
   struct task_basic_info info;
   mach_msg_type_number_t infoCount = TASK_BASIC_INFO_COUNT;
   if (task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&info, &infoCount) !=
@@ -2173,7 +2173,7 @@ int64_t GetCurrentRSS() {
     ARROW_LOG(WARNING) << "Can't resolve RSS value";
     return 0;
   }
-#endif
+#  endif
   return static_cast<int64_t>(info.resident_size);
 
 #elif defined(__linux__)
diff --git a/cpp/src/arrow/util/io_util.h b/cpp/src/arrow/util/io_util.h
index 5f5bbd169e2eb..892641d4bc52f 100644
--- a/cpp/src/arrow/util/io_util.h
+++ b/cpp/src/arrow/util/io_util.h
@@ -18,7 +18,7 @@
 #pragma once
 
 #ifndef _WIN32
-#define ARROW_HAVE_SIGACTION 1
+#  define ARROW_HAVE_SIGACTION 1
 #endif
 
 #include <atomic>
@@ -29,7 +29,7 @@
 #include <vector>
 
 #if ARROW_HAVE_SIGACTION
-#include <csignal>  // Needed for struct sigaction
+#  include <csignal>  // Needed for struct sigaction
 #endif
 
 #include "arrow/result.h"
diff --git a/cpp/src/arrow/util/io_util_test.cc b/cpp/src/arrow/util/io_util_test.cc
index 73213bf9ce48a..1ff8fcf7adb5c 100644
--- a/cpp/src/arrow/util/io_util_test.cc
+++ b/cpp/src/arrow/util/io_util_test.cc
@@ -29,9 +29,9 @@
 #include <signal.h>
 
 #ifndef _WIN32
-#include <pthread.h>
-#include <sys/types.h>
-#include <unistd.h>
+#  include <pthread.h>
+#  include <sys/types.h>
+#  include <unistd.h>
 #endif
 
 #include <gmock/gmock-matchers.h>
@@ -48,11 +48,11 @@
 #include "arrow/util/windows_fixup.h"
 
 #ifdef WIN32
-#define PIPE_WRITE _write
-#define PIPE_READ _read
+#  define PIPE_WRITE _write
+#  define PIPE_READ _read
 #else
-#define PIPE_WRITE write
-#define PIPE_READ read
+#  define PIPE_WRITE write
+#  define PIPE_READ read
 #endif
 
 namespace arrow {
@@ -474,9 +474,9 @@ TEST_F(TestSelfPipe, SendFromSignalAndWait) {
 #if !(defined(_WIN32) || defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \
       defined(THREAD_SANITIZER))
 TEST_F(TestSelfPipe, ForkSafety) {
-#ifndef ARROW_ENABLE_THREADING
+#  ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
 
   self_pipe_->Send(123456789123456789ULL);
 
@@ -974,7 +974,7 @@ TEST(DeleteFile, Basics) {
 TEST(FileUtils, LongPaths) {
   // ARROW-8477: check using long file paths under Windows (> 260 characters).
   bool created, deleted;
-#ifdef _WIN32
+#  ifdef _WIN32
   const char* kRegKeyName = R"(SYSTEM\CurrentControlSet\Control\FileSystem)";
   const char* kRegValueName = "LongPathsEnabled";
   DWORD value = 0;
@@ -990,7 +990,7 @@ TEST(FileUtils, LongPaths) {
         << " to 1 on the test host.";
     return;
   }
-#endif
+#  endif
 
   const std::string BASE = "xxx-io-util-test-dir-long";
   PlatformFilename base_path, long_path, long_filename;
diff --git a/cpp/src/arrow/util/logger.h b/cpp/src/arrow/util/logger.h
index 5200503bb4fdb..7832f4a4c2232 100644
--- a/cpp/src/arrow/util/logger.h
+++ b/cpp/src/arrow/util/logger.h
@@ -128,7 +128,7 @@ class ARROW_EXPORT LogMessage {
 // For the following macros, log statements with a lower severity than
 // `ARROW_MINIMUM_LOG_LEVEL` will be stripped from the build
 #ifndef ARROW_MINIMUM_LOG_LEVEL
-#define ARROW_MINIMUM_LOG_LEVEL -1000
+#  define ARROW_MINIMUM_LOG_LEVEL -1000
 #endif
 
 #define ARROW_LOGGER_INTERNAL(LOGGER, LEVEL)                                      \
@@ -137,50 +137,50 @@ class ARROW_EXPORT LogMessage {
 
 static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_TRACE) == -2);
 #if ARROW_MINIMUM_LOG_LEVEL <= -2
-#define ARROW_LOGGER_TRACE(LOGGER, ...) \
-  (ARROW_LOGGER_INTERNAL(LOGGER, TRACE).Append(__VA_ARGS__))
+#  define ARROW_LOGGER_TRACE(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, TRACE).Append(__VA_ARGS__))
 #else
-#define ARROW_LOGGER_TRACE(...) ARROW_UNUSED(0)
+#  define ARROW_LOGGER_TRACE(...) ARROW_UNUSED(0)
 #endif
 
 static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_DEBUG) == -1);
 #if ARROW_MINIMUM_LOG_LEVEL <= -1
-#define ARROW_LOGGER_DEBUG(LOGGER, ...) \
-  (ARROW_LOGGER_INTERNAL(LOGGER, DEBUG).Append(__VA_ARGS__))
+#  define ARROW_LOGGER_DEBUG(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, DEBUG).Append(__VA_ARGS__))
 #else
-#define ARROW_LOGGER_DEBUG(...) ARROW_UNUSED(0)
+#  define ARROW_LOGGER_DEBUG(...) ARROW_UNUSED(0)
 #endif
 
 static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_INFO) == 0);
 #if ARROW_MINIMUM_LOG_LEVEL <= 0
-#define ARROW_LOGGER_INFO(LOGGER, ...) \
-  (ARROW_LOGGER_INTERNAL(LOGGER, INFO).Append(__VA_ARGS__))
+#  define ARROW_LOGGER_INFO(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, INFO).Append(__VA_ARGS__))
 #else
-#define ARROW_LOGGER_INFO(...) ARROW_UNUSED(0)
+#  define ARROW_LOGGER_INFO(...) ARROW_UNUSED(0)
 #endif
 
 static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_WARNING) == 1);
 #if ARROW_MINIMUM_LOG_LEVEL <= 1
-#define ARROW_LOGGER_WARNING(LOGGER, ...) \
-  (ARROW_LOGGER_INTERNAL(LOGGER, WARNING).Append(__VA_ARGS__))
+#  define ARROW_LOGGER_WARNING(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, WARNING).Append(__VA_ARGS__))
 #else
-#define ARROW_LOGGER_WARNING(...) ARROW_UNUSED(0)
+#  define ARROW_LOGGER_WARNING(...) ARROW_UNUSED(0)
 #endif
 
 static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_ERROR) == 2);
 #if ARROW_MINIMUM_LOG_LEVEL <= 2
-#define ARROW_LOGGER_ERROR(LOGGER, ...) \
-  (ARROW_LOGGER_INTERNAL(LOGGER, ERROR).Append(__VA_ARGS__))
+#  define ARROW_LOGGER_ERROR(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, ERROR).Append(__VA_ARGS__))
 #else
-#define ARROW_LOGGER_ERROR(...) ARROW_UNUSED(0)
+#  define ARROW_LOGGER_ERROR(...) ARROW_UNUSED(0)
 #endif
 
 static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_FATAL) == 3);
 #if ARROW_MINIMUM_LOG_LEVEL <= 3
-#define ARROW_LOGGER_FATAL(LOGGER, ...) \
-  (ARROW_LOGGER_INTERNAL(LOGGER, FATAL).Append(__VA_ARGS__))
+#  define ARROW_LOGGER_FATAL(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, FATAL).Append(__VA_ARGS__))
 #else
-#define ARROW_LOGGER_FATAL(...) ARROW_UNUSED(0)
+#  define ARROW_LOGGER_FATAL(...) ARROW_UNUSED(0)
 #endif
 
 #define ARROW_LOGGER_CALL(LOGGER, LEVEL, ...) ARROW_LOGGER_##LEVEL(LOGGER, __VA_ARGS__)
diff --git a/cpp/src/arrow/util/logging.cc b/cpp/src/arrow/util/logging.cc
index ca4edcc5a5deb..993c5306ca4aa 100644
--- a/cpp/src/arrow/util/logging.cc
+++ b/cpp/src/arrow/util/logging.cc
@@ -20,36 +20,36 @@
 #include "arrow/util/config.h"
 
 #ifdef ARROW_WITH_BACKTRACE
-#include <execinfo.h>
+#  include <execinfo.h>
 #endif
 #include <cstdlib>
 #include <iostream>
 
 #ifdef ARROW_USE_GLOG
 
-#include <signal.h>
-#include <vector>
+#  include <signal.h>
+#  include <vector>
 
-#include <glog/logging.h>
+#  include <glog/logging.h>
 
 // Restore our versions of DCHECK and friends, as GLog defines its own
-#undef DCHECK
-#undef DCHECK_OK
-#undef DCHECK_EQ
-#undef DCHECK_NE
-#undef DCHECK_LE
-#undef DCHECK_LT
-#undef DCHECK_GE
-#undef DCHECK_GT
-
-#define DCHECK ARROW_DCHECK
-#define DCHECK_OK ARROW_DCHECK_OK
-#define DCHECK_EQ ARROW_DCHECK_EQ
-#define DCHECK_NE ARROW_DCHECK_NE
-#define DCHECK_LE ARROW_DCHECK_LE
-#define DCHECK_LT ARROW_DCHECK_LT
-#define DCHECK_GE ARROW_DCHECK_GE
-#define DCHECK_GT ARROW_DCHECK_GT
+#  undef DCHECK
+#  undef DCHECK_OK
+#  undef DCHECK_EQ
+#  undef DCHECK_NE
+#  undef DCHECK_LE
+#  undef DCHECK_LT
+#  undef DCHECK_GE
+#  undef DCHECK_GT
+
+#  define DCHECK ARROW_DCHECK
+#  define DCHECK_OK ARROW_DCHECK_OK
+#  define DCHECK_EQ ARROW_DCHECK_EQ
+#  define DCHECK_NE ARROW_DCHECK_NE
+#  define DCHECK_LE ARROW_DCHECK_LE
+#  define DCHECK_LT ARROW_DCHECK_LT
+#  define DCHECK_GE ARROW_DCHECK_GE
+#  define DCHECK_GT ARROW_DCHECK_GT
 
 #endif
 
@@ -188,11 +188,11 @@ void ArrowLog::UninstallSignalAction() {
   // This signal list comes from glog's signalhandler.cc.
   // https://github.com/google/glog/blob/master/src/signalhandler.cc#L58-L70
   std::vector<int> installed_signals({SIGSEGV, SIGILL, SIGFPE, SIGABRT, SIGTERM});
-#ifdef WIN32
+#  ifdef WIN32
   for (int signal_num : installed_signals) {
     ARROW_CHECK(signal(signal_num, SIG_DFL) != SIG_ERR);
   }
-#else
+#  else
   struct sigaction sig_action;
   memset(&sig_action, 0, sizeof(sig_action));
   sigemptyset(&sig_action.sa_mask);
@@ -200,7 +200,7 @@ void ArrowLog::UninstallSignalAction() {
   for (int signal_num : installed_signals) {
     ARROW_CHECK(sigaction(signal_num, &sig_action, NULL) == 0);
   }
-#endif
+#  endif
 #endif
 }
 
diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h
index be73c020c07f8..04c6bc21cac73 100644
--- a/cpp/src/arrow/util/logging.h
+++ b/cpp/src/arrow/util/logging.h
@@ -22,25 +22,25 @@
 // The LLVM IR code doesn't have an NDEBUG mode. And, it shouldn't include references to
 // streams or stdc++. So, making the DCHECK calls void in that case.
 
-#define ARROW_IGNORE_EXPR(expr) ((void)(expr))
+#  define ARROW_IGNORE_EXPR(expr) ((void)(expr))
 
-#define DCHECK(condition) ARROW_IGNORE_EXPR(condition)
-#define DCHECK_OK(status) ARROW_IGNORE_EXPR(status)
-#define DCHECK_EQ(val1, val2) ARROW_IGNORE_EXPR(val1)
-#define DCHECK_NE(val1, val2) ARROW_IGNORE_EXPR(val1)
-#define DCHECK_LE(val1, val2) ARROW_IGNORE_EXPR(val1)
-#define DCHECK_LT(val1, val2) ARROW_IGNORE_EXPR(val1)
-#define DCHECK_GE(val1, val2) ARROW_IGNORE_EXPR(val1)
-#define DCHECK_GT(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define DCHECK(condition) ARROW_IGNORE_EXPR(condition)
+#  define DCHECK_OK(status) ARROW_IGNORE_EXPR(status)
+#  define DCHECK_EQ(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define DCHECK_NE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define DCHECK_LE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define DCHECK_LT(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define DCHECK_GE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define DCHECK_GT(val1, val2) ARROW_IGNORE_EXPR(val1)
 
 #else  // !GANDIVA_IR
 
-#include <memory>
-#include <ostream>
-#include <string>
+#  include <memory>
+#  include <ostream>
+#  include <string>
 
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
+#  include "arrow/util/macros.h"
+#  include "arrow/util/visibility.h"
 
 namespace arrow {
 namespace util {
@@ -54,115 +54,115 @@ enum class ArrowLogLevel : int {
   ARROW_FATAL = 3
 };
 
-#define ARROW_LOG_INTERNAL(level) ::arrow::util::ArrowLog(__FILE__, __LINE__, level)
-#define ARROW_LOG(level) ARROW_LOG_INTERNAL(::arrow::util::ArrowLogLevel::ARROW_##level)
+#  define ARROW_LOG_INTERNAL(level) ::arrow::util::ArrowLog(__FILE__, __LINE__, level)
+#  define ARROW_LOG(level) ARROW_LOG_INTERNAL(::arrow::util::ArrowLogLevel::ARROW_##level)
 
-#define ARROW_IGNORE_EXPR(expr) ((void)(expr))
+#  define ARROW_IGNORE_EXPR(expr) ((void)(expr))
 
-#define ARROW_CHECK_OR_LOG(condition, level) \
-  ARROW_PREDICT_TRUE(condition)              \
-  ? ARROW_IGNORE_EXPR(0)                     \
-  : ::arrow::util::Voidify() & ARROW_LOG(level) << " Check failed: " #condition " "
+#  define ARROW_CHECK_OR_LOG(condition, level) \
+    ARROW_PREDICT_TRUE(condition)              \
+    ? ARROW_IGNORE_EXPR(0)                     \
+    : ::arrow::util::Voidify() & ARROW_LOG(level) << " Check failed: " #condition " "
 
-#define ARROW_CHECK(condition) ARROW_CHECK_OR_LOG(condition, FATAL)
+#  define ARROW_CHECK(condition) ARROW_CHECK_OR_LOG(condition, FATAL)
 
 // If 'to_call' returns a bad status, CHECK immediately with a logged message
 // of 'msg' followed by the status.
-#define ARROW_CHECK_OK_PREPEND(to_call, msg, level)                 \
-  do {                                                              \
-    ::arrow::Status _s = (to_call);                                 \
-    ARROW_CHECK_OR_LOG(_s.ok(), level)                              \
-        << "Operation failed: " << ARROW_STRINGIFY(to_call) << "\n" \
-        << (msg) << ": " << _s.ToString();                          \
-  } while (false)
+#  define ARROW_CHECK_OK_PREPEND(to_call, msg, level)                 \
+    do {                                                              \
+      ::arrow::Status _s = (to_call);                                 \
+      ARROW_CHECK_OR_LOG(_s.ok(), level)                              \
+          << "Operation failed: " << ARROW_STRINGIFY(to_call) << "\n" \
+          << (msg) << ": " << _s.ToString();                          \
+    } while (false)
 
 // If the status is bad, CHECK immediately, appending the status to the
 // logged message.
-#define ARROW_CHECK_OK(s) ARROW_CHECK_OK_PREPEND(s, "Bad status", FATAL)
+#  define ARROW_CHECK_OK(s) ARROW_CHECK_OK_PREPEND(s, "Bad status", FATAL)
 
-#define ARROW_CHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2))
-#define ARROW_CHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2))
-#define ARROW_CHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2))
-#define ARROW_CHECK_LT(val1, val2) ARROW_CHECK((val1) < (val2))
-#define ARROW_CHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2))
-#define ARROW_CHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2))
+#  define ARROW_CHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2))
+#  define ARROW_CHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2))
+#  define ARROW_CHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2))
+#  define ARROW_CHECK_LT(val1, val2) ARROW_CHECK((val1) < (val2))
+#  define ARROW_CHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2))
+#  define ARROW_CHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2))
 
-#ifdef NDEBUG
-#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_WARNING
+#  ifdef NDEBUG
+#    define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_WARNING
 
 // CAUTION: DCHECK_OK() always evaluates its argument, but other DCHECK*() macros
 // only do so in debug mode.
 
-#define ARROW_DCHECK(condition)               \
-  while (false) ARROW_IGNORE_EXPR(condition); \
-  while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_OK(s) \
-  ARROW_IGNORE_EXPR(s);    \
-  while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_EQ(val1, val2)      \
-  while (false) ARROW_IGNORE_EXPR(val1); \
-  while (false) ARROW_IGNORE_EXPR(val2); \
-  while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_NE(val1, val2)      \
-  while (false) ARROW_IGNORE_EXPR(val1); \
-  while (false) ARROW_IGNORE_EXPR(val2); \
-  while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_LE(val1, val2)      \
-  while (false) ARROW_IGNORE_EXPR(val1); \
-  while (false) ARROW_IGNORE_EXPR(val2); \
-  while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_LT(val1, val2)      \
-  while (false) ARROW_IGNORE_EXPR(val1); \
-  while (false) ARROW_IGNORE_EXPR(val2); \
-  while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_GE(val1, val2)      \
-  while (false) ARROW_IGNORE_EXPR(val1); \
-  while (false) ARROW_IGNORE_EXPR(val2); \
-  while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_GT(val1, val2)      \
-  while (false) ARROW_IGNORE_EXPR(val1); \
-  while (false) ARROW_IGNORE_EXPR(val2); \
-  while (false) ::arrow::util::detail::NullLog()
-
-#else
-#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL
-
-#define ARROW_DCHECK ARROW_CHECK
-#define ARROW_DCHECK_OK ARROW_CHECK_OK
-#define ARROW_DCHECK_EQ ARROW_CHECK_EQ
-#define ARROW_DCHECK_NE ARROW_CHECK_NE
-#define ARROW_DCHECK_LE ARROW_CHECK_LE
-#define ARROW_DCHECK_LT ARROW_CHECK_LT
-#define ARROW_DCHECK_GE ARROW_CHECK_GE
-#define ARROW_DCHECK_GT ARROW_CHECK_GT
-
-#endif  // NDEBUG
+#    define ARROW_DCHECK(condition)               \
+      while (false) ARROW_IGNORE_EXPR(condition); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_OK(s) \
+      ARROW_IGNORE_EXPR(s);    \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_EQ(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_NE(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_LE(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_LT(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_GE(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_GT(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+
+#  else
+#    define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL
+
+#    define ARROW_DCHECK ARROW_CHECK
+#    define ARROW_DCHECK_OK ARROW_CHECK_OK
+#    define ARROW_DCHECK_EQ ARROW_CHECK_EQ
+#    define ARROW_DCHECK_NE ARROW_CHECK_NE
+#    define ARROW_DCHECK_LE ARROW_CHECK_LE
+#    define ARROW_DCHECK_LT ARROW_CHECK_LT
+#    define ARROW_DCHECK_GE ARROW_CHECK_GE
+#    define ARROW_DCHECK_GT ARROW_CHECK_GT
+
+#  endif  // NDEBUG
 
 // These are internal-use macros and should not be used in public headers.
-#ifndef DCHECK
-#define DCHECK ARROW_DCHECK
-#endif
-#ifndef DCHECK_OK
-#define DCHECK_OK ARROW_DCHECK_OK
-#endif
-#ifndef DCHECK_EQ
-#define DCHECK_EQ ARROW_DCHECK_EQ
-#endif
-#ifndef DCHECK_NE
-#define DCHECK_NE ARROW_DCHECK_NE
-#endif
-#ifndef DCHECK_LE
-#define DCHECK_LE ARROW_DCHECK_LE
-#endif
-#ifndef DCHECK_LT
-#define DCHECK_LT ARROW_DCHECK_LT
-#endif
-#ifndef DCHECK_GE
-#define DCHECK_GE ARROW_DCHECK_GE
-#endif
-#ifndef DCHECK_GT
-#define DCHECK_GT ARROW_DCHECK_GT
-#endif
+#  ifndef DCHECK
+#    define DCHECK ARROW_DCHECK
+#  endif
+#  ifndef DCHECK_OK
+#    define DCHECK_OK ARROW_DCHECK_OK
+#  endif
+#  ifndef DCHECK_EQ
+#    define DCHECK_EQ ARROW_DCHECK_EQ
+#  endif
+#  ifndef DCHECK_NE
+#    define DCHECK_NE ARROW_DCHECK_NE
+#  endif
+#  ifndef DCHECK_LE
+#    define DCHECK_LE ARROW_DCHECK_LE
+#  endif
+#  ifndef DCHECK_LT
+#    define DCHECK_LT ARROW_DCHECK_LT
+#  endif
+#  ifndef DCHECK_GE
+#    define DCHECK_GE ARROW_DCHECK_GE
+#  endif
+#  ifndef DCHECK_GT
+#    define DCHECK_GT ARROW_DCHECK_GT
+#  endif
 
 // This code is adapted from
 // https://github.com/ray-project/ray/blob/master/src/ray/util/logging.h.
diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h
index 484df3400d92d..5658874b42b6c 100644
--- a/cpp/src/arrow/util/macros.h
+++ b/cpp/src/arrow/util/macros.h
@@ -25,15 +25,15 @@
 
 // From Google gutil
 #ifndef ARROW_DISALLOW_COPY_AND_ASSIGN
-#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&) = delete;            \
-  void operator=(const TypeName&) = delete
+#  define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+    TypeName(const TypeName&) = delete;            \
+    void operator=(const TypeName&) = delete
 #endif
 
 #ifndef ARROW_DEFAULT_MOVE_AND_ASSIGN
-#define ARROW_DEFAULT_MOVE_AND_ASSIGN(TypeName) \
-  TypeName(TypeName&&) = default;               \
-  TypeName& operator=(TypeName&&) = default
+#  define ARROW_DEFAULT_MOVE_AND_ASSIGN(TypeName) \
+    TypeName(TypeName&&) = default;               \
+    TypeName& operator=(TypeName&&) = default
 #endif
 
 // With ARROW_PREDICT_FALSE, GCC and clang can be told that a certain branch is
@@ -68,55 +68,55 @@
 //     Program Annotations". https://github.com/jdoerfert/PETOSPA/blob/master/ISC19.pdf
 #define ARROW_UNUSED(x) (void)(x)
 #ifdef ARROW_WARN_DOCUMENTATION
-#define ARROW_ARG_UNUSED(x) x
+#  define ARROW_ARG_UNUSED(x) x
 #else
-#define ARROW_ARG_UNUSED(x)
+#  define ARROW_ARG_UNUSED(x)
 #endif
 #if defined(__GNUC__)  // GCC and compatible compilers (clang, Intel ICC)
-#define ARROW_NORETURN __attribute__((noreturn))
-#define ARROW_NOINLINE __attribute__((noinline))
-#define ARROW_FORCE_INLINE __attribute__((always_inline))
-#define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0))
-#define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
-#define ARROW_PREFETCH(addr) __builtin_prefetch(addr)
-#define ARROW_RESTRICT __restrict
-#if defined(__clang__)  // clang-specific
-#define ARROW_COMPILER_ASSUME(expr) __builtin_assume(expr)
-#else  // GCC-specific
-#if __GNUC__ >= 13
-#define ARROW_COMPILER_ASSUME(expr) __attribute__((assume(expr)))
-#else
+#  define ARROW_NORETURN __attribute__((noreturn))
+#  define ARROW_NOINLINE __attribute__((noinline))
+#  define ARROW_FORCE_INLINE __attribute__((always_inline))
+#  define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0))
+#  define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#  define ARROW_PREFETCH(addr) __builtin_prefetch(addr)
+#  define ARROW_RESTRICT __restrict
+#  if defined(__clang__)  // clang-specific
+#    define ARROW_COMPILER_ASSUME(expr) __builtin_assume(expr)
+#  else  // GCC-specific
+#    if __GNUC__ >= 13
+#      define ARROW_COMPILER_ASSUME(expr) __attribute__((assume(expr)))
+#    else
 // GCC does not have a built-in assume intrinsic before GCC 13, so we use an
 // if statement and __builtin_unreachable() to achieve the same effect [2].
 // Unlike clang's __builtin_assume and C++23's [[assume(expr)]], using this
 // on GCC won't warn about side-effects in the expression, so make sure expr
 // is side-effect free when working with GCC versions before 13 (Jan-2024),
 // otherwise clang/MSVC builds will fail in CI.
-#define ARROW_COMPILER_ASSUME(expr) \
-  if (expr) {                       \
-  } else {                          \
-    __builtin_unreachable();        \
-  }
-#endif  // __GNUC__ >= 13
-#endif
+#      define ARROW_COMPILER_ASSUME(expr) \
+        if (expr) {                       \
+        } else {                          \
+          __builtin_unreachable();        \
+        }
+#    endif  // __GNUC__ >= 13
+#  endif
 #elif defined(_MSC_VER)  // MSVC
-#define ARROW_NORETURN __declspec(noreturn)
-#define ARROW_NOINLINE __declspec(noinline)
-#define ARROW_FORCE_INLINE __forceinline
-#define ARROW_PREDICT_FALSE(x) (x)
-#define ARROW_PREDICT_TRUE(x) (x)
-#define ARROW_PREFETCH(addr)
-#define ARROW_RESTRICT __restrict
-#define ARROW_COMPILER_ASSUME(expr) __assume(expr)
+#  define ARROW_NORETURN __declspec(noreturn)
+#  define ARROW_NOINLINE __declspec(noinline)
+#  define ARROW_FORCE_INLINE __forceinline
+#  define ARROW_PREDICT_FALSE(x) (x)
+#  define ARROW_PREDICT_TRUE(x) (x)
+#  define ARROW_PREFETCH(addr)
+#  define ARROW_RESTRICT __restrict
+#  define ARROW_COMPILER_ASSUME(expr) __assume(expr)
 #else
-#define ARROW_NORETURN
-#define ARROW_NOINLINE
-#define ARROW_FORCE_INLINE
-#define ARROW_PREDICT_FALSE(x) (x)
-#define ARROW_PREDICT_TRUE(x) (x)
-#define ARROW_PREFETCH(addr)
-#define ARROW_RESTRICT
-#define ARROW_COMPILER_ASSUME(expr)
+#  define ARROW_NORETURN
+#  define ARROW_NOINLINE
+#  define ARROW_FORCE_INLINE
+#  define ARROW_PREDICT_FALSE(x) (x)
+#  define ARROW_PREDICT_TRUE(x) (x)
+#  define ARROW_PREFETCH(addr)
+#  define ARROW_RESTRICT
+#  define ARROW_COMPILER_ASSUME(expr)
 #endif
 
 // ----------------------------------------------------------------------
@@ -124,11 +124,11 @@
 
 #ifndef NULLPTR
 
-#ifdef __cplusplus_cli
-#define NULLPTR __nullptr
-#else
-#define NULLPTR nullptr
-#endif
+#  ifdef __cplusplus_cli
+#    define NULLPTR __nullptr
+#  else
+#    define NULLPTR nullptr
+#  endif
 
 #endif  // ifndef NULLPTR
 
@@ -154,22 +154,22 @@
 // Macros to disable deprecation warnings
 
 #ifdef __clang__
-#define ARROW_SUPPRESS_DEPRECATION_WARNING \
-  _Pragma("clang diagnostic push");        \
-  _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
-#define ARROW_UNSUPPRESS_DEPRECATION_WARNING _Pragma("clang diagnostic pop")
+#  define ARROW_SUPPRESS_DEPRECATION_WARNING \
+    _Pragma("clang diagnostic push");        \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#  define ARROW_UNSUPPRESS_DEPRECATION_WARNING _Pragma("clang diagnostic pop")
 #elif defined(__GNUC__)
-#define ARROW_SUPPRESS_DEPRECATION_WARNING \
-  _Pragma("GCC diagnostic push");          \
-  _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
-#define ARROW_UNSUPPRESS_DEPRECATION_WARNING _Pragma("GCC diagnostic pop")
+#  define ARROW_SUPPRESS_DEPRECATION_WARNING \
+    _Pragma("GCC diagnostic push");          \
+    _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#  define ARROW_UNSUPPRESS_DEPRECATION_WARNING _Pragma("GCC diagnostic pop")
 #elif defined(_MSC_VER)
-#define ARROW_SUPPRESS_DEPRECATION_WARNING \
-  __pragma(warning(push)) __pragma(warning(disable : 4996))
-#define ARROW_UNSUPPRESS_DEPRECATION_WARNING __pragma(warning(pop))
+#  define ARROW_SUPPRESS_DEPRECATION_WARNING \
+    __pragma(warning(push)) __pragma(warning(disable : 4996))
+#  define ARROW_UNSUPPRESS_DEPRECATION_WARNING __pragma(warning(pop))
 #else
-#define ARROW_SUPPRESS_DEPRECATION_WARNING
-#define ARROW_UNSUPPRESS_DEPRECATION_WARNING
+#  define ARROW_SUPPRESS_DEPRECATION_WARNING
+#  define ARROW_UNSUPPRESS_DEPRECATION_WARNING
 #endif
 
 // ----------------------------------------------------------------------
@@ -178,41 +178,42 @@
 // these macros are portable across different compilers and platforms
 //[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
 #if !defined(MANUALLY_ALIGNED_STRUCT)
-#if defined(_MSC_VER)
-#define MANUALLY_ALIGNED_STRUCT(alignment) \
-  __pragma(pack(1));                       \
-  struct __declspec(align(alignment))
-#define STRUCT_END(name, size) \
-  __pragma(pack());            \
-  static_assert(sizeof(name) == size, "compiler breaks packing rules")
-#elif defined(__GNUC__) || defined(__clang__)
-#define MANUALLY_ALIGNED_STRUCT(alignment) \
-  _Pragma("pack(1)") struct __attribute__((aligned(alignment)))
-#define STRUCT_END(name, size) \
-  _Pragma("pack()") static_assert(sizeof(name) == size, "compiler breaks packing rules")
-#else
-#error Unknown compiler, please define structure alignment macros
-#endif
+#  if defined(_MSC_VER)
+#    define MANUALLY_ALIGNED_STRUCT(alignment) \
+      __pragma(pack(1));                       \
+      struct __declspec(align(alignment))
+#    define STRUCT_END(name, size) \
+      __pragma(pack());            \
+      static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#  elif defined(__GNUC__) || defined(__clang__)
+#    define MANUALLY_ALIGNED_STRUCT(alignment) \
+      _Pragma("pack(1)") struct __attribute__((aligned(alignment)))
+#    define STRUCT_END(name, size)                          \
+      _Pragma("pack()") static_assert(sizeof(name) == size, \
+                                      "compiler breaks packing rules")
+#  else
+#    error Unknown compiler, please define structure alignment macros
+#  endif
 #endif  // !defined(MANUALLY_ALIGNED_STRUCT)
 
 // ----------------------------------------------------------------------
 // Convenience macro disabling a particular UBSan check in a function
 
 #if defined(__clang__)
-#define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature)))
+#  define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature)))
 #else
-#define ARROW_DISABLE_UBSAN(feature)
+#  define ARROW_DISABLE_UBSAN(feature)
 #endif
 
 // ----------------------------------------------------------------------
 // Machine information
 
 #if INTPTR_MAX == INT64_MAX
-#define ARROW_BITNESS 64
+#  define ARROW_BITNESS 64
 #elif INTPTR_MAX == INT32_MAX
-#define ARROW_BITNESS 32
+#  define ARROW_BITNESS 32
 #else
-#error Unexpected INTPTR_MAX
+#  error Unexpected INTPTR_MAX
 #endif
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/math_constants.h b/cpp/src/arrow/util/math_constants.h
index 7ee87c5d6ac81..3524f88e0ba9a 100644
--- a/cpp/src/arrow/util/math_constants.h
+++ b/cpp/src/arrow/util/math_constants.h
@@ -22,11 +22,11 @@
 // Not provided by default in MSVC,
 // and _USE_MATH_DEFINES is not reliable with unity builds
 #ifndef M_PI
-#define M_PI 3.14159265358979323846
+#  define M_PI 3.14159265358979323846
 #endif
 #ifndef M_PI_2
-#define M_PI_2 1.57079632679489661923
+#  define M_PI_2 1.57079632679489661923
 #endif
 #ifndef M_PI_4
-#define M_PI_4 0.785398163397448309616
+#  define M_PI_4 0.785398163397448309616
 #endif
diff --git a/cpp/src/arrow/util/mutex.cc b/cpp/src/arrow/util/mutex.cc
index bbf2a9a93e692..e170a8648a1fd 100644
--- a/cpp/src/arrow/util/mutex.cc
+++ b/cpp/src/arrow/util/mutex.cc
@@ -20,8 +20,8 @@
 #include <mutex>
 
 #ifndef _WIN32
-#include <pthread.h>
-#include <atomic>
+#  include <pthread.h>
+#  include <atomic>
 #endif
 
 #include "arrow/util/config.h"
diff --git a/cpp/src/arrow/util/simd.h b/cpp/src/arrow/util/simd.h
index ee9105d5f4beb..b37f6e4926978 100644
--- a/cpp/src/arrow/util/simd.h
+++ b/cpp/src/arrow/util/simd.h
@@ -20,25 +20,25 @@
 #ifdef _MSC_VER
 // MSVC x86_64/arm64
 
-#if defined(_M_AMD64) || defined(_M_X64)
-#include <intrin.h>
-#endif
+#  if defined(_M_AMD64) || defined(_M_X64)
+#    include <intrin.h>
+#  endif
 
 #else
 // gcc/clang (possibly others)
 
-#if defined(ARROW_HAVE_BMI2)
-#include <x86intrin.h>
-#endif
+#  if defined(ARROW_HAVE_BMI2)
+#    include <x86intrin.h>
+#  endif
 
-#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_AVX512)
-#include <immintrin.h>
-#elif defined(ARROW_HAVE_SSE4_2)
-#include <nmmintrin.h>
-#endif
+#  if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_AVX512)
+#    include <immintrin.h>
+#  elif defined(ARROW_HAVE_SSE4_2)
+#    include <nmmintrin.h>
+#  endif
 
-#ifdef ARROW_HAVE_NEON
-#include <arm_neon.h>
-#endif
+#  ifdef ARROW_HAVE_NEON
+#    include <arm_neon.h>
+#  endif
 
 #endif
diff --git a/cpp/src/arrow/util/small_vector_benchmark.cc b/cpp/src/arrow/util/small_vector_benchmark.cc
index 96f94c369e61e..04ad547221b2c 100644
--- a/cpp/src/arrow/util/small_vector_benchmark.cc
+++ b/cpp/src/arrow/util/small_vector_benchmark.cc
@@ -321,10 +321,10 @@ void LongVectorInsertAtEnd(benchmark::State& state) {
 
 #ifdef ARROW_WITH_BENCHMARKS_REFERENCE
 
-#define STD_VECTOR(T) std::vector<T>
+#  define STD_VECTOR(T) std::vector<T>
 SHORT_VECTOR_BENCHMARKS(STD_VECTOR);
 LONG_VECTOR_BENCHMARKS(STD_VECTOR);
-#undef STD_VECTOR
+#  undef STD_VECTOR
 
 #endif
 
diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h
index d7e377773f62f..f2081d0937b77 100644
--- a/cpp/src/arrow/util/string.h
+++ b/cpp/src/arrow/util/string.h
@@ -26,7 +26,7 @@
 #include <vector>
 
 #if __has_include(<charconv>)
-#include <charconv>
+#  include <charconv>
 #endif
 
 #include "arrow/result.h"
diff --git a/cpp/src/arrow/util/thread_pool.cc b/cpp/src/arrow/util/thread_pool.cc
index d82934c9bec01..d59d1f20de7c3 100644
--- a/cpp/src/arrow/util/thread_pool.cc
+++ b/cpp/src/arrow/util/thread_pool.cc
@@ -128,7 +128,7 @@ int SerialExecutor::GetNumTasks() {
 #ifdef ARROW_ENABLE_THREADING
 Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
                                  StopToken stop_token, StopCallback&& stop_callback) {
-#ifdef ARROW_WITH_OPENTELEMETRY
+#  ifdef ARROW_WITH_OPENTELEMETRY
   // Wrap the task to propagate a parent tracing span to it
   // XXX should there be a generic utility in tracing_internal.h for this?
   task = [func = std::move(task),
@@ -137,7 +137,7 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
     auto scope = ::arrow::internal::tracing::GetTracer()->WithActiveSpan(active_span);
     std::move(func)();
   };
-#endif
+#  endif
   // While the SerialExecutor runs tasks synchronously on its main thread,
   // SpawnReal may be called from external threads (e.g. when transferring back
   // from blocking I/O threads), so we need to keep the state alive *and* to
@@ -172,7 +172,7 @@ void SerialExecutor::Finish() {
 #else  // ARROW_ENABLE_THREADING
 Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
                                  StopToken stop_token, StopCallback&& stop_callback) {
-#ifdef ARROW_WITH_OPENTELEMETRY
+#  ifdef ARROW_WITH_OPENTELEMETRY
   // Wrap the task to propagate a parent tracing span to it
   // XXX should there be a generic utility in tracing_internal.h for this?
   task = [func = std::move(task),
@@ -181,7 +181,7 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
     auto scope = ::arrow::internal::tracing::GetTracer()->WithActiveSpan(active_span);
     std::move(func)();
   };
-#endif  // ARROW_WITH_OPENTELEMETRY
+#  endif  // ARROW_WITH_OPENTELEMETRY
 
   if (state_->finished) {
     return Status::Invalid(
@@ -503,7 +503,7 @@ ThreadPool::ThreadPool()
       shutdown_on_destroy_(true) {
   // Eternal thread pools would produce false leak reports in the vector of
   // atfork handlers.
-#if !(defined(_WIN32) || defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND))
+#  if !(defined(_WIN32) || defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND))
   state_->atfork_handler_ = std::make_shared<AtForkHandler>(
       /*before=*/
       [weak_state = std::weak_ptr<ThreadPool::State>(sp_state_)]() {
@@ -528,7 +528,7 @@ ThreadPool::ThreadPool()
         }
       });
   RegisterAtFork(state_->atfork_handler_);
-#endif
+#  endif
 }
 
 ThreadPool::~ThreadPool() {
@@ -623,7 +623,7 @@ void ThreadPool::LaunchWorkersUnlocked(int threads) {
 Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken stop_token,
                              StopCallback&& stop_callback) {
   {
-#ifdef ARROW_WITH_OPENTELEMETRY
+#  ifdef ARROW_WITH_OPENTELEMETRY
     // Wrap the task to propagate a parent tracing span to it
     // This task-wrapping needs to be done before we grab the mutex because the
     // first call to OT (whatever that happens to be) will attempt to grab this mutex
@@ -638,7 +638,7 @@ Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken sto
     } wrapper{std::forward<FnOnce<void()>>(task),
               ::arrow::internal::tracing::GetTracer()->GetCurrentSpan()};
     task = std::move(wrapper);
-#endif
+#  endif
     std::lock_guard<std::mutex> lock(state_->mutex_);
     if (state_->please_shutdown_) {
       return Status::Invalid("operation forbidden during or after shutdown");
@@ -674,9 +674,9 @@ Result<std::shared_ptr<ThreadPool>> ThreadPool::MakeEternal(int threads) {
   // On Windows, the ThreadPool destructor may be called after non-main threads
   // have been killed by the OS, and hang in a condition variable.
   // On Unix, we want to avoid leak reports by Valgrind.
-#ifdef _WIN32
+#  ifdef _WIN32
   pool->shutdown_on_destroy_ = false;
-#endif
+#  endif
   return pool;
 }
 
diff --git a/cpp/src/arrow/util/thread_pool.h b/cpp/src/arrow/util/thread_pool.h
index 44b1e227b0e5f..cd32781aed756 100644
--- a/cpp/src/arrow/util/thread_pool.h
+++ b/cpp/src/arrow/util/thread_pool.h
@@ -36,7 +36,7 @@
 
 #if defined(_MSC_VER)
 // Disable harmless warning for decorated name length limit
-#pragma warning(disable : 4503)
+#  pragma warning(disable : 4503)
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/util/thread_pool_test.cc b/cpp/src/arrow/util/thread_pool_test.cc
index 8f43bb8dec367..7cf8826e8a173 100644
--- a/cpp/src/arrow/util/thread_pool_test.cc
+++ b/cpp/src/arrow/util/thread_pool_test.cc
@@ -16,8 +16,8 @@
 // under the License.
 
 #ifndef _WIN32
-#include <sys/types.h>
-#include <unistd.h>
+#  include <sys/types.h>
+#  include <unistd.h>
 #endif
 
 #include <algorithm>
@@ -830,9 +830,9 @@ class TestThreadPoolForkSafety : public TestThreadPool {};
 
 TEST_F(TestThreadPoolForkSafety, Basics) {
   {
-#ifndef ARROW_ENABLE_THREADING
+#  ifndef ARROW_ENABLE_THREADING
     GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
 
     // Fork after task submission
     auto pool = this->MakeThreadPool(3);
@@ -877,9 +877,9 @@ TEST_F(TestThreadPoolForkSafety, Basics) {
 }
 
 TEST_F(TestThreadPoolForkSafety, MultipleChildThreads) {
-#ifndef ARROW_ENABLE_THREADING
+#  ifndef ARROW_ENABLE_THREADING
   GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
   // ARROW-15593: race condition in after-fork ThreadPool reinitialization
   // when SpawnReal() was called from multiple threads in a forked child.
   auto run_in_child = [](ThreadPool* pool) {
@@ -927,12 +927,12 @@ TEST_F(TestThreadPoolForkSafety, MultipleChildThreads) {
 
 TEST_F(TestThreadPoolForkSafety, NestedChild) {
   {
-#ifdef __APPLE__
+#  ifdef __APPLE__
     GTEST_SKIP() << "Nested fork is not supported on macos";
-#endif
-#ifndef ARROW_ENABLE_THREADING
+#  endif
+#  ifndef ARROW_ENABLE_THREADING
     GTEST_SKIP() << "Test requires threading support";
-#endif
+#  endif
     auto pool = this->MakeThreadPool(3);
     ASSERT_OK_AND_ASSIGN(auto fut, pool->Submit(add<int>, 4, 5));
     ASSERT_OK_AND_EQ(9, fut.result());
diff --git a/cpp/src/arrow/util/tracing_internal.cc b/cpp/src/arrow/util/tracing_internal.cc
index f4f65ad1e6132..e47acf42bccd1 100644
--- a/cpp/src/arrow/util/tracing_internal.cc
+++ b/cpp/src/arrow/util/tracing_internal.cc
@@ -26,8 +26,8 @@
 #include <thread>
 
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4522)
+#  pragma warning(push)
+#  pragma warning(disable : 4522)
 #endif
 #include <google/protobuf/util/json_util.h>
 
@@ -45,7 +45,7 @@
 #include <opentelemetry/exporters/otlp/protobuf_include_suffix.h>
 #include <opentelemetry/proto/collector/trace/v1/trace_service.pb.h>
 #ifdef _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 #include "arrow/util/io_util.h"
diff --git a/cpp/src/arrow/util/tracing_internal.h b/cpp/src/arrow/util/tracing_internal.h
index a031edf08dc77..6ed731599a9d4 100644
--- a/cpp/src/arrow/util/tracing_internal.h
+++ b/cpp/src/arrow/util/tracing_internal.h
@@ -23,15 +23,15 @@
 #include "arrow/util/config.h"
 
 #ifdef ARROW_WITH_OPENTELEMETRY
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4522)
-#endif
-#include <opentelemetry/trace/provider.h>
-#include <opentelemetry/trace/scope.h>
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
+#  ifdef _MSC_VER
+#    pragma warning(push)
+#    pragma warning(disable : 4522)
+#  endif
+#  include <opentelemetry/trace/provider.h>
+#  include <opentelemetry/trace/scope.h>
+#  ifdef _MSC_VER
+#    pragma warning(pop)
+#  endif
 #endif
 
 #include "arrow/memory_pool.h"
@@ -135,77 +135,78 @@ opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span>& RewrapSpan(
 opentelemetry::trace::StartSpanOptions SpanOptionsWithParent(
     const util::tracing::Span& parent_span);
 
-#define START_SPAN(target_span, ...)                           \
-  auto opentelemetry_scope##__LINE__ =                         \
-      ::arrow::internal::tracing::GetTracer()->WithActiveSpan( \
-          ::arrow::internal::tracing::RewrapSpan(              \
-              target_span.details.get(),                       \
-              ::arrow::internal::tracing::GetTracer()->StartSpan(__VA_ARGS__)))
-
-#define START_SCOPED_SPAN(target_span, ...)                    \
-  ::arrow::internal::tracing::Scope(                           \
-      ::arrow::internal::tracing::GetTracer()->WithActiveSpan( \
-          ::arrow::internal::tracing::RewrapSpan(              \
-              target_span.details.get(),                       \
-              ::arrow::internal::tracing::GetTracer()->StartSpan(__VA_ARGS__))))
-
-#define START_SCOPED_SPAN_SV(target_span, name, ...)                             \
-  ::arrow::internal::tracing::Scope(                                             \
-      ::arrow::internal::tracing::GetTracer()->WithActiveSpan(                   \
-          ::arrow::internal::tracing::RewrapSpan(                                \
-              target_span.details.get(),                                         \
-              ::arrow::internal::tracing::GetTracer()->StartSpan(                \
-                  ::opentelemetry::nostd::string_view(name.data(), name.size()), \
-                  ##__VA_ARGS__))))
-
-#define START_SCOPED_SPAN_WITH_PARENT_SV(target_span, parent_span, name, ...)    \
-  ::arrow::internal::tracing::Scope(                                             \
-      ::arrow::internal::tracing::GetTracer()->WithActiveSpan(                   \
-          ::arrow::internal::tracing::RewrapSpan(                                \
-              target_span.details.get(),                                         \
-                                                                                 \
-              ::arrow::internal::tracing::GetTracer()->StartSpan(                \
-                  ::opentelemetry::nostd::string_view(name.data(), name.size()), \
-                  __VA_ARGS__,                                                   \
-                  ::arrow::internal::tracing::SpanOptionsWithParent(parent_span)))))
-
-#define START_COMPUTE_SPAN(target_span, ...)                        \
-  START_SPAN(target_span, __VA_ARGS__);                             \
-  ::arrow::internal::tracing::UnwrapSpan(target_span.details.get()) \
-      ->SetAttribute("arrow.memory_pool_bytes",                     \
-                     ::arrow::default_memory_pool()->bytes_allocated())
-
-#define EVENT_ON_CURRENT_SPAN(...) \
-  ::arrow::internal::tracing::GetTracer()->GetCurrentSpan()->AddEvent(__VA_ARGS__)
-
-#define EVENT(target_span, ...) \
-  ::arrow::internal::tracing::UnwrapSpan(target_span.details.get())->AddEvent(__VA_ARGS__)
-
-#define ACTIVATE_SPAN(target_span)                             \
-  ::arrow::internal::tracing::Scope(                           \
-      ::arrow::internal::tracing::GetTracer()->WithActiveSpan( \
-          ::arrow::internal::tracing::UnwrapSpan(target_span.details.get())))
-
-#define MARK_SPAN(target_span, status)  \
-  ::arrow::internal::tracing::MarkSpan( \
-      status, ::arrow::internal::tracing::UnwrapSpan(target_span.details.get()).get())
-
-#define END_SPAN(target_span) \
-  ::arrow::internal::tracing::UnwrapSpan(target_span.details.get())->End()
-
-#define END_SPAN_ON_FUTURE_COMPLETION(target_span, target_future) \
-  target_future.SetSpan(&target_span)
-
-#define PROPAGATE_SPAN_TO_GENERATOR(generator)                                \
-  generator = ::arrow::internal::tracing::PropagateSpanThroughAsyncGenerator( \
-      std::move(generator))
-
-#define WRAP_ASYNC_GENERATOR(generator) \
-  generator = ::arrow::internal::tracing::WrapAsyncGenerator(std::move(generator))
-
-#define WRAP_ASYNC_GENERATOR_WITH_CHILD_SPAN(generator, name) \
-  generator =                                                 \
-      ::arrow::internal::tracing::WrapAsyncGenerator(std::move(generator), name, true)
+#  define START_SPAN(target_span, ...)                           \
+    auto opentelemetry_scope##__LINE__ =                         \
+        ::arrow::internal::tracing::GetTracer()->WithActiveSpan( \
+            ::arrow::internal::tracing::RewrapSpan(              \
+                target_span.details.get(),                       \
+                ::arrow::internal::tracing::GetTracer()->StartSpan(__VA_ARGS__)))
+
+#  define START_SCOPED_SPAN(target_span, ...)                    \
+    ::arrow::internal::tracing::Scope(                           \
+        ::arrow::internal::tracing::GetTracer()->WithActiveSpan( \
+            ::arrow::internal::tracing::RewrapSpan(              \
+                target_span.details.get(),                       \
+                ::arrow::internal::tracing::GetTracer()->StartSpan(__VA_ARGS__))))
+
+#  define START_SCOPED_SPAN_SV(target_span, name, ...)                             \
+    ::arrow::internal::tracing::Scope(                                             \
+        ::arrow::internal::tracing::GetTracer()->WithActiveSpan(                   \
+            ::arrow::internal::tracing::RewrapSpan(                                \
+                target_span.details.get(),                                         \
+                ::arrow::internal::tracing::GetTracer()->StartSpan(                \
+                    ::opentelemetry::nostd::string_view(name.data(), name.size()), \
+                    ##__VA_ARGS__))))
+
+#  define START_SCOPED_SPAN_WITH_PARENT_SV(target_span, parent_span, name, ...)    \
+    ::arrow::internal::tracing::Scope(                                             \
+        ::arrow::internal::tracing::GetTracer()->WithActiveSpan(                   \
+            ::arrow::internal::tracing::RewrapSpan(                                \
+                target_span.details.get(),                                         \
+                                                                                   \
+                ::arrow::internal::tracing::GetTracer()->StartSpan(                \
+                    ::opentelemetry::nostd::string_view(name.data(), name.size()), \
+                    __VA_ARGS__,                                                   \
+                    ::arrow::internal::tracing::SpanOptionsWithParent(parent_span)))))
+
+#  define START_COMPUTE_SPAN(target_span, ...)                        \
+    START_SPAN(target_span, __VA_ARGS__);                             \
+    ::arrow::internal::tracing::UnwrapSpan(target_span.details.get()) \
+        ->SetAttribute("arrow.memory_pool_bytes",                     \
+                       ::arrow::default_memory_pool()->bytes_allocated())
+
+#  define EVENT_ON_CURRENT_SPAN(...) \
+    ::arrow::internal::tracing::GetTracer()->GetCurrentSpan()->AddEvent(__VA_ARGS__)
+
+#  define EVENT(target_span, ...)                                     \
+    ::arrow::internal::tracing::UnwrapSpan(target_span.details.get()) \
+        ->AddEvent(__VA_ARGS__)
+
+#  define ACTIVATE_SPAN(target_span)                             \
+    ::arrow::internal::tracing::Scope(                           \
+        ::arrow::internal::tracing::GetTracer()->WithActiveSpan( \
+            ::arrow::internal::tracing::UnwrapSpan(target_span.details.get())))
+
+#  define MARK_SPAN(target_span, status)  \
+    ::arrow::internal::tracing::MarkSpan( \
+        status, ::arrow::internal::tracing::UnwrapSpan(target_span.details.get()).get())
+
+#  define END_SPAN(target_span) \
+    ::arrow::internal::tracing::UnwrapSpan(target_span.details.get())->End()
+
+#  define END_SPAN_ON_FUTURE_COMPLETION(target_span, target_future) \
+    target_future.SetSpan(&target_span)
+
+#  define PROPAGATE_SPAN_TO_GENERATOR(generator)                                \
+    generator = ::arrow::internal::tracing::PropagateSpanThroughAsyncGenerator( \
+        std::move(generator))
+
+#  define WRAP_ASYNC_GENERATOR(generator) \
+    generator = ::arrow::internal::tracing::WrapAsyncGenerator(std::move(generator))
+
+#  define WRAP_ASYNC_GENERATOR_WITH_CHILD_SPAN(generator, name) \
+    generator =                                                 \
+        ::arrow::internal::tracing::WrapAsyncGenerator(std::move(generator), name, true)
 
 /*
  * Calls to the helper macros above are removed by the preprocessor when
@@ -223,19 +224,19 @@ struct Scope {
   [[maybe_unused]] ~Scope() {}
 };
 
-#define START_SPAN(target_span, ...)
-#define START_SCOPED_SPAN(target_span, ...) ::arrow::internal::tracing::Scope()
-#define START_SCOPED_SPAN_SV(target_span, name, ...) ::arrow::internal::tracing::Scope()
-#define START_COMPUTE_SPAN(target_span, ...)
-#define ACTIVATE_SPAN(target_span) ::arrow::internal::tracing::Scope()
-#define MARK_SPAN(target_span, status)
-#define EVENT(target_span, ...)
-#define EVENT_ON_CURRENT_SPAN(...)
-#define END_SPAN(target_span)
-#define END_SPAN_ON_FUTURE_COMPLETION(target_span, target_future)
-#define PROPAGATE_SPAN_TO_GENERATOR(generator)
-#define WRAP_ASYNC_GENERATOR(generator)
-#define WRAP_ASYNC_GENERATOR_WITH_CHILD_SPAN(generator, name)
+#  define START_SPAN(target_span, ...)
+#  define START_SCOPED_SPAN(target_span, ...) ::arrow::internal::tracing::Scope()
+#  define START_SCOPED_SPAN_SV(target_span, name, ...) ::arrow::internal::tracing::Scope()
+#  define START_COMPUTE_SPAN(target_span, ...)
+#  define ACTIVATE_SPAN(target_span) ::arrow::internal::tracing::Scope()
+#  define MARK_SPAN(target_span, status)
+#  define EVENT(target_span, ...)
+#  define EVENT_ON_CURRENT_SPAN(...)
+#  define END_SPAN(target_span)
+#  define END_SPAN_ON_FUTURE_COMPLETION(target_span, target_future)
+#  define PROPAGATE_SPAN_TO_GENERATOR(generator)
+#  define WRAP_ASYNC_GENERATOR(generator)
+#  define WRAP_ASYNC_GENERATOR_WITH_CHILD_SPAN(generator, name)
 
 #endif
 
diff --git a/cpp/src/arrow/util/utf8.cc b/cpp/src/arrow/util/utf8.cc
index 042a6144d6c19..9f91e0f080868 100644
--- a/cpp/src/arrow/util/utf8.cc
+++ b/cpp/src/arrow/util/utf8.cc
@@ -30,7 +30,7 @@
 
 // Can be defined by utfcpp
 #ifdef NOEXCEPT
-#undef NOEXCEPT
+#  undef NOEXCEPT
 #endif
 
 namespace arrow {
diff --git a/cpp/src/arrow/util/utf8_internal.h b/cpp/src/arrow/util/utf8_internal.h
index ec8fc2b46fe82..335e875f7ae20 100644
--- a/cpp/src/arrow/util/utf8_internal.h
+++ b/cpp/src/arrow/util/utf8_internal.h
@@ -25,7 +25,7 @@
 #include <string_view>
 
 #if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
-#include <xsimd/xsimd.hpp>
+#  include <xsimd/xsimd.hpp>
 #endif
 
 #include "arrow/type_fwd.h"
diff --git a/cpp/src/arrow/util/visibility.h b/cpp/src/arrow/util/visibility.h
index 1498d2085a03d..9a53cdbdeff1b 100644
--- a/cpp/src/arrow/util/visibility.h
+++ b/cpp/src/arrow/util/visibility.h
@@ -20,67 +20,67 @@
 #if defined(_WIN32) || defined(__CYGWIN__)
 // Windows
 
-#if defined(_MSC_VER)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#if defined(__cplusplus) && defined(__GNUC__) && !defined(__clang__)
+#  if defined(__cplusplus) && defined(__GNUC__) && !defined(__clang__)
 // Use C++ attribute syntax where possible to avoid GCC parser bug
 // (https://stackoverflow.com/questions/57993818/gcc-how-to-combine-attribute-dllexport-and-nodiscard-in-a-struct-de)
-#define ARROW_DLLEXPORT [[gnu::dllexport]]
-#define ARROW_DLLIMPORT [[gnu::dllimport]]
-#else
-#define ARROW_DLLEXPORT __declspec(dllexport)
-#define ARROW_DLLIMPORT __declspec(dllimport)
-#endif
+#    define ARROW_DLLEXPORT [[gnu::dllexport]]
+#    define ARROW_DLLIMPORT [[gnu::dllimport]]
+#  else
+#    define ARROW_DLLEXPORT __declspec(dllexport)
+#    define ARROW_DLLIMPORT __declspec(dllimport)
+#  endif
 
 // _declspec(dllexport) even when the #included by a non-arrow source
-#define ARROW_FORCE_EXPORT ARROW_DLLEXPORT
+#  define ARROW_FORCE_EXPORT ARROW_DLLEXPORT
 
-#ifdef ARROW_STATIC
-#define ARROW_EXPORT
-#define ARROW_FRIEND_EXPORT
-#define ARROW_TEMPLATE_EXPORT
-#elif defined(ARROW_EXPORTING)
-#define ARROW_EXPORT ARROW_DLLEXPORT
+#  ifdef ARROW_STATIC
+#    define ARROW_EXPORT
+#    define ARROW_FRIEND_EXPORT
+#    define ARROW_TEMPLATE_EXPORT
+#  elif defined(ARROW_EXPORTING)
+#    define ARROW_EXPORT ARROW_DLLEXPORT
 // For some reason [[gnu::dllexport]] doesn't work well with friend declarations
-#define ARROW_FRIEND_EXPORT __declspec(dllexport)
-#define ARROW_TEMPLATE_EXPORT ARROW_DLLEXPORT
-#else
-#define ARROW_EXPORT ARROW_DLLIMPORT
-#define ARROW_FRIEND_EXPORT __declspec(dllimport)
-#define ARROW_TEMPLATE_EXPORT ARROW_DLLIMPORT
-#endif
+#    define ARROW_FRIEND_EXPORT __declspec(dllexport)
+#    define ARROW_TEMPLATE_EXPORT ARROW_DLLEXPORT
+#  else
+#    define ARROW_EXPORT ARROW_DLLIMPORT
+#    define ARROW_FRIEND_EXPORT __declspec(dllimport)
+#    define ARROW_TEMPLATE_EXPORT ARROW_DLLIMPORT
+#  endif
 
-#define ARROW_NO_EXPORT
+#  define ARROW_NO_EXPORT
 
 #else
 
 // Non-Windows
 
-#if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__))
-#ifndef ARROW_EXPORT
-#define ARROW_EXPORT [[gnu::visibility("default")]]
-#endif
-#ifndef ARROW_NO_EXPORT
-#define ARROW_NO_EXPORT [[gnu::visibility("hidden")]]
-#endif
-#else
+#  if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__))
+#    ifndef ARROW_EXPORT
+#      define ARROW_EXPORT [[gnu::visibility("default")]]
+#    endif
+#    ifndef ARROW_NO_EXPORT
+#      define ARROW_NO_EXPORT [[gnu::visibility("hidden")]]
+#    endif
+#  else
 // Not C++, or not gcc/clang
-#ifndef ARROW_EXPORT
-#define ARROW_EXPORT
-#endif
-#ifndef ARROW_NO_EXPORT
-#define ARROW_NO_EXPORT
-#endif
-#endif
+#    ifndef ARROW_EXPORT
+#      define ARROW_EXPORT
+#    endif
+#    ifndef ARROW_NO_EXPORT
+#      define ARROW_NO_EXPORT
+#    endif
+#  endif
 
-#define ARROW_FRIEND_EXPORT
-#define ARROW_TEMPLATE_EXPORT
+#  define ARROW_FRIEND_EXPORT
+#  define ARROW_TEMPLATE_EXPORT
 
 // [[gnu::visibility("default")]] even when #included by a non-arrow source
-#define ARROW_FORCE_EXPORT [[gnu::visibility("default")]]
+#  define ARROW_FORCE_EXPORT [[gnu::visibility("default")]]
 
 #endif  // Non-Windows
diff --git a/cpp/src/arrow/util/windows_compatibility.h b/cpp/src/arrow/util/windows_compatibility.h
index c97b2f3b76a7c..810a91201f335 100644
--- a/cpp/src/arrow/util/windows_compatibility.h
+++ b/cpp/src/arrow/util/windows_compatibility.h
@@ -18,22 +18,22 @@
 #ifdef _WIN32
 
 // Windows defines min and max macros that mess up std::min/max
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
 
-#define WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
 
 // Set Windows 7 as a conservative minimum for Apache Arrow
-#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x601
-#undef _WIN32_WINNT
-#endif
-#ifndef _WIN32_WINNT
-#define _WIN32_WINNT 0x601
-#endif
+#  if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x601
+#    undef _WIN32_WINNT
+#  endif
+#  ifndef _WIN32_WINNT
+#    define _WIN32_WINNT 0x601
+#  endif
 
-#include <winsock2.h>
+#  include <winsock2.h>
 
-#include "arrow/util/windows_fixup.h"
+#  include "arrow/util/windows_fixup.h"
 
 #endif  // _WIN32
diff --git a/cpp/src/arrow/util/windows_fixup.h b/cpp/src/arrow/util/windows_fixup.h
index 2949ac4ab7688..42e74f4a7857f 100644
--- a/cpp/src/arrow/util/windows_fixup.h
+++ b/cpp/src/arrow/util/windows_fixup.h
@@ -19,32 +19,32 @@
 
 #ifdef _WIN32
 
-#ifdef max
-#undef max
-#endif
-#ifdef min
-#undef min
-#endif
+#  ifdef max
+#    undef max
+#  endif
+#  ifdef min
+#    undef min
+#  endif
 
 // The Windows API defines macros from *File resolving to either
 // *FileA or *FileW.  Need to undo them.
-#ifdef CopyFile
-#undef CopyFile
-#endif
-#ifdef CreateFile
-#undef CreateFile
-#endif
-#ifdef DeleteFile
-#undef DeleteFile
-#endif
+#  ifdef CopyFile
+#    undef CopyFile
+#  endif
+#  ifdef CreateFile
+#    undef CreateFile
+#  endif
+#  ifdef DeleteFile
+#    undef DeleteFile
+#  endif
 
 // Other annoying Windows macro definitions...
-#ifdef IN
-#undef IN
-#endif
-#ifdef OUT
-#undef OUT
-#endif
+#  ifdef IN
+#    undef IN
+#  endif
+#  ifdef OUT
+#    undef OUT
+#  endif
 
 // Note that we can't undefine OPTIONAL, because it can be used in other
 // Windows headers...
diff --git a/cpp/src/gandiva/cast_time.cc b/cpp/src/gandiva/cast_time.cc
index eeb2ea3fdd88f..f170375298b55 100644
--- a/cpp/src/gandiva/cast_time.cc
+++ b/cpp/src/gandiva/cast_time.cc
@@ -22,10 +22,10 @@
 #include "gandiva/precompiled/time_fields.h"
 
 #ifndef GANDIVA_UNIT_TEST
-#include "gandiva/exported_funcs.h"
-#include "gandiva/gdv_function_stubs.h"
+#  include "gandiva/exported_funcs.h"
+#  include "gandiva/gdv_function_stubs.h"
 
-#include "gandiva/engine.h"
+#  include "gandiva/engine.h"
 
 namespace gandiva {
 
diff --git a/cpp/src/gandiva/context_helper.cc b/cpp/src/gandiva/context_helper.cc
index 8edd52b1fb070..2a3efc8348b38 100644
--- a/cpp/src/gandiva/context_helper.cc
+++ b/cpp/src/gandiva/context_helper.cc
@@ -21,9 +21,9 @@
 // This file is also used in the pre-compiled unit tests, which do include
 // llvm/engine/..
 #ifndef GANDIVA_UNIT_TEST
-#include "gandiva/exported_funcs.h"
+#  include "gandiva/exported_funcs.h"
 
-#include "gandiva/engine.h"
+#  include "gandiva/engine.h"
 
 namespace gandiva {
 
diff --git a/cpp/src/gandiva/decimal_xlarge.cc b/cpp/src/gandiva/decimal_xlarge.cc
index 21212422f3d69..e9fe0dc6b91ed 100644
--- a/cpp/src/gandiva/decimal_xlarge.cc
+++ b/cpp/src/gandiva/decimal_xlarge.cc
@@ -33,8 +33,8 @@
 #include "gandiva/decimal_type_util.h"
 
 #ifndef GANDIVA_UNIT_TEST
-#include "gandiva/engine.h"
-#include "gandiva/exported_funcs.h"
+#  include "gandiva/engine.h"
+#  include "gandiva/exported_funcs.h"
 
 namespace gandiva {
 
diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc
index bfce72cefc630..065ea5a59837e 100644
--- a/cpp/src/gandiva/engine.cc
+++ b/cpp/src/gandiva/engine.cc
@@ -18,7 +18,7 @@
 // TODO(wesm): LLVM 7 produces pesky C4244 that disable pragmas around the LLVM
 // includes seem to not fix as with LLVM 6
 #if defined(_MSC_VER)
-#pragma warning(disable : 4244)
+#  pragma warning(disable : 4244)
 #endif
 
 #include "gandiva/engine.h"
@@ -35,12 +35,12 @@
 #include <arrow/util/logging.h>
 
 #if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4141)
-#pragma warning(disable : 4146)
-#pragma warning(disable : 4244)
-#pragma warning(disable : 4267)
-#pragma warning(disable : 4624)
+#  pragma warning(push)
+#  pragma warning(disable : 4141)
+#  pragma warning(disable : 4146)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4267)
+#  pragma warning(disable : 4624)
 #endif
 
 #include <llvm/Analysis/Passes.h>
@@ -56,32 +56,32 @@
 #include <llvm/Linker/Linker.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 #if LLVM_VERSION_MAJOR >= 17
-#include <llvm/TargetParser/SubtargetFeature.h>
+#  include <llvm/TargetParser/SubtargetFeature.h>
 #else
-#include <llvm/MC/SubtargetFeature.h>
+#  include <llvm/MC/SubtargetFeature.h>
 #endif
 #include <llvm/Passes/PassBuilder.h>
 #include <llvm/Support/DynamicLibrary.h>
 #if LLVM_VERSION_MAJOR >= 18
-#include <llvm/TargetParser/Host.h>
+#  include <llvm/TargetParser/Host.h>
 #else
-#include <llvm/Support/Host.h>
+#  include <llvm/Support/Host.h>
 #endif
 #include <llvm/Transforms/IPO/GlobalDCE.h>
 #include <llvm/Transforms/IPO/Internalize.h>
 #if LLVM_VERSION_MAJOR >= 14
-#include <llvm/IR/PassManager.h>
-#include <llvm/MC/TargetRegistry.h>
-#include <llvm/Passes/PassPlugin.h>
-#include <llvm/Transforms/IPO/GlobalOpt.h>
-#include <llvm/Transforms/Scalar/NewGVN.h>
-#include <llvm/Transforms/Scalar/SimplifyCFG.h>
-#include <llvm/Transforms/Utils/Mem2Reg.h>
-#include <llvm/Transforms/Vectorize/LoopVectorize.h>
-#include <llvm/Transforms/Vectorize/SLPVectorizer.h>
+#  include <llvm/IR/PassManager.h>
+#  include <llvm/MC/TargetRegistry.h>
+#  include <llvm/Passes/PassPlugin.h>
+#  include <llvm/Transforms/IPO/GlobalOpt.h>
+#  include <llvm/Transforms/Scalar/NewGVN.h>
+#  include <llvm/Transforms/Scalar/SimplifyCFG.h>
+#  include <llvm/Transforms/Utils/Mem2Reg.h>
+#  include <llvm/Transforms/Vectorize/LoopVectorize.h>
+#  include <llvm/Transforms/Vectorize/SLPVectorizer.h>
 #else
-#include <llvm/Support/TargetRegistry.h>
-#include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#  include <llvm/Support/TargetRegistry.h>
+#  include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #endif
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
@@ -91,18 +91,18 @@
 #include <llvm/Transforms/Scalar/GVN.h>
 #include <llvm/Transforms/Utils.h>
 #if LLVM_VERSION_MAJOR <= 17
-#include <llvm/Transforms/Vectorize.h>
+#  include <llvm/Transforms/Vectorize.h>
 #endif
 
 // JITLink is available in LLVM 9+
 // but the `InProcessMemoryManager::Create` API was added since LLVM 14
 #if LLVM_VERSION_MAJOR >= 14 && !defined(_WIN32)
-#define JIT_LINK_SUPPORTED
-#include <llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h>
+#  define JIT_LINK_SUPPORTED
+#  include <llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h>
 #endif
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 #include "gandiva/configuration.h"
diff --git a/cpp/src/gandiva/gandiva_object_cache.h b/cpp/src/gandiva/gandiva_object_cache.h
index 62042c7b627ec..cebc8d5cac211 100644
--- a/cpp/src/gandiva/gandiva_object_cache.h
+++ b/cpp/src/gandiva/gandiva_object_cache.h
@@ -18,12 +18,12 @@
 #pragma once
 
 #if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4244)
-#pragma warning(disable : 4141)
-#pragma warning(disable : 4146)
-#pragma warning(disable : 4267)
-#pragma warning(disable : 4624)
+#  pragma warning(push)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4141)
+#  pragma warning(disable : 4146)
+#  pragma warning(disable : 4267)
+#  pragma warning(disable : 4624)
 #endif
 
 #include <llvm/ExecutionEngine/ObjectCache.h>
diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h
index 8e87bc51215e1..4113f261ad766 100644
--- a/cpp/src/gandiva/gdv_function_stubs.h
+++ b/cpp/src/gandiva/gdv_function_stubs.h
@@ -46,13 +46,13 @@ using gdv_month_interval = int32_t;
 
 #ifdef GANDIVA_UNIT_TEST
 // unit tests may be compiled without O2, so inlining may not happen.
-#define GDV_FORCE_INLINE
+#  define GDV_FORCE_INLINE
 #else
-#ifdef _MSC_VER
-#define GDV_FORCE_INLINE __forceinline
-#else
-#define GDV_FORCE_INLINE inline __attribute__((always_inline))
-#endif
+#  ifdef _MSC_VER
+#    define GDV_FORCE_INLINE __forceinline
+#  else
+#    define GDV_FORCE_INLINE inline __attribute__((always_inline))
+#  endif
 #endif
 
 GANDIVA_EXPORT
diff --git a/cpp/src/gandiva/llvm_includes.h b/cpp/src/gandiva/llvm_includes.h
index 3d455591895c7..659c9d29de177 100644
--- a/cpp/src/gandiva/llvm_includes.h
+++ b/cpp/src/gandiva/llvm_includes.h
@@ -18,13 +18,13 @@
 #pragma once
 
 #if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4141)
-#pragma warning(disable : 4146)
-#pragma warning(disable : 4244)
-#pragma warning(disable : 4267)
-#pragma warning(disable : 4291)
-#pragma warning(disable : 4624)
+#  pragma warning(push)
+#  pragma warning(disable : 4141)
+#  pragma warning(disable : 4146)
+#  pragma warning(disable : 4244)
+#  pragma warning(disable : 4267)
+#  pragma warning(disable : 4291)
+#  pragma warning(disable : 4624)
 #endif
 
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
@@ -33,11 +33,11 @@
 #include <llvm/IR/Module.h>
 
 #if LLVM_VERSION_MAJOR >= 10
-#define LLVM_ALIGN(alignment) (llvm::Align((alignment)))
+#  define LLVM_ALIGN(alignment) (llvm::Align((alignment)))
 #else
-#define LLVM_ALIGN(alignment) (alignment)
+#  define LLVM_ALIGN(alignment) (alignment)
 #endif
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
diff --git a/cpp/src/gandiva/precompiled/extended_math_ops.cc b/cpp/src/gandiva/precompiled/extended_math_ops.cc
index e5bd9b48e0e0b..c03db1d5f2b13 100644
--- a/cpp/src/gandiva/precompiled/extended_math_ops.cc
+++ b/cpp/src/gandiva/precompiled/extended_math_ops.cc
@@ -16,7 +16,7 @@
 // under the License.
 
 #ifndef M_PI
-#define M_PI 3.14159265358979323846
+#  define M_PI 3.14159265358979323846
 #endif
 
 #include "arrow/util/logging.h"
diff --git a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
index 3e9d8a5d2cd44..7170fad01d250 100644
--- a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
@@ -16,7 +16,7 @@
 // under the License.
 
 #ifndef M_PI
-#define M_PI 3.14159265358979323846
+#  define M_PI 3.14159265358979323846
 #endif
 
 #include <gtest/gtest.h>
diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h
index 83bbdee208562..c93b694fc777e 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -43,9 +43,9 @@ using gdv_day_time_interval = int64_t;
 
 #ifdef GANDIVA_UNIT_TEST
 // unit tests may be compiled without O2, so inlining may not happen.
-#define FORCE_INLINE
+#  define FORCE_INLINE
 #else
-#define FORCE_INLINE __attribute__((always_inline))
+#  define FORCE_INLINE __attribute__((always_inline))
 #endif
 
 extern "C" {
diff --git a/cpp/src/gandiva/selection_vector.cc b/cpp/src/gandiva/selection_vector.cc
index 39e9f5bc90228..8d5f9f4210af5 100644
--- a/cpp/src/gandiva/selection_vector.cc
+++ b/cpp/src/gandiva/selection_vector.cc
@@ -54,14 +54,14 @@ Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap
 
     while (current_word != 0) {
 #if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4146)
+#  pragma warning(push)
+#  pragma warning(disable : 4146)
 #endif
       // MSVC warns about negating an unsigned type. We suppress it for now
       uint64_t highest_only = current_word & -current_word;
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
       int pos_in_word = arrow::bit_util::CountTrailingZeros(highest_only);
diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc
index 59eeb3d92f19a..a22d04ac28f47 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -16,7 +16,7 @@
 // under the License.
 
 #ifndef M_PI
-#define M_PI 3.14159265358979323846
+#  define M_PI 3.14159265358979323846
 #endif
 
 #include "gandiva/projector.h"
diff --git a/cpp/src/gandiva/tests/test_util.cc b/cpp/src/gandiva/tests/test_util.cc
index 2ee49ffae0ed6..584e27e7533b6 100644
--- a/cpp/src/gandiva/tests/test_util.cc
+++ b/cpp/src/gandiva/tests/test_util.cc
@@ -35,7 +35,7 @@ std::shared_ptr<Configuration> TestConfigWithIrDumping() {
 }
 
 #ifndef GANDIVA_EXTENSION_TEST_DIR
-#define GANDIVA_EXTENSION_TEST_DIR "."
+#  define GANDIVA_EXTENSION_TEST_DIR "."
 #endif
 
 std::string GetTestFunctionLLVMIRPath() {
diff --git a/cpp/src/gandiva/visibility.h b/cpp/src/gandiva/visibility.h
index 450b3056b2ec0..4961952c2974e 100644
--- a/cpp/src/gandiva/visibility.h
+++ b/cpp/src/gandiva/visibility.h
@@ -18,31 +18,31 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef GANDIVA_STATIC
-#define GANDIVA_EXPORT
-#elif defined(GANDIVA_EXPORTING)
-#define GANDIVA_EXPORT __declspec(dllexport)
-#else
-#define GANDIVA_EXPORT __declspec(dllimport)
-#endif
+#  ifdef GANDIVA_STATIC
+#    define GANDIVA_EXPORT
+#  elif defined(GANDIVA_EXPORTING)
+#    define GANDIVA_EXPORT __declspec(dllexport)
+#  else
+#    define GANDIVA_EXPORT __declspec(dllimport)
+#  endif
 
-#define GANDIVA_NO_EXPORT
+#  define GANDIVA_NO_EXPORT
 #else  // Not Windows
-#ifndef GANDIVA_EXPORT
-#define GANDIVA_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef GANDIVA_NO_EXPORT
-#define GANDIVA_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
+#  ifndef GANDIVA_EXPORT
+#    define GANDIVA_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef GANDIVA_NO_EXPORT
+#    define GANDIVA_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
 #endif  // Non-Windows
 
 #if defined(_MSC_VER)
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 64030e0f90d54..724e6c44f2ed0 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -16,9 +16,9 @@
 // under the License.
 
 #ifdef _MSC_VER
-#pragma warning(push)
+#  pragma warning(push)
 // Disable forcing value to bool warnings
-#pragma warning(disable : 4800)
+#  pragma warning(disable : 4800)
 #endif
 
 #include "gmock/gmock.h"
@@ -55,7 +55,7 @@
 #include "arrow/util/range.h"
 
 #ifdef ARROW_CSV
-#include "arrow/csv/api.h"
+#  include "arrow/csv/api.h"
 #endif
 
 #include "parquet/api/reader.h"
diff --git a/cpp/src/parquet/exception.h b/cpp/src/parquet/exception.h
index 826f5bdc8bf73..cd221ec7a24ae 100644
--- a/cpp/src/parquet/exception.h
+++ b/cpp/src/parquet/exception.h
@@ -28,7 +28,7 @@
 
 // PARQUET-1085
 #if !defined(ARROW_UNUSED)
-#define ARROW_UNUSED(x) UNUSED(x)
+#  define ARROW_UNUSED(x) UNUSED(x)
 #endif
 
 // Parquet exception to Arrow Status
diff --git a/cpp/src/parquet/level_comparison_inc.h b/cpp/src/parquet/level_comparison_inc.h
index cfee506654331..04f628d533111 100644
--- a/cpp/src/parquet/level_comparison_inc.h
+++ b/cpp/src/parquet/level_comparison_inc.h
@@ -22,7 +22,7 @@
 
 // Used to make sure ODR rule isn't violated.
 #ifndef PARQUET_IMPL_NAMESPACE
-#error "PARQUET_IMPL_NAMESPACE must be defined"
+#  error "PARQUET_IMPL_NAMESPACE must be defined"
 #endif
 namespace parquet::internal::PARQUET_IMPL_NAMESPACE {
 /// Builds a bitmap by applying predicate to the level vector provided.
diff --git a/cpp/src/parquet/level_conversion_inc.h b/cpp/src/parquet/level_conversion_inc.h
index 3accb154e6f5a..5fce93e779b2d 100644
--- a/cpp/src/parquet/level_conversion_inc.h
+++ b/cpp/src/parquet/level_conversion_inc.h
@@ -31,7 +31,7 @@
 #include "parquet/level_comparison.h"
 
 #ifndef PARQUET_IMPL_NAMESPACE
-#error "PARQUET_IMPL_NAMESPACE must be defined"
+#  error "PARQUET_IMPL_NAMESPACE must be defined"
 #endif
 
 namespace parquet::internal::PARQUET_IMPL_NAMESPACE {
@@ -261,7 +261,7 @@ inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
 #ifdef ARROW_HAVE_BMI2
 
 // Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
-#if UINTPTR_MAX == 0xFFFFFFFF
+#  if UINTPTR_MAX == 0xFFFFFFFF
 
 using extract_bitmap_t = uint32_t;
 inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
@@ -269,7 +269,7 @@ inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
   return _pext_u32(bitmap, select_bitmap);
 }
 
-#else
+#  else
 
 using extract_bitmap_t = uint64_t;
 inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
@@ -277,7 +277,7 @@ inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
   return _pext_u64(bitmap, select_bitmap);
 }
 
-#endif
+#  endif
 
 #else  // !defined(ARROW_HAVE_BMI2)
 
diff --git a/cpp/src/parquet/platform.h b/cpp/src/parquet/platform.h
index b085e57cd9918..e8d67e225f8ff 100644
--- a/cpp/src/parquet/platform.h
+++ b/cpp/src/parquet/platform.h
@@ -28,48 +28,48 @@
 
 #if defined(_WIN32) || defined(__CYGWIN__)
 
-#if defined(_MSC_VER)
-#pragma warning(push)
+#  if defined(_MSC_VER)
+#    pragma warning(push)
 // Disable warning for STL types usage in DLL interface
 // https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
-#pragma warning(disable : 4275 4251)
+#    pragma warning(disable : 4275 4251)
 // Disable diamond inheritance warnings
-#pragma warning(disable : 4250)
+#    pragma warning(disable : 4250)
 // Disable macro redefinition warnings
-#pragma warning(disable : 4005)
+#    pragma warning(disable : 4005)
 // Disable extern before exported template warnings
-#pragma warning(disable : 4910)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#    pragma warning(disable : 4910)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef PARQUET_STATIC
-#define PARQUET_EXPORT
-#elif defined(PARQUET_EXPORTING)
-#define PARQUET_EXPORT __declspec(dllexport)
-#else
-#define PARQUET_EXPORT __declspec(dllimport)
-#endif
+#  ifdef PARQUET_STATIC
+#    define PARQUET_EXPORT
+#  elif defined(PARQUET_EXPORTING)
+#    define PARQUET_EXPORT __declspec(dllexport)
+#  else
+#    define PARQUET_EXPORT __declspec(dllimport)
+#  endif
 
-#define PARQUET_NO_EXPORT
+#  define PARQUET_NO_EXPORT
 
 #else  // Not Windows
-#ifndef PARQUET_EXPORT
-#define PARQUET_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef PARQUET_NO_EXPORT
-#define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
+#  ifndef PARQUET_EXPORT
+#    define PARQUET_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef PARQUET_NO_EXPORT
+#    define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
 #endif  // Non-Windows
 
 // This is a complicated topic, some reading on it:
 // http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
 #if defined(_MSC_VER) || defined(__clang__)
-#define PARQUET_TEMPLATE_CLASS_EXPORT
-#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
+#  define PARQUET_TEMPLATE_CLASS_EXPORT
+#  define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
 #else
-#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
-#define PARQUET_TEMPLATE_EXPORT
+#  define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
+#  define PARQUET_TEMPLATE_EXPORT
 #endif
 
 #define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
@@ -80,7 +80,7 @@
 // If ARROW_VALGRIND set when compiling unit tests, also define
 // PARQUET_VALGRIND
 #ifdef ARROW_VALGRIND
-#define PARQUET_VALGRIND
+#  define PARQUET_VALGRIND
 #endif
 
 namespace parquet {
diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc
index e0ca7d6356646..fdcaed5c81ed7 100644
--- a/cpp/src/parquet/types_test.cc
+++ b/cpp/src/parquet/types_test.cc
@@ -65,11 +65,11 @@ TEST(TestConvertedTypeToString, ConvertedTypes) {
 }
 
 #ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #elif defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4996)
+#  pragma warning(push)
+#  pragma warning(disable : 4996)
 #endif
 
 TEST(TypePrinter, StatisticsTypes) {
@@ -164,9 +164,9 @@ TEST(TestInt96Timestamp, Decoding) {
 }
 
 #if !(defined(_WIN32) || defined(__CYGWIN__))
-#pragma GCC diagnostic pop
+#  pragma GCC diagnostic pop
 #elif _MSC_VER
-#pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 }  // namespace parquet
diff --git a/cpp/src/parquet/windows_fixup.h b/cpp/src/parquet/windows_fixup.h
index ce44480c5732e..feac4e64d1976 100644
--- a/cpp/src/parquet/windows_fixup.h
+++ b/cpp/src/parquet/windows_fixup.h
@@ -22,8 +22,8 @@
 #ifdef _WIN32
 
 // parquet.thrift's OPTIONAL RepetitionType conflicts with a Windows #define
-#ifdef OPTIONAL
-#undef OPTIONAL
-#endif
+#  ifdef OPTIONAL
+#    undef OPTIONAL
+#  endif
 
 #endif  // _WIN32
diff --git a/matlab/src/cpp/arrow/matlab/api/visibility.h b/matlab/src/cpp/arrow/matlab/api/visibility.h
index 1570de06c4e17..9c5cc28565113 100644
--- a/matlab/src/cpp/arrow/matlab/api/visibility.h
+++ b/matlab/src/cpp/arrow/matlab/api/visibility.h
@@ -18,11 +18,11 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#ifdef ARROW_MATLAB_EXPORTING
-#define ARROW_MATLAB_EXPORT __declspec(dllexport)
-#else
-#define ARROW_MATLAB_EXPORT __declspec(dllimport)
-#endif
+#  ifdef ARROW_MATLAB_EXPORTING
+#    define ARROW_MATLAB_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_MATLAB_EXPORT __declspec(dllimport)
+#  endif
 #else  // Not Windows
-#define ARROW_MATLAB_EXPORT __attribute__((visibility("default")))
+#  define ARROW_MATLAB_EXPORT __attribute__((visibility("default")))
 #endif
diff --git a/python/pyarrow/src/arrow/python/datetime.h b/python/pyarrow/src/arrow/python/datetime.h
index 7346d6bc67791..9b21eeb434217 100644
--- a/python/pyarrow/src/arrow/python/datetime.h
+++ b/python/pyarrow/src/arrow/python/datetime.h
@@ -35,9 +35,9 @@
 // Instead, we redefine PyDateTimeAPI to point to a global variable,
 // which is initialized once by calling InitDatetime().
 #ifdef PYPY_VERSION
-#include "datetime.h"
+#  include "datetime.h"
 #else
-#define PyDateTimeAPI ::arrow::py::internal::datetime_api
+#  define PyDateTimeAPI ::arrow::py::internal::datetime_api
 #endif
 
 namespace arrow {
diff --git a/python/pyarrow/src/arrow/python/flight.h b/python/pyarrow/src/arrow/python/flight.h
index 82d93711e55fb..5243258495778 100644
--- a/python/pyarrow/src/arrow/python/flight.h
+++ b/python/pyarrow/src/arrow/python/flight.h
@@ -26,24 +26,24 @@
 #include "arrow/python/common.h"
 
 #if defined(_WIN32) || defined(__CYGWIN__)  // Windows
-#if defined(_MSC_VER)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
-
-#ifdef ARROW_PYTHON_STATIC
-#define ARROW_PYFLIGHT_EXPORT
-#elif defined(ARROW_PYFLIGHT_EXPORTING)
-#define ARROW_PYFLIGHT_EXPORT __declspec(dllexport)
-#else
-#define ARROW_PYFLIGHT_EXPORT __declspec(dllimport)
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYFLIGHT_EXPORT
+#  elif defined(ARROW_PYFLIGHT_EXPORTING)
+#    define ARROW_PYFLIGHT_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYFLIGHT_EXPORT __declspec(dllimport)
+#  endif
 
 #else  // Not Windows
-#ifndef ARROW_PYFLIGHT_EXPORT
-#define ARROW_PYFLIGHT_EXPORT __attribute__((visibility("default")))
-#endif
+#  ifndef ARROW_PYFLIGHT_EXPORT
+#    define ARROW_PYFLIGHT_EXPORT __attribute__((visibility("default")))
+#  endif
 #endif  // Non-Windows
 
 namespace arrow {
diff --git a/python/pyarrow/src/arrow/python/numpy_interop.h b/python/pyarrow/src/arrow/python/numpy_interop.h
index 7ea7d6e16f528..a83ae4a62b944 100644
--- a/python/pyarrow/src/arrow/python/numpy_interop.h
+++ b/python/pyarrow/src/arrow/python/numpy_interop.h
@@ -23,19 +23,19 @@
 
 // Don't use the deprecated Numpy functions
 #ifdef NPY_1_7_API_VERSION
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#  define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #else
-#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED
-#define NPY_ARRAY_ALIGNED NPY_ALIGNED
-#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE
-#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY
+#  define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED
+#  define NPY_ARRAY_ALIGNED NPY_ALIGNED
+#  define NPY_ARRAY_WRITEABLE NPY_WRITEABLE
+#  define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY
 #endif
 
 // This is required to be able to access the NumPy C API properly in C++ files
 // other than init.cc.
 #define PY_ARRAY_UNIQUE_SYMBOL arrow_ARRAY_API
 #ifndef NUMPY_IMPORT_ARRAY
-#define NO_IMPORT_ARRAY
+#  define NO_IMPORT_ARRAY
 #endif
 
 #include <numpy/arrayobject.h>   // IWYU pragma: export
@@ -56,22 +56,22 @@
 // NPY_INT needs to be handled separately.
 
 #if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64
-#define NPY_INT64_IS_LONG_LONG 1
+#  define NPY_INT64_IS_LONG_LONG 1
 #else
-#define NPY_INT64_IS_LONG_LONG 0
+#  define NPY_INT64_IS_LONG_LONG 0
 #endif
 
 #if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
-#define NPY_INT32_IS_INT 1
+#  define NPY_INT32_IS_INT 1
 #else
-#define NPY_INT32_IS_INT 0
+#  define NPY_INT32_IS_INT 0
 #endif
 
 // Backported NumPy 2 API (can be removed if numpy 2 is required)
 #if NPY_ABI_VERSION < 0x02000000
-#define PyDataType_ELSIZE(descr) ((descr)->elsize)
-#define PyDataType_C_METADATA(descr) ((descr)->c_metadata)
-#define PyDataType_FIELDS(descr) ((descr)->fields)
+#  define PyDataType_ELSIZE(descr) ((descr)->elsize)
+#  define PyDataType_C_METADATA(descr) ((descr)->c_metadata)
+#  define PyDataType_FIELDS(descr) ((descr)->fields)
 #endif
 
 namespace arrow {
diff --git a/python/pyarrow/src/arrow/python/parquet_encryption.h b/python/pyarrow/src/arrow/python/parquet_encryption.h
index a1aaa30e260f5..7a107c89f0bdc 100644
--- a/python/pyarrow/src/arrow/python/parquet_encryption.h
+++ b/python/pyarrow/src/arrow/python/parquet_encryption.h
@@ -27,24 +27,24 @@
 #include "parquet/encryption/kms_client_factory.h"
 
 #if defined(_WIN32) || defined(__CYGWIN__)  // Windows
-#if defined(_MSC_VER)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
-
-#ifdef ARROW_PYTHON_STATIC
-#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
-#elif defined(ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING)
-#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllexport)
-#else
-#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllimport)
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
+#  elif defined(ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING)
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllimport)
+#  endif
 
 #else  // Not Windows
-#ifndef ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
-#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __attribute__((visibility("default")))
-#endif
+#  ifndef ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __attribute__((visibility("default")))
+#  endif
 #endif  // Non-Windows
 
 namespace arrow {
diff --git a/python/pyarrow/src/arrow/python/platform.h b/python/pyarrow/src/arrow/python/platform.h
index e71c7ac85399e..48758cd1c8468 100644
--- a/python/pyarrow/src/arrow/python/platform.h
+++ b/python/pyarrow/src/arrow/python/platform.h
@@ -29,13 +29,13 @@
 
 // Work around C2528 error
 #ifdef _MSC_VER
-#if _MSC_VER >= 1900
-#undef timezone
-#endif
+#  if _MSC_VER >= 1900
+#    undef timezone
+#  endif
 
 // https://bugs.python.org/issue36020
 // TODO(wjones127): Can remove once we drop support for CPython 3.9
-#ifdef snprintf
-#undef snprintf
-#endif
+#  ifdef snprintf
+#    undef snprintf
+#  endif
 #endif
diff --git a/python/pyarrow/src/arrow/python/visibility.h b/python/pyarrow/src/arrow/python/visibility.h
index dd43b32fd43ff..4bf9680a06bf0 100644
--- a/python/pyarrow/src/arrow/python/visibility.h
+++ b/python/pyarrow/src/arrow/python/visibility.h
@@ -18,22 +18,22 @@
 #pragma once
 
 #if defined(_WIN32) || defined(__CYGWIN__)  // Windows
-#if defined(_MSC_VER)
-#pragma warning(disable : 4251)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
 
-#ifdef ARROW_PYTHON_STATIC
-#define ARROW_PYTHON_EXPORT
-#elif defined(ARROW_PYTHON_EXPORTING)
-#define ARROW_PYTHON_EXPORT __declspec(dllexport)
-#else
-#define ARROW_PYTHON_EXPORT __declspec(dllimport)
-#endif
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYTHON_EXPORT
+#  elif defined(ARROW_PYTHON_EXPORTING)
+#    define ARROW_PYTHON_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYTHON_EXPORT __declspec(dllimport)
+#  endif
 
 #else  // Not Windows
-#ifndef ARROW_PYTHON_EXPORT
-#define ARROW_PYTHON_EXPORT __attribute__((visibility("default")))
-#endif
+#  ifndef ARROW_PYTHON_EXPORT
+#    define ARROW_PYTHON_EXPORT __attribute__((visibility("default")))
+#  endif
 #endif  // Non-Windows

From 12dddfc737ea23c43646f79769e0f9e12204a429 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Fri, 6 Sep 2024 12:52:06 +0530
Subject: [PATCH 095/186] MINOR: [Java] Bump com.puppycrawl.tools:checkstyle
 from 10.17.0 to 10.18.1 in /java Manual (#43980)

### Rationale for this change

The dependabot PR https://github.com/apache/arrow/pull/43922 automated change doesn't fix some code level changes required. This PR fixes that.

### What changes are included in this PR?

This PR is a supportive PR to enable the `puppycraw.tools:checkstyle` plugin upgrade.

### Are these changes tested?

Using existing test cases.

### Are there any user-facing changes?

No

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java       | 2 +-
 .../src/main/java/org/apache/arrow/flight/FlightClient.java     | 2 +-
 .../jdbc/client/ArrowFlightSqlClientHandlerBuilderTest.java     | 2 +-
 .../src/main/java/org/apache/arrow/memory/ArrowBuf.java         | 2 +-
 java/pom.xml                                                    | 2 +-
 .../apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java
index 337220a42fbce..1daeda6772b26 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java
@@ -42,7 +42,7 @@ public static Stream<Arguments> getTestData() throws IOException {
         Arguments.of(getTable("h2/test1_map_h2.yml", JdbcToArrowMapDataTypeTest.class)));
   }
 
-  /** Test Method to test JdbcToArrow Functionality for Map form Types.OTHER column */
+  /** Test Method to test JdbcToArrow Functionality for Map form Types.OTHER column. */
   @ParameterizedTest
   @MethodSource("getTestData")
   public void testJdbcToArrowValues(Table table)
diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java
index 62b3c1eedb69d..a15c3049aa6ad 100644
--- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java
+++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java
@@ -472,7 +472,7 @@ public void close() throws Exception {
     }
   }
 
-  /** A stream observer for Flight.PutResult */
+  /** A stream observer for Flight.PutResult. */
   private static class SetStreamObserver implements StreamObserver<Flight.PutResult> {
     private final BufferAllocator allocator;
     private final StreamListener<PutResult> listener;
diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandlerBuilderTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandlerBuilderTest.java
index 4f16a4fa60932..6beaba82360cc 100644
--- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandlerBuilderTest.java
+++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/ArrowFlightSqlClientHandlerBuilderTest.java
@@ -33,7 +33,7 @@
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.RegisterExtension;
 
-/** Test the behavior of ArrowFlightSqlClientHandler.Builder */
+/** Test the behavior of ArrowFlightSqlClientHandler.Builder. */
 public class ArrowFlightSqlClientHandlerBuilderTest {
 
   @RegisterExtension
diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java
index a958092a5789a..775a8925ad1a9 100644
--- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java
+++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ArrowBuf.java
@@ -549,7 +549,7 @@ public byte readByte() {
   }
 
   /**
-   * Read dst.length bytes at readerIndex into dst byte array
+   * Read dst.length bytes at readerIndex into dst byte array.
    *
    * @param dst byte array where the data will be written
    */
diff --git a/java/pom.xml b/java/pom.xml
index 81e652f462e02..c6b1876873f30 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -105,7 +105,7 @@ under the License.
     <dep.avro.version>1.12.0</dep.avro.version>
     <arrow.vector.classifier></arrow.vector.classifier>
     <forkCount>2</forkCount>
-    <checkstyle.version>10.17.0</checkstyle.version>
+    <checkstyle.version>10.18.1</checkstyle.version>
     <checkstyle.failOnViolation>true</checkstyle.failOnViolation>
     <error_prone_core.version>2.31.0</error_prone_core.version>
     <mockito.core.version>5.11.0</mockito.core.version>
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java
index a704dbdd74eaa..cee76433ea4c7 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/ArrowDictionaryBatch.java
@@ -21,7 +21,7 @@
 import org.apache.arrow.flatbuf.MessageHeader;
 
 /**
- * POJO wrapper around a Dictionary Batch IPC messages
+ * POJO wrapper around a Dictionary Batch IPC messages.
  * (https://arrow.apache.org/docs/format/IPC.html#dictionary-batches)
  */
 public class ArrowDictionaryBatch implements ArrowMessage {

From 089ad788976041444887a9a9c3b8f6973467e22a Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 6 Sep 2024 19:19:02 +0900
Subject: [PATCH 096/186] GH-43979: [CI][C++][Dev] Add cpplint to pre-commit
 (#43982)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

cpplint isn't integrated with pre-commit yet.

### What changes are included in this PR?

* Add cpplint configuration
* Share configuration with pre-commit and cpp/build-support/run_cpplint.py
* Add pre-commit entry

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #43979

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 .pre-commit-config.yaml          | 20 ++++++++++++++++++++
 CPPLINT.cfg                      | 30 ++++++++++++++++++++++++++++++
 cpp/CMakeLists.txt               |  3 ++-
 cpp/build-support/run_cpplint.py | 20 --------------------
 4 files changed, 52 insertions(+), 21 deletions(-)
 create mode 100644 CPPLINT.cfg

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bf0bcde14622a..91017969eb502 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -78,6 +78,26 @@ repos:
           ?^cpp/src/generated/|
           ?^cpp/thirdparty/|
           )
+  - repo: https://github.com/cpplint/cpplint
+    rev: 1.6.1
+    hooks:
+      - id: cpplint
+        name: C++ Lint
+        args:
+          - "--verbose=2"
+        types_or:
+          - c++
+        files: >-
+          ^cpp/
+        exclude: >-
+          (
+          ?\.grpc\.fb\.(cc|h)$|
+          ?\.pb\.(cc|h)$|
+          ?_generated.*\.(cc|h)$|
+          ?^cpp/src/arrow/vendored/|
+          ?^cpp/src/generated/|
+          ?^cpp/thirdparty/|
+          )
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v14.0.6
     hooks:
diff --git a/CPPLINT.cfg b/CPPLINT.cfg
new file mode 100644
index 0000000000000..2f47b4dbf57b7
--- /dev/null
+++ b/CPPLINT.cfg
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+filter = -build/c++11
+filter = -build/header_guard
+filter = -build/include_order
+filter = -build/include_what_you_use
+filter = -readability/alt_tokens
+# readability/casting is disabled as it aggressively warns about
+# functions with names like "int32", so "int32(x)", where int32 is a
+# function name, warns with
+filter = -readability/casting
+filter = -readability/todo
+filter = -runtime/references
+filter = -whitespace/comments
+linelength = 90
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5ead9e4b063cd..423744c388471 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -301,7 +301,8 @@ add_custom_target(lint
                   --cpplint_binary
                   ${CPPLINT_BIN}
                   ${COMMON_LINT_OPTIONS}
-                  ${ARROW_LINT_QUIET})
+                  ${ARROW_LINT_QUIET}
+                  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
 
 #
 # "make format" and "make check-format" targets
diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py
index 76c0fe0aefaca..a81acf2eb2ff9 100755
--- a/cpp/build-support/run_cpplint.py
+++ b/cpp/build-support/run_cpplint.py
@@ -26,24 +26,6 @@
 from functools import partial
 
 
-# NOTE(wesm):
-#
-# * readability/casting is disabled as it aggressively warns about functions
-#   with names like "int32", so "int32(x)", where int32 is a function name,
-#   warns with
-_filters = '''
--whitespace/comments
--readability/casting
--readability/todo
--readability/alt_tokens
--build/header_guard
--build/c++11
--build/include_what_you_use
--runtime/references
--build/include_order
-'''.split()
-
-
 def _get_chunk_key(filenames):
     # lists are not hashable so key on the first filename in a chunk
     return filenames[0]
@@ -87,8 +69,6 @@ def _check_some_files(completed_processes, filenames):
     cmd = [
         arguments.cpplint_binary,
         '--verbose=2',
-        '--linelength=90',
-        '--filter=' + ','.join(_filters)
     ]
     if (arguments.cpplint_binary.endswith('.py') and
             platform.system() == 'Windows'):

From ab0a40ee34217070f14027776682074c55d0b507 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Fri, 6 Sep 2024 21:38:58 +0800
Subject: [PATCH 097/186] GH-43712: [C++][Parquet] Dataset: Handle num-nulls in
 Parquet correctly when !HasNullCount() (#43726)

### Rationale for this change

See issue. When `!HasNullCount`, we cannot gurantee null exists

### What changes are included in this PR?

Handle HasNullCount in dataset expr

### Are these changes tested?

Yes

### Are there any user-facing changes?

Merely

* GitHub Issue: #43712

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/arrow/dataset/file_parquet.cc      | 16 ++++---
 cpp/src/arrow/dataset/file_parquet_test.cc | 50 ++++++++++++++++++++++
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 1f8b6cc4882cf..ca391b4354c07 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -366,8 +366,12 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
     const parquet::Statistics& statistics) {
   auto field_expr = compute::field_ref(field_ref);
 
+  bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0;
   // Optimize for corner case where all values are nulls
-  if (statistics.num_values() == 0 && statistics.null_count() > 0) {
+  if (statistics.num_values() == 0) {
+    // If there are no non-null values, column `field_ref` in the fragment
+    // might be empty or all values are nulls. In this case, we also return
+    // a null expression.
     return is_null(std::move(field_expr));
   }
 
@@ -378,7 +382,6 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
 
   auto maybe_min = Cast(min, field.type());
   auto maybe_max = Cast(max, field.type());
-
   if (maybe_min.ok() && maybe_max.ok()) {
     min = maybe_min.MoveValueUnsafe().scalar();
     max = maybe_max.MoveValueUnsafe().scalar();
@@ -386,7 +389,7 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
     if (min->Equals(*max)) {
       auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));
 
-      if (statistics.null_count() == 0) {
+      if (!may_have_null) {
         return single_value;
       }
       return compute::or_(std::move(single_value), is_null(std::move(field_expr)));
@@ -412,9 +415,8 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
     } else {
       in_range = compute::and_(std::move(lower_bound), std::move(upper_bound));
     }
-
-    if (statistics.null_count() != 0) {
-      return compute::or_(std::move(in_range), compute::is_null(field_expr));
+    if (may_have_null) {
+      return compute::or_(std::move(in_range), compute::is_null(std::move(field_expr)));
     }
     return in_range;
   }
@@ -423,7 +425,7 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
 
 std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpression(
     const Field& field, const parquet::Statistics& statistics) {
-  const auto field_name = field.name();
+  auto field_name = field.name();
   return EvaluateStatisticsAsExpression(field, FieldRef(std::move(field_name)),
                                         statistics);
 }
diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc
index bf626826d4d1b..2c05dcd9be459 100644
--- a/cpp/src/arrow/dataset/file_parquet_test.cc
+++ b/cpp/src/arrow/dataset/file_parquet_test.cc
@@ -841,6 +841,56 @@ TEST(TestParquetStatistics, NullMax) {
   EXPECT_EQ(stat_expression->ToString(), "(x >= 1)");
 }
 
+TEST(TestParquetStatistics, NoNullCount) {
+  auto field = ::arrow::field("x", int32());
+  auto parquet_node_ptr = ::parquet::schema::Int32("x", ::parquet::Repetition::REQUIRED);
+  ::parquet::ColumnDescriptor descr(parquet_node_ptr, /*max_definition_level=*/1,
+                                    /*max_repetition_level=*/0);
+
+  auto int32_to_parquet_stats = [](int32_t v) {
+    std::string value;
+    value.resize(sizeof(int32_t));
+    memcpy(value.data(), &v, sizeof(int32_t));
+    return value;
+  };
+  {
+    // Base case: when null_count is not set, the expression might contain null
+    ::parquet::EncodedStatistics encoded_stats;
+    encoded_stats.set_min(int32_to_parquet_stats(1));
+    encoded_stats.set_max(int32_to_parquet_stats(100));
+    encoded_stats.has_null_count = false;
+    encoded_stats.all_null_value = false;
+    encoded_stats.null_count = 0;
+    auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10);
+
+    auto stat_expression =
+        ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
+    ASSERT_TRUE(stat_expression.has_value());
+    EXPECT_EQ(stat_expression->ToString(),
+              "(((x >= 1) and (x <= 100)) or is_null(x, {nan_is_null=false}))");
+  }
+  {
+    // Special case: when num_value is 0, it would return
+    // "is_null".
+    ::parquet::EncodedStatistics encoded_stats;
+    encoded_stats.has_null_count = true;
+    encoded_stats.null_count = 1;
+    encoded_stats.all_null_value = true;
+    auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
+    auto stat_expression =
+        ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
+    ASSERT_TRUE(stat_expression.has_value());
+    EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");
+
+    encoded_stats.has_null_count = false;
+    encoded_stats.all_null_value = false;
+    stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
+    stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
+    ASSERT_TRUE(stat_expression.has_value());
+    EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");
+  }
+}
+
 class DelayedBufferReader : public ::arrow::io::BufferReader {
  public:
   explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer)

From 7d49420ef1c412f66383e1b214d728d36a5e02a1 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Sat, 7 Sep 2024 04:55:09 +0800
Subject: [PATCH 098/186] GH-43992: [C++] Add missing std::move() in
 array_nested.cc (#43993)

### Rationale for this change

Minor code enhancement in array_nested.cc

### What changes are included in this PR?

Use std::move for nested types

### Are these changes tested?

Covered by existing

### Are there any user-facing changes?

no

* GitHub Issue: #43992

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/arrow/array/array_dict.cc   |  4 +--
 cpp/src/arrow/array/array_nested.cc | 41 +++++++++++++++--------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc
index 7fd76a1dae81b..55e086af30bc2 100644
--- a/cpp/src/arrow/array/array_dict.cc
+++ b/cpp/src/arrow/array/array_dict.cc
@@ -349,7 +349,7 @@ class DictionaryUnifierImpl : public DictionaryUnifier {
   using MemoTableType = typename DictTraits::MemoTableType;
 
   DictionaryUnifierImpl(MemoryPool* pool, std::shared_ptr<DataType> value_type)
-      : pool_(pool), value_type_(value_type), memo_table_(pool) {}
+      : pool_(pool), value_type_(std::move(value_type)), memo_table_(pool) {}
 
   Status Unify(const Array& dictionary, std::shared_ptr<Buffer>* out) override {
     if (dictionary.null_count() > 0) {
@@ -432,7 +432,7 @@ struct MakeUnifier {
   std::unique_ptr<DictionaryUnifier> result;
 
   MakeUnifier(MemoryPool* pool, std::shared_ptr<DataType> value_type)
-      : pool(pool), value_type(value_type) {}
+      : pool(pool), value_type(std::move(value_type)) {}
 
   template <typename T>
   enable_if_no_memoize<T, Status> Visit(const T&) {
diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index 47c0fd35829a1..bb469df1ad6b4 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -542,7 +542,7 @@ Result<std::shared_ptr<ListArray>> ListArray::FromArrays(
     const Array& offsets, const Array& values, MemoryPool* pool,
     std::shared_ptr<Buffer> null_bitmap, int64_t null_count) {
   return ListArrayFromArrays<ListType>(std::make_shared<ListType>(values.type()), offsets,
-                                       values, pool, null_bitmap, null_count);
+                                       values, pool, std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<ListArray>> ListArray::FromListView(const ListViewArray& source,
@@ -563,7 +563,7 @@ Result<std::shared_ptr<ListArray>> ListArray::FromArrays(
     return Status::TypeError("Mismatching list value type");
   }
   return ListArrayFromArrays<ListType>(std::move(type), offsets, values, pool,
-                                       null_bitmap, null_count);
+                                       std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<Array>> ListArray::Flatten(MemoryPool* memory_pool) const {
@@ -599,8 +599,8 @@ Result<std::shared_ptr<LargeListArray>> LargeListArray::FromArrays(
     const Array& offsets, const Array& values, MemoryPool* pool,
     std::shared_ptr<Buffer> null_bitmap, int64_t null_count) {
   return ListArrayFromArrays<LargeListType>(
-      std::make_shared<LargeListType>(values.type()), offsets, values, pool, null_bitmap,
-      null_count);
+      std::make_shared<LargeListType>(values.type()), offsets, values, pool,
+      std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<LargeListArray>> LargeListArray::FromListView(
@@ -622,7 +622,7 @@ Result<std::shared_ptr<LargeListArray>> LargeListArray::FromArrays(
     return Status::TypeError("Mismatching list value type");
   }
   return ListArrayFromArrays<LargeListType>(std::move(type), offsets, values, pool,
-                                            null_bitmap, null_count);
+                                            std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<Array>> LargeListArray::Flatten(MemoryPool* memory_pool) const {
@@ -662,7 +662,7 @@ Result<std::shared_ptr<ListViewArray>> ListViewArray::FromArrays(
     std::shared_ptr<Buffer> null_bitmap, int64_t null_count) {
   return ListViewArrayFromArrays<ListViewType>(
       std::make_shared<ListViewType>(values.type()), offsets, sizes, values, pool,
-      null_bitmap, null_count);
+      std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<ListViewArray>> ListViewArray::FromArrays(
@@ -677,7 +677,7 @@ Result<std::shared_ptr<ListViewArray>> ListViewArray::FromArrays(
     return Status::TypeError("Mismatching list-view value type");
   }
   return ListViewArrayFromArrays<ListViewType>(std::move(type), offsets, sizes, values,
-                                               pool, null_bitmap, null_count);
+                                               pool, std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<ListViewArray>> ListViewArray::FromList(const ListArray& source,
@@ -722,7 +722,7 @@ LargeListViewArray::LargeListViewArray(std::shared_ptr<DataType> type, int64_t l
                                        std::shared_ptr<Buffer> null_bitmap,
                                        int64_t null_count, int64_t offset) {
   LargeListViewArray::SetData(ArrayData::Make(
-      type, length,
+      std::move(type), length,
       {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)},
       /*child_data=*/{values->data()}, null_count, offset));
 }
@@ -737,7 +737,7 @@ Result<std::shared_ptr<LargeListViewArray>> LargeListViewArray::FromArrays(
     std::shared_ptr<Buffer> null_bitmap, int64_t null_count) {
   return ListViewArrayFromArrays<LargeListViewType>(
       std::make_shared<LargeListViewType>(values.type()), offsets, sizes, values, pool,
-      null_bitmap, null_count);
+      std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<LargeListViewArray>> LargeListViewArray::FromArrays(
@@ -752,7 +752,7 @@ Result<std::shared_ptr<LargeListViewArray>> LargeListViewArray::FromArrays(
     return Status::TypeError("Mismatching large list-view value type");
   }
   return ListViewArrayFromArrays<LargeListViewType>(
-      std::move(type), offsets, sizes, values, pool, null_bitmap, null_count);
+      std::move(type), offsets, sizes, values, pool, std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<Array>> LargeListViewArray::Flatten(
@@ -854,8 +854,9 @@ Result<std::shared_ptr<Array>> MapArray::FromArraysInternal(
     null_count = kUnknownNullCount;
   }
   buffers[1] = typed_offsets.values();
-  return std::make_shared<MapArray>(type, offsets->length() - 1, std::move(buffers), keys,
-                                    items, /*null_count=*/null_count, offsets->offset());
+  return std::make_shared<MapArray>(std::move(type), offsets->length() - 1,
+                                    std::move(buffers), keys, items,
+                                    /*null_count=*/null_count, offsets->offset());
 }
 
 Result<std::shared_ptr<Array>> MapArray::FromArrays(const std::shared_ptr<Array>& offsets,
@@ -971,8 +972,8 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
   int64_t length = values->length() / list_size;
   auto list_type = std::make_shared<FixedSizeListType>(values->type(), list_size);
 
-  return std::make_shared<FixedSizeListArray>(list_type, length, values, null_bitmap,
-                                              null_count);
+  return std::make_shared<FixedSizeListArray>(list_type, length, values,
+                                              std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
@@ -992,8 +993,8 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
   }
   int64_t length = values->length() / list_type.list_size();
 
-  return std::make_shared<FixedSizeListArray>(type, length, values, null_bitmap,
-                                              null_count);
+  return std::make_shared<FixedSizeListArray>(std::move(type), length, values,
+                                              std::move(null_bitmap), null_count);
 }
 
 Result<std::shared_ptr<Array>> FixedSizeListArray::Flatten(
@@ -1015,7 +1016,7 @@ StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length,
                          std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
                          int64_t offset) {
   ARROW_CHECK_EQ(type->id(), Type::STRUCT);
-  SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset));
+  SetData(ArrayData::Make(type, length, {std::move(null_bitmap)}, null_count, offset));
   for (const auto& child : children) {
     data_->child_data.push_back(child->data());
   }
@@ -1048,7 +1049,7 @@ Result<std::shared_ptr<StructArray>> StructArray::Make(
     null_count = 0;
   }
   return std::make_shared<StructArray>(struct_(fields), length - offset, children,
-                                       null_bitmap, null_count, offset);
+                                       std::move(null_bitmap), null_count, offset);
 }
 
 Result<std::shared_ptr<StructArray>> StructArray::Make(
@@ -1085,8 +1086,8 @@ const std::shared_ptr<Array>& StructArray::field(int i) const {
     } else {
       field_data = data_->child_data[i];
     }
-    std::shared_ptr<Array> result = MakeArray(field_data);
-    std::atomic_store(&boxed_fields_[i], result);
+    result = MakeArray(field_data);
+    std::atomic_store(&boxed_fields_[i], std::move(result));
     return boxed_fields_[i];
   }
   return boxed_fields_[i];

From 5549fa9637ee79082ed3b199b00e7f5fa84b8261 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sun, 8 Sep 2024 06:24:01 +0900
Subject: [PATCH 099/186] GH-43814: [GLib][FlightRPC] Add
 `GAFlightServerClass::do_put` (#43999)

### Rationale for this change

This is needed to write `DoPut` client tests.

### What changes are included in this PR?

The following features are also added:
* `GAFlightMetadataWriter`
* `garrow_record_batch_writer_is_closed()`
* Improve `ArrowFlight::Client#do_put` API in Ruby

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #43814

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-flight-glib/client.cpp           |  13 +-
 c_glib/arrow-flight-glib/client.hpp           |   3 +-
 c_glib/arrow-flight-glib/server.cpp           | 166 ++++++++++++++++++
 c_glib/arrow-flight-glib/server.h             |  29 +++
 c_glib/arrow-flight-glib/server.hpp           |   8 +
 c_glib/arrow-glib/writer.cpp                  |  31 +++-
 c_glib/arrow-glib/writer.h                    |   4 +
 c_glib/test/flight/test-client.rb             |  33 ++++
 c_glib/test/helper/flight-server.rb           |  11 ++
 c_glib/test/test-file-writer.rb               |   6 +
 c_glib/test/test-stream-writer.rb             |   3 +
 .../lib/arrow-flight/client.rb                |  44 +++++
 ruby/red-arrow-flight/test/helper/server.rb   |  11 ++
 ruby/red-arrow-flight/test/test-client.rb     |  31 ++++
 14 files changed, 383 insertions(+), 10 deletions(-)

diff --git a/c_glib/arrow-flight-glib/client.cpp b/c_glib/arrow-flight-glib/client.cpp
index 23f59c9da69ad..75b02ec25869f 100644
--- a/c_glib/arrow-flight-glib/client.cpp
+++ b/c_glib/arrow-flight-glib/client.cpp
@@ -570,7 +570,9 @@ gaflight_do_put_result_set_property(GObject *object,
     {
       auto result = static_cast<arrow::flight::FlightClient::DoPutResult *>(
         g_value_get_pointer(value));
-      priv->writer = gaflight_stream_writer_new_raw(result->writer.release());
+      std::shared_ptr<arrow::flight::FlightStreamWriter> writer =
+        std::move(result->writer);
+      priv->writer = gaflight_stream_writer_new_raw(&writer);
       priv->reader = gaflight_metadata_reader_new_raw(result->reader.release());
       break;
     }
@@ -983,10 +985,13 @@ gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader,
 }
 
 GAFlightStreamWriter *
-gaflight_stream_writer_new_raw(arrow::flight::FlightStreamWriter *flight_writer)
+gaflight_stream_writer_new_raw(
+  std::shared_ptr<arrow::flight::FlightStreamWriter> *flight_writer)
 {
-  return GAFLIGHT_STREAM_WRITER(
-    g_object_new(GAFLIGHT_TYPE_STREAM_WRITER, "writer", flight_writer, nullptr));
+  return GAFLIGHT_STREAM_WRITER(g_object_new(GAFLIGHT_TYPE_STREAM_WRITER,
+                                             "record-batch-writer",
+                                             flight_writer,
+                                             nullptr));
 }
 
 GAFlightMetadataReader *
diff --git a/c_glib/arrow-flight-glib/client.hpp b/c_glib/arrow-flight-glib/client.hpp
index 888f87ecb5732..32ad35845aa12 100644
--- a/c_glib/arrow-flight-glib/client.hpp
+++ b/c_glib/arrow-flight-glib/client.hpp
@@ -30,7 +30,8 @@ gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader,
 
 GAFLIGHT_EXTERN
 GAFlightStreamWriter *
-gaflight_stream_writer_new_raw(arrow::flight::FlightStreamWriter *flight_writer);
+gaflight_stream_writer_new_raw(
+  std::shared_ptr<arrow::flight::FlightStreamWriter> *flight_writer);
 
 GAFLIGHT_EXTERN
 GAFlightMetadataReader *
diff --git a/c_glib/arrow-flight-glib/server.cpp b/c_glib/arrow-flight-glib/server.cpp
index f7444918e90f6..e39fd97b0d06c 100644
--- a/c_glib/arrow-flight-glib/server.cpp
+++ b/c_glib/arrow-flight-glib/server.cpp
@@ -45,6 +45,9 @@ G_BEGIN_DECLS
  * client. Also allows reading application-defined metadata via the
  * Flight protocol.
  *
+ * #GAFlightMetadataWriter is a class for sending application-specific
+ * metadata back to client during an upload.
+ *
  * #GAFlightServerAuthSender is a class for sending messages to the
  * client during an authentication handshake.
  *
@@ -290,6 +293,98 @@ gaflight_message_reader_get_descriptor(GAFlightMessageReader *reader)
   return gaflight_descriptor_new_raw(&flight_descriptor);
 }
 
+struct GAFlightMetadataWriterPrivate
+{
+  arrow::flight::FlightMetadataWriter *writer;
+};
+
+enum {
+  PROP_WRITER = 1,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GAFlightMetadataWriter,
+                           gaflight_metadata_writer,
+                           G_TYPE_OBJECT)
+
+#define GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object)                                     \
+  static_cast<GAFlightMetadataWriterPrivate *>(                                          \
+    gaflight_metadata_writer_get_instance_private(GAFLIGHT_METADATA_WRITER(object)))
+
+static void
+gaflight_metadata_writer_finalize(GObject *object)
+{
+  auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object);
+
+  delete priv->writer;
+
+  G_OBJECT_CLASS(gaflight_metadata_writer_parent_class)->finalize(object);
+}
+
+static void
+gaflight_metadata_writer_set_property(GObject *object,
+                                      guint prop_id,
+                                      const GValue *value,
+                                      GParamSpec *pspec)
+{
+  auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_WRITER:
+    priv->writer =
+      static_cast<arrow::flight::FlightMetadataWriter *>(g_value_get_pointer(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gaflight_metadata_writer_init(GAFlightMetadataWriter *object)
+{
+}
+
+static void
+gaflight_metadata_writer_class_init(GAFlightMetadataWriterClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize = gaflight_metadata_writer_finalize;
+  gobject_class->set_property = gaflight_metadata_writer_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer(
+    "writer",
+    nullptr,
+    nullptr,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_WRITER, spec);
+}
+
+/**
+ * gaflight_metadata_writer_write:
+ * @writer: A #GAFlightMetadataWriter.
+ * @metadata: A #GArrowBuffer to be sent.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Writes metadata to the client.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_metadata_writer_write(GAFlightMetadataWriter *writer,
+                               GArrowBuffer *metadata,
+                               GError **error)
+{
+  auto flight_writer = gaflight_metadata_writer_get_raw(writer);
+  auto flight_metadata = garrow_buffer_get_raw(metadata);
+  return garrow::check(error,
+                       flight_writer->WriteMetadata(*flight_metadata),
+                       "[flight-metadata-writer][write]");
+}
+
 struct GAFlightServerCallContextPrivate
 {
   arrow::flight::ServerCallContext *call_context;
@@ -1034,6 +1129,34 @@ namespace gaflight {
       return arrow::Status::OK();
     }
 
+    arrow::Status
+    DoPut(const arrow::flight::ServerCallContext &context,
+          std::unique_ptr<arrow::flight::FlightMessageReader> reader,
+          std::unique_ptr<arrow::flight::FlightMetadataWriter> writer) override
+    {
+      auto gacontext = gaflight_server_call_context_new_raw(&context);
+      auto gareader = gaflight_message_reader_new_raw(reader.release(), TRUE);
+      auto gawriter = gaflight_metadata_writer_new_raw(writer.release());
+      GError *gerror = nullptr;
+      auto success =
+        gaflight_server_do_put(gaserver_, gacontext, gareader, gawriter, &gerror);
+      g_object_unref(gawriter);
+      g_object_unref(gareader);
+      g_object_unref(gacontext);
+      if (!success && !gerror) {
+        g_set_error(&gerror,
+                    GARROW_ERROR,
+                    GARROW_ERROR_UNKNOWN,
+                    "GAFlightServerClass::do_put() returns FALSE but error isn't set");
+      }
+      if (gerror) {
+        return garrow_error_to_status(gerror,
+                                      arrow::StatusCode::UnknownError,
+                                      "[flight-server][do-put]");
+      }
+      return arrow::Status::OK();
+    }
+
   private:
     GAFlightServer *gaserver_;
   };
@@ -1228,6 +1351,35 @@ gaflight_server_do_get(GAFlightServer *server,
   return (*(klass->do_get))(server, context, ticket, error);
 }
 
+/**
+ * gaflight_server_do_put:
+ * @server: A #GAFlightServer.
+ * @context: A #GAFlightServerCallContext.
+ * @reader: A #GAFlightMessageReader.
+ * @writer: A #GAFlightMetadataWriter.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Processes a stream of IPC payloads sent from a client.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gaflight_server_do_put(GAFlightServer *server,
+                       GAFlightServerCallContext *context,
+                       GAFlightMessageReader *reader,
+                       GAFlightMetadataWriter *writer,
+                       GError **error)
+{
+  auto klass = GAFLIGHT_SERVER_GET_CLASS(server);
+  if (!(klass && klass->do_put)) {
+    g_set_error(error, GARROW_ERROR, GARROW_ERROR_NOT_IMPLEMENTED, "not implemented");
+    return false;
+  }
+  return klass->do_put(server, context, reader, writer, error);
+}
+
 G_END_DECLS
 
 arrow::flight::FlightDataStream *
@@ -1257,6 +1409,20 @@ gaflight_message_reader_get_raw(GAFlightMessageReader *reader)
   return static_cast<arrow::flight::FlightMessageReader *>(flight_reader);
 }
 
+GAFlightMetadataWriter *
+gaflight_metadata_writer_new_raw(arrow::flight::FlightMetadataWriter *flight_writer)
+{
+  return GAFLIGHT_METADATA_WRITER(
+    g_object_new(GAFLIGHT_TYPE_METADATA_WRITER, "writer", flight_writer, nullptr));
+}
+
+arrow::flight::FlightMetadataWriter *
+gaflight_metadata_writer_get_raw(GAFlightMetadataWriter *writer)
+{
+  auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(writer);
+  return priv->writer;
+}
+
 GAFlightServerCallContext *
 gaflight_server_call_context_new_raw(
   const arrow::flight::ServerCallContext *flight_call_context)
diff --git a/c_glib/arrow-flight-glib/server.h b/c_glib/arrow-flight-glib/server.h
index 7e594febb172f..e3a469098b32c 100644
--- a/c_glib/arrow-flight-glib/server.h
+++ b/c_glib/arrow-flight-glib/server.h
@@ -65,6 +65,21 @@ GAFLIGHT_AVAILABLE_IN_14_0
 GAFlightDescriptor *
 gaflight_message_reader_get_descriptor(GAFlightMessageReader *reader);
 
+#define GAFLIGHT_TYPE_METADATA_WRITER (gaflight_metadata_writer_get_type())
+GAFLIGHT_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(
+  GAFlightMetadataWriter, gaflight_metadata_writer, GAFLIGHT, METADATA_WRITER, GObject)
+struct _GAFlightMetadataWriterClass
+{
+  GObjectClass parent_class;
+};
+
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_metadata_writer_write(GAFlightMetadataWriter *writer,
+                               GArrowBuffer *metadata,
+                               GError **error);
+
 #define GAFLIGHT_TYPE_SERVER_CALL_CONTEXT (gaflight_server_call_context_get_type())
 GAFLIGHT_AVAILABLE_IN_5_0
 G_DECLARE_DERIVABLE_TYPE(GAFlightServerCallContext,
@@ -199,6 +214,7 @@ G_DECLARE_DERIVABLE_TYPE(GAFlightServer, gaflight_server, GAFLIGHT, SERVER, GObj
  * GAFlightServerClass:
  * @list_flights: A virtual function to implement `ListFlights` API.
  * @do_get: A virtual function to implement `DoGet` API.
+ * @do_put: A virtual function to implement `DoPut` API.
  *
  * Since: 5.0.0
  */
@@ -218,6 +234,11 @@ struct _GAFlightServerClass
                                 GAFlightServerCallContext *context,
                                 GAFlightTicket *ticket,
                                 GError **error);
+  gboolean (*do_put)(GAFlightServer *server,
+                     GAFlightServerCallContext *context,
+                     GAFlightMessageReader *reader,
+                     GAFlightMetadataWriter *writer,
+                     GError **error);
 };
 
 GAFLIGHT_AVAILABLE_IN_5_0
@@ -254,4 +275,12 @@ gaflight_server_do_get(GAFlightServer *server,
                        GAFlightTicket *ticket,
                        GError **error);
 
+GAFLIGHT_AVAILABLE_IN_18_0
+gboolean
+gaflight_server_do_put(GAFlightServer *server,
+                       GAFlightServerCallContext *context,
+                       GAFlightMessageReader *reader,
+                       GAFlightMetadataWriter *writer,
+                       GError **error);
+
 G_END_DECLS
diff --git a/c_glib/arrow-flight-glib/server.hpp b/c_glib/arrow-flight-glib/server.hpp
index ec4815751c8d8..f68eef83781ec 100644
--- a/c_glib/arrow-flight-glib/server.hpp
+++ b/c_glib/arrow-flight-glib/server.hpp
@@ -36,6 +36,14 @@ GAFLIGHT_EXTERN
 arrow::flight::FlightMessageReader *
 gaflight_message_reader_get_raw(GAFlightMessageReader *reader);
 
+GAFLIGHT_EXTERN
+GAFlightMetadataWriter *
+gaflight_metadata_writer_new_raw(arrow::flight::FlightMetadataWriter *flight_writer);
+
+GAFLIGHT_EXTERN
+arrow::flight::FlightMetadataWriter *
+gaflight_metadata_writer_get_raw(GAFlightMetadataWriter *writer);
+
 GAFLIGHT_EXTERN
 GAFlightServerCallContext *
 gaflight_server_call_context_new_raw(
diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp
index b0321d51b3ba4..08af1c7976965 100644
--- a/c_glib/arrow-glib/writer.cpp
+++ b/c_glib/arrow-glib/writer.cpp
@@ -45,14 +45,14 @@ G_BEGIN_DECLS
  * batches in file format into output.
  */
 
-typedef struct GArrowRecordBatchWriterPrivate_
+struct GArrowRecordBatchWriterPrivate
 {
   std::shared_ptr<arrow::ipc::RecordBatchWriter> record_batch_writer;
-} GArrowRecordBatchWriterPrivate;
+  bool is_closed;
+};
 
 enum {
-  PROP_0,
-  PROP_RECORD_BATCH_WRITER
+  PROP_RECORD_BATCH_WRITER = 1,
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GArrowRecordBatchWriter,
@@ -111,6 +111,7 @@ garrow_record_batch_writer_init(GArrowRecordBatchWriter *object)
 {
   auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(object);
   new (&priv->record_batch_writer) std::shared_ptr<arrow::ipc::RecordBatchWriter>;
+  priv->is_closed = false;
 }
 
 static void
@@ -193,7 +194,27 @@ garrow_record_batch_writer_close(GArrowRecordBatchWriter *writer, GError **error
   auto arrow_writer = garrow_record_batch_writer_get_raw(writer);
 
   auto status = arrow_writer->Close();
-  return garrow_error_check(error, status, "[record-batch-writer][close]");
+  auto success = garrow_error_check(error, status, "[record-batch-writer][close]");
+  if (success) {
+    auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(writer);
+    priv->is_closed = true;
+  }
+  return success;
+}
+
+/**
+ * garrow_record_batch_writer_is_closed:
+ * @writer: A #GArrowRecordBatchWriter.
+ *
+ * Returns: %TRUE if the writer is closed, %FALSE otherwise.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+garrow_record_batch_writer_is_closed(GArrowRecordBatchWriter *writer)
+{
+  auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(writer);
+  return priv->is_closed;
 }
 
 G_DEFINE_TYPE(GArrowRecordBatchStreamWriter,
diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h
index 46bbdddec8c9d..cea8390d9028f 100644
--- a/c_glib/arrow-glib/writer.h
+++ b/c_glib/arrow-glib/writer.h
@@ -53,6 +53,10 @@ GARROW_AVAILABLE_IN_ALL
 gboolean
 garrow_record_batch_writer_close(GArrowRecordBatchWriter *writer, GError **error);
 
+GARROW_AVAILABLE_IN_18_0
+gboolean
+garrow_record_batch_writer_is_closed(GArrowRecordBatchWriter *writer);
+
 #define GARROW_TYPE_RECORD_BATCH_STREAM_WRITER                                           \
   (garrow_record_batch_stream_writer_get_type())
 GARROW_AVAILABLE_IN_ALL
diff --git a/c_glib/test/flight/test-client.rb b/c_glib/test/flight/test-client.rb
index 7eb093d3cab80..f1e3f31234ab4 100644
--- a/c_glib/test/flight/test-client.rb
+++ b/c_glib/test/flight/test-client.rb
@@ -84,4 +84,37 @@ def test_error
       end
     end
   end
+
+  sub_test_case("#do_put") do
+    def test_success
+      client = ArrowFlight::Client.new(@location)
+      generator = Helper::FlightInfoGenerator.new
+      descriptor = generator.page_view_descriptor
+      table = generator.page_view_table
+      result = client.do_put(descriptor, table.schema)
+      writer = result.writer
+      writer.write_table(table)
+      writer.done_writing
+      reader = result.reader
+      metadata = reader.read
+      writer.close
+      assert_equal(["done", table],
+                   [metadata.data.to_s, @server.uploaded_table])
+    end
+
+    def test_error
+      client = ArrowFlight::Client.new(@location)
+      generator = Helper::FlightInfoGenerator.new
+      descriptor = generator.page_view_descriptor
+      table = generator.page_view_table
+      result = client.do_put(descriptor, table.schema)
+      assert_raise(Arrow::Error::Invalid) do
+        writer = result.writer
+        writer.done_writing
+        reader = result.reader
+        reader.read
+        writer.close
+      end
+    end
+  end
 end
diff --git a/c_glib/test/helper/flight-server.rb b/c_glib/test/helper/flight-server.rb
index 8c47029d41791..80b8a5c96cf9f 100644
--- a/c_glib/test/helper/flight-server.rb
+++ b/c_glib/test/helper/flight-server.rb
@@ -34,6 +34,8 @@ def virtual_do_is_valid(context, token)
   class FlightServer < ArrowFlight::Server
     type_register
 
+    attr_reader :uploaded_table
+
     private
     def virtual_do_list_flights(context, criteria)
       generator = FlightInfoGenerator.new
@@ -54,5 +56,14 @@ def virtual_do_do_get(context, ticket)
       reader = Arrow::TableBatchReader.new(table)
       ArrowFlight::RecordBatchStream.new(reader)
     end
+
+    def virtual_do_do_put(context, reader, writer)
+      @uploaded_table = reader.read_all
+      writer.write(Arrow::Buffer.new("done"))
+      if @uploaded_table.n_rows.zero?
+        raise Arrow::Error::Invalid.new("empty table")
+      end
+      true
+    end
   end
 end
diff --git a/c_glib/test/test-file-writer.rb b/c_glib/test/test-file-writer.rb
index 5f9c3c4e19aa9..06c9dfa25c7fc 100644
--- a/c_glib/test/test-file-writer.rb
+++ b/c_glib/test/test-file-writer.rb
@@ -34,6 +34,9 @@ def test_write_record_batch
         file_writer.write_record_batch(record_batch)
       ensure
         file_writer.close
+        assert do
+          file_writer.closed?
+        end
       end
     ensure
       output.close
@@ -68,6 +71,9 @@ def test_write_table
         file_writer.write_table(table)
       ensure
         file_writer.close
+        assert do
+          file_writer.closed?
+        end
       end
     ensure
       output.close
diff --git a/c_glib/test/test-stream-writer.rb b/c_glib/test/test-stream-writer.rb
index 32754e20838b4..261732ae91e15 100644
--- a/c_glib/test/test-stream-writer.rb
+++ b/c_glib/test/test-stream-writer.rb
@@ -35,6 +35,9 @@ def test_write_record_batch
         stream_writer.write_record_batch(record_batch)
       ensure
         stream_writer.close
+        assert do
+          stream_writer.closed?
+        end
       end
     ensure
       output.close
diff --git a/ruby/red-arrow-flight/lib/arrow-flight/client.rb b/ruby/red-arrow-flight/lib/arrow-flight/client.rb
index ad45a4e403559..2750bcca589c8 100644
--- a/ruby/red-arrow-flight/lib/arrow-flight/client.rb
+++ b/ruby/red-arrow-flight/lib/arrow-flight/client.rb
@@ -47,5 +47,49 @@ def authenticate_basic(user, password, options=nil)
       end
       options
     end
+
+    alias_method :do_put_raw, :do_put
+    # Upload data to a Flight described by the given descriptor. The
+    # caller must call `#close` on the returned stream once they are
+    # done writing. Note that it's automatically done when you use
+    # block.
+    #
+    # The reader and writer are linked; closing the writer will also
+    # close the reader. Use GArrowFlight::StreamWriter#done_writing to
+    # only close the write side of the channel.
+    #
+    # @param descriptor [GArrowFlight::Descriptor] Descriptor to be uploaded.
+    # @param schema [GArrow::Schema] Schema of uploaded data.
+    # @param options [ArrowFlight::CallOptions, Hash, nil] (nil)
+    #   The options to be used.
+    #
+    # @yieldparam writer [GArrowFlight::StreamWriter] The writer to upload
+    #   data to the given descriptor.
+    #
+    #   This is closed automatically after the given block is finished.
+    #
+    # @yieldparam reader [GArrowFlight::MetadataReader] The reader to read
+    #   metadata from the server.
+    #
+    # @return [Array<GArrowFlight::MetadataReader, GArrowFlight::StreamWriter>, Object]
+    #   The reader and the writer if block isn't given.
+    #
+    #   The return value from block if block is given.
+    #
+    # @since 18.0.0
+    def do_put(descriptor, schema, options=nil)
+      result = do_put_raw(descriptor, schema, options)
+      reader = result.reader
+      writer = result.writer
+      if block_given?
+        begin
+          yield(reader, writer)
+        ensure
+          writer.close unless writer.closed?
+        end
+      else
+        return reader, writer
+      end
+    end
   end
 end
diff --git a/ruby/red-arrow-flight/test/helper/server.rb b/ruby/red-arrow-flight/test/helper/server.rb
index 269bb5f3d7858..1ea4855897b09 100644
--- a/ruby/red-arrow-flight/test/helper/server.rb
+++ b/ruby/red-arrow-flight/test/helper/server.rb
@@ -21,6 +21,8 @@ module Helper
   class Server < ArrowFlight::Server
     type_register
 
+    attr_reader :uploaded_table
+
     private
     def virtual_do_list_flights(context, criteria)
       generator = InfoGenerator.new
@@ -35,5 +37,14 @@ def virtual_do_do_get(context, ticket)
       table = generator.page_view_table
       ArrowFlight::RecordBatchStream.new(table)
     end
+
+    def virtual_do_do_put(context, reader, writer)
+      @uploaded_table = reader.read_all
+      writer.write(Arrow::Buffer.new("done"))
+      if @uploaded_table.n_rows.zero?
+        raise Arrow::Error::Invalid.new("empty table")
+      end
+      true
+    end
   end
 end
diff --git a/ruby/red-arrow-flight/test/test-client.rb b/ruby/red-arrow-flight/test/test-client.rb
index 850d6f45790c3..9f1ebbff81550 100644
--- a/ruby/red-arrow-flight/test/test-client.rb
+++ b/ruby/red-arrow-flight/test/test-client.rb
@@ -43,4 +43,35 @@ def test_do_get
     assert_equal(generator.page_view_table,
                  reader.read_all)
   end
+
+  def test_do_put_with_block
+    client = ArrowFlight::Client.new(@location)
+    generator = Helper::InfoGenerator.new
+    descriptor = generator.page_view_descriptor
+    table = generator.page_view_table
+    client.do_put(descriptor, table.schema) do |reader, writer|
+      writer.write_table(table)
+      writer.done_writing
+      metadata = reader.read
+      assert_equal(["done", table],
+                   [metadata.data.to_s, @server.uploaded_table])
+    end
+  end
+
+  def test_do_put_without_block
+    client = ArrowFlight::Client.new(@location)
+    generator = Helper::InfoGenerator.new
+    descriptor = generator.page_view_descriptor
+    table = generator.page_view_table
+    reader, writer = client.do_put(descriptor, table.schema)
+    begin
+      writer.write_table(table)
+      writer.done_writing
+      metadata = reader.read
+      assert_equal(["done", table],
+                   [metadata.data.to_s, @server.uploaded_table])
+    ensure
+      writer.close
+    end
+  end
 end

From a15956f697dddce4a08198ff3d36ac3e326d069e Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Mon, 9 Sep 2024 08:58:25 +0900
Subject: [PATCH 100/186] GH-43983: [C++][Parquet] Add support for
 arrow::ArrayStatistics: zero-copy types (#43984)

### Rationale for this change

Statistics is useful for fast processing.

Target types:

* `Int32`
* `Int64`
* `Float`
* `Double`
* `Timestamp[milli]`
* `Timestamp[micro]`
* `Timestamp[nano]`

### What changes are included in this PR?

Map `ColumnChunkMetaData` information to `arrow::ArrayStatistics`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #43983

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .../parquet/arrow/arrow_statistics_test.cc    | 66 ++++++++++---
 cpp/src/parquet/arrow/reader_internal.cc      | 96 +++++++++++++------
 2 files changed, 117 insertions(+), 45 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc
index 2638358f1ce7c..5011bf89112c6 100644
--- a/cpp/src/parquet/arrow/arrow_statistics_test.cc
+++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc
@@ -18,6 +18,8 @@
 #include "gtest/gtest.h"
 
 #include "arrow/array.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/builder_time.h"
 #include "arrow/table.h"
 #include "arrow/testing/gtest_util.h"
 
@@ -183,9 +185,8 @@ TEST(StatisticsTest, TruncateOnlyHalfMinMax) {
 
 namespace {
 ::arrow::Result<std::shared_ptr<::arrow::Array>> StatisticsReadArray(
-    std::shared_ptr<::arrow::DataType> data_type, const std::string& json) {
+    std::shared_ptr<::arrow::DataType> data_type, std::shared_ptr<::arrow::Array> array) {
   auto schema = ::arrow::schema({::arrow::field("column", data_type)});
-  auto array = ::arrow::ArrayFromJSON(data_type, json);
   auto record_batch = ::arrow::RecordBatch::Make(schema, array->length(), {array});
   ARROW_ASSIGN_OR_RAISE(auto sink, ::arrow::io::BufferOutputStream::Create());
   const auto arrow_writer_properties =
@@ -211,21 +212,27 @@ ::arrow::Result<std::shared_ptr<::arrow::Array>> StatisticsReadArray(
 template <typename ArrowType, typename MinMaxType>
 void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) {
   using ArrowArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+  using ArrowArrayBuilder = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
   using ArrowCType = typename ArrowType::c_type;
-  constexpr auto min = std::numeric_limits<ArrowCType>::min();
+  constexpr auto min = std::numeric_limits<ArrowCType>::lowest();
   constexpr auto max = std::numeric_limits<ArrowCType>::max();
 
-  std::string json;
-  json += "[";
-  json += std::to_string(max);
-  json += ", null, ";
-  json += std::to_string(min);
-  json += ", ";
-  json += std::to_string(max);
-  json += "]";
-  ASSERT_OK_AND_ASSIGN(auto array, StatisticsReadArray(arrow_type, json));
-  auto typed_array = std::static_pointer_cast<ArrowArrayType>(array);
-  auto statistics = typed_array->statistics();
+  std::unique_ptr<ArrowArrayBuilder> builder;
+  if constexpr (::arrow::TypeTraits<ArrowType>::is_parameter_free) {
+    builder = std::make_unique<ArrowArrayBuilder>(::arrow::default_memory_pool());
+  } else {
+    builder =
+        std::make_unique<ArrowArrayBuilder>(arrow_type, ::arrow::default_memory_pool());
+  }
+  ASSERT_OK(builder->Append(max));
+  ASSERT_OK(builder->AppendNull());
+  ASSERT_OK(builder->Append(min));
+  ASSERT_OK(builder->Append(max));
+  ASSERT_OK_AND_ASSIGN(auto built_array, builder->Finish());
+  ASSERT_OK_AND_ASSIGN(auto read_array,
+                       StatisticsReadArray(arrow_type, std::move(built_array)));
+  auto typed_read_array = std::static_pointer_cast<ArrowArrayType>(read_array);
+  auto statistics = typed_read_array->statistics();
   ASSERT_NE(nullptr, statistics);
   ASSERT_EQ(true, statistics->null_count.has_value());
   ASSERT_EQ(1, statistics->null_count.value());
@@ -257,14 +264,30 @@ TEST(TestStatisticsRead, UInt16) {
   TestStatisticsReadArray<::arrow::UInt16Type, uint64_t>(::arrow::uint16());
 }
 
+TEST(TestStatisticsRead, Int32) {
+  TestStatisticsReadArray<::arrow::Int32Type, int64_t>(::arrow::int32());
+}
+
 TEST(TestStatisticsRead, UInt32) {
   TestStatisticsReadArray<::arrow::UInt32Type, uint64_t>(::arrow::uint32());
 }
 
+TEST(TestStatisticsRead, Int64) {
+  TestStatisticsReadArray<::arrow::Int64Type, int64_t>(::arrow::int64());
+}
+
 TEST(TestStatisticsRead, UInt64) {
   TestStatisticsReadArray<::arrow::UInt64Type, uint64_t>(::arrow::uint64());
 }
 
+TEST(TestStatisticsRead, Float) {
+  TestStatisticsReadArray<::arrow::FloatType, double>(::arrow::float32());
+}
+
+TEST(TestStatisticsRead, Double) {
+  TestStatisticsReadArray<::arrow::DoubleType, double>(::arrow::float64());
+}
+
 TEST(TestStatisticsRead, Date32) {
   TestStatisticsReadArray<::arrow::Date32Type, int64_t>(::arrow::date32());
 }
@@ -279,6 +302,21 @@ TEST(TestStatisticsRead, Time64) {
       ::arrow::time64(::arrow::TimeUnit::MICRO));
 }
 
+TEST(TestStatisticsRead, TimestampMilli) {
+  TestStatisticsReadArray<::arrow::TimestampType, int64_t>(
+      ::arrow::timestamp(::arrow::TimeUnit::MILLI));
+}
+
+TEST(TestStatisticsRead, TimestampMicro) {
+  TestStatisticsReadArray<::arrow::TimestampType, int64_t>(
+      ::arrow::timestamp(::arrow::TimeUnit::MICRO));
+}
+
+TEST(TestStatisticsRead, TimestampNano) {
+  TestStatisticsReadArray<::arrow::TimestampType, int64_t>(
+      ::arrow::timestamp(::arrow::TimeUnit::NANO));
+}
+
 TEST(TestStatisticsRead, Duration) {
   TestStatisticsReadArray<::arrow::DurationType, int64_t>(
       ::arrow::duration(::arrow::TimeUnit::NANO));
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index e6c2d95e1fbf7..aa84a7a92bbe1 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -319,30 +319,20 @@ void ReconstructChunksWithoutNulls(::arrow::ArrayVector* chunks) {
 }
 
 template <typename ArrowType, typename ParquetType>
-Status TransferInt(RecordReader* reader,
-                   std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
-                   const ReaderContext* ctx, const std::shared_ptr<Field>& field,
-                   Datum* out) {
+void AttachStatistics(::arrow::ArrayData* data,
+                      std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                      const ReaderContext* ctx) {
   using ArrowCType = typename ArrowType::c_type;
-  using ParquetCType = typename ParquetType::c_type;
-  int64_t length = reader->values_written();
-  ARROW_ASSIGN_OR_RAISE(auto data,
-                        ::arrow::AllocateBuffer(length * sizeof(ArrowCType), ctx->pool));
 
-  auto values = reinterpret_cast<const ParquetCType*>(reader->values());
-  auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
-  std::copy(values, values + length, out_ptr);
-  int64_t null_count = 0;
-  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, std::move(data)};
-  if (field->nullable()) {
-    null_count = reader->null_count();
-    buffers[0] = reader->ReleaseIsValid();
+  auto statistics = metadata->statistics().get();
+  if (data->null_count == ::arrow::kUnknownNullCount && !statistics) {
+    return;
   }
-  auto array_data =
-      ::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count);
+
   auto array_statistics = std::make_shared<::arrow::ArrayStatistics>();
-  array_statistics->null_count = null_count;
-  auto statistics = metadata->statistics().get();
+  if (data->null_count != ::arrow::kUnknownNullCount) {
+    array_statistics->null_count = data->null_count;
+  }
   if (statistics) {
     if (statistics->HasDistinctCount()) {
       array_statistics->distinct_count = statistics->distinct_count();
@@ -352,17 +342,21 @@ Status TransferInt(RecordReader* reader,
           static_cast<::parquet::TypedStatistics<ParquetType>*>(statistics);
       const ArrowCType min = typed_statistics->min();
       const ArrowCType max = typed_statistics->max();
-      if (std::is_signed<ArrowCType>::value) {
+      if (std::is_floating_point<ArrowCType>::value) {
+        array_statistics->min = static_cast<double>(min);
+        array_statistics->max = static_cast<double>(max);
+      } else if (std::is_signed<ArrowCType>::value) {
         array_statistics->min = static_cast<int64_t>(min);
         array_statistics->max = static_cast<int64_t>(max);
       } else {
         array_statistics->min = static_cast<uint64_t>(min);
         array_statistics->max = static_cast<uint64_t>(max);
       }
-      // We can assume that integer based min/max are always exact if
-      // they exist. Apache Parquet's "Statistics" has
-      // "is_min_value_exact" and "is_max_value_exact" but we can
-      // ignore them for integer based min/max.
+      // We can assume that integer and floating point number based
+      // min/max are always exact if they exist. Apache Parquet's
+      // "Statistics" has "is_min_value_exact" and
+      // "is_max_value_exact" but we can ignore them for integer and
+      // floating point number based min/max.
       //
       // See also the discussion at dev@parquet.apache.org:
       // https://lists.apache.org/thread/zfnmg5p51b7oylft5w5k4670wgkd4zv4
@@ -370,13 +364,41 @@ Status TransferInt(RecordReader* reader,
       array_statistics->is_max_exact = true;
     }
   }
-  array_data->statistics = std::move(array_statistics);
+
+  data->statistics = std::move(array_statistics);
+}
+
+template <typename ArrowType, typename ParquetType>
+Status TransferInt(RecordReader* reader,
+                   std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                   const ReaderContext* ctx, const std::shared_ptr<Field>& field,
+                   Datum* out) {
+  using ArrowCType = typename ArrowType::c_type;
+  using ParquetCType = typename ParquetType::c_type;
+  int64_t length = reader->values_written();
+  ARROW_ASSIGN_OR_RAISE(auto data,
+                        ::arrow::AllocateBuffer(length * sizeof(ArrowCType), ctx->pool));
+
+  auto values = reinterpret_cast<const ParquetCType*>(reader->values());
+  auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
+  std::copy(values, values + length, out_ptr);
+  int64_t null_count = 0;
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, std::move(data)};
+  if (field->nullable()) {
+    null_count = reader->null_count();
+    buffers[0] = reader->ReleaseIsValid();
+  }
+  auto array_data =
+      ::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count);
+  AttachStatistics<ArrowType, ParquetType>(array_data.get(), std::move(metadata), ctx);
   *out = std::make_shared<ArrayType<ArrowType>>(std::move(array_data));
   return Status::OK();
 }
 
-std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader,
-                                        const std::shared_ptr<Field>& field) {
+template <typename ArrowType, typename ParquetType>
+std::shared_ptr<Array> TransferZeroCopy(
+    RecordReader* reader, std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+    const ReaderContext* ctx, const std::shared_ptr<Field>& field) {
   std::shared_ptr<::arrow::ArrayData> data;
   if (field->nullable()) {
     std::vector<std::shared_ptr<Buffer>> buffers = {reader->ReleaseIsValid(),
@@ -388,7 +410,8 @@ std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader,
     data = std::make_shared<::arrow::ArrayData>(field->type(), reader->values_written(),
                                                 std::move(buffers), /*null_count=*/0);
   }
-  return ::arrow::MakeArray(data);
+  AttachStatistics<ArrowType, ParquetType>(data.get(), std::move(metadata), ctx);
+  return ::arrow::MakeArray(std::move(data));
 }
 
 Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum* out) {
@@ -794,10 +817,20 @@ Status TransferColumnData(RecordReader* reader,
       break;
     }
     case ::arrow::Type::INT32:
+      result = TransferZeroCopy<::arrow::Int32Type, Int32Type>(
+          reader, std::move(metadata), ctx, value_field);
+      break;
     case ::arrow::Type::INT64:
+      result = TransferZeroCopy<::arrow::Int64Type, Int64Type>(
+          reader, std::move(metadata), ctx, value_field);
+      break;
     case ::arrow::Type::FLOAT:
+      result = TransferZeroCopy<::arrow::FloatType, FloatType>(
+          reader, std::move(metadata), ctx, value_field);
+      break;
     case ::arrow::Type::DOUBLE:
-      result = TransferZeroCopy(reader, value_field);
+      result = TransferZeroCopy<::arrow::DoubleType, DoubleType>(
+          reader, std::move(metadata), ctx, value_field);
       break;
     case ::arrow::Type::BOOL:
       RETURN_NOT_OK(TransferBool(reader, value_field->nullable(), pool, &result));
@@ -895,7 +928,8 @@ Status TransferColumnData(RecordReader* reader,
           case ::arrow::TimeUnit::MILLI:
           case ::arrow::TimeUnit::MICRO:
           case ::arrow::TimeUnit::NANO:
-            result = TransferZeroCopy(reader, value_field);
+            result = TransferZeroCopy<::arrow::Int64Type, Int64Type>(
+                reader, std::move(metadata), ctx, value_field);
             break;
           default:
             return Status::NotImplemented("TimeUnit not supported");

From d88dd19ea83aee37a4da69bdbc688f5e342fb455 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Mon, 9 Sep 2024 09:45:29 +0900
Subject: [PATCH 101/186] GH-40860: [GLib][Parquet] Add
 `gparquet_arrow_file_writer_write_record_batch()` (#44001)

### Rationale for this change

We don't need to create a `GArrowTable` only for writing a `GArrowRecordBatch`.

### What changes are included in this PR?

The following APIs are also added:
* `gparquet_arrow_file_writer_get_schema()`
* Parquet::ArrowFileWriter#write` in Ruby

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #40860

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/parquet-glib/arrow-file-writer.cpp     | 46 ++++++++-
 c_glib/parquet-glib/arrow-file-writer.h       | 10 ++
 c_glib/test/parquet/test-arrow-file-writer.rb | 38 ++++++-
 .../lib/parquet/arrow-file-writer.rb          | 98 +++++++++++++++++++
 ruby/red-parquet/lib/parquet/loader.rb        |  1 +
 .../test/test-arrow-file-writer.rb            | 76 ++++++++++++++
 6 files changed, 262 insertions(+), 7 deletions(-)
 create mode 100644 ruby/red-parquet/lib/parquet/arrow-file-writer.rb
 create mode 100644 ruby/red-parquet/test/test-arrow-file-writer.rb

diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp
index b6f019ed27d46..0d0e87e7e3ede 100644
--- a/c_glib/parquet-glib/arrow-file-writer.cpp
+++ b/c_glib/parquet-glib/arrow-file-writer.cpp
@@ -316,14 +316,13 @@ gparquet_writer_properties_get_data_page_size(GParquetWriterProperties *properti
   return parquet_properties->data_pagesize();
 }
 
-typedef struct GParquetArrowFileWriterPrivate_
+struct GParquetArrowFileWriterPrivate
 {
   parquet::arrow::FileWriter *arrow_file_writer;
-} GParquetArrowFileWriterPrivate;
+};
 
 enum {
-  PROP_0,
-  PROP_ARROW_FILE_WRITER
+  PROP_ARROW_FILE_WRITER = 1,
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GParquetArrowFileWriter,
@@ -496,6 +495,45 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema,
   }
 }
 
+/**
+ * gparquet_arrow_file_writer_get_schema:
+ * @writer: A #GParquetArrowFileWriter.
+ *
+ * Returns: (transfer full): The schema to be written to.
+ *
+ * Since: 18.0.0
+ */
+GArrowSchema *
+gparquet_arrow_file_writer_get_schema(GParquetArrowFileWriter *writer)
+{
+  auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
+  auto arrow_schema = parquet_arrow_file_writer->schema();
+  return garrow_schema_new_raw(&arrow_schema);
+}
+
+/**
+ * gparquet_arrow_file_writer_write_record_batch:
+ * @writer: A #GParquetArrowFileWriter.
+ * @record_batch: A record batch to be written.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE if there was an error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer,
+                                              GArrowRecordBatch *record_batch,
+                                              GError **error)
+{
+  auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
+  auto arrow_record_batch = garrow_record_batch_get_raw(record_batch).get();
+  auto status = parquet_arrow_file_writer->WriteRecordBatch(*arrow_record_batch);
+  return garrow_error_check(error,
+                            status,
+                            "[parquet][arrow][file-writer][write-record-batch]");
+}
+
 /**
  * gparquet_arrow_file_writer_write_table:
  * @writer: A #GParquetArrowFileWriter.
diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h
index 71cbfa195e842..7eb14fe27a8bf 100644
--- a/c_glib/parquet-glib/arrow-file-writer.h
+++ b/c_glib/parquet-glib/arrow-file-writer.h
@@ -116,6 +116,16 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema,
                                     GParquetWriterProperties *writer_properties,
                                     GError **error);
 
+GPARQUET_AVAILABLE_IN_18_0
+GArrowSchema *
+gparquet_arrow_file_writer_get_schema(GParquetArrowFileWriter *writer);
+
+GPARQUET_AVAILABLE_IN_18_0
+gboolean
+gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer,
+                                              GArrowRecordBatch *record_batch,
+                                              GError **error);
+
 GPARQUET_AVAILABLE_IN_0_11
 gboolean
 gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer,
diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb
index f899e7273b2a2..e348c9b679524 100644
--- a/c_glib/test/parquet/test-arrow-file-writer.rb
+++ b/c_glib/test/parquet/test-arrow-file-writer.rb
@@ -26,7 +26,39 @@ def setup
     end
   end
 
-  def test_write
+  def test_schema
+    schema = build_schema("enabled" => :boolean)
+    writer = Parquet::ArrowFileWriter.new(schema, @file.path)
+    assert_equal(schema, writer.schema)
+    writer.close
+  end
+
+  def test_write_record_batch
+    enabled_values = [true, nil, false, true]
+    record_batch =
+      build_record_batch("enabled" => build_boolean_array(enabled_values))
+
+    writer = Parquet::ArrowFileWriter.new(record_batch.schema, @file.path)
+    writer.write_record_batch(record_batch)
+    writer.close
+
+    reader = Parquet::ArrowFileReader.new(@file.path)
+    begin
+      reader.use_threads = true
+      assert_equal([
+                     1,
+                     Arrow::Table.new(record_batch.schema, [record_batch]),
+                   ],
+                   [
+                     reader.n_row_groups,
+                     reader.read_table,
+                   ])
+    ensure
+      reader.unref
+    end
+  end
+
+  def test_write_table
     enabled_values = [true, nil, false, true]
     table = build_table("enabled" => build_boolean_array(enabled_values))
     chunk_size = 2
@@ -40,11 +72,11 @@ def test_write
       reader.use_threads = true
       assert_equal([
                      enabled_values.length / chunk_size,
-                     true,
+                     table,
                    ],
                    [
                      reader.n_row_groups,
-                     table.equal_metadata(reader.read_table, false),
+                     reader.read_table,
                    ])
     ensure
       reader.unref
diff --git a/ruby/red-parquet/lib/parquet/arrow-file-writer.rb b/ruby/red-parquet/lib/parquet/arrow-file-writer.rb
new file mode 100644
index 0000000000000..137dc518e3f95
--- /dev/null
+++ b/ruby/red-parquet/lib/parquet/arrow-file-writer.rb
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Parquet
+  class ArrowFileWriter
+    # Write data to Apache Parquet.
+    #
+    # @return [void]
+    #
+    # @overload write(record_batch)
+    #
+    #   @param record_batch [Arrow::RecordBatch] The record batch to
+    #     be written.
+    #
+    #   @example Write a record batch
+    #     record_batch = Arrow::RecordBatch.new(enabled: [true, false])
+    #     schema = record_batch.schema
+    #     Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
+    #       writer.write(record_batch)
+    #     end
+    #
+    # @overload write(table, chunk_size: nil)
+    #
+    #   @param table [Arrow::Table] The table to be written.
+    #
+    #   @param chunk_size [nil, Integer] (nil) The maximum number of
+    #     rows to write per row group.
+    #
+    #     If this is `nil`, the default value (`1024 * 1024`) is used.
+    #
+    #   @example Write a record batch with the default chunk size
+    #     table = Arrow::Table.new(enabled: [true, false])
+    #     schema = table.schema
+    #     Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
+    #       writer.write(table)
+    #     end
+    #
+    #   @example Write a record batch with the specified chunk size
+    #     table = Arrow::Table.new(enabled: [true, false])
+    #     schema = table.schema
+    #     Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
+    #       writer.write(table, chunk_size: 1)
+    #     end
+    #
+    # @overload write(raw_records)
+    #
+    #   @param data [Array<Hash>, Array<Array>] The data to be written
+    #     as primitive Ruby objects.
+    #
+    #   @example Write a record batch with Array<Array> based data
+    #     schema = Arrow::Schema.new(enabled: :boolean)
+    #     raw_records = [
+    #       [true],
+    #       [false],
+    #     ]
+    #     Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
+    #       writer.write(raw_records)
+    #     end
+    #
+    #   @example Write a record batch with Array<Hash> based data
+    #     schema = Arrow::Schema.new(enabled: :boolean)
+    #     raw_columns = [
+    #       enabled: [true, false],
+    #     ]
+    #     Parquet::ArrowFileWriter.open(schema, "data.parquet") do |writer|
+    #       writer.write(raw_columns)
+    #     end
+    #
+    # @since 18.0.0
+    def write(target, chunk_size: nil)
+      case target
+      when Arrow::RecordBatch
+        write_record_batch(target)
+      when Arrow::Table
+        # Same as parquet::DEFAULT_MAX_ROW_GROUP_LENGTH in C++
+        chunk_size ||= 1024 * 1024
+        write_table(target, chunk_size)
+      else
+        record_batch = Arrow::RecordBatch.new(schema, target)
+        write_record_batch(record_batch)
+      end
+    end
+  end
+end
diff --git a/ruby/red-parquet/lib/parquet/loader.rb b/ruby/red-parquet/lib/parquet/loader.rb
index 0c20ad2b52a21..018a35ce459eb 100644
--- a/ruby/red-parquet/lib/parquet/loader.rb
+++ b/ruby/red-parquet/lib/parquet/loader.rb
@@ -30,6 +30,7 @@ def post_load(repository, namespace)
 
     def require_libraries
       require "parquet/arrow-file-reader"
+      require "parquet/arrow-file-writer"
       require "parquet/arrow-table-loadable"
       require "parquet/arrow-table-savable"
       require "parquet/writer-properties"
diff --git a/ruby/red-parquet/test/test-arrow-file-writer.rb b/ruby/red-parquet/test/test-arrow-file-writer.rb
new file mode 100644
index 0000000000000..c71586499c59d
--- /dev/null
+++ b/ruby/red-parquet/test/test-arrow-file-writer.rb
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestArrowFileWriter < Test::Unit::TestCase
+  def open_buffer_output_stream
+    buffer = Arrow::ResizableBuffer.new(4096)
+    Arrow::BufferOutputStream.open(buffer) do |output|
+      yield(output)
+    end
+    buffer
+  end
+
+  sub_test_case("#write") do
+    test("RecordBatch") do
+      schema = Arrow::Schema.new(visible: :boolean)
+      record_batch = Arrow::RecordBatch.new(schema, [[true], [false]])
+      buffer = open_buffer_output_stream do |output|
+        Parquet::ArrowFileWriter.open(record_batch.schema, output) do |writer|
+          writer.write(record_batch)
+        end
+      end
+      assert_equal(record_batch.to_table,
+                   Arrow::Table.load(buffer, format: :parquet))
+    end
+
+    test("Table") do
+      schema = Arrow::Schema.new(visible: :boolean)
+      table = Arrow::Table.new(schema, [[true], [false]])
+      buffer = open_buffer_output_stream do |output|
+        Parquet::ArrowFileWriter.open(table.schema, output) do |writer|
+          writer.write(table)
+        end
+      end
+      assert_equal(table,
+                   Arrow::Table.load(buffer, format: :parquet))
+    end
+
+    test("[[]]") do
+      schema = Arrow::Schema.new(visible: :boolean)
+      raw_records = [[true], [false]]
+      buffer = open_buffer_output_stream do |output|
+        Parquet::ArrowFileWriter.open(schema, output) do |writer|
+          writer.write(raw_records)
+        end
+      end
+      assert_equal(Arrow::RecordBatch.new(schema, raw_records).to_table,
+                   Arrow::Table.load(buffer, format: :parquet))
+    end
+
+    test("[{}]") do
+      schema = Arrow::Schema.new(visible: :boolean)
+      raw_columns = [visible: [true, false]]
+      buffer = open_buffer_output_stream do |output|
+        Parquet::ArrowFileWriter.open(schema, output) do |writer|
+          writer.write(raw_columns)
+        end
+      end
+      assert_equal(Arrow::RecordBatch.new(schema, raw_columns).to_table,
+                   Arrow::Table.load(buffer, format: :parquet))
+    end
+  end
+end

From d0f9f3e6136bffaa94b25d4a1c95576d4747773d Mon Sep 17 00:00:00 2001
From: hellishfire <starryspace@yahoo.com>
Date: Mon, 9 Sep 2024 15:26:15 +0800
Subject: [PATCH 102/186] GH-43966: [Java] Check for nullabilities when
 comparing StructVector (#43968)

### Rationale for this change

See #43966

### What changes are included in this PR?

Check for nullabilities when comparing StructVector with RangeEqualsVisitor.

### Are these changes tested?

Yes

### Are there any user-facing changes?
No

* GitHub Issue: #43966

Authored-by: youming.whl <youming.whl@antfin.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../vector/compare/RangeEqualsVisitor.java    | 61 +++++++++++++++++--
 .../compare/TestRangeEqualsVisitor.java       | 20 +++++-
 2 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
index 9aa1bffb8463e..ed51f748af577 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
@@ -41,6 +41,7 @@
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
+import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
 
 /** Visitor to compare a range of values for vectors. */
@@ -345,6 +346,20 @@ protected boolean compareDenseUnionVectors(Range range) {
     return true;
   }
 
+  private boolean compareStructVectorsInternal(
+      NonNullableStructVector leftVector, NonNullableStructVector rightVector, Range range) {
+    List<String> leftChildNames = leftVector.getChildFieldNames();
+    for (String name : leftChildNames) {
+      RangeEqualsVisitor visitor =
+          createInnerVisitor(
+              leftVector.getChild(name), rightVector.getChild(name), /*type comparator*/ null);
+      if (!visitor.rangeEquals(range)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   protected boolean compareStructVectors(Range range) {
     NonNullableStructVector leftVector = (NonNullableStructVector) left;
     NonNullableStructVector rightVector = (NonNullableStructVector) right;
@@ -354,15 +369,49 @@ protected boolean compareStructVectors(Range range) {
       return false;
     }
 
-    for (String name : leftChildNames) {
-      RangeEqualsVisitor visitor =
-          createInnerVisitor(
-              leftVector.getChild(name), rightVector.getChild(name), /*type comparator*/ null);
-      if (!visitor.rangeEquals(range)) {
+    if (!(leftVector instanceof StructVector || rightVector instanceof StructVector)) {
+      // neither struct vector is nullable
+      return compareStructVectorsInternal(leftVector, rightVector, range);
+    }
+
+    Range subRange = new Range(0, 0, 0);
+    boolean lastIsNull = true;
+    int lastNullIndex = -1;
+    for (int i = 0; i < range.getLength(); i++) {
+      int leftIndex = range.getLeftStart() + i;
+      int rightIndex = range.getRightStart() + i;
+      boolean isLeftNull = leftVector.isNull(leftIndex);
+      boolean isRightNull = rightVector.isNull(rightIndex);
+
+      if (isLeftNull != isRightNull) {
+        // exactly one slot is null, unequal
         return false;
       }
+      if (isLeftNull) {
+        // slots are null
+        if (!lastIsNull) {
+          subRange
+              .setLeftStart(range.getLeftStart() + lastNullIndex + 1)
+              .setRightStart(range.getRightStart() + lastNullIndex + 1)
+              .setLength(i - (lastNullIndex + 1));
+          if (!compareStructVectorsInternal(leftVector, rightVector, subRange)) {
+            return false;
+          }
+        }
+        lastIsNull = true;
+        lastNullIndex = i;
+      } else {
+        // slots are not null
+        lastIsNull = false;
+      }
+    }
+    if (!lastIsNull) {
+      subRange
+          .setLeftStart(range.getLeftStart() + lastNullIndex + 1)
+          .setRightStart(range.getRightStart() + lastNullIndex + 1)
+          .setLength(range.getLength() - (lastNullIndex + 1));
+      return compareStructVectorsInternal(leftVector, rightVector, subRange);
     }
-
     return true;
   }
 
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
index eca5c2d9b2a83..08da786eb272c 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java
@@ -434,17 +434,18 @@ public void testStructVectorRangeEquals() {
       NullableStructWriter writer1 = vector1.getWriter();
       writer1.allocate();
 
+      writeStructVector(writer1, 0, 0L);
       writeStructVector(writer1, 1, 10L);
       writeStructVector(writer1, 2, 20L);
       writeStructVector(writer1, 3, 30L);
       writeStructVector(writer1, 4, 40L);
       writeStructVector(writer1, 5, 50L);
-      writer1.setValueCount(5);
+      writer1.setValueCount(6);
 
       NullableStructWriter writer2 = vector2.getWriter();
       writer2.allocate();
 
-      writeStructVector(writer2, 0, 00L);
+      writeStructVector(writer2, 0, 0L);
       writeStructVector(writer2, 2, 20L);
       writeStructVector(writer2, 3, 30L);
       writeStructVector(writer2, 4, 40L);
@@ -452,7 +453,20 @@ public void testStructVectorRangeEquals() {
       writer2.setValueCount(5);
 
       RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2);
-      assertTrue(visitor.rangeEquals(new Range(1, 1, 3)));
+      assertTrue(visitor.rangeEquals(new Range(2, 1, 3)));
+
+      // different nullability but same values
+      vector1.setNull(3);
+      assertFalse(visitor.rangeEquals(new Range(2, 1, 3)));
+      // both null and same values
+      vector2.setNull(2);
+      assertTrue(visitor.rangeEquals(new Range(2, 1, 3)));
+      // both not null but different values
+      assertFalse(visitor.rangeEquals(new Range(2, 1, 4)));
+      // both null but different values
+      vector1.setNull(5);
+      vector2.setNull(4);
+      assertTrue(visitor.rangeEquals(new Range(2, 1, 4)));
     }
   }
 

From 27b43f46a883c549d91668b130f1ce8adda2092a Mon Sep 17 00:00:00 2001
From: Rossi Sun <zanmato1984@gmail.com>
Date: Mon, 9 Sep 2024 21:28:32 +0800
Subject: [PATCH 103/186] GH-43986: [C++][Acero] Some code cleanup to `Grouper`
 (#43988)

### Rationale for this change

See #43986.

### What changes are included in this PR?

Mostly trivial changes, plus removing one `Grouper` implementation that's not wired.

### Are these changes tested?

No new tests needed.

### Are there any user-facing changes?

None.

* GitHub Issue: #43986

Authored-by: Ruoxi Sun <zanmato1984@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/acero/groupby_aggregate_node.cc |  5 +--
 cpp/src/arrow/acero/scalar_aggregate_node.cc  |  5 +--
 .../arrow/compute/kernels/hash_aggregate.cc   |  8 ++---
 cpp/src/arrow/compute/row/grouper.cc          | 32 -------------------
 cpp/src/arrow/compute/row/grouper.h           | 12 +++----
 5 files changed, 16 insertions(+), 46 deletions(-)

diff --git a/cpp/src/arrow/acero/groupby_aggregate_node.cc b/cpp/src/arrow/acero/groupby_aggregate_node.cc
index 723c8b7377e13..06b034ab2d459 100644
--- a/cpp/src/arrow/acero/groupby_aggregate_node.cc
+++ b/cpp/src/arrow/acero/groupby_aggregate_node.cc
@@ -369,13 +369,14 @@ Status GroupByNode::InputReceived(ExecNode* input, ExecBatch batch) {
   DCHECK_EQ(input, inputs_[0]);
 
   auto handler = [this](const ExecBatch& full_batch, const Segment& segment) {
-    if (!segment.extends && segment.offset == 0) RETURN_NOT_OK(OutputResult(false));
+    if (!segment.extends && segment.offset == 0)
+      RETURN_NOT_OK(OutputResult(/*is_last=*/false));
     auto exec_batch = full_batch.Slice(segment.offset, segment.length);
     auto batch = ExecSpan(exec_batch);
     RETURN_NOT_OK(Consume(batch));
     RETURN_NOT_OK(
         ExtractSegmenterValues(&segmenter_values_, exec_batch, segment_key_field_ids_));
-    if (!segment.is_open) RETURN_NOT_OK(OutputResult(false));
+    if (!segment.is_open) RETURN_NOT_OK(OutputResult(/*is_last=*/false));
     return Status::OK();
   };
   ARROW_RETURN_NOT_OK(
diff --git a/cpp/src/arrow/acero/scalar_aggregate_node.cc b/cpp/src/arrow/acero/scalar_aggregate_node.cc
index c7805f4d24eb2..b34f7511cc12b 100644
--- a/cpp/src/arrow/acero/scalar_aggregate_node.cc
+++ b/cpp/src/arrow/acero/scalar_aggregate_node.cc
@@ -234,7 +234,8 @@ Status ScalarAggregateNode::InputReceived(ExecNode* input, ExecBatch batch) {
     // (1) The segment is starting of a new segment group and points to
     // the beginning of the batch, then it means no data in the batch belongs
     // to the current segment group. We can output and reset kernel states.
-    if (!segment.extends && segment.offset == 0) RETURN_NOT_OK(OutputResult(false));
+    if (!segment.extends && segment.offset == 0)
+      RETURN_NOT_OK(OutputResult(/*is_last=*/false));
 
     // We add segment to the current segment group aggregation
     auto exec_batch = full_batch.Slice(segment.offset, segment.length);
@@ -244,7 +245,7 @@ Status ScalarAggregateNode::InputReceived(ExecNode* input, ExecBatch batch) {
 
     // If the segment closes the current segment group, we can output segment group
     // aggregation.
-    if (!segment.is_open) RETURN_NOT_OK(OutputResult(false));
+    if (!segment.is_open) RETURN_NOT_OK(OutputResult(/*is_last=*/false));
 
     return Status::OK();
   };
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 4bf6a6106dfe5..1207355939a0c 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -2533,11 +2533,11 @@ struct GroupedCountDistinctImpl : public GroupedAggregator {
 struct GroupedDistinctImpl : public GroupedCountDistinctImpl {
   Result<Datum> Finalize() override {
     ARROW_ASSIGN_OR_RAISE(auto uniques, grouper_->GetUniques());
-    ARROW_ASSIGN_OR_RAISE(auto groupings, grouper_->MakeGroupings(
-                                              *uniques[1].array_as<UInt32Array>(),
-                                              static_cast<uint32_t>(num_groups_), ctx_));
     ARROW_ASSIGN_OR_RAISE(
-        auto list, grouper_->ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_));
+        auto groupings, Grouper::MakeGroupings(*uniques[1].array_as<UInt32Array>(),
+                                               static_cast<uint32_t>(num_groups_), ctx_));
+    ARROW_ASSIGN_OR_RAISE(
+        auto list, Grouper::ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_));
     const auto& values = list->values();
     DCHECK_EQ(values->offset(), 0);
     auto* offsets = list->value_offsets()->mutable_data_as<int32_t>();
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index 5889f94d96c79..2b79539a3b0c2 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -332,38 +332,6 @@ Result<std::unique_ptr<RowSegmenter>> RowSegmenter::Make(
 
 namespace {
 
-struct GrouperNoKeysImpl : Grouper {
-  Result<std::shared_ptr<Array>> MakeConstantGroupIdArray(int64_t length,
-                                                          group_id_t value) {
-    std::unique_ptr<ArrayBuilder> a_builder;
-    RETURN_NOT_OK(MakeBuilder(default_memory_pool(), g_group_id_type, &a_builder));
-    using GroupIdBuilder = typename TypeTraits<GroupIdType>::BuilderType;
-    auto builder = checked_cast<GroupIdBuilder*>(a_builder.get());
-    if (length != 0) {
-      RETURN_NOT_OK(builder->Resize(length));
-    }
-    for (int64_t i = 0; i < length; i++) {
-      builder->UnsafeAppend(value);
-    }
-    std::shared_ptr<Array> array;
-    RETURN_NOT_OK(builder->Finish(&array));
-    return array;
-  }
-  Status Reset() override { return Status::OK(); }
-  Result<Datum> Consume(const ExecSpan& batch, int64_t offset, int64_t length) override {
-    ARROW_ASSIGN_OR_RAISE(auto array, MakeConstantGroupIdArray(length, 0));
-    return Datum(array);
-  }
-  Result<ExecBatch> GetUniques() override {
-    auto data = ArrayData::Make(uint32(), 1, 0);
-    auto values = data->GetMutableValues<uint32_t>(0);
-    values[0] = 0;
-    ExecBatch out({Datum(data)}, 1);
-    return out;
-  }
-  uint32_t num_groups() const override { return 1; }
-};
-
 struct GrouperImpl : public Grouper {
   static Result<std::unique_ptr<GrouperImpl>> Make(
       const std::vector<TypeHolder>& key_types, ExecContext* ctx) {
diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h
index a883fb938ddaf..1d2aaae9dffb0 100644
--- a/cpp/src/arrow/compute/row/grouper.h
+++ b/cpp/src/arrow/compute/row/grouper.h
@@ -57,13 +57,13 @@ inline bool operator!=(const Segment& segment1, const Segment& segment2) {
 
 /// \brief a helper class to divide a batch into segments of equal values
 ///
-/// For example, given a batch with two rows:
+/// For example, given a batch with two columns specifed as segment keys:
 ///
-/// A A
-/// A A
-/// A B
-/// A B
-/// A A
+/// A A [other columns]...
+/// A A ...
+/// A B ...
+/// A B ...
+/// A A ...
 ///
 /// Then the batch could be divided into 3 segments.  The first would be rows 0 & 1,
 /// the second would be rows 2 & 3, and the third would be row 4.

From cd50c324882ab1419d1728e9adad20d47b185508 Mon Sep 17 00:00:00 2001
From: mwish <1506118561@qq.com>
Date: Mon, 9 Sep 2024 22:21:26 +0800
Subject: [PATCH 104/186] GH-43301: [C++][Parquet] Enhance the comment for
 ColumnReader/Decoder (#44003)

### Rationale for this change

Enhance the comment for ColumnReader/Decoder

### What changes are included in this PR?

Enhance the comment for ColumnReader/Decoder

### Are these changes tested?

no need

### Are there any user-facing changes?

no

* GitHub Issue: #43301

Lead-authored-by: mwish <1506118561@qq.com>
Co-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/parquet/column_reader.cc | 32 +++++++++++++++++++++++++-------
 cpp/src/parquet/column_reader.h  |  9 +++++++--
 cpp/src/parquet/decoder.cc       |  2 ++
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 60a8a2176b0a8..bc72db61ce638 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2050,6 +2050,14 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
   LevelInfo leaf_info_;
 };
 
+/// In FLBARecordReader, we read fixed length byte array values.
+///
+/// Unlike other fixed length types, the `values_` buffer is not used to store
+/// values, instead we use `data_builder_` to store the values, and `null_bitmap_builder_`
+/// is used to store the null bitmap.
+///
+/// The `values_` buffer is used to store the temporary values for `Decode`, and it would
+/// be Reset after each `Decode` call. The `valid_bits_` buffer is never used.
 class FLBARecordReader final : public TypedRecordReader<FLBAType>,
                                virtual public BinaryRecordReader {
  public:
@@ -2134,6 +2142,13 @@ class FLBARecordReader final : public TypedRecordReader<FLBAType>,
   ::arrow::BufferBuilder data_builder_;
 };
 
+/// ByteArrayRecordReader reads variable length byte array values.
+///
+/// It only calls `DecodeArrowNonNull` and `DecodeArrow` to read values, and
+/// `Decode` and `DecodeSpaced` are not used.
+///
+/// The `values_` buffers are never used, and the `accumulator_`
+/// is used to store the values.
 class ByteArrayChunkedRecordReader final : public TypedRecordReader<ByteArrayType>,
                                            virtual public BinaryRecordReader {
  public:
@@ -2147,7 +2162,7 @@ class ByteArrayChunkedRecordReader final : public TypedRecordReader<ByteArrayTyp
 
   ::arrow::ArrayVector GetBuilderChunks() override {
     ::arrow::ArrayVector result = accumulator_.chunks;
-    if (result.size() == 0 || accumulator_.builder->length() > 0) {
+    if (result.empty() || accumulator_.builder->length() > 0) {
       std::shared_ptr<::arrow::Array> last_chunk;
       PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
       result.push_back(std::move(last_chunk));
@@ -2176,6 +2191,11 @@ class ByteArrayChunkedRecordReader final : public TypedRecordReader<ByteArrayTyp
   typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
 };
 
+/// ByteArrayDictionaryRecordReader reads into ::arrow::dictionary(index: int32,
+/// values: binary).
+///
+/// If underlying column is dictionary encoded, it will call `DecodeIndices` to read,
+/// otherwise it will call `DecodeArrowNonNull` to read.
 class ByteArrayDictionaryRecordReader final : public TypedRecordReader<ByteArrayType>,
                                               virtual public DictionaryRecordReader {
  public:
@@ -2225,10 +2245,9 @@ class ByteArrayDictionaryRecordReader final : public TypedRecordReader<ByteArray
     } else {
       num_decoded = this->current_decoder_->DecodeArrowNonNull(
           static_cast<int>(values_to_read), &builder_);
-
-      /// Flush values since they have been copied into the builder
-      ResetValues();
     }
+    // Flush values since they have been copied into the builder
+    ResetValues();
     CheckNumberDecoded(num_decoded, values_to_read);
   }
 
@@ -2244,11 +2263,10 @@ class ByteArrayDictionaryRecordReader final : public TypedRecordReader<ByteArray
       num_decoded = this->current_decoder_->DecodeArrow(
           static_cast<int>(values_to_read), static_cast<int>(null_count),
           valid_bits_->mutable_data(), values_written_, &builder_);
-
-      /// Flush values since they have been copied into the builder
-      ResetValues();
     }
     ARROW_DCHECK_EQ(num_decoded, values_to_read - null_count);
+    // Flush values since they have been copied into the builder
+    ResetValues();
   }
 
  private:
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 29e1b2a25e437..61d79d4f9b1b3 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -446,7 +446,9 @@ class PARQUET_EXPORT RecordReader {
   int64_t null_count_;
 
   /// \brief Each bit corresponds to one element in 'values_' and specifies if it
-  /// is null or not null. Not set if read_dense_for_nullable_ is true.
+  /// is null or not null.
+  ///
+  /// Not set if leaf type is not nullable or read_dense_for_nullable_ is true.
   std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
 
   /// \brief Buffer for definition levels. May contain more levels than
@@ -471,7 +473,10 @@ class PARQUET_EXPORT RecordReader {
 
   bool read_dictionary_ = false;
   // If true, we will not leave any space for the null values in the values_
-  // vector.
+  // vector or fill nulls values in BinaryRecordReader/DictionaryRecordReader.
+  //
+  // If read_dense_for_nullable_ is true, the BinaryRecordReader/DictionaryRecordReader
+  // might still populate the validity bitmap buffer.
   bool read_dense_for_nullable_ = false;
 };
 
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 70810461605b1..7063f423096dc 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -2047,6 +2047,8 @@ class DeltaByteArrayDecoderImpl : public DecoderImpl, public TypedDecoderImpl<DT
   int num_valid_values_{0};
   uint32_t prefix_len_offset_{0};
   std::shared_ptr<ResizableBuffer> buffered_prefix_length_;
+  // buffer for decoded strings, which gurantees the lifetime of the decoded strings
+  // until the next call of Decode.
   std::shared_ptr<ResizableBuffer> buffered_data_;
 };
 

From d28e542ef4dafcbb37abc5501154ee45b1b16b6f Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Mon, 9 Sep 2024 19:28:57 +0300
Subject: [PATCH 105/186] GH-43536: [Python][CI] Add a Crossbow job with the
 free-threaded build (#43671)

### Rationale for this change

Testing with the free-threaded build is required for adding support for it. (see #43536)

### What changes are included in this PR?

- Add a Docker build with the CPython free-threaded build from deadsnakes.
- Add a Crossbow job to run said Docker build with Python 3.13t

### Are there any user-facing changes?

No.
* GitHub Issue: #43536

Lead-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 ...ux-apt-python-313-freethreading.dockerfile | 59 +++++++++++++++++++
 dev/tasks/tasks.yml                           |  8 +++
 docker-compose.yml                            | 27 +++++++++
 python/CMakeLists.txt                         |  7 +++
 4 files changed, 101 insertions(+)
 create mode 100644 ci/docker/linux-apt-python-313-freethreading.dockerfile

diff --git a/ci/docker/linux-apt-python-313-freethreading.dockerfile b/ci/docker/linux-apt-python-313-freethreading.dockerfile
new file mode 100644
index 0000000000000..f5505e67f00bb
--- /dev/null
+++ b/ci/docker/linux-apt-python-313-freethreading.dockerfile
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ARG base
+FROM ${base}
+
+RUN apt-get update -y -q && \
+    apt install -y -q --no-install-recommends software-properties-common gpg-agent && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update -y -q && \
+    apt install -y -q --no-install-recommends python3.13-dev python3.13-nogil python3.13-venv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+COPY python/requirements-build.txt \
+     python/requirements-test.txt \
+     /arrow/python/
+
+ENV ARROW_PYTHON_VENV /arrow-dev
+RUN python3.13t -m venv ${ARROW_PYTHON_VENV}
+RUN ${ARROW_PYTHON_VENV}/bin/python -m pip install -U pip setuptools wheel
+RUN ${ARROW_PYTHON_VENV}/bin/python -m pip install \
+      --pre \
+      --prefer-binary \
+      --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" \
+      -r arrow/python/requirements-build.txt \
+      -r arrow/python/requirements-test.txt
+
+# We want to run the PyArrow test suite with the GIL disabled, but cffi
+# (more precisely, the `_cffi_backend` module) currently doesn't declare
+# itself safe to run without the GIL.
+# Therefore set PYTHON_GIL to 0.
+ENV ARROW_ACERO=ON \
+    ARROW_BUILD_STATIC=OFF \
+    ARROW_BUILD_TESTS=OFF \
+    ARROW_BUILD_UTILITIES=OFF \
+    ARROW_COMPUTE=ON \
+    ARROW_CSV=ON \
+    ARROW_DATASET=ON \
+    ARROW_FILESYSTEM=ON \
+    ARROW_GDB=ON \
+    ARROW_HDFS=ON \
+    ARROW_JSON=ON \
+    ARROW_USE_GLOG=OFF \
+    PYTHON_GIL=0
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 9ded6ee41ab4b..c1c15a3ff73fd 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1257,6 +1257,14 @@ tasks:
         PYTHON: "3.10"
       image: conda-python-cython2
 
+  test-ubuntu-22.04-python-313-freethreading:
+    ci: github
+    template: docker-tests/github.linux.yml
+    params:
+      env:
+        UBUNTU: 22.04
+      image: ubuntu-python-313-freethreading
+
   test-debian-12-python-3-amd64:
     ci: github
     template: docker-tests/github.linux.yml
diff --git a/docker-compose.yml b/docker-compose.yml
index 36cf150f25f39..8721eef524a19 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -149,6 +149,7 @@ x-hierarchy:
     - ubuntu-lint
     - ubuntu-python
     - ubuntu-python-sdist-test
+    - ubuntu-python-313-freethreading
     - ubuntu-r
     - ubuntu-r-only-r
   - ubuntu-cpp-bundled
@@ -1085,6 +1086,32 @@ services:
         /arrow/ci/scripts/cpp_build.sh /arrow /build &&
         /arrow/ci/scripts/python_sdist_test.sh /arrow"
 
+  ############################ Python free-threading ##########################
+
+  ubuntu-python-313-freethreading:
+    # Usage:
+    #   docker-compose build ubuntu-cpp
+    #   docker-compose build ubuntu-python-313-freethreading
+    #   docker-compose run --rm ubuntu-python-313-freethreading
+    # Parameters:
+    #   ARCH: amd64, arm64v8, ...
+    #   UBUNTU: 20.04, 22.04, 24.04
+    image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-313-freethreading
+    build:
+      context: .
+      dockerfile: ci/docker/linux-apt-python-313-freethreading.dockerfile
+      cache_from:
+        - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-313-freethreading
+      args:
+        base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp
+    shm_size: *shm-size
+    environment:
+      <<: [*common, *ccache]
+      # Bundled build of OpenTelemetry needs a git client
+      ARROW_WITH_OPENTELEMETRY: "OFF"
+    volumes: *ubuntu-volumes
+    command: *python-command
+
   ############################ Python wheels ##################################
 
   # See available versions at:
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index eda4ff4ca5f07..912719b20f0e4 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -21,6 +21,13 @@
 cmake_minimum_required(VERSION 3.16)
 project(pyarrow)
 
+# This is needed for 3.13 free-threading. CMake used to add Python
+# include directories with `-isystem`, which led to some Python-internal
+# includes to resolve to normal 3.13 includes (cause -isystem includes
+# are searched after system directories), instead of 3.13-freethreading,
+# which in turn meant that Py_GIL_DISABLED was not set.
+set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON)
+
 set(PYARROW_VERSION "18.0.0-SNAPSHOT")
 string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" PYARROW_BASE_VERSION "${PYARROW_VERSION}")
 

From 989508418b59e19037cbc58b46fb72ec2adf6234 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 05:28:50 +0900
Subject: [PATCH 106/186] MINOR: [CI] Bump actions/download-artifact from 4.1.7
 to 4.1.8 (#44020)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 4.1.7 to 4.1.8.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/actions/download-artifact/releases">actions/download-artifact's releases</a>.</em></p>
<blockquote>
<h2>v4.1.8</h2>
<h2>What's Changed</h2>
<ul>
<li>Update <code>@​actions/artifact</code> version, bump dependencies by <a href="https://github.com/robherley"><code>@​robherley</code></a> in <a href="https://redirect.github.com/actions/download-artifact/pull/341">actions/download-artifact#341</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/download-artifact/compare/v4...v4.1.8">https://github.com/actions/download-artifact/compare/v4...v4.1.8</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/actions/download-artifact/commit/fa0a91b85d4f404e444e00e005971372dc801d16"><code>fa0a91b</code></a> Merge pull request <a href="https://redirect.github.com/actions/download-artifact/issues/341">#341</a> from actions/robherley/bump-pkgs</li>
<li><a href="https://github.com/actions/download-artifact/commit/b54d0883e196647f43ce531a3fc13b246cf908b6"><code>b54d088</code></a> Update <code>@​actions/artifact</code> version, bump dependencies</li>
<li>See full diff in <a href="https://github.com/actions/download-artifact/compare/v4.1.7...v4.1.8">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/download-artifact&package-manager=github_actions&previous-version=4.1.7&new-version=4.1.8)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/r.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index fbc2ebe0bc5f1..bd1631db4f617 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -332,7 +332,7 @@ jobs:
           echo "$HOME/.local/bin" >> $GITHUB_PATH
       - run: mkdir r/windows
       - name: Download artifacts
-        uses: actions/download-artifact@v4.1.7
+        uses: actions/download-artifact@v4.1.8
         with:
           name: libarrow-rtools40-ucrt64.zip
           path: r/windows

From 23c46879d20eabfaf010e34fb03131999a02c30f Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Tue, 10 Sep 2024 05:55:20 +0530
Subject: [PATCH 107/186] GH-44013: [Java] Consider warnings as errors for
 Dataset Module (#44014)

### Rationale for this change

This PR configs the build such that warnings are considered as errors in the C module. And corresponding code changes have also been made.

### What changes are included in this PR?

Adding flags to consider warnings as errors in javac and fixing the corresponding errors.

### Are these changes tested?

Tested by existing test cases.

### Are there any user-facing changes?

N/A
* GitHub Issue: #44013

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/dataset/pom.xml                          |  9 ++++++
 .../arrow/dataset/TextBasedWriteSupport.java  | 12 ++++++--
 .../dataset/file/TestFileSystemDataset.java   |  7 ++---
 .../dataset/jni/TestReservationListener.java  |  1 +
 .../substrait/TestAceroSubstraitConsumer.java | 28 +++++--------------
 5 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
index a19e934f0de98..92b67825517c6 100644
--- a/java/dataset/pom.xml
+++ b/java/dataset/pom.xml
@@ -202,6 +202,15 @@ under the License.
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration combine.children="append">
+          <compilerArgs>
+            <arg>-Werror</arg>
+          </compilerArgs>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java b/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java
index 9e6559824ce7f..e3495bd81ca79 100644
--- a/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java
+++ b/java/dataset/src/test/java/org/apache/arrow/dataset/TextBasedWriteSupport.java
@@ -17,10 +17,13 @@
 package org.apache.arrow.dataset;
 
 import java.io.File;
-import java.io.FileWriter;
 import java.io.IOException;
+import java.io.Writer;
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
 import java.util.Random;
 
 public class TextBasedWriteSupport {
@@ -43,7 +46,12 @@ public static TextBasedWriteSupport writeTempFile(
       File outputFolder, String fileExtension, String... values)
       throws URISyntaxException, IOException {
     TextBasedWriteSupport writer = new TextBasedWriteSupport(outputFolder, fileExtension);
-    try (FileWriter addValues = new FileWriter(new File(writer.uri), true)) {
+    try (Writer addValues =
+        Files.newBufferedWriter(
+            new File(writer.uri).toPath(),
+            StandardCharsets.UTF_8,
+            StandardOpenOption.CREATE,
+            StandardOpenOption.APPEND)) {
       for (Object value : values) {
         addValues.write(value + "\n");
       }
diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java
index 0b085d25b32eb..89ce208e8c6f6 100644
--- a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java
+++ b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java
@@ -29,7 +29,6 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashSet;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
@@ -407,7 +406,7 @@ public void testBaseArrowIpcRead() throws Exception {
     try (VectorSchemaRoot root = VectorSchemaRoot.create(sourceSchema, rootAllocator());
         FileOutputStream sink = new FileOutputStream(dataFile);
         ArrowFileWriter writer =
-            new ArrowFileWriter(root, /*dictionaryProvider=*/ null, sink.getChannel())) {
+            new ArrowFileWriter(root, /* provider= */ null, sink.getChannel())) {
       IntVector ints = (IntVector) root.getVector(0);
       ints.setSafe(0, 0);
       ints.setSafe(1, 1024);
@@ -562,7 +561,7 @@ private void checkParquetReadResult(
       Schema schema, List<GenericRecord> expected, List<ArrowRecordBatch> actual) {
     assertEquals(expected.size(), actual.stream().mapToInt(ArrowRecordBatch::getLength).sum());
     final int fieldCount = schema.getFields().size();
-    LinkedList<GenericRecord> expectedRemovable = new LinkedList<>(expected);
+    ArrayList<GenericRecord> expectedRemovable = new ArrayList<>(expected);
     try (VectorSchemaRoot vsr = VectorSchemaRoot.create(schema, rootAllocator())) {
       VectorLoader loader = new VectorLoader(vsr);
       for (ArrowRecordBatch batch : actual) {
@@ -578,7 +577,7 @@ private void checkParquetReadResult(
           }
         }
         for (int i = 0; i < batchRowCount; i++) {
-          expectedRemovable.poll();
+          expectedRemovable.remove(0);
         }
       }
       assertTrue(expectedRemovable.isEmpty());
diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/jni/TestReservationListener.java b/java/dataset/src/test/java/org/apache/arrow/dataset/jni/TestReservationListener.java
index f366c824d2ded..9fabc4a257fb3 100644
--- a/java/dataset/src/test/java/org/apache/arrow/dataset/jni/TestReservationListener.java
+++ b/java/dataset/src/test/java/org/apache/arrow/dataset/jni/TestReservationListener.java
@@ -91,6 +91,7 @@ public void unreserve(long size) {
   }
 
   @Test
+  @SuppressWarnings("UnnecessaryAsync")
   public void testErrorThrownFromReservationListener() throws Exception {
     final String errorMessage = "ERROR_MESSAGE";
     ParquetWriteSupport writeSupport =
diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java b/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java
index 97c185d7053d5..eec6570a639f2 100644
--- a/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java
+++ b/java/dataset/src/test/java/org/apache/arrow/dataset/substrait/TestAceroSubstraitConsumer.java
@@ -23,6 +23,7 @@
 
 import java.io.File;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.Arrays;
@@ -87,7 +88,8 @@ public void testRunQueryLocalFiles() throws Exception {
                                 TestAceroSubstraitConsumer.class
                                     .getClassLoader()
                                     .getResource("substrait/local_files_users.json")
-                                    .toURI())))
+                                    .toURI())),
+                        StandardCharsets.UTF_8)
                     .replace("FILENAME_PLACEHOLDER", writeSupport.getOutputURI()))) {
       assertEquals(schema, arrowReader.getVectorSchemaRoot().getSchema());
       int rowcount = 0;
@@ -134,7 +136,8 @@ public void testRunQueryNamedTable() throws Exception {
                               TestAceroSubstraitConsumer.class
                                   .getClassLoader()
                                   .getResource("substrait/named_table_users.json")
-                                  .toURI()))),
+                                  .toURI())),
+                      StandardCharsets.UTF_8),
                   mapTableToArrowReader)) {
         assertEquals(schema, arrowReader.getVectorSchemaRoot().getSchema());
         assertEquals(arrowReader.getVectorSchemaRoot().getSchema(), schema);
@@ -186,7 +189,8 @@ public void testRunQueryNamedTableWithException() throws Exception {
                                     TestAceroSubstraitConsumer.class
                                         .getClassLoader()
                                         .getResource("substrait/named_table_users.json")
-                                        .toURI()))),
+                                        .toURI())),
+                            StandardCharsets.UTF_8),
                         mapTableToArrowReader)) {
               assertEquals(schema, arrowReader.getVectorSchemaRoot().getSchema());
               int rowcount = 0;
@@ -311,12 +315,6 @@ public void testRunExtendedExpressionsFilter() throws Exception {
   @Test
   public void testRunExtendedExpressionsFilterWithProjectionsInsteadOfFilterException()
       throws Exception {
-    final Schema schema =
-        new Schema(
-            Arrays.asList(
-                Field.nullable("id", new ArrowType.Int(32, true)),
-                Field.nullable("name", new ArrowType.Utf8())),
-            null);
     // Substrait Extended Expression: Project New Column:
     // Expression ADD: id + 2
     // Expression CONCAT: name + '-' + name
@@ -360,12 +358,6 @@ public void testRunExtendedExpressionsFilterWithProjectionsInsteadOfFilterExcept
 
   @Test
   public void testRunExtendedExpressionsFilterWithEmptyFilterException() throws Exception {
-    final Schema schema =
-        new Schema(
-            Arrays.asList(
-                Field.nullable("id", new ArrowType.Int(32, true)),
-                Field.nullable("name", new ArrowType.Utf8())),
-            null);
     String base64EncodedSubstraitFilter = "";
     ByteBuffer substraitExpressionFilter = getByteBuffer(base64EncodedSubstraitFilter);
     ParquetWriteSupport writeSupport =
@@ -529,12 +521,6 @@ public void testRunExtendedExpressionsProjectionWithFilterInsteadOfProjectionExc
 
   @Test
   public void testRunExtendedExpressionsProjectionWithEmptyProjectionException() throws Exception {
-    final Schema schema =
-        new Schema(
-            Arrays.asList(
-                Field.nullable("id", new ArrowType.Int(32, true)),
-                Field.nullable("name", new ArrowType.Utf8())),
-            null);
     String base64EncodedSubstraitFilter = "";
     ByteBuffer substraitExpressionProjection = getByteBuffer(base64EncodedSubstraitFilter);
     ParquetWriteSupport writeSupport =

From 09ed5e5c4ed9e7fd1619bf3bf3a5c88be4540a40 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Tue, 10 Sep 2024 05:56:18 +0530
Subject: [PATCH 108/186] GH-44011: [Java] Consider warnings as errors for C
 Module (#44012)

### Rationale for this change

This PR configs the build such that warnings are considered as errors in the C module. And corresponding code changes have also been made.

### What changes are included in this PR?

Adding flags to consider warnings as errors in javac and fixing the corresponding errors.

### Are these changes tested?

Tested by existing test cases.

### Are there any user-facing changes?

N/A
* GitHub Issue: #44011

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/c/pom.xml                                        | 11 +++++++++++
 .../main/java/org/apache/arrow/c/ArrayExporter.java   |  2 +-
 .../org/apache/arrow/c/BufferImportTypeVisitor.java   |  2 +-
 .../test/java/org/apache/arrow/c/DictionaryTest.java  |  8 ++++----
 .../test/java/org/apache/arrow/c/RoundtripTest.java   |  8 --------
 5 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/java/c/pom.xml b/java/c/pom.xml
index 52962354047b1..fe57bd2ea0ec5 100644
--- a/java/c/pom.xml
+++ b/java/c/pom.xml
@@ -91,5 +91,16 @@ under the License.
         </includes>
       </resource>
     </resources>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration combine.children="append">
+          <compilerArgs>
+            <arg>-Werror</arg>
+          </compilerArgs>
+        </configuration>
+      </plugin>
+    </plugins>
   </build>
 </project>
diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java b/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java
index 820a1522749c6..0c6b5de4486bc 100644
--- a/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java
+++ b/java/c/src/main/java/org/apache/arrow/c/ArrayExporter.java
@@ -90,7 +90,7 @@ void export(ArrowArray array, FieldVector vector, DictionaryProvider dictionaryP
 
       data.buffers = new ArrayList<>(vector.getExportedCDataBufferCount());
       data.buffers_ptrs =
-          allocator.buffer((long) (vector.getExportedCDataBufferCount()) * Long.BYTES);
+          allocator.buffer((long) vector.getExportedCDataBufferCount() * Long.BYTES);
       vector.exportCDataBuffers(data.buffers, data.buffers_ptrs, NULL);
 
       if (dictionaryEncoding != null) {
diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
index 93fef6d7ca801..e47d27bf091ee 100644
--- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
+++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
@@ -232,7 +232,7 @@ public List<ArrowBuf> visit(ArrowType.Utf8 type) {
   private List<ArrowBuf> visitVariableWidthView(ArrowType type) {
     final int viewBufferIndex = 1;
     final int variadicSizeBufferIndex = this.buffers.length - 1;
-    final long numOfVariadicBuffers = this.buffers.length - 3;
+    final long numOfVariadicBuffers = this.buffers.length - 3L;
     final long variadicSizeBufferCapacity = numOfVariadicBuffers * Long.BYTES;
     List<ArrowBuf> buffers = new ArrayList<>();
 
diff --git a/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java b/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java
index ce0e82586b766..8cd4913f22dd2 100644
--- a/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java
+++ b/java/c/src/test/java/org/apache/arrow/c/DictionaryTest.java
@@ -247,8 +247,8 @@ private void createStructVector(StructVector vector) {
 
     // Write the values to child 1
     child1.allocateNew();
-    child1.set(0, "01234567890".getBytes());
-    child1.set(1, "012345678901234567".getBytes());
+    child1.set(0, "01234567890".getBytes(StandardCharsets.UTF_8));
+    child1.set(1, "012345678901234567".getBytes(StandardCharsets.UTF_8));
     vector.setIndexDefined(0);
 
     // Write the values to child 2
@@ -269,8 +269,8 @@ private void createStructVectorInline(StructVector vector) {
 
     // Write the values to child 1
     child1.allocateNew();
-    child1.set(0, "012345678".getBytes());
-    child1.set(1, "01234".getBytes());
+    child1.set(0, "012345678".getBytes(StandardCharsets.UTF_8));
+    child1.set(1, "01234".getBytes(StandardCharsets.UTF_8));
     vector.setIndexDefined(0);
 
     // Write the values to child 2
diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
index 18b2e94adde47..d8286465e475f 100644
--- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
+++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java
@@ -528,14 +528,6 @@ public void testVarBinaryVector() {
     }
   }
 
-  private String generateString(String str, int repetition) {
-    StringBuilder aRepeated = new StringBuilder();
-    for (int i = 0; i < repetition; i++) {
-      aRepeated.append(str);
-    }
-    return aRepeated.toString();
-  }
-
   @Test
   public void testViewVector() {
     // ViewVarCharVector with short strings

From f3dd298bd32f6dc38654680c49b3f1fbf97e3d5f Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Tue, 10 Sep 2024 05:58:10 +0530
Subject: [PATCH 109/186] GH-43576: [Java] Gandiva Tests are failing due to
 linking issues (#43978)

### Rationale for this change

Gandiva tests are failing due to a linking issue and it is failing the Java CIs. But for most of the made PRs, we cannot verify the overall workflow given that those PRs are independent of the Gandiva component.

### What changes are included in this PR?

This PR disables such failing tests temporarily such that once the Gandiva issue is fixed, re-enabling the tests.
Re-enabling task will be tracked using https://github.com/apache/arrow/issues/43981

### Are these changes tested?

Yes, by existing CIs and tests.

### Are there any user-facing changes?

No
* GitHub Issue: #43576

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
index f2590226b1a74..0d86bd9e72923 100644
--- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
+++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
@@ -62,6 +62,7 @@
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
+@Disabled("Disabled until GH-43981 is solved")
 public class ProjectorTest extends BaseEvaluatorTest {
 
   private Charset utf8Charset = Charset.forName("UTF-8");

From b6316c091f416967c5e7c9a9284601fa4507aa72 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Tue, 10 Sep 2024 13:23:26 +0800
Subject: [PATCH 110/186] GH-44036: [C++] IPC: ipc reader/writer code
 enhancement (#44019)

### Rationale for this change

So minor ipc code enhancement when I read the code

### What changes are included in this PR?

Avoid copying shared_ptr in some naive space

### Are these changes tested?

Covered by existence

### Are there any user-facing changes?

no

* GitHub Issue: #44036

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/arrow/ipc/reader.cc | 12 ++++++------
 cpp/src/arrow/ipc/writer.cc | 21 +++++++++------------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index da84f2f2dc87d..98214c1debb86 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -305,7 +305,7 @@ class ArrayLoader {
       RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1]));
     } else {
       buffer_index_++;
-      out_->buffers[1].reset(new Buffer(nullptr, 0));
+      out_->buffers[1] = std::make_shared<Buffer>(nullptr, 0);
     }
     return Status::OK();
   }
@@ -644,11 +644,11 @@ Result<std::shared_ptr<RecordBatch>> LoadRecordBatch(
     const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
     const std::vector<bool>& inclusion_mask, const IpcReadContext& context,
     io::RandomAccessFile* file) {
-  if (inclusion_mask.size() > 0) {
-    return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file);
-  } else {
+  if (inclusion_mask.empty()) {
     return LoadRecordBatchSubset(metadata, schema, /*inclusion_mask=*/nullptr, context,
                                  file);
+  } else {
+    return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file);
   }
 }
 
@@ -1447,7 +1447,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
     // Prebuffering's read patterns are also slightly worse than the alternative
     // when doing whole-file reads because the logic is not in place to recognize
     // we can just read the entire file up-front
-    if (options_.included_fields.size() != 0 &&
+    if (!options_.included_fields.empty() &&
         options_.included_fields.size() != schema_->fields().size() &&
         !file_->supports_zero_copy()) {
       RETURN_NOT_OK(state->PreBufferMetadata({}));
@@ -1907,7 +1907,7 @@ Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
 Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
     const std::shared_ptr<io::RandomAccessFile>& file, const IpcReadOptions& options) {
   ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
-  return OpenAsync(std::move(file), footer_offset, options);
+  return OpenAsync(file, footer_offset, options);
 }
 
 Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index f603e60c66555..88aa3f3f8a47a 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -86,7 +86,7 @@ bool HasNestedDict(const ArrayData& data) {
 }
 
 Status GetTruncatedBitmap(int64_t offset, int64_t length,
-                          const std::shared_ptr<Buffer> input, MemoryPool* pool,
+                          const std::shared_ptr<Buffer>& input, MemoryPool* pool,
                           std::shared_ptr<Buffer>* buffer) {
   if (!input) {
     *buffer = input;
@@ -103,7 +103,7 @@ Status GetTruncatedBitmap(int64_t offset, int64_t length,
 }
 
 Status GetTruncatedBuffer(int64_t offset, int64_t length, int32_t byte_width,
-                          const std::shared_ptr<Buffer> input, MemoryPool* pool,
+                          const std::shared_ptr<Buffer>& input, MemoryPool* pool,
                           std::shared_ptr<Buffer>* buffer) {
   if (!input) {
     *buffer = input;
@@ -252,7 +252,7 @@ class RecordBatchSerializer {
   }
 
   Status Assemble(const RecordBatch& batch) {
-    if (field_nodes_.size() > 0) {
+    if (!field_nodes_.empty()) {
       field_nodes_.clear();
       buffer_meta_.clear();
       out_->body_buffers.clear();
@@ -335,8 +335,7 @@ class RecordBatchSerializer {
       ARROW_ASSIGN_OR_RAISE(auto shifted_offsets,
                             AllocateBuffer(required_bytes, options_.memory_pool));
 
-      offset_type* dest_offsets =
-          reinterpret_cast<offset_type*>(shifted_offsets->mutable_data());
+      auto dest_offsets = shifted_offsets->mutable_span_as<offset_type>();
       const offset_type start_offset = array.value_offset(0);
 
       for (int i = 0; i < array.length(); ++i) {
@@ -362,7 +361,6 @@ class RecordBatchSerializer {
                                      offset_type* out_min_offset,
                                      offset_type* out_max_end) {
     auto offsets = array.value_offsets();
-    auto sizes = array.value_sizes();
 
     const int64_t required_bytes = sizeof(offset_type) * array.length();
     if (array.offset() != 0) {
@@ -572,7 +570,7 @@ class RecordBatchSerializer {
   Status Visit(const StructArray& array) {
     --max_recursion_depth_;
     for (int i = 0; i < array.num_fields(); ++i) {
-      std::shared_ptr<Array> field = array.field(i);
+      const auto& field = array.field(i);
       RETURN_NOT_OK(VisitArray(*field));
     }
     ++max_recursion_depth_;
@@ -641,8 +639,7 @@ class RecordBatchSerializer {
       ARROW_ASSIGN_OR_RAISE(
           auto shifted_offsets_buffer,
           AllocateBuffer(length * sizeof(int32_t), options_.memory_pool));
-      int32_t* shifted_offsets =
-          reinterpret_cast<int32_t*>(shifted_offsets_buffer->mutable_data());
+      auto shifted_offsets = shifted_offsets_buffer->mutable_span_as<int32_t>();
 
       // Offsets are guaranteed to be increasing according to the spec, so
       // the first offset we find for a child is the initial offset and
@@ -899,7 +896,7 @@ Status GetContiguousTensor(const Tensor& tensor, MemoryPool* pool,
   RETURN_NOT_OK(WriteStridedTensorData(0, 0, elem_size, tensor,
                                        scratch_space->mutable_data(), &stream));
 
-  out->reset(new Tensor(tensor.type(), contiguous_data, tensor.shape()));
+  *out = std::make_unique<Tensor>(tensor.type(), contiguous_data, tensor.shape());
 
   return Status::OK();
 }
@@ -1005,7 +1002,7 @@ class SparseTensorSerializer {
   }
 
   Status Assemble(const SparseTensor& sparse_tensor) {
-    if (buffer_meta_.size() > 0) {
+    if (!buffer_meta_.empty()) {
       buffer_meta_.clear();
       out_->body_buffers.clear();
     }
@@ -1169,7 +1166,7 @@ Status RecordBatchWriter::WriteTable(const Table& table) { return WriteTable(tab
 
 namespace internal {
 
-IpcPayloadWriter::~IpcPayloadWriter() {}
+IpcPayloadWriter::~IpcPayloadWriter() = default;
 
 Status IpcPayloadWriter::Start() { return Status::OK(); }
 

From fed5fcbbd5f1ca578f2eb3661a8260b4e21965ea Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 10 Sep 2024 00:30:54 -0700
Subject: [PATCH 111/186] GH-43996: [Java] Mark new allocated ArrowSchema as
 released (#43997)

### Rationale for this change

As described in #43996.

### What changes are included in this PR?

### Are these changes tested?

### Are there any user-facing changes?

* GitHub Issue: #43996

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../src/main/java/org/apache/arrow/c/ArrowSchema.java  | 10 +++++++++-
 .../java/org/apache/arrow/c/ArrowArrayUtilityTest.java |  7 +++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/java/c/src/main/java/org/apache/arrow/c/ArrowSchema.java b/java/c/src/main/java/org/apache/arrow/c/ArrowSchema.java
index 06e401627ef01..ad9f16ae9ceed 100644
--- a/java/c/src/main/java/org/apache/arrow/c/ArrowSchema.java
+++ b/java/c/src/main/java/org/apache/arrow/c/ArrowSchema.java
@@ -52,6 +52,7 @@
  */
 public class ArrowSchema implements BaseStruct {
   private static final int SIZE_OF = 72;
+  private static final int INDEX_RELEASE_CALLBACK = 56;
 
   private ArrowBuf data;
 
@@ -103,7 +104,9 @@ public static ArrowSchema wrap(long memoryAddress) {
    * @return A new ArrowSchema instance
    */
   public static ArrowSchema allocateNew(BufferAllocator allocator) {
-    return new ArrowSchema(allocator.buffer(ArrowSchema.SIZE_OF));
+    ArrowSchema schema = new ArrowSchema(allocator.buffer(ArrowSchema.SIZE_OF));
+    schema.markReleased();
+    return schema;
   }
 
   ArrowSchema(ArrowBuf data) {
@@ -111,6 +114,11 @@ public static ArrowSchema allocateNew(BufferAllocator allocator) {
     this.data = data;
   }
 
+  /** Mark the schema as released. */
+  public void markReleased() {
+    directBuffer().putLong(INDEX_RELEASE_CALLBACK, NULL);
+  }
+
   @Override
   public long memoryAddress() {
     checkNotNull(data, "ArrowSchema is already closed");
diff --git a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java
index 1d4cb411fab45..511358a5e62fa 100644
--- a/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java
+++ b/java/c/src/test/java/org/apache/arrow/c/ArrowArrayUtilityTest.java
@@ -50,6 +50,13 @@ void afterEach() {
     allocator.close();
   }
 
+  @Test
+  void arraySchemaInit() {
+    ArrowSchema schema = ArrowSchema.allocateNew(allocator);
+    assertThat(schema.snapshot().release).isEqualTo(0);
+    schema.close();
+  }
+
   // ------------------------------------------------------------
   // BufferImportTypeVisitor
 

From b4b22a4635fcf9bb1a36bf26411c6bde2ae01732 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 07:31:08 -0700
Subject: [PATCH 112/186] MINOR: [C#] Bump Microsoft.NET.Test.Sdk from 17.11.0
 to 17.11.1 in /csharp (#44025)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [Microsoft.NET.Test.Sdk](https://github.com/microsoft/vstest) from 17.11.0 to 17.11.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/microsoft/vstest/releases">Microsoft.NET.Test.Sdk's releases</a>.</em></p>
<blockquote>
<h2>v17.11.1</h2>
<h2>What's Changed</h2>
<ul>
<li>[rel/17.11] Forward error output from testhost as info by <a href="https://github.com/nohwnd"><code>@​nohwnd</code></a> in <a href="https://redirect.github.com/microsoft/vstest/pull/5193">microsoft/vstest#5193</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/microsoft/vstest/compare/v17.11.0...v17.11.1">https://github.com/microsoft/vstest/compare/v17.11.0...v17.11.1</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/microsoft/vstest/commit/58dbd027217ac035a4e9114c9213b11dc0e988bd"><code>58dbd02</code></a> Revert &quot;Do not publish to BAR when RTM version is built&quot;</li>
<li><a href="https://github.com/microsoft/vstest/commit/aa6284866d5c93df1b41916207e3220f98e99e89"><code>aa62848</code></a> Do not publish to BAR when RTM version is built</li>
<li><a href="https://github.com/microsoft/vstest/commit/d824a2f5a3493c352acfc3ec7333142cda661651"><code>d824a2f</code></a> Bump 17.11.1</li>
<li><a href="https://github.com/microsoft/vstest/commit/ed4ac9273539dd5e29f251105fcdd5afc7b665e9"><code>ed4ac92</code></a> Forward error output from testhost as info (<a href="https://redirect.github.com/microsoft/vstest/issues/5193">#5193</a>)</li>
<li>See full diff in <a href="https://github.com/microsoft/vstest/compare/v17.11.0...v17.11.1">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Microsoft.NET.Test.Sdk&package-manager=nuget&previous-version=17.11.0&new-version=17.11.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../Apache.Arrow.Compression.Tests.csproj                       | 2 +-
 .../Apache.Arrow.Flight.Sql.Tests.csproj                        | 2 +-
 .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj  | 2 +-
 csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
index 4ea02e0ed21c0..baa9ca1188cef 100644
--- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj
@@ -7,7 +7,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.1" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
   </ItemGroup>
diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
index fd8274230ec64..fb546c213f8a6 100644
--- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj
@@ -6,7 +6,7 @@
     </PropertyGroup>
 
     <ItemGroup>
-      <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
+      <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.1" />
       <PackageReference Include="xunit" Version="2.9.0" />
       <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
       <PackageReference Include="coverlet.collector" Version="6.0.2" />
diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
index eae9ab746f283..71f54aa539e14 100644
--- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj
@@ -6,7 +6,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.1" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
     <PackageReference Include="coverlet.collector" Version="6.0.2" />
diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
index ee71b203218f8..aabe787b1f8c3 100644
--- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
@@ -16,7 +16,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.11.1" />
     <PackageReference Include="xunit" Version="2.9.0" />
     <PackageReference Include="xunit.runner.visualstudio" Version="2.8.2">
       <PrivateAssets>all</PrivateAssets>

From 22a44962bb3a641051cf449d9598477dd7dc820d Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Tue, 10 Sep 2024 21:06:50 +0530
Subject: [PATCH 113/186] GH-44016: [Java] Consider warnings as errors for
 Format Module (#44017)

### Rationale for this change

This PR configs the build such that warnings are considered errors in the Format module. And corresponding code changes have also been made.

### What changes are included in this PR?

Adding flags to consider warnings as errors in javac and fixing the corresponding errors.

### Are these changes tested?

Tested by existing test cases.

### Are there any user-facing changes?

N/A
* GitHub Issue: #44016

Lead-authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Co-authored-by: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Co-authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: Dane Pitkin <dpitkin@apache.org>
---
 java/format/pom.xml                        | 9 +++++++++
 java/format/src/main/java/module-info.java | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/java/format/pom.xml b/java/format/pom.xml
index 1121930da42d2..f767215b12807 100644
--- a/java/format/pom.xml
+++ b/java/format/pom.xml
@@ -61,6 +61,15 @@ under the License.
           </java>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration combine.children="append">
+          <compilerArgs>
+            <arg>-Werror</arg>
+          </compilerArgs>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/java/format/src/main/java/module-info.java b/java/format/src/main/java/module-info.java
index bda779c91afbc..f8d740b726fde 100644
--- a/java/format/src/main/java/module-info.java
+++ b/java/format/src/main/java/module-info.java
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+// TODO(https://github.com/apache/arrow/issues/44037): Google hasn't reviewed Flatbuffers fix
+@SuppressWarnings({ "requires-automatic", "requires-transitive-automatic" })
 module org.apache.arrow.format {
   exports org.apache.arrow.flatbuf;
   requires transitive flatbuffers.java;

From c469da4d8494dbeffac6a43d662d5277b606942d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 14:12:31 -0400
Subject: [PATCH 114/186] MINOR: [Java] Bump logback.version from 1.5.7 to
 1.5.8 in /java (#44023)

Bumps `logback.version` from 1.5.7 to 1.5.8.
Updates `ch.qos.logback:logback-classic` from 1.5.7 to 1.5.8
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/qos-ch/logback/commit/92e1a5ea0adc3bc215fc9441fb711fb5726974e4"><code>92e1a5e</code></a> prepare release 1.5.8</li>
<li><a href="https://github.com/qos-ch/logback/commit/76d8dd86022daddf8733d11745f1e9fc42bc4a06"><code>76d8dd8</code></a> Update README.md, comment out CI action results</li>
<li><a href="https://github.com/qos-ch/logback/commit/d7e0d59cbed5c3904b00166e1bd54f658ff16f2e"><code>d7e0d59</code></a> Merge branch 'master' of github.com:qos-ch/logback</li>
<li><a href="https://github.com/qos-ch/logback/commit/fe3bf9d56533614864385ab7e35c29bf702b7e56"><code>fe3bf9d</code></a> os.name property is expected to be Mac OS X on Apple computers</li>
<li><a href="https://github.com/qos-ch/logback/commit/980627396335ffb0eeb94e5a5a3e18f963494833"><code>9806273</code></a> Update README.md</li>
<li><a href="https://github.com/qos-ch/logback/commit/c45f110ba2f97cfe78e6b4ff28f7fa83d3c7a0ff"><code>c45f110</code></a> check for Mac OS X</li>
<li><a href="https://github.com/qos-ch/logback/commit/00c6f5eaf3a42b8ab012a6419fb24fa327ca6ef8"><code>00c6f5e</code></a> what is the os.name</li>
<li><a href="https://github.com/qos-ch/logback/commit/7d03a42f4bd41e9eabaa8fc001d70c4f8aa122ef"><code>7d03a42</code></a> update actions/setup</li>
<li><a href="https://github.com/qos-ch/logback/commit/edacb3b99083274f1f08f68729790d661e051c4f"><code>edacb3b</code></a> skip email sent termination test on MacOs</li>
<li><a href="https://github.com/qos-ch/logback/commit/3b5d041d03f88b3f7a6440cb41b2b98f8dd1018f"><code>3b5d041</code></a> allow more time for timetout</li>
<li>Additional commits viewable in <a href="https://github.com/qos-ch/logback/compare/v_1.5.7...v_1.5.8">compare view</a></li>
</ul>
</details>
<br />

Updates `ch.qos.logback:logback-core` from 1.5.7 to 1.5.8
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/qos-ch/logback/commit/92e1a5ea0adc3bc215fc9441fb711fb5726974e4"><code>92e1a5e</code></a> prepare release 1.5.8</li>
<li><a href="https://github.com/qos-ch/logback/commit/76d8dd86022daddf8733d11745f1e9fc42bc4a06"><code>76d8dd8</code></a> Update README.md, comment out CI action results</li>
<li><a href="https://github.com/qos-ch/logback/commit/d7e0d59cbed5c3904b00166e1bd54f658ff16f2e"><code>d7e0d59</code></a> Merge branch 'master' of github.com:qos-ch/logback</li>
<li><a href="https://github.com/qos-ch/logback/commit/fe3bf9d56533614864385ab7e35c29bf702b7e56"><code>fe3bf9d</code></a> os.name property is expected to be Mac OS X on Apple computers</li>
<li><a href="https://github.com/qos-ch/logback/commit/980627396335ffb0eeb94e5a5a3e18f963494833"><code>9806273</code></a> Update README.md</li>
<li><a href="https://github.com/qos-ch/logback/commit/c45f110ba2f97cfe78e6b4ff28f7fa83d3c7a0ff"><code>c45f110</code></a> check for Mac OS X</li>
<li><a href="https://github.com/qos-ch/logback/commit/00c6f5eaf3a42b8ab012a6419fb24fa327ca6ef8"><code>00c6f5e</code></a> what is the os.name</li>
<li><a href="https://github.com/qos-ch/logback/commit/7d03a42f4bd41e9eabaa8fc001d70c4f8aa122ef"><code>7d03a42</code></a> update actions/setup</li>
<li><a href="https://github.com/qos-ch/logback/commit/edacb3b99083274f1f08f68729790d661e051c4f"><code>edacb3b</code></a> skip email sent termination test on MacOs</li>
<li><a href="https://github.com/qos-ch/logback/commit/3b5d041d03f88b3f7a6440cb41b2b98f8dd1018f"><code>3b5d041</code></a> allow more time for timetout</li>
<li>Additional commits viewable in <a href="https://github.com/qos-ch/logback/compare/v_1.5.7...v_1.5.8">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Dane Pitkin <dpitkin@apache.org>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index c6b1876873f30..1c68fde535879 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -111,7 +111,7 @@ under the License.
     <mockito.core.version>5.11.0</mockito.core.version>
     <mockito.inline.version>5.2.0</mockito.inline.version>
     <checker.framework.version>3.46.0</checker.framework.version>
-    <logback.version>1.5.7</logback.version>
+    <logback.version>1.5.8</logback.version>
     <doclint>none</doclint>
     <additionalparam>-Xdoclint:none</additionalparam>
     <!-- List of add-opens arg line arguments for tests -->

From a87a8e0efe1650b01ac85f7a7331ccfcffc088a2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 14:12:58 -0400
Subject: [PATCH 115/186] MINOR: [Java] Bump io.netty:netty-bom from
 4.1.112.Final to 4.1.113.Final in /java (#44022)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [io.netty:netty-bom](https://github.com/netty/netty) from 4.1.112.Final to 4.1.113.Final.
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/netty/netty/commit/d0a109ef01ed6ad28bb8ffd4a5b1606fcae63ed1"><code>d0a109e</code></a> [maven-release-plugin] prepare release netty-4.1.113.Final</li>
<li><a href="https://github.com/netty/netty/commit/e1d63849add6d100599e7cbc351934f829bb9167"><code>e1d6384</code></a> Cleanup fields on AdaptiveByteBuf::deallocate (<a href="https://redirect.github.com/netty/netty/issues/14273">#14273</a>)</li>
<li><a href="https://github.com/netty/netty/commit/8a02f4583df23e75e033181489260994a09bb880"><code>8a02f45</code></a> Upload hidden files for staging (<a href="https://redirect.github.com/netty/netty/issues/14275">#14275</a>)</li>
<li><a href="https://github.com/netty/netty/commit/c0fdb8e9f8f256990e902fcfffbbe10754d0f3dd"><code>c0fdb8e</code></a> adjust continuation frame header length (<a href="https://redirect.github.com/netty/netty/issues/14245">#14245</a>)</li>
<li><a href="https://github.com/netty/netty/commit/95d86bbcee4f8e5a7d273d7ee16f69982cf2fab1"><code>95d86bb</code></a> chore: clean code DefaultChannelPipeline add method (<a href="https://redirect.github.com/netty/netty/issues/14249">#14249</a>)</li>
<li><a href="https://github.com/netty/netty/commit/1c1da9f943322a12588e7c5b8b637c8e0fa642da"><code>1c1da9f</code></a> Fix netty-all artifact snapshot deployments (<a href="https://redirect.github.com/netty/netty/issues/14264">#14264</a>)</li>
<li><a href="https://github.com/netty/netty/commit/235eb6f060e0ae5a063e96ba9be5efdc5806e5bf"><code>235eb6f</code></a> Upgrade to netty-tcnative 2.0.66.Final (<a href="https://redirect.github.com/netty/netty/issues/14254">#14254</a>)</li>
<li><a href="https://github.com/netty/netty/commit/ceade95db41b592167acec45e30aaa21ea0d26a6"><code>ceade95</code></a> Ensure flushes are not discarded by ChunkedWriteHandler for passed th… (<a href="https://redirect.github.com/netty/netty/issues/14248">#14248</a>)</li>
<li><a href="https://github.com/netty/netty/commit/dc30c3337a766d9f9519e910438ed3a3ae1b3761"><code>dc30c33</code></a> Add new SslHandler.isEncrypted(...) variant that will not produce fal… (<a href="https://redirect.github.com/netty/netty/issues/14243">#14243</a>)</li>
<li><a href="https://github.com/netty/netty/commit/31d15922ca9a0ea63631ed705ee3cc413a6429bc"><code>31d1592</code></a> Remove reference to parent in recycled buffers for leak detection (<a href="https://redirect.github.com/netty/netty/issues/14250">#14250</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/netty/netty/compare/netty-4.1.112.Final...netty-4.1.113.Final">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=io.netty:netty-bom&package-manager=maven&previous-version=4.1.112.Final&new-version=4.1.113.Final)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Dane Pitkin <dpitkin@apache.org>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 1c68fde535879..1e22b6b973b9f 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -96,7 +96,7 @@ under the License.
     <dep.junit.jupiter.version>5.11.0</dep.junit.jupiter.version>
     <dep.slf4j.version>2.0.16</dep.slf4j.version>
     <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
-    <dep.netty-bom.version>4.1.112.Final</dep.netty-bom.version>
+    <dep.netty-bom.version>4.1.113.Final</dep.netty-bom.version>
     <dep.grpc-bom.version>1.65.0</dep.grpc-bom.version>
     <dep.protobuf-bom.version>3.25.4</dep.protobuf-bom.version>
     <dep.jackson-bom.version>2.17.2</dep.jackson-bom.version>

From 44b72d5c2518b7dc70b67b588432fb06ea3896c7 Mon Sep 17 00:00:00 2001
From: larry98 <larry98@users.noreply.github.com>
Date: Tue, 10 Sep 2024 15:08:00 -0400
Subject: [PATCH 116/186] GH-43187: [C++] Support basic is_in predicate
 simplification (#43761)

### Rationale for this change

Prior to https://github.com/apache/arrow/pull/43256, this PR adds a basic implementation that does a linear scan filter over the value set on each guarantee. This isolates the correctness/semantics of `is_in` predicate simplification from the binary search performance optimization.

### What changes are included in this PR?

`SimplifyWithGuarantee` now handles `is_in` expressions.

### Are these changes tested?

A new unit test was added to arrow-compute-expression-test testing this change.

### Are there any user-facing changes?

No.
* GitHub Issue: #43187

Lead-authored-by: Larry Wang <larry@hudson-trading.com>
Co-authored-by: larry98 <larry98@users.noreply.github.com>
Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/compute/expression.cc      |  73 ++++++++++
 cpp/src/arrow/compute/expression_test.cc | 173 +++++++++++++++++++++++
 2 files changed, 246 insertions(+)

diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc
index 33e5928c2865d..12fda5d58f3bf 100644
--- a/cpp/src/arrow/compute/expression.cc
+++ b/cpp/src/arrow/compute/expression.cc
@@ -23,6 +23,7 @@
 #include <unordered_set>
 
 #include "arrow/chunked_array.h"
+#include "arrow/compute/api_aggregate.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/exec_internal.h"
 #include "arrow/compute/expression_internal.h"
@@ -1242,6 +1243,72 @@ struct Inequality {
                             /*insert_implicit_casts=*/false, &exec_context);
   }
 
+  /// Simplify an `is_in` call against an inequality guarantee.
+  ///
+  /// We avoid the complexity of fully simplifying EQUAL comparisons to true
+  /// literals (e.g., 'x is_in [1, 2, 3]' given the guarantee 'x = 2') due to
+  /// potential complications with null matching behavior. This is ok for the
+  /// predicate pushdown use case because the overall aim is to simplify to an
+  /// unsatisfiable expression.
+  ///
+  /// \pre `is_in_call` is a call to the `is_in` function
+  /// \return a simplified expression, or nullopt if no simplification occurred
+  static Result<std::optional<Expression>> SimplifyIsIn(
+      const Inequality& guarantee, const Expression::Call* is_in_call) {
+    DCHECK_EQ(is_in_call->function_name, "is_in");
+
+    auto options = checked_pointer_cast<SetLookupOptions>(is_in_call->options);
+
+    const auto& lhs = Comparison::StripOrderPreservingCasts(is_in_call->arguments[0]);
+    if (!lhs.field_ref()) return std::nullopt;
+    if (*lhs.field_ref() != guarantee.target) return std::nullopt;
+
+    FilterOptions::NullSelectionBehavior null_selection;
+    switch (options->null_matching_behavior) {
+      case SetLookupOptions::MATCH:
+        null_selection =
+            guarantee.nullable ? FilterOptions::EMIT_NULL : FilterOptions::DROP;
+        break;
+      case SetLookupOptions::SKIP:
+        null_selection = FilterOptions::DROP;
+        break;
+      case SetLookupOptions::EMIT_NULL:
+        if (guarantee.nullable) return std::nullopt;
+        null_selection = FilterOptions::DROP;
+        break;
+      case SetLookupOptions::INCONCLUSIVE:
+        if (guarantee.nullable) return std::nullopt;
+        ARROW_ASSIGN_OR_RAISE(Datum is_null, IsNull(options->value_set));
+        ARROW_ASSIGN_OR_RAISE(Datum any_null, Any(is_null));
+        if (any_null.scalar_as<BooleanScalar>().value) return std::nullopt;
+        null_selection = FilterOptions::DROP;
+        break;
+    }
+
+    std::string func_name = Comparison::GetName(guarantee.cmp);
+    DCHECK_NE(func_name, "na");
+    std::vector<Datum> args{options->value_set, guarantee.bound};
+    ARROW_ASSIGN_OR_RAISE(Datum filter_mask, CallFunction(func_name, args));
+    FilterOptions filter_options(null_selection);
+    ARROW_ASSIGN_OR_RAISE(Datum simplified_value_set,
+                          Filter(options->value_set, filter_mask, filter_options));
+
+    if (simplified_value_set.length() == 0) return literal(false);
+    if (simplified_value_set.length() == options->value_set.length()) return std::nullopt;
+
+    ExecContext exec_context;
+    Expression::Call simplified_call;
+    simplified_call.function_name = "is_in";
+    simplified_call.arguments = is_in_call->arguments;
+    simplified_call.options = std::make_shared<SetLookupOptions>(
+        simplified_value_set, options->null_matching_behavior);
+    ARROW_ASSIGN_OR_RAISE(
+        Expression simplified_expr,
+        BindNonRecursive(std::move(simplified_call),
+                         /*insert_implicit_casts=*/false, &exec_context));
+    return simplified_expr;
+  }
+
   /// \brief Simplify the given expression given this inequality as a guarantee.
   Result<Expression> Simplify(Expression expr) {
     const auto& guarantee = *this;
@@ -1258,6 +1325,12 @@ struct Inequality {
       return call->function_name == "is_valid" ? literal(true) : literal(false);
     }
 
+    if (call->function_name == "is_in") {
+      ARROW_ASSIGN_OR_RAISE(std::optional<Expression> result,
+                            SimplifyIsIn(guarantee, call));
+      return result.value_or(expr);
+    }
+
     auto cmp = Comparison::Get(expr);
     if (!cmp) return expr;
 
diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc
index d94a17b6ffadf..0b7e8a9c23b13 100644
--- a/cpp/src/arrow/compute/expression_test.cc
+++ b/cpp/src/arrow/compute/expression_test.cc
@@ -27,6 +27,7 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "arrow/array/builder_primitive.h"
 #include "arrow/compute/expression_internal.h"
 #include "arrow/compute/function_internal.h"
 #include "arrow/compute/registry.h"
@@ -1616,6 +1617,144 @@ TEST(Expression, SimplifyWithComparisonAndNullableCaveat) {
           true_unless_null(field_ref("i32"))));  // not satisfiable, will drop row group
 }
 
+TEST(Expression, SimplifyIsIn) {
+  auto is_in = [](Expression field, std::shared_ptr<DataType> value_set_type,
+                  std::string json_array,
+                  SetLookupOptions::NullMatchingBehavior null_matching_behavior) {
+    SetLookupOptions options{ArrayFromJSON(value_set_type, json_array),
+                             null_matching_behavior};
+    return call("is_in", {field}, options);
+  };
+
+  for (SetLookupOptions::NullMatchingBehavior null_matching : {
+           SetLookupOptions::MATCH,
+           SetLookupOptions::SKIP,
+           SetLookupOptions::EMIT_NULL,
+           SetLookupOptions::INCONCLUSIVE,
+       }) {
+    Simplify{is_in(field_ref("i32"), int32(), "[]", null_matching)}
+        .WithGuarantee(greater(field_ref("i32"), literal(2)))
+        .Expect(false);
+
+    Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(equal(field_ref("i32"), literal(6)))
+        .Expect(false);
+
+    Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(greater(field_ref("i32"), literal(3)))
+        .Expect(is_in(field_ref("i32"), int32(), "[5,7,9]", null_matching));
+
+    Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(greater(field_ref("i32"), literal(9)))
+        .Expect(false);
+
+    Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(less_equal(field_ref("i32"), literal(0)))
+        .Expect(false);
+
+    Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(greater(field_ref("i32"), literal(0)))
+        .ExpectUnchanged();
+
+    Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(less_equal(field_ref("i32"), literal(9)))
+        .ExpectUnchanged();
+
+    Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(and_(less_equal(field_ref("i32"), literal(7)),
+                            greater(field_ref("i32"), literal(4))))
+        .Expect(is_in(field_ref("i32"), int32(), "[5,7]", null_matching));
+
+    Simplify{is_in(field_ref("u32"), int8(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(greater(field_ref("u32"), literal(3)))
+        .Expect(is_in(field_ref("u32"), int8(), "[5,7,9]", null_matching));
+
+    Simplify{is_in(field_ref("u32"), int64(), "[1,3,5,7,9]", null_matching)}
+        .WithGuarantee(greater(field_ref("u32"), literal(3)))
+        .Expect(is_in(field_ref("u32"), int64(), "[5,7,9]", null_matching));
+  }
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::MATCH),
+  }
+      .WithGuarantee(
+          or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32"))))
+      .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH));
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH),
+  }
+      .WithGuarantee(greater(field_ref("i32"), literal(2)))
+      .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH));
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH),
+  }
+      .WithGuarantee(
+          or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32"))))
+      .Expect(is_in(field_ref("i32"), int32(), "[3,null]", SetLookupOptions::MATCH));
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::SKIP),
+  }
+      .WithGuarantee(
+          or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32"))))
+      .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP));
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP),
+  }
+      .WithGuarantee(greater(field_ref("i32"), literal(2)))
+      .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP));
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP),
+  }
+      .WithGuarantee(
+          or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32"))))
+      .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP));
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::EMIT_NULL),
+  }
+      .WithGuarantee(
+          or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32"))))
+      .ExpectUnchanged();
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL),
+  }
+      .WithGuarantee(greater(field_ref("i32"), literal(2)))
+      .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::EMIT_NULL));
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL),
+  }
+      .WithGuarantee(
+          or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32"))))
+      .ExpectUnchanged();
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::INCONCLUSIVE),
+  }
+      .WithGuarantee(
+          or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32"))))
+      .ExpectUnchanged();
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE),
+  }
+      .WithGuarantee(greater(field_ref("i32"), literal(2)))
+      .ExpectUnchanged();
+
+  Simplify{
+      is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE),
+  }
+      .WithGuarantee(
+          or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32"))))
+      .ExpectUnchanged();
+}
+
 TEST(Expression, SimplifyThenExecute) {
   auto filter =
       or_({equal(field_ref("f32"), literal(0)),
@@ -1643,6 +1782,40 @@ TEST(Expression, SimplifyThenExecute) {
   AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true);
 }
 
+TEST(Expression, SimplifyIsInThenExecute) {
+  auto input = RecordBatchFromJSON(kBoringSchema, R"([
+      {"i64": 2, "i32": 5},
+      {"i64": 5, "i32": 6},
+      {"i64": 3, "i32": 6},
+      {"i64": 3, "i32": 5},
+      {"i64": 4, "i32": 5},
+      {"i64": 2, "i32": 7},
+      {"i64": 5, "i32": 5}
+  ])");
+
+  std::vector<Expression> guarantees{greater(field_ref("i64"), literal(1)),
+                                     greater_equal(field_ref("i32"), literal(5)),
+                                     less_equal(field_ref("i64"), literal(5))};
+
+  for (const Expression& guarantee : guarantees) {
+    auto filter =
+        call("is_in", {guarantee.call()->arguments[0]},
+             compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2,3]"), true});
+    ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*kBoringSchema));
+    ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(filter, guarantee));
+
+    Datum evaluated, simplified_evaluated;
+    ExpectExecute(filter, input, &evaluated);
+    ExpectExecute(simplified, input, &simplified_evaluated);
+    if (simplified_evaluated.is_scalar()) {
+      ASSERT_OK_AND_ASSIGN(
+          simplified_evaluated,
+          MakeArrayFromScalar(*simplified_evaluated.scalar(), evaluated.length()));
+    }
+    AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true);
+  }
+}
+
 TEST(Expression, Filter) {
   auto ExpectFilter = [](Expression filter, std::string batch_json) {
     ASSERT_OK_AND_ASSIGN(auto s, kBoringSchema->AddField(0, field("in", boolean())));

From c4b68ee561cd3d2363f83f00b4085145bf3e3807 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Tue, 10 Sep 2024 15:51:53 -0400
Subject: [PATCH 117/186] GH-43956: [Format] Allow Decimal32/Decimal64 in
 format (#43976)

### Rationale for this change
Widening the Decimal128/256 type to allow for bitwidths of 32 and 64
allows for more interoperability with other libraries and utilities
which already support these types. This provides even more opportunities
for zero-copy interactions between things such as libcudf and various
databases.

<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->

### What changes are included in this PR?
Updating the documentation in Schema.fbs to explicitly state that 32-bit
and 64-bit is now allowed for bitwidths of Decimal types. This is the
only area in the the spec that mentions the allowed decimal bitwidths.
* GitHub Issue: #43956

---------

Co-authored-by: Antoine Pitrou <pitrou@free.fr>
---
 docs/source/format/Columnar.rst   | 2 +-
 docs/source/format/Versioning.rst | 5 +++++
 format/Schema.fbs                 | 9 +++++----
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
index c5f822f41643f..4bd937d760d59 100644
--- a/docs/source/format/Columnar.rst
+++ b/docs/source/format/Columnar.rst
@@ -21,7 +21,7 @@
 Arrow Columnar Format
 *********************
 
-*Version: 1.4*
+*Version: 1.5*
 
 .. seealso:: :ref:`Additions to the Arrow columnar format since version 1.0.0
    <post-1-0-0-format-versions>`
diff --git a/docs/source/format/Versioning.rst b/docs/source/format/Versioning.rst
index 8fcf11b21f0cc..d46d07a90906c 100644
--- a/docs/source/format/Versioning.rst
+++ b/docs/source/format/Versioning.rst
@@ -105,3 +105,8 @@ Version 1.4
 * Added :ref:`listview-layout` and the associated ListView and LargeListView
   types.
 * Added :ref:`variadic-buffers`.
+
+Version 1.5
+-----------
+
+* Expanded Decimal type bit widths to allow 32-bit and 64-bit types.
diff --git a/format/Schema.fbs b/format/Schema.fbs
index a03ca31ae97c4..e8e14b112a771 100644
--- a/format/Schema.fbs
+++ b/format/Schema.fbs
@@ -24,6 +24,7 @@
 /// Version 1.3 - Add Run-End Encoded.
 /// Version 1.4 - Add BinaryView, Utf8View, variadicBufferCounts, ListView, and
 /// LargeListView.
+/// Version 1.5 - Add 32-bit and 64-bit as allowed bit widths for Decimal
 
 namespace org.apache.arrow.flatbuf;
 
@@ -222,9 +223,9 @@ table RunEndEncoded {
 }
 
 /// Exact decimal value represented as an integer value in two's
-/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
-/// are used. The representation uses the endianness indicated
-/// in the Schema.
+/// complement. Currently 32-bit (4-byte), 64-bit (8-byte), 
+/// 128-bit (16-byte) and 256-bit (32-byte) integers are used.
+/// The representation uses the endianness indicated in the Schema.
 table Decimal {
   /// Total number of decimal digits
   precision: int;
@@ -232,7 +233,7 @@ table Decimal {
   /// Number of digits after the decimal point "."
   scale: int;
 
-  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// Number of bits per value. The accepted widths are 32, 64, 128 and 256.
   /// We use bitWidth for consistency with Int::bitWidth.
   bitWidth: int = 128;
 }

From 199138ee561e833bd466a965696839c3b9e931cf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 16:06:14 -0400
Subject: [PATCH 118/186] MINOR: [Java] Bump com.google.guava:guava-bom from
 33.2.1-jre to 33.3.0-jre in /java (#43750)

Bumps [com.google.guava:guava-bom](https://github.com/google/guava) from 33.2.1-jre to 33.3.0-jre.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/google/guava/releases">com.google.guava:guava-bom's releases</a>.</em></p>
<blockquote>
<h2>33.3.0</h2>
<h3>Maven</h3>
<pre lang="xml"><code>&lt;dependency&gt;
  &lt;groupId&gt;com.google.guava&lt;/groupId&gt;
  &lt;artifactId&gt;guava&lt;/artifactId&gt;
  &lt;version&gt;33.3.0-jre&lt;/version&gt;
  &lt;!-- or, for Android: --&gt;
  &lt;version&gt;33.3.0-android&lt;/version&gt;
&lt;/dependency&gt;
</code></pre>
<h3>Jar files</h3>
<ul>
<li><a href="https://repo1.maven.org/maven2/com/google/guava/guava/33.3.0-jre/guava-33.3.0-jre.jar">33.3.0-jre.jar</a></li>
<li><a href="https://repo1.maven.org/maven2/com/google/guava/guava/33.3.0-android/guava-33.3.0-android.jar">33.3.0-android.jar</a></li>
</ul>
<p>Guava requires <a href="https://github.com/google/guava/wiki/UseGuavaInYourBuild#what-about-guavas-own-dependencies">one runtime dependency</a>, which you can download here:</p>
<ul>
<li><a href="https://repo1.maven.org/maven2/com/google/guava/failureaccess/1.0.1/failureaccess-1.0.1.jar">failureaccess-1.0.1.jar</a></li>
</ul>
<h3>Javadoc</h3>
<ul>
<li><a href="https://guava.dev/releases/33.3.0-jre/api/docs/">33.3.0-jre</a></li>
<li><a href="https://guava.dev/releases/33.3.0-android/api/docs/">33.3.0-android</a></li>
</ul>
<h3>JDiff</h3>
<ul>
<li><a href="https://guava.dev/releases/33.3.0-jre/api/diffs/">33.3.0-jre vs. 33.2.1-jre</a></li>
<li><a href="https://guava.dev/releases/33.3.0-android/api/diffs/">33.3.0-android vs. 33.2.1-android</a></li>
<li><a href="https://guava.dev/releases/33.3.0-android/api/androiddiffs/">33.3.0-android vs. 33.3.0-jre</a></li>
</ul>
<h3>Changelog</h3>
<ul>
<li><code>base</code>: Removed <code>@ Beta</code> from the <code>Duration</code> overload of <code>Suppliers.memoizeWithExpiration</code>. (76fca99db95ce9c8e55bb9c37fd0e44ef0451a80)</li>
<li><code>cache</code>: Added <code>CacheBuilder</code> <code>Duration</code> overloads to <code>guava-android</code>. (a5f9bcafd6)</li>
<li><code>collect</code>: Removed <code>@ Beta</code> from the <code>guava-android</code> <code>Collector</code> APIs. (c86c09dc3d)</li>
<li><code>collect</code>: Added <code>ImmutableMultimap.builderWithExpectedKeys</code> and <code>ImmutableMultimap.Builder.expectedValuesPerKey</code>. (c3d5b17dc2)</li>
<li><code>graph</code>: Improved <code>Graphs.hasCycle</code> to avoid causing <code>StackOverflowError</code> for long paths. (63734b9dfc)</li>
<li><code>net</code>: Added <code>text/markdown</code> to <code>MediaType</code>. (2466a099ae)</li>
<li><code>net</code>: Deprecated <code>HttpHeaders</code> constant for <code>Sec-Ch-UA-Form-Factor</code> in favor of <code>Sec-Ch-UA-Form-Factors</code> to follow the latest spec. (b310b7e1ee)</li>
<li><code>testing</code>: Changed some test libraries to throw <code>AssertionError</code> (instead of the more specific <code>AssertionFailedError</code>) in some cases. (fdfbed1985)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/google/guava/commits">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.google.guava:guava-bom&package-manager=maven&previous-version=33.2.1-jre&new-version=33.3.0-jre)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Dane Pitkin <dpitkin@apache.org>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 1e22b6b973b9f..02ec57a5032df 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -95,7 +95,7 @@ under the License.
     <dep.junit.platform.version>1.9.0</dep.junit.platform.version>
     <dep.junit.jupiter.version>5.11.0</dep.junit.jupiter.version>
     <dep.slf4j.version>2.0.16</dep.slf4j.version>
-    <dep.guava-bom.version>33.2.1-jre</dep.guava-bom.version>
+    <dep.guava-bom.version>33.3.0-jre</dep.guava-bom.version>
     <dep.netty-bom.version>4.1.113.Final</dep.netty-bom.version>
     <dep.grpc-bom.version>1.65.0</dep.grpc-bom.version>
     <dep.protobuf-bom.version>3.25.4</dep.protobuf-bom.version>

From ca911ab0058947c955ce53bc9f6346b5ee95a94a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Sep 2024 16:06:45 -0400
Subject: [PATCH 119/186] MINOR: [Java] Bump checker.framework.version from
 3.46.0 to 3.47.0 in /java (#44021)

Bumps `checker.framework.version` from 3.46.0 to 3.47.0.
Updates `org.checkerframework:checker-qual` from 3.46.0 to 3.47.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/typetools/checker-framework/releases">org.checkerframework:checker-qual's releases</a>.</em></p>
<blockquote>
<h2>Checker Framework 3.47.0</h2>
<h2>Version 3.47.0 (September 3, 2024)</h2>
<p><strong>User-visible changes:</strong></p>
<p>The Checker Framework runs under JDK 22 -- that is, it runs on a version 22 JVM.
The Checker Framework runs under JDK 23 -- that is, it runs on a version 23 JVM.</p>
<p>The Optional Checker no longer supports the <code>@ OptionalBottom</code> annotation.</p>
<p><strong>Implementation details:</strong></p>
<p>Removed annotations:</p>
<ul>
<li><code>@ OptionalBottom</code></li>
</ul>
<p><strong>Closed issues:</strong></p>
<p><a href="https://redirect.github.com/typetools/checker-framework/issues/6510">#6510</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6704">#6704</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6743">#6743</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6749">#6749</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6760">#6760</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6761">#6761</a>.</p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/typetools/checker-framework/blob/master/docs/CHANGELOG.md">org.checkerframework:checker-qual's changelog</a>.</em></p>
<blockquote>
<h2>Version 3.47.0 (October 1, 2024)</h2>
<p><strong>User-visible changes:</strong></p>
<p><strong>Implementation details:</strong></p>
<p><strong>Closed issues:</strong></p>
<h2>Version 3.47.0 (September 3, 2024)</h2>
<p><strong>User-visible changes:</strong></p>
<p>The Checker Framework runs under JDK 22 -- that is, it runs on a version 22 JVM.
The Checker Framework runs under JDK 23 -- that is, it runs on a version 23 JVM.</p>
<p>The Optional Checker no longer supports the <code>@ OptionalBottom</code> annotation.</p>
<p><strong>Implementation details:</strong></p>
<p>Removed annotations:</p>
<ul>
<li><code>@ OptionalBottom</code></li>
</ul>
<p><strong>Closed issues:</strong></p>
<p><a href="https://redirect.github.com/typetools/checker-framework/issues/6510">#6510</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6704">#6704</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6743">#6743</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6749">#6749</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6760">#6760</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6761">#6761</a>.</p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/typetools/checker-framework/commit/2f788fe4e2b881db9fcd7bcd5e1703f4a26518dc"><code>2f788fe</code></a> new release 3.47.0</li>
<li><a href="https://github.com/typetools/checker-framework/commit/2d0d20ba72e1d7367d789c34bf07e01c0fc77e6e"><code>2d0d20b</code></a> Prep for release.</li>
<li><a href="https://github.com/typetools/checker-framework/commit/0aeb0a44da68d1be153c9356873d4ea847652ca8"><code>0aeb0a4</code></a> Removing the <code>@ OptionalBottom</code> type and annotation (<a href="https://redirect.github.com/typetools/checker-framework/issues/6772">#6772</a>)</li>
<li><a href="https://github.com/typetools/checker-framework/commit/87f9d4455d6f2c3c7056fa73daa694daa3e17002"><code>87f9d44</code></a> Support Java 23</li>
<li><a href="https://github.com/typetools/checker-framework/commit/c16094be63cf000d9703427388becf868d7b2b7b"><code>c16094b</code></a> Remove <code>resolveDependencies</code> target (<a href="https://redirect.github.com/typetools/checker-framework/issues/6775">#6775</a>)</li>
<li><a href="https://github.com/typetools/checker-framework/commit/c27f6514f146f9840ecd927f4e31ee919d90b0c6"><code>c27f651</code></a> Don't use <code>///</code> comments, whose content must be Markdown in Java 23</li>
<li><a href="https://github.com/typetools/checker-framework/commit/cb70fb7a06cc9600d5ccea344a4da61c0b244714"><code>cb70fb7</code></a> Update dependency com.amazonaws:aws-java-sdk-bom to v1.12.770 (<a href="https://redirect.github.com/typetools/checker-framework/issues/6773">#6773</a>)</li>
<li><a href="https://github.com/typetools/checker-framework/commit/07940f716dc6195de7fd6a78aa18ec36f728e761"><code>07940f7</code></a> Update versions.errorprone to v2.31.0 (<a href="https://redirect.github.com/typetools/checker-framework/issues/6771">#6771</a>)</li>
<li><a href="https://github.com/typetools/checker-framework/commit/7b2378e69b79761fb6f3359a0b7a189c517c1a5a"><code>7b2378e</code></a> Support Java 22</li>
<li><a href="https://github.com/typetools/checker-framework/commit/c5cc9d8be5c861d2ab9baf088f721c7525c80c73"><code>c5cc9d8</code></a> Update dependency io.github.classgraph:classgraph to v4.8.175 (<a href="https://redirect.github.com/typetools/checker-framework/issues/6766">#6766</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/typetools/checker-framework/compare/checker-framework-3.46.0...checker-framework-3.47.0">compare view</a></li>
</ul>
</details>
<br />

Updates `org.checkerframework:checker` from 3.46.0 to 3.47.0
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/typetools/checker-framework/releases">org.checkerframework:checker's releases</a>.</em></p>
<blockquote>
<h2>Checker Framework 3.47.0</h2>
<h2>Version 3.47.0 (September 3, 2024)</h2>
<p><strong>User-visible changes:</strong></p>
<p>The Checker Framework runs under JDK 22 -- that is, it runs on a version 22 JVM.
The Checker Framework runs under JDK 23 -- that is, it runs on a version 23 JVM.</p>
<p>The Optional Checker no longer supports the <code>@ OptionalBottom</code> annotation.</p>
<p><strong>Implementation details:</strong></p>
<p>Removed annotations:</p>
<ul>
<li><code>@ OptionalBottom</code></li>
</ul>
<p><strong>Closed issues:</strong></p>
<p><a href="https://redirect.github.com/typetools/checker-framework/issues/6510">#6510</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6704">#6704</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6743">#6743</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6749">#6749</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6760">#6760</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6761">#6761</a>.</p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/typetools/checker-framework/blob/master/docs/CHANGELOG.md">org.checkerframework:checker's changelog</a>.</em></p>
<blockquote>
<h2>Version 3.47.0 (October 1, 2024)</h2>
<p><strong>User-visible changes:</strong></p>
<p><strong>Implementation details:</strong></p>
<p><strong>Closed issues:</strong></p>
<h2>Version 3.47.0 (September 3, 2024)</h2>
<p><strong>User-visible changes:</strong></p>
<p>The Checker Framework runs under JDK 22 -- that is, it runs on a version 22 JVM.
The Checker Framework runs under JDK 23 -- that is, it runs on a version 23 JVM.</p>
<p>The Optional Checker no longer supports the <code>@ OptionalBottom</code> annotation.</p>
<p><strong>Implementation details:</strong></p>
<p>Removed annotations:</p>
<ul>
<li><code>@ OptionalBottom</code></li>
</ul>
<p><strong>Closed issues:</strong></p>
<p><a href="https://redirect.github.com/typetools/checker-framework/issues/6510">#6510</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6704">#6704</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6743">#6743</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6749">#6749</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6760">#6760</a>, <a href="https://redirect.github.com/typetools/checker-framework/issues/6761">#6761</a>.</p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/typetools/checker-framework/commit/2f788fe4e2b881db9fcd7bcd5e1703f4a26518dc"><code>2f788fe</code></a> new release 3.47.0</li>
<li><a href="https://github.com/typetools/checker-framework/commit/2d0d20ba72e1d7367d789c34bf07e01c0fc77e6e"><code>2d0d20b</code></a> Prep for release.</li>
<li><a href="https://github.com/typetools/checker-framework/commit/0aeb0a44da68d1be153c9356873d4ea847652ca8"><code>0aeb0a4</code></a> Removing the <code>@ OptionalBottom</code> type and annotation (<a href="https://redirect.github.com/typetools/checker-framework/issues/6772">#6772</a>)</li>
<li><a href="https://github.com/typetools/checker-framework/commit/87f9d4455d6f2c3c7056fa73daa694daa3e17002"><code>87f9d44</code></a> Support Java 23</li>
<li><a href="https://github.com/typetools/checker-framework/commit/c16094be63cf000d9703427388becf868d7b2b7b"><code>c16094b</code></a> Remove <code>resolveDependencies</code> target (<a href="https://redirect.github.com/typetools/checker-framework/issues/6775">#6775</a>)</li>
<li><a href="https://github.com/typetools/checker-framework/commit/c27f6514f146f9840ecd927f4e31ee919d90b0c6"><code>c27f651</code></a> Don't use <code>///</code> comments, whose content must be Markdown in Java 23</li>
<li><a href="https://github.com/typetools/checker-framework/commit/cb70fb7a06cc9600d5ccea344a4da61c0b244714"><code>cb70fb7</code></a> Update dependency com.amazonaws:aws-java-sdk-bom to v1.12.770 (<a href="https://redirect.github.com/typetools/checker-framework/issues/6773">#6773</a>)</li>
<li><a href="https://github.com/typetools/checker-framework/commit/07940f716dc6195de7fd6a78aa18ec36f728e761"><code>07940f7</code></a> Update versions.errorprone to v2.31.0 (<a href="https://redirect.github.com/typetools/checker-framework/issues/6771">#6771</a>)</li>
<li><a href="https://github.com/typetools/checker-framework/commit/7b2378e69b79761fb6f3359a0b7a189c517c1a5a"><code>7b2378e</code></a> Support Java 22</li>
<li><a href="https://github.com/typetools/checker-framework/commit/c5cc9d8be5c861d2ab9baf088f721c7525c80c73"><code>c5cc9d8</code></a> Update dependency io.github.classgraph:classgraph to v4.8.175 (<a href="https://redirect.github.com/typetools/checker-framework/issues/6766">#6766</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/typetools/checker-framework/compare/checker-framework-3.46.0...checker-framework-3.47.0">compare view</a></li>
</ul>
</details>
<br />

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Dane Pitkin <dpitkin@apache.org>
---
 java/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/pom.xml b/java/pom.xml
index 02ec57a5032df..808b0ad4d8cc7 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -110,7 +110,7 @@ under the License.
     <error_prone_core.version>2.31.0</error_prone_core.version>
     <mockito.core.version>5.11.0</mockito.core.version>
     <mockito.inline.version>5.2.0</mockito.inline.version>
-    <checker.framework.version>3.46.0</checker.framework.version>
+    <checker.framework.version>3.47.0</checker.framework.version>
     <logback.version>1.5.8</logback.version>
     <doclint>none</doclint>
     <additionalparam>-Xdoclint:none</additionalparam>

From e710b6eb3775f1c269e5ae82737ace449710771a Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 11 Sep 2024 02:19:59 +0200
Subject: [PATCH 120/186] MINOR: [CI][C++] Enable core dumps and stack traces
 in Linux/macOS jobs (#43937)

### Rationale for this change

In https://github.com/apache/arrow/pull/43936 I noticed that core dumps were not written out for crashing C++ tests. One problem is that, by default, Ubuntu hosts pipe core dumps to `apport`, but it is not available inside containers. Another is that the `ulimit` must be set in the host, not in the container.

In addition, this PR restores automatic traceback generation when running C++ tests, on Linux and macOS jobs.

### Are these changes tested?

Manually by introducing a spurious segfault and running Docker containers.

### Are there any user-facing changes?

No.

Lead-authored-by: Antoine Pitrou <antoine@python.org>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/cpp.yml                     |  5 ++-
 .github/workflows/dev.yml                     |  3 +-
 .github/workflows/integration.yml             |  1 +
 .github/workflows/java_jni.yml                |  4 ++-
 .github/workflows/js.yml                      |  3 +-
 .github/workflows/python.yml                  |  3 +-
 .github/workflows/r.yml                       |  6 ++--
 .github/workflows/ruby.yml                    |  3 +-
 .github/workflows/swift.yml                   |  3 +-
 ci/docker/fedora-39-cpp.dockerfile            |  1 +
 ci/docker/ubuntu-20.04-cpp-minimal.dockerfile |  1 +
 ci/docker/ubuntu-22.04-cpp-minimal.dockerfile |  1 +
 ci/docker/ubuntu-24.04-cpp-minimal.dockerfile |  1 +
 ci/scripts/util_enable_core_dumps.sh          | 33 +++++++++++++++++++
 cpp/build-support/run-test.sh                 | 23 ++++++++-----
 dev/tasks/docker-tests/github.cuda.yml        |  1 +
 dev/tasks/docker-tests/github.linux.yml       |  1 +
 dev/tasks/python-wheels/github.linux.yml      |  1 +
 dev/tasks/r/github.packages.yml               |  3 +-
 docker-compose.yml                            |  4 +--
 20 files changed, 70 insertions(+), 31 deletions(-)
 create mode 100644 ci/scripts/util_enable_core_dumps.sh

diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
index 4a01d2f8e3aab..f5c8b6a7201be 100644
--- a/.github/workflows/cpp.yml
+++ b/.github/workflows/cpp.yml
@@ -155,8 +155,7 @@ jobs:
         run: |
           # GH-40558: reduce ASLR to avoid ASAN/LSAN crashes
           sudo sysctl -w vm.mmap_rnd_bits=28
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           archery docker run ${{ matrix.image }}
       - name: Docker Push
         if: >-
@@ -272,7 +271,7 @@ jobs:
         shell: bash
         run: |
           sudo sysctl -w kern.coredump=1
-          sudo sysctl -w kern.corefile=core.%N.%P
+          sudo sysctl -w kern.corefile=/tmp/core.%N.%P
           ulimit -c unlimited  # must enable within the same shell
           ci/scripts/cpp_test.sh $(pwd) $(pwd)/build
 
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index 1cc8d993498b6..3879a045fd239 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -67,8 +67,7 @@ jobs:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
         run: |
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           archery docker run -e GITHUB_ACTIONS=true ubuntu-lint
       - name: Docker Push
         if: >-
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index ecf89bff8f600..2d19b1e59b27a 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -101,6 +101,7 @@ jobs:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
         run: >
+          source ci/scripts/util_enable_core_dumps.sh
           archery docker run \
             -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \
             -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \
diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml
index f2ecc801dc724..e730a5bf3e672 100644
--- a/.github/workflows/java_jni.yml
+++ b/.github/workflows/java_jni.yml
@@ -81,7 +81,9 @@ jobs:
         env:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
-        run: archery docker run java-jni-manylinux-2014
+        run: |
+          source ci/scripts/util_enable_core_dumps.sh
+          archery docker run java-jni-manylinux-2014
       - name: Docker Push
         if: >-
           success() &&
diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml
index 17b57c42b62f6..9ab4edf0851cd 100644
--- a/.github/workflows/js.yml
+++ b/.github/workflows/js.yml
@@ -66,8 +66,7 @@ jobs:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
         run: |
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           archery docker run debian-js
       - name: Docker Push
         if: >-
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 6e83b727593b4..45efd305aa8f6 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -119,8 +119,7 @@ jobs:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
         run: |
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           archery docker run ${{ matrix.image }}
       - name: Docker Push
         if: >-
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index bd1631db4f617..92e0e63fb7ea5 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -158,8 +158,7 @@ jobs:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
         run: |
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           # Setting a non-default and non-probable Marquesas French Polynesia time
           # it has both with a .45 offset and very very few people who live there.
           archery docker run -e TZ=MART -e ARROW_R_FORCE_TESTS=${{ matrix.force-tests }} ubuntu-r
@@ -218,8 +217,7 @@ jobs:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
         run: |
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           # Don't set a TZ here to test that case. These builds will have the following warning in them:
           #   System has not been booted with systemd as init system (PID 1). Can't operate.
           #   Failed to connect to bus: Host is down
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index c4a7f31f4a94c..05b7b317ffd96 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -95,8 +95,7 @@ jobs:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
         run: |
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           archery docker run \
             -e ARROW_FLIGHT=ON \
             -e ARROW_FLIGHT_SQL=ON \
diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml
index 86eb113dfc833..87aa5cb83f714 100644
--- a/.github/workflows/swift.yml
+++ b/.github/workflows/swift.yml
@@ -65,8 +65,7 @@ jobs:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
         run: |
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           archery docker run ubuntu-swift
       - name: Docker Push
         if: >-
diff --git a/ci/docker/fedora-39-cpp.dockerfile b/ci/docker/fedora-39-cpp.dockerfile
index 33d11823094ce..2ac5afe7b91f6 100644
--- a/ci/docker/fedora-39-cpp.dockerfile
+++ b/ci/docker/fedora-39-cpp.dockerfile
@@ -34,6 +34,7 @@ RUN dnf update -y && \
         curl-devel \
         gcc \
         gcc-c++ \
+        gdb \
         gflags-devel \
         git \
         glog-devel \
diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
index 4d867a448c994..1b342df596c9d 100644
--- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile
@@ -29,6 +29,7 @@ RUN apt-get update -y -q && \
         ccache \
         cmake \
         curl \
+        gdb \
         git \
         libssl-dev \
         libcurl4-openssl-dev \
diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
index f26cad51f0983..ce31c457e909e 100644
--- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile
@@ -29,6 +29,7 @@ RUN apt-get update -y -q && \
         ccache \
         cmake \
         curl \
+        gdb \
         git \
         libssl-dev \
         libcurl4-openssl-dev \
diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
index 125bc7ba46a81..a1fd178a2c754 100644
--- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
+++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile
@@ -29,6 +29,7 @@ RUN apt-get update -y -q && \
         ccache \
         cmake \
         curl \
+        gdb \
         git \
         libssl-dev \
         libcurl4-openssl-dev \
diff --git a/ci/scripts/util_enable_core_dumps.sh b/ci/scripts/util_enable_core_dumps.sh
new file mode 100644
index 0000000000000..09f8d2d727099
--- /dev/null
+++ b/ci/scripts/util_enable_core_dumps.sh
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NOTE: this script is not marked executable as it should be source'd
+# for `ulimit` to take effect.
+
+set -e
+
+platform=$(uname)
+
+if [ "${platform}" = "Linux" ]; then
+  # We need to override `core_pattern` because
+  # 1. the original setting may reference apport, which is not available under
+  #    most Docker containers;
+  # 2. we want to write the core file in a well-known directory.
+  sudo sysctl -w kernel.core_pattern="/tmp/core.%e.%p"
+fi
+
+ulimit -c unlimited
diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh
index 8e42438a23c1c..55e3fe0980749 100755
--- a/cpp/build-support/run-test.sh
+++ b/cpp/build-support/run-test.sh
@@ -121,12 +121,15 @@ function print_coredumps() {
   # patterns must be set with prefix `core.{test-executable}*`:
   #
   # In case of macOS:
-  #   sudo sysctl -w kern.corefile=core.%N.%P
+  #   sudo sysctl -w kern.corefile=/tmp/core.%N.%P
   # On Linux:
-  #   sudo sysctl -w kernel.core_pattern=core.%e.%p
+  #   sudo sysctl -w kernel.core_pattern=/tmp/core.%e.%p
   #
   # and the ulimit must be increased:
   #   ulimit -c unlimited
+  #
+  # If the tests are run in a Docker container, the instructions are slightly
+  # different: see the 'Coredumps' comment section in `docker-compose.yml`.
 
   # filename is truncated to the first 15 characters in case of linux, so limit
   # the pattern for the first 15 characters
@@ -134,19 +137,21 @@ function print_coredumps() {
   FILENAME=$(echo ${FILENAME} | cut -c-15)
   PATTERN="^core\.${FILENAME}"
 
-  COREFILES=$(ls | grep $PATTERN)
+  COREFILES=$(ls /tmp | grep $PATTERN)
   if [ -n "$COREFILES" ]; then
-    echo "Found core dump, printing backtrace:"
-
     for COREFILE in $COREFILES; do
+      COREPATH="/tmp/${COREFILE}"
+      echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+      echo "Running '${TEST_EXECUTABLE}' produced core dump at '${COREPATH}', printing backtrace:"
       # Print backtrace
       if [ "$(uname)" == "Darwin" ]; then
-        lldb -c "${COREFILE}" --batch --one-line "thread backtrace all -e true"
+        lldb -c "${COREPATH}" --batch --one-line "thread backtrace all -e true"
       else
-        gdb -c "${COREFILE}" $TEST_EXECUTABLE -ex "thread apply all bt" -ex "set pagination 0" -batch
+        gdb -c "${COREPATH}" $TEST_EXECUTABLE -ex "thread apply all bt" -ex "set pagination 0" -batch
       fi
-      # Remove the coredump, regenerate it via running the test case directly
-      rm "${COREFILE}"
+      echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+      # Remove the coredump, it can be regenerated via running the test case directly
+      rm "${COREPATH}"
     done
   fi
 }
diff --git a/dev/tasks/docker-tests/github.cuda.yml b/dev/tasks/docker-tests/github.cuda.yml
index 8c04da8a91a4f..d03b3657afc53 100644
--- a/dev/tasks/docker-tests/github.cuda.yml
+++ b/dev/tasks/docker-tests/github.cuda.yml
@@ -38,6 +38,7 @@ jobs:
         env:
         {{ macros.github_set_sccache_envvars()|indent(8) }}
         run: |
+          source arrow/ci/scripts/util_enable_core_dumps.sh
           archery docker run \
             -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \
             {{ flags|default("") }} \
diff --git a/dev/tasks/docker-tests/github.linux.yml b/dev/tasks/docker-tests/github.linux.yml
index 28d3203c1ed48..cd2923a50d6df 100644
--- a/dev/tasks/docker-tests/github.linux.yml
+++ b/dev/tasks/docker-tests/github.linux.yml
@@ -38,6 +38,7 @@ jobs:
         run: |
           # GH-40558: reduce ASLR to avoid TSAN crashing
           sudo sysctl -w vm.mmap_rnd_bits=28
+          source arrow/ci/scripts/util_enable_core_dumps.sh
           archery docker run \
             -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" \
             {{ flags|default("") }} \
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index f9df27ba3175b..d9dbef82a948e 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -59,6 +59,7 @@ jobs:
       - name: Test wheel
         shell: bash
         run: |
+          source arrow/ci/scripts/util_enable_core_dumps.sh
           archery docker run python-wheel-manylinux-test-imports
           archery docker run python-wheel-manylinux-test-unittests
 
diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml
index 0539eae6cc9d9..db6955b92d1e0 100644
--- a/dev/tasks/r/github.packages.yml
+++ b/dev/tasks/r/github.packages.yml
@@ -140,8 +140,7 @@ jobs:
           UBUNTU: {{ '"${{ matrix.ubuntu }}"' }}
         {{ macros.github_set_sccache_envvars()|indent(8) }}
         run: |
-          sudo sysctl -w kernel.core_pattern="core.%e.%p"
-          ulimit -c unlimited
+          source ci/scripts/util_enable_core_dumps.sh
           archery docker run \
             -e EXTRA_CMAKE_FLAGS="{{ '${{ matrix.extra-cmake-flags }}' }}" \
             {{ '${{ matrix.os }}' }}-cpp-static
diff --git a/docker-compose.yml b/docker-compose.yml
index 8721eef524a19..6d9b738d8da35 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -38,11 +38,11 @@
 # WARNING: setting this will affect the host machine.
 #
 # Linux host:
-#   $ sudo sysctl -w kernel.core_pattern=core.%e.%p
+#   $ sudo sysctl -w kernel.core_pattern=/tmp/core.%e.%p
 #
 # macOS host running Docker for Mac (won't persist between restarts):
 #   $ screen ~/Library/Containers/com.docker.docker/Data/vms/0/tty
-#   # echo "core.%e.%p" > /proc/sys/kernel/core_pattern
+#   # echo "/tmp/core.%e.%p" > /proc/sys/kernel/core_pattern
 #
 # The setup attempts to generate coredumps by default, but the correct paths
 # above must be set. In order to disable the coredump generation set

From 2c44bcc053ea7c6b173a26295a4cc333ac9241b0 Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 11 Sep 2024 06:21:41 +0530
Subject: [PATCH 121/186] GH-44044: [Java] Consider warnings as errors for
 Vector Module (#44045)

### Rationale for this change

This PR configs the build such that warnings are considered errors in the Vector module. And corresponding code changes have also been made.

### What changes are included in this PR?

Adding flags to consider warnings as errors in javac and fixing the corresponding errors.

### Are these changes tested?

Tested by existing test cases.

### Are there any user-facing changes?

N/A
* GitHub Issue: #44044

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/vector/pom.xml                                    | 9 +++++++++
 java/vector/src/main/codegen/templates/BaseWriter.java | 1 +
 2 files changed, 10 insertions(+)

diff --git a/java/vector/pom.xml b/java/vector/pom.xml
index 73d76fc7306ae..eb0e39565332e 100644
--- a/java/vector/pom.xml
+++ b/java/vector/pom.xml
@@ -118,6 +118,15 @@ under the License.
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration combine.children="append">
+          <compilerArgs>
+            <arg>-Werror</arg>
+          </compilerArgs>
+        </configuration>
+      </plugin>
       <plugin>
         <!-- generate sources from fmpp -->
         <groupId>org.apache.drill.tools</groupId>
diff --git a/java/vector/src/main/codegen/templates/BaseWriter.java b/java/vector/src/main/codegen/templates/BaseWriter.java
index 458a4df1eec82..e952d46f1f241 100644
--- a/java/vector/src/main/codegen/templates/BaseWriter.java
+++ b/java/vector/src/main/codegen/templates/BaseWriter.java
@@ -125,6 +125,7 @@ public interface StructOrListWriter {
     /**
      * @deprecated use {@link #listOfStruct()} instead.
      */
+    @Deprecated
     StructOrListWriter listoftstruct(String name);
     StructOrListWriter listOfStruct(String name);
     StructOrListWriter list(String name);

From 2a793d6410539cf72f49f0bd06e1087d56c99d9f Mon Sep 17 00:00:00 2001
From: Vibhatha Lakmal Abeykoon <vibhatha@users.noreply.github.com>
Date: Wed, 11 Sep 2024 06:49:33 +0530
Subject: [PATCH 122/186] GH-43962: [Java] Consider warnings as errors for
 Adapter Module (#43963)

### Rationale for this change

This PR configs the build such that warnings are considered as errors in the Adapter module. And corresponding code changes have also been made.

### What changes are included in this PR?

Adding flags to consider warnings as errors in javac and fixing the corresponding errors.

### Are these changes tested?

Tested by existing test cases.

### Are there any user-facing changes?

N/A
* GitHub Issue: #43962

Authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/adapter/avro/pom.xml                          | 14 ++++++++++++++
 .../adapter/avro/AvroToArrowIteratorTest.java      |  1 +
 java/adapter/jdbc/pom.xml                          |  9 +++++++++
 .../adapter/jdbc/h2/JdbcToArrowCharSetTest.java    |  1 +
 .../adapter/jdbc/h2/JdbcToArrowDataTypesTest.java  |  1 +
 .../jdbc/h2/JdbcToArrowMapDataTypeTest.java        |  1 +
 .../arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java |  1 +
 .../jdbc/h2/JdbcToArrowOptionalColumnsTest.java    |  1 +
 .../arrow/adapter/jdbc/h2/JdbcToArrowTest.java     |  1 +
 .../adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java   |  1 +
 .../jdbc/h2/JdbcToArrowVectorIteratorTest.java     |  1 +
 java/adapter/orc/pom.xml                           |  9 +++++++++
 12 files changed, 41 insertions(+)

diff --git a/java/adapter/avro/pom.xml b/java/adapter/avro/pom.xml
index cb4adccb76771..2c02e72e9c838 100644
--- a/java/adapter/avro/pom.xml
+++ b/java/adapter/avro/pom.xml
@@ -56,4 +56,18 @@ under the License.
       <version>${dep.avro.version}</version>
     </dependency>
   </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration combine.children="append">
+          <compilerArgs>
+            <arg>-Werror</arg>
+          </compilerArgs>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
 </project>
diff --git a/java/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/AvroToArrowIteratorTest.java b/java/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/AvroToArrowIteratorTest.java
index f8022a9385134..44ccbc74511dd 100644
--- a/java/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/AvroToArrowIteratorTest.java
+++ b/java/adapter/avro/src/test/java/org/apache/arrow/adapter/avro/AvroToArrowIteratorTest.java
@@ -50,6 +50,7 @@
 public class AvroToArrowIteratorTest extends AvroTestBase {
 
   @BeforeEach
+  @Override
   public void init() {
     final BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
     this.config = new AvroToArrowConfigBuilder(allocator).setTargetBatchSize(3).build();
diff --git a/java/adapter/jdbc/pom.xml b/java/adapter/jdbc/pom.xml
index 099798a95cd25..5ebb4089cf72f 100644
--- a/java/adapter/jdbc/pom.xml
+++ b/java/adapter/jdbc/pom.xml
@@ -116,6 +116,15 @@ under the License.
           <argLine>--add-reads=org.apache.arrow.adapter.jdbc=com.fasterxml.jackson.dataformat.yaml --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -Duser.timezone=UTC</argLine>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration combine.children="append">
+          <compilerArgs>
+            <arg>-Werror</arg>
+          </compilerArgs>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java
index 726e1905c4242..39c0085603f17 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java
@@ -91,6 +91,7 @@ public static Stream<Arguments> getTestData()
    */
   @ParameterizedTest
   @MethodSource("getTestData")
+  @Override
   public void testJdbcToArrowValues(Table table)
       throws SQLException, IOException, ClassNotFoundException {
     this.initializeDatabase(table);
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java
index c246bb2bec47e..2274f51745973 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java
@@ -145,6 +145,7 @@ public static Stream<Arguments> getTestData()
   /** Test Method to test JdbcToArrow Functionality for various H2 DB based datatypes. */
   @ParameterizedTest
   @MethodSource("getTestData")
+  @Override
   public void testJdbcToArrowValues(Table table)
       throws SQLException, IOException, ClassNotFoundException {
     this.initializeDatabase(table);
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java
index 1daeda6772b26..456d338f6bd75 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowMapDataTypeTest.java
@@ -45,6 +45,7 @@ public static Stream<Arguments> getTestData() throws IOException {
   /** Test Method to test JdbcToArrow Functionality for Map form Types.OTHER column. */
   @ParameterizedTest
   @MethodSource("getTestData")
+  @Override
   public void testJdbcToArrowValues(Table table)
       throws SQLException, IOException, ClassNotFoundException {
     this.initializeDatabase(table);
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java
index 205b7e16f2f09..2009268980afe 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java
@@ -113,6 +113,7 @@ public static Stream<Arguments> getTestData()
    */
   @ParameterizedTest
   @MethodSource("getTestData")
+  @Override
   public void testJdbcToArrowValues(Table table)
       throws SQLException, IOException, ClassNotFoundException {
     this.initializeDatabase(table);
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java
index 382d20f45d4b1..2108afec4c945 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java
@@ -59,6 +59,7 @@ public static Stream<Arguments> getTestData()
    */
   @ParameterizedTest
   @MethodSource("getTestData")
+  @Override
   public void testJdbcToArrowValues(Table table)
       throws SQLException, IOException, ClassNotFoundException {
     this.initializeDatabase(table);
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java
index 7966f62e175e3..bea7d4d37c50e 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java
@@ -87,6 +87,7 @@ public static Stream<Arguments> getTestData()
    */
   @ParameterizedTest
   @MethodSource("getTestData")
+  @Override
   public void testJdbcToArrowValues(Table table)
       throws SQLException, IOException, ClassNotFoundException {
     this.initializeDatabase(table);
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java
index 0f60c89d1c03c..14396997d2863 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java
@@ -91,6 +91,7 @@ public static Stream<Arguments> getTestData()
    */
   @ParameterizedTest
   @MethodSource("getTestData")
+  @Override
   public void testJdbcToArrowValues(Table table)
       throws SQLException, IOException, ClassNotFoundException {
     this.initializeDatabase(table);
diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java
index 40fd39ac0c555..de9eff327ef6f 100644
--- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java
+++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowVectorIteratorTest.java
@@ -76,6 +76,7 @@ public class JdbcToArrowVectorIteratorTest extends JdbcToArrowTest {
 
   @ParameterizedTest
   @MethodSource("getTestData")
+  @Override
   public void testJdbcToArrowValues(Table table)
       throws SQLException, IOException, ClassNotFoundException {
     this.initializeDatabase(table);
diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml
index d9cd2bb21a526..cf35397c9917b 100644
--- a/java/adapter/orc/pom.xml
+++ b/java/adapter/orc/pom.xml
@@ -160,6 +160,15 @@ under the License.
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration combine.children="append">
+          <compilerArgs>
+            <arg>-Werror</arg>
+          </compilerArgs>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 </project>

From 8556001e6a8b4c7f35d4e18c28704d7811005904 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 11 Sep 2024 11:02:26 +0900
Subject: [PATCH 123/186] GH-44006: [GLib][Parquet] Add
 `gparquet_arrow_file_writer_new_row_group()` (#44039)

### Rationale for this change

This is a low-level API to control how to write data. This is for advanced users.

### What changes are included in this PR?

`gparquet_arrow_file_writer_write_chunked_array()` is also added to write a test for `gparquet_arrow_file_writer_new_row_group()`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #44006

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/parquet-glib/arrow-file-writer.cpp     | 50 +++++++++++++++++--
 c_glib/parquet-glib/arrow-file-writer.h       | 14 +++++-
 c_glib/test/parquet/test-arrow-file-writer.rb | 30 +++++++++++
 3 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp
index 0d0e87e7e3ede..7a672f1f21dcc 100644
--- a/c_glib/parquet-glib/arrow-file-writer.cpp
+++ b/c_glib/parquet-glib/arrow-file-writer.cpp
@@ -548,13 +548,57 @@ gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer,
 gboolean
 gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer,
                                        GArrowTable *table,
-                                       guint64 chunk_size,
+                                       gsize chunk_size,
                                        GError **error)
 {
   auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
   auto arrow_table = garrow_table_get_raw(table).get();
-  auto status = parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size);
-  return garrow_error_check(error, status, "[parquet][arrow][file-writer][write-table]");
+  return garrow::check(error,
+                       parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size),
+                       "[parquet][arrow][file-writer][write-table]");
+}
+
+/**
+ * gparquet_arrow_file_writer_new_row_group:
+ * @writer: A #GParquetArrowFileWriter.
+ * @chunk_size: The max number of rows in a row group.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE if there was an error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer,
+                                         gsize chunk_size,
+                                         GError **error)
+{
+  auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
+  return garrow::check(error,
+                       parquet_arrow_file_writer->NewRowGroup(chunk_size),
+                       "[parquet][arrow][file-writer][new-row-group]");
+}
+
+/**
+ * gparquet_arrow_file_writer_write_chunked_array:
+ * @writer: A #GParquetArrowFileWriter.
+ * @chunked_array: A #GArrowChunkedArray to be written.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE if there was an error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer,
+                                               GArrowChunkedArray *chunked_array,
+                                               GError **error)
+{
+  auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
+  auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array);
+  return garrow::check(error,
+                       parquet_arrow_file_writer->WriteColumnChunk(arrow_chunked_array),
+                       "[parquet][arrow][file-writer][write-chunked-array]");
 }
 
 /**
diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h
index 7eb14fe27a8bf..40595bdfef4b9 100644
--- a/c_glib/parquet-glib/arrow-file-writer.h
+++ b/c_glib/parquet-glib/arrow-file-writer.h
@@ -130,9 +130,21 @@ GPARQUET_AVAILABLE_IN_0_11
 gboolean
 gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer,
                                        GArrowTable *table,
-                                       guint64 chunk_size,
+                                       gsize chunk_size,
                                        GError **error);
 
+GPARQUET_AVAILABLE_IN_18_0
+gboolean
+gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer,
+                                         gsize chunk_size,
+                                         GError **error);
+
+GPARQUET_AVAILABLE_IN_18_0
+gboolean
+gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer,
+                                               GArrowChunkedArray *chunked_array,
+                                               GError **error);
+
 GPARQUET_AVAILABLE_IN_0_11
 gboolean
 gparquet_arrow_file_writer_close(GParquetArrowFileWriter *writer, GError **error);
diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb
index e348c9b679524..89db16c6fb90b 100644
--- a/c_glib/test/parquet/test-arrow-file-writer.rb
+++ b/c_glib/test/parquet/test-arrow-file-writer.rb
@@ -82,4 +82,34 @@ def test_write_table
       reader.unref
     end
   end
+
+  def test_write_chunked_array
+    schema = build_schema("enabled" => :boolean)
+    writer = Parquet::ArrowFileWriter.new(schema, @file.path)
+    writer.new_row_group(2)
+    chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true, nil])])
+    writer.write_chunked_array(chunked_array)
+    writer.new_row_group(1)
+    chunked_array = Arrow::ChunkedArray.new([build_boolean_array([false])])
+    writer.write_chunked_array(chunked_array)
+    writer.close
+
+    reader = Parquet::ArrowFileReader.new(@file.path)
+    begin
+      reader.use_threads = true
+      assert_equal([
+                     2,
+                     build_table("enabled" => [
+                                   build_boolean_array([true, nil]),
+                                   build_boolean_array([false]),
+                                 ]),
+                   ],
+                   [
+                     reader.n_row_groups,
+                     reader.read_table,
+                   ])
+    ensure
+      reader.unref
+    end
+  end
 end

From 170ea048f3e639770c97f7a5972b8442cd62fb22 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 11 Sep 2024 15:21:24 +0900
Subject: [PATCH 124/186] GH-44050: [CI][Integration] Execute integration test
 again (#44051)

### Rationale for this change

`>` in YAML removes newlines.

### What changes are included in this PR?

Use `|` instead of `>` to keep newlines.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #44050

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/integration.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 2d19b1e59b27a..b73f900e616f5 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -100,7 +100,7 @@ jobs:
         env:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
-        run: >
+        run: |
           source ci/scripts/util_enable_core_dumps.sh
           archery docker run \
             -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \

From 21f59689750dae87cbded9146ca6b4b00396722c Mon Sep 17 00:00:00 2001
From: Dane Pitkin <dpitkin@apache.org>
Date: Wed, 11 Sep 2024 04:26:48 -0400
Subject: [PATCH 125/186] GH-43973: [Python] Table fails gracefully on non-cpu
 devices (#43974)

## Rationale for this change

Table APIs should throw python exception instead of segfault if they don't support operating on non-cpu memory.

### What changes are included in this PR?

* Add is_cpu() property to Table
* Add _assert_cpu() checks to Table APIs that only support operating on cpu memory

### Are these changes tested?

* Unit tests

### Are there any user-facing changes?

No, besides receiving a friendlier error in certain scenarios.
* GitHub Issue: #43973

Lead-authored-by: Dane Pitkin <dpitkin@apache.org>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/lib.pxd             |   2 +
 python/pyarrow/table.pxi           |  30 ++++
 python/pyarrow/tests/test_table.py | 244 +++++++++++++++++++++++++++--
 3 files changed, 265 insertions(+), 11 deletions(-)

diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 1caf58e20e653..25a7945dc3ddc 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -525,6 +525,8 @@ cdef class Table(_Tabular):
     cdef:
         shared_ptr[CTable] sp_table
         CTable* table
+        c_bool _is_cpu
+        c_bool _init_is_cpu
 
     cdef void init(self, const shared_ptr[CTable]& table)
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 3b0df981e017c..819bbc34c66b9 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -4180,6 +4180,7 @@ cdef class Table(_Tabular):
 
     def __cinit__(self):
         self.table = NULL
+        self._init_is_cpu = False
 
     cdef void init(self, const shared_ptr[CTable]& table):
         self.sp_table = table
@@ -4205,6 +4206,7 @@ cdef class Table(_Tabular):
         ArrowInvalid
         """
         if full:
+            self._assert_cpu()
             with nogil:
                 check_status(self.table.ValidateFull())
         else:
@@ -4214,6 +4216,7 @@ cdef class Table(_Tabular):
     def __reduce__(self):
         # Reduce the columns as ChunkedArrays to avoid serializing schema
         # data twice
+        self._assert_cpu()
         columns = [col for col in self.columns]
         return _reconstruct_table, (columns, self.schema)
 
@@ -4452,6 +4455,7 @@ cdef class Table(_Tabular):
         a.year: [[null,2022]]
         month: [[4,6]]
         """
+        self._assert_cpu()
         cdef:
             shared_ptr[CTable] flattened
             CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
@@ -4499,6 +4503,7 @@ cdef class Table(_Tabular):
         n_legs: [[2,2,4,4,5,100]]
         animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]]
         """
+        self._assert_cpu()
         cdef:
             shared_ptr[CTable] combined
             CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
@@ -4556,6 +4561,7 @@ cdef class Table(_Tabular):
         ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]  -- indices:
         [3,4,5]]
         """
+        self._assert_cpu()
         cdef:
             CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
             shared_ptr[CTable] c_result
@@ -4601,6 +4607,7 @@ cdef class Table(_Tabular):
         >>> table.equals(table_1, check_metadata=True)
         False
         """
+        self._assert_cpu()
         if other is None:
             return False
 
@@ -4658,6 +4665,7 @@ cdef class Table(_Tabular):
         n_legs: [[2,4,5,100]]
         animals: [["Flamingo","Horse","Brittle stars","Centipede"]]
         """
+        self._assert_cpu()
         cdef:
             ChunkedArray column, casted
             Field field
@@ -4909,6 +4917,7 @@ cdef class Table(_Tabular):
         -------
         ChunkedArray
         """
+        self._assert_cpu()
         return chunked_array([
             batch.to_struct_array()
             for batch in self.to_batches(max_chunksize=max_chunksize)
@@ -5118,6 +5127,7 @@ cdef class Table(_Tabular):
 
     def _to_pandas(self, options, categories=None, ignore_metadata=False,
                    types_mapper=None):
+        self._assert_cpu()
         from pyarrow.pandas_compat import table_to_dataframe
         df = table_to_dataframe(
             options, self, categories,
@@ -5239,6 +5249,7 @@ cdef class Table(_Tabular):
         >>> table.nbytes
         72
         """
+        self._assert_cpu()
         cdef:
             CResult[int64_t] c_res_buffer
 
@@ -5268,6 +5279,7 @@ cdef class Table(_Tabular):
         >>> table.get_total_buffer_size()
         76
         """
+        self._assert_cpu()
         cdef:
             int64_t total_buffer_size
 
@@ -5576,6 +5588,7 @@ cdef class Table(_Tabular):
         year: [[2020,2022,2021,2019]]
         n_legs_sum: [[2,6,104,5]]
         """
+        self._assert_cpu()
         return TableGroupBy(self, keys, use_threads=use_threads)
 
     def join(self, right_table, keys, right_keys=None, join_type="left outer",
@@ -5685,6 +5698,7 @@ cdef class Table(_Tabular):
         n_legs: [[100]]
         animal: [["Centipede"]]
         """
+        self._assert_cpu()
         if right_keys is None:
             right_keys = keys
         return _pac()._perform_join(
@@ -5772,6 +5786,7 @@ cdef class Table(_Tabular):
         n_legs: [[null,5,null,5,null]]
         animal: [[null,"Brittle stars",null,"Brittle stars",null]]
         """
+        self._assert_cpu()
         if right_on is None:
             right_on = on
         if right_by is None:
@@ -5797,8 +5812,23 @@ cdef class Table(_Tabular):
         -------
         PyCapsule
         """
+        self._assert_cpu()
         return self.to_reader().__arrow_c_stream__(requested_schema)
 
+    @property
+    def is_cpu(self):
+        """
+        Whether all ChunkedArrays are CPU-accessible.
+        """
+        if not self._init_is_cpu:
+            self._is_cpu = all(c.is_cpu for c in self.itercolumns())
+            self._init_is_cpu = True
+        return self._is_cpu
+
+    cdef void _assert_cpu(self) except *:
+        if not self.is_cpu:
+            raise NotImplementedError("Implemented only for data on CPU device")
+
 
 def _reconstruct_table(arrays, schema):
     """
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index c3f805b4b32d6..b66a5eb083cc5 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -3430,6 +3430,21 @@ def cuda_recordbatch(cuda_context, cpu_recordbatch):
     return cpu_recordbatch.copy_to(cuda_context.memory_manager)
 
 
+@pytest.fixture
+def cpu_table(schema, cpu_chunked_array):
+    return pa.table([cpu_chunked_array, cpu_chunked_array], schema=schema)
+
+
+@pytest.fixture
+def cuda_table(schema, cuda_chunked_array):
+    return pa.table([cuda_chunked_array, cuda_chunked_array], schema=schema)
+
+
+@pytest.fixture
+def cpu_and_cuda_table(schema, cpu_chunked_array, cuda_chunked_array):
+    return pa.table([cpu_chunked_array, cuda_chunked_array], schema=schema)
+
+
 def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_array,
                                cpu_and_cuda_chunked_array):
     # type test
@@ -3586,6 +3601,9 @@ def verify_cuda_recordbatch(batch, expected_schema):
 def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
                              cuda_arrays, schema):
     verify_cuda_recordbatch(cuda_recordbatch, expected_schema=schema)
+    N = cuda_recordbatch.num_rows
+
+    # shape test
     assert cuda_recordbatch.shape == (5, 2)
 
     # columns() test
@@ -3593,24 +3611,26 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
 
     # add_column(), set_column() test
     for fn in [cuda_recordbatch.add_column, cuda_recordbatch.set_column]:
-        col = pa.array([6, 7, 8, 9, 10], pa.int8()).copy_to(cuda_context.memory_manager)
+        col = pa.array([-2, -1, 0, 1, 2], pa.int8()
+                       ).copy_to(cuda_context.memory_manager)
         new_batch = fn(2, 'c2', col)
-        assert len(new_batch.columns) == 3
-        for c in new_batch.columns:
-            assert c.device_type == pa.DeviceAllocationType.CUDA
+        verify_cuda_recordbatch(
+            new_batch, expected_schema=schema.append(pa.field('c2', pa.int8())))
         err_msg = ("Got column on device <DeviceAllocationType.CPU: 1>, "
                    "but expected <DeviceAllocationType.CUDA: 2>.")
         with pytest.raises(TypeError, match=err_msg):
-            fn(2, 'c2', [1, 1, 1, 1, 1])
+            fn(2, 'c2', [1] * N)
 
     # remove_column() test
     new_batch = cuda_recordbatch.remove_column(1)
     verify_cuda_recordbatch(new_batch, expected_schema=schema.remove(1))
 
     # drop_columns() test
-    new_batch = cuda_recordbatch.drop_columns(['c0', 'c1'])
-    assert len(new_batch.columns) == 0
-    assert new_batch.device_type == pa.DeviceAllocationType.CUDA
+    new_batch = cuda_recordbatch.drop_columns(['c1'])
+    verify_cuda_recordbatch(new_batch, expected_schema=schema.remove(1))
+    empty_batch = cuda_recordbatch.drop_columns(['c0', 'c1'])
+    assert len(empty_batch.columns) == 0
+    assert empty_batch.device_type == pa.DeviceAllocationType.CUDA
 
     # select() test
     new_batch = cuda_recordbatch.select(['c0'])
@@ -3622,8 +3642,7 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
         cuda_recordbatch.cast(new_schema)
 
     # drop_null() test
-    null_col = pa.array([-2, -1, 0, 1, 2],
-                        mask=[True, False, True, False, True]).copy_to(
+    null_col = pa.array([1] * N, mask=[True, False, True, False, True]).copy_to(
         cuda_context.memory_manager)
     cuda_recordbatch_with_nulls = cuda_recordbatch.add_column(2, 'c2', null_col)
     with pytest.raises(NotImplementedError):
@@ -3631,7 +3650,7 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
 
     # filter() test
     with pytest.raises(NotImplementedError):
-        cuda_recordbatch.filter([True] * 5)
+        cuda_recordbatch.filter([True] * N)
 
     # take() test
     with pytest.raises(NotImplementedError):
@@ -3737,3 +3756,206 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch,
     # __dataframe__() test
     with pytest.raises(NotImplementedError):
         from_dataframe(cuda_recordbatch.__dataframe__())
+
+
+def verify_cuda_table(table, expected_schema):
+    table.validate()
+    assert table.is_cpu is False
+    assert table.num_columns == len(expected_schema.names)
+    assert table.column_names == expected_schema.names
+    assert str(table) in repr(table)
+    for c in table.columns:
+        assert c.is_cpu is False
+        for chunk in c.iterchunks():
+            assert chunk.is_cpu is False
+            assert chunk.device_type == pa.DeviceAllocationType.CUDA
+    assert table.schema == expected_schema
+
+
+def test_table_non_cpu(cuda_context, cpu_table, cuda_table,
+                       cuda_arrays, cuda_recordbatch, schema):
+    verify_cuda_table(cuda_table, expected_schema=schema)
+    N = cuda_table.num_rows
+
+    # shape test
+    assert cuda_table.shape == (10, 2)
+
+    # columns() test
+    assert len(cuda_table.columns) == 2
+
+    # add_column(), set_column() test
+    for fn in [cuda_table.add_column, cuda_table.set_column]:
+        cpu_col = pa.array([1] * N, pa.int8())
+        cuda_col = cpu_col.copy_to(cuda_context.memory_manager)
+        new_table = fn(2, 'c2', cuda_col)
+        verify_cuda_table(new_table, expected_schema=schema.append(
+            pa.field('c2', pa.int8())))
+        new_table = fn(2, 'c2', cpu_col)
+        assert new_table.is_cpu is False
+        assert new_table.column(0).is_cpu is False
+        assert new_table.column(1).is_cpu is False
+        assert new_table.column(2).is_cpu is True
+
+    # remove_column() test
+    new_table = cuda_table.remove_column(1)
+    verify_cuda_table(new_table, expected_schema=schema.remove(1))
+
+    # drop_columns() test
+    new_table = cuda_table.drop_columns(['c1'])
+    verify_cuda_table(new_table, expected_schema=schema.remove(1))
+    new_table = cuda_table.drop_columns(['c0', 'c1'])
+    assert len(new_table.columns) == 0
+    assert new_table.is_cpu
+
+    # select() test
+    new_table = cuda_table.select(['c0'])
+    verify_cuda_table(new_table, expected_schema=schema.remove(1))
+
+    # cast() test
+    new_schema = pa.schema([pa.field('c0', pa.int64()), pa.field('c1', pa.int64())])
+    with pytest.raises(NotImplementedError):
+        cuda_table.cast(new_schema)
+
+    # drop_null() test
+    null_col = pa.array([1] * N, mask=[True] * N).copy_to(cuda_context.memory_manager)
+    cuda_table_with_nulls = cuda_table.add_column(2, 'c2', null_col)
+    with pytest.raises(NotImplementedError):
+        cuda_table_with_nulls.drop_null()
+
+    # filter() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.filter([True] * N)
+
+    # take() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.take([0])
+
+    # sort_by() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.sort_by('c0')
+
+    # field() test
+    assert cuda_table.field(0) == schema.field(0)
+    assert cuda_table.field(1) == schema.field(1)
+
+    # equals() test
+    with pytest.raises(NotImplementedError):
+        assert cuda_table.equals(cpu_table)
+
+    # from_arrays() test
+    new_table = pa.Table.from_arrays(cuda_arrays, ['c0', 'c1'])
+    verify_cuda_table(new_table, expected_schema=schema)
+
+    # from_pydict() test
+    new_table = pa.Table.from_pydict({'c0': cuda_arrays[0], 'c1': cuda_arrays[1]})
+    verify_cuda_table(new_table, expected_schema=schema)
+
+    # from_struct_array() test
+    fields = [schema.field(i) for i in range(len(schema.names))]
+    struct_array = pa.StructArray.from_arrays(cuda_arrays, fields=fields)
+    with pytest.raises(NotImplementedError):
+        pa.Table.from_struct_array(struct_array)
+
+    # from_batches() test
+    new_table = pa.Table.from_batches([cuda_recordbatch, cuda_recordbatch], schema)
+    verify_cuda_table(new_table, expected_schema=schema)
+
+    # nbytes test
+    with pytest.raises(NotImplementedError):
+        assert cuda_table.nbytes
+
+    # get_total_buffer_size() test
+    with pytest.raises(NotImplementedError):
+        assert cuda_table.get_total_buffer_size()
+
+    # to_pydict() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.to_pydict()
+
+    # to_pylist() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.to_pylist()
+
+    # to_pandas() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.to_pandas()
+
+    # to_struct_array() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.to_struct_array()
+
+    # to_batches() test
+    batches = cuda_table.to_batches(max_chunksize=5)
+    for batch in batches:
+        # GH-44049
+        with pytest.raises(AssertionError):
+            verify_cuda_recordbatch(batch, expected_schema=schema)
+
+    # to_reader() test
+    reader = cuda_table.to_reader(max_chunksize=5)
+    for batch in reader:
+        # GH-44049
+        with pytest.raises(AssertionError):
+            verify_cuda_recordbatch(batch, expected_schema=schema)
+
+    # slice() test
+    new_table = cuda_table.slice(1, 3)
+    verify_cuda_table(new_table, expected_schema=schema)
+    assert new_table.num_rows == 3
+
+    # replace_schema_metadata() test
+    new_table = cuda_table.replace_schema_metadata({b'key': b'value'})
+    verify_cuda_table(new_table, expected_schema=schema)
+    assert new_table.schema.metadata == {b'key': b'value'}
+
+    # rename_columns() test
+    new_table = cuda_table.rename_columns(['col0', 'col1'])
+    expected_schema = pa.schema(
+        [pa.field('col0', schema.field(0).type),
+         pa.field('col1', schema.field(1).type)])
+    verify_cuda_table(new_table, expected_schema=expected_schema)
+
+    # validate() test
+    cuda_table.validate()
+    with pytest.raises(NotImplementedError):
+        cuda_table.validate(full=True)
+
+    # flatten() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.flatten()
+
+    # combine_chunks() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.flatten()
+
+    # unify_dictionaries() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.unify_dictionaries()
+
+    # group_by() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.group_by('c0')
+
+    # join() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.join(cuda_table, 'c0')
+
+    # join_asof() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.join_asof(cuda_table, 'c0', 'c0', 0)
+
+    # __array__() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.__array__()
+
+    # __arrow_c_stream__() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.__arrow_c_stream__()
+
+    # __dataframe__() test
+    with pytest.raises(NotImplementedError):
+        from_dataframe(cuda_table.__dataframe__())
+
+    # __reduce__() test
+    with pytest.raises(NotImplementedError):
+        cuda_table.__reduce__()

From 27acf8bf6e16d4c53e1ed59d5ef46ac7db0306ea Mon Sep 17 00:00:00 2001
From: Pradeep Gollakota <me@progger.dev>
Date: Wed, 11 Sep 2024 03:45:08 -0700
Subject: [PATCH 126/186] GH-32538: [C++][Parquet] Add JSON canonical extension
 type (#13901)

Arrow now provides a canonical extension type for JSON data. This
extension is backed by utf8(). Parquet will recognize this extension
and appropriately propagate the LogicalType to the storage format.
* GitHub Issue: #32538

Lead-authored-by: Rok Mihevc <rok@mihevc.org>
Co-authored-by: Pradeep Gollakota <pgollakota@google.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Co-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/CMakeLists.txt                  |  1 +
 cpp/src/arrow/array/validate.cc               | 20 +++-
 cpp/src/arrow/extension/CMakeLists.txt        |  2 +-
 .../extension/fixed_shape_tensor_test.cc      |  6 +-
 cpp/src/arrow/extension/json.cc               | 61 ++++++++++++
 cpp/src/arrow/extension/json.h                | 56 +++++++++++
 cpp/src/arrow/extension/json_test.cc          | 83 ++++++++++++++++
 cpp/src/arrow/extension/uuid_test.cc          |  4 +-
 cpp/src/arrow/extension_type.cc               |  4 +-
 cpp/src/arrow/extension_type_test.cc          |  6 +-
 cpp/src/arrow/ipc/test_common.cc              | 17 ++--
 cpp/src/arrow/ipc/test_common.h               |  4 +-
 cpp/src/arrow/testing/gtest_util.cc           |  1 +
 .../parquet/arrow/arrow_reader_writer_test.cc | 61 +++++++++++-
 cpp/src/parquet/arrow/arrow_schema_test.cc    | 94 ++++++++++++++++++-
 cpp/src/parquet/arrow/schema.cc               | 46 ++++++---
 cpp/src/parquet/arrow/schema_internal.cc      | 24 +++--
 cpp/src/parquet/arrow/schema_internal.h       |  8 +-
 cpp/src/parquet/properties.h                  | 16 +++-
 docs/source/status.rst                        |  2 +-
 20 files changed, 460 insertions(+), 56 deletions(-)
 create mode 100644 cpp/src/arrow/extension/json.cc
 create mode 100644 cpp/src/arrow/extension/json.h
 create mode 100644 cpp/src/arrow/extension/json_test.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 01ac813f4713b..e77a02d0c0800 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -376,6 +376,7 @@ set(ARROW_SRCS
     device_allocation_type_set.cc
     extension_type.cc
     extension/bool8.cc
+    extension/json.cc
     extension/uuid.cc
     pretty_print.cc
     record_batch.cc
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 0d940d3bc869e..69f1646054f4c 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -985,10 +985,22 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d
 
 ARROW_EXPORT
 Status ValidateUTF8(const ArrayData& data) {
-  DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW ||
-         data.type->id() == Type::LARGE_STRING);
-  UTF8DataValidator validator{data};
-  return VisitTypeInline(*data.type, &validator);
+  const auto& storage_type =
+      (data.type->id() == Type::EXTENSION)
+          ? checked_cast<const ExtensionType&>(*data.type).storage_type()
+          : data.type;
+  DCHECK(storage_type->id() == Type::STRING || storage_type->id() == Type::STRING_VIEW ||
+         storage_type->id() == Type::LARGE_STRING);
+
+  if (data.type->id() == Type::EXTENSION) {
+    ArrayData ext_data(data);
+    ext_data.type = storage_type;
+    UTF8DataValidator validator{ext_data};
+    return VisitTypeInline(*storage_type, &validator);
+  } else {
+    UTF8DataValidator validator{data};
+    return VisitTypeInline(*storage_type, &validator);
+  }
 }
 
 ARROW_EXPORT
diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt
index 065ea3f1ddb16..4ab6a35b52e4f 100644
--- a/cpp/src/arrow/extension/CMakeLists.txt
+++ b/cpp/src/arrow/extension/CMakeLists.txt
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc)
+set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc)
 
 if(ARROW_JSON)
   list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc)
diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
index 842a78e1a4f7a..51aea4b25fdda 100644
--- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
+++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc
@@ -205,7 +205,7 @@ TEST_F(TestExtensionType, RoundtripBatch) {
   std::shared_ptr<RecordBatch> read_batch;
   auto ext_field = field(/*name=*/"f0", /*type=*/ext_type_);
   auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
-  RoundtripBatch(batch, &read_batch);
+  ASSERT_OK(RoundtripBatch(batch, &read_batch));
   CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);
 
   // Pass extension metadata and storage array, expect getting back extension array
@@ -216,7 +216,7 @@ TEST_F(TestExtensionType, RoundtripBatch) {
   ext_field = field(/*name=*/"f0", /*type=*/element_type_, /*nullable=*/true,
                     /*metadata=*/ext_metadata);
   auto batch2 = RecordBatch::Make(schema({ext_field}), fsla_arr->length(), {fsla_arr});
-  RoundtripBatch(batch2, &read_batch2);
+  ASSERT_OK(RoundtripBatch(batch2, &read_batch2));
   CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
 }
 
@@ -469,7 +469,7 @@ TEST_F(TestExtensionType, RoundtripBatchFromTensor) {
   auto ext_field = field("f0", ext_type_, true, ext_metadata);
   auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
   std::shared_ptr<RecordBatch> read_batch;
-  RoundtripBatch(batch, &read_batch);
+  ASSERT_OK(RoundtripBatch(batch, &read_batch));
   CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);
 }
 
diff --git a/cpp/src/arrow/extension/json.cc b/cpp/src/arrow/extension/json.cc
new file mode 100644
index 0000000000000..d793233c2b573
--- /dev/null
+++ b/cpp/src/arrow/extension/json.cc
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/json.h"
+
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/logging.h"
+
+namespace arrow::extension {
+
+bool JsonExtensionType::ExtensionEquals(const ExtensionType& other) const {
+  return other.extension_name() == this->extension_name();
+}
+
+Result<std::shared_ptr<DataType>> JsonExtensionType::Deserialize(
+    std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
+  if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW &&
+      storage_type->id() != Type::LARGE_STRING) {
+    return Status::Invalid("Invalid storage type for JsonExtensionType: ",
+                           storage_type->ToString());
+  }
+  return std::make_shared<JsonExtensionType>(storage_type);
+}
+
+std::string JsonExtensionType::Serialize() const { return ""; }
+
+std::shared_ptr<Array> JsonExtensionType::MakeArray(
+    std::shared_ptr<ArrayData> data) const {
+  DCHECK_EQ(data->type->id(), Type::EXTENSION);
+  DCHECK_EQ("arrow.json",
+            internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
+  return std::make_shared<ExtensionArray>(data);
+}
+
+std::shared_ptr<DataType> json(const std::shared_ptr<DataType> storage_type) {
+  ARROW_CHECK(storage_type->id() != Type::STRING ||
+              storage_type->id() != Type::STRING_VIEW ||
+              storage_type->id() != Type::LARGE_STRING);
+  return std::make_shared<JsonExtensionType>(storage_type);
+}
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/json.h b/cpp/src/arrow/extension/json.h
new file mode 100644
index 0000000000000..4793ab2bc9b36
--- /dev/null
+++ b/cpp/src/arrow/extension/json.h
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow::extension {
+
+/// \brief Concrete type class for variable-size JSON data, utf8-encoded.
+class ARROW_EXPORT JsonExtensionType : public ExtensionType {
+ public:
+  explicit JsonExtensionType(const std::shared_ptr<DataType>& storage_type)
+      : ExtensionType(storage_type), storage_type_(storage_type) {}
+
+  std::string extension_name() const override { return "arrow.json"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+
+  std::string Serialize() const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+ private:
+  std::shared_ptr<DataType> storage_type_;
+};
+
+/// \brief Return a JsonExtensionType instance.
+ARROW_EXPORT std::shared_ptr<DataType> json(
+    std::shared_ptr<DataType> storage_type = utf8());
+
+}  // namespace arrow::extension
diff --git a/cpp/src/arrow/extension/json_test.cc b/cpp/src/arrow/extension/json_test.cc
new file mode 100644
index 0000000000000..143e4f9ceeac7
--- /dev/null
+++ b/cpp/src/arrow/extension/json_test.cc
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/json.h"
+
+#include "arrow/array/validate.h"
+#include "arrow/ipc/test_common.h"
+#include "arrow/record_batch.h"
+#include "arrow/testing/gtest_util.h"
+#include "parquet/exception.h"
+
+namespace arrow {
+
+using arrow::ipc::test::RoundtripBatch;
+using extension::json;
+
+class TestJsonExtensionType : public ::testing::Test {};
+
+std::shared_ptr<Array> ExampleJson(const std::shared_ptr<DataType>& storage_type) {
+  std::shared_ptr<Array> arr = ArrayFromJSON(storage_type, R"([
+    "null",
+    "1234",
+    "3.14159",
+    "true",
+    "false",
+    "\"a json string\"",
+    "[\"a\", \"json\", \"array\"]",
+    "{\"obj\": \"a simple json object\"}"
+   ])");
+  return ExtensionType::WrapArray(arrow::extension::json(storage_type), arr);
+}
+
+TEST_F(TestJsonExtensionType, JsonRoundtrip) {
+  for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) {
+    std::shared_ptr<Array> ext_arr = ExampleJson(storage_type);
+    auto batch =
+        RecordBatch::Make(schema({field("f0", json(storage_type))}), 8, {ext_arr});
+
+    std::shared_ptr<RecordBatch> read_batch;
+    ASSERT_OK(RoundtripBatch(batch, &read_batch));
+    ASSERT_OK(read_batch->ValidateFull());
+    CompareBatch(*batch, *read_batch, /*compare_metadata*/ true);
+
+    auto read_ext_arr = read_batch->column(0);
+    ASSERT_OK(internal::ValidateUTF8(*read_ext_arr));
+    ASSERT_OK(read_ext_arr->ValidateFull());
+  }
+}
+
+TEST_F(TestJsonExtensionType, InvalidUTF8) {
+  for (const auto& storage_type : {utf8(), large_utf8(), utf8_view()}) {
+    auto json_type = json(storage_type);
+    auto invalid_input = ArrayFromJSON(storage_type, "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]");
+    auto ext_arr = ExtensionType::WrapArray(json_type, invalid_input);
+
+    ASSERT_RAISES_WITH_MESSAGE(Invalid,
+                               "Invalid: Invalid UTF8 sequence at string index 0",
+                               ext_arr->ValidateFull());
+    ASSERT_RAISES_WITH_MESSAGE(Invalid,
+                               "Invalid: Invalid UTF8 sequence at string index 0",
+                               arrow::internal::ValidateUTF8(*ext_arr));
+
+    auto batch = RecordBatch::Make(schema({field("f0", json_type)}), 2, {ext_arr});
+    std::shared_ptr<RecordBatch> read_batch;
+    ASSERT_OK(RoundtripBatch(batch, &read_batch));
+  }
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc
index 3bbb6eeb4aef1..1c1ffb6eb8e15 100644
--- a/cpp/src/arrow/extension/uuid_test.cc
+++ b/cpp/src/arrow/extension/uuid_test.cc
@@ -54,7 +54,7 @@ TEST(TestUuuidExtensionType, RoundtripBatch) {
   std::shared_ptr<RecordBatch> read_batch;
   auto ext_field = field(/*name=*/"f0", /*type=*/ext_type);
   auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
-  RoundtripBatch(batch, &read_batch);
+  ASSERT_OK(RoundtripBatch(batch, &read_batch));
   CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);
 
   // Pass extension metadata and storage array, expect getting back extension array
@@ -65,7 +65,7 @@ TEST(TestUuuidExtensionType, RoundtripBatch) {
   ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(),
                     /*nullable=*/true, /*metadata=*/ext_metadata);
   auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr});
-  RoundtripBatch(batch2, &read_batch2);
+  ASSERT_OK(RoundtripBatch(batch2, &read_batch2));
   CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
 }
 
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index d0135e905a0c3..7ad39eab23f8d 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -32,6 +32,7 @@
 #  include "arrow/extension/fixed_shape_tensor.h"
 #  include "arrow/extension/opaque.h"
 #endif
+#include "arrow/extension/json.h"
 #include "arrow/extension/uuid.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -148,7 +149,8 @@ static void CreateGlobalRegistry() {
   // Register canonical extension types
 
   g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
-  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), extension::uuid()};
+  std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), extension::json(),
+                                                   extension::uuid()};
 
 #ifdef ARROW_JSON
   ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index f49ffc5cba553..029d833b98cd8 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -219,14 +219,14 @@ TEST_F(TestExtensionType, IpcRoundtrip) {
   auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr});
 
   std::shared_ptr<RecordBatch> read_batch;
-  RoundtripBatch(batch, &read_batch);
+  ASSERT_OK(RoundtripBatch(batch, &read_batch));
   CompareBatch(*batch, *read_batch, false /* compare_metadata */);
 
   // Wrap type in a ListArray and ensure it also makes it
   auto offsets_arr = ArrayFromJSON(int32(), "[0, 0, 2, 4]");
   ASSERT_OK_AND_ASSIGN(auto list_arr, ListArray::FromArrays(*offsets_arr, *ext_arr));
   batch = RecordBatch::Make(schema({field("f0", list(uuid()))}), 3, {list_arr});
-  RoundtripBatch(batch, &read_batch);
+  ASSERT_OK(RoundtripBatch(batch, &read_batch));
   CompareBatch(*batch, *read_batch, false /* compare_metadata */);
 }
 
@@ -289,7 +289,7 @@ TEST_F(TestExtensionType, ParametricTypes) {
                                  4, {p1, p2, p3, p4});
 
   std::shared_ptr<RecordBatch> read_batch;
-  RoundtripBatch(batch, &read_batch);
+  ASSERT_OK(RoundtripBatch(batch, &read_batch));
   CompareBatch(*batch, *read_batch, false /* compare_metadata */);
 }
 
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index fb4f6bd8eadcf..e354e2f89b3b3 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -1236,18 +1236,19 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
   return Tensor::Make(type, buf, shape, strides).Value(out);
 }
 
-void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
-                    std::shared_ptr<RecordBatch>* out) {
-  ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
-  ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
-                                        out_stream.get()));
+Status RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
+                      std::shared_ptr<RecordBatch>* out) {
+  ARROW_ASSIGN_OR_RAISE(auto out_stream, io::BufferOutputStream::Create());
+  RETURN_NOT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
+                                            out_stream.get()));
 
-  ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
+  ARROW_ASSIGN_OR_RAISE(auto complete_ipc_stream, out_stream->Finish());
 
   io::BufferReader reader(complete_ipc_stream);
   std::shared_ptr<RecordBatchReader> batch_reader;
-  ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
-  ASSERT_OK(batch_reader->ReadNext(out));
+  ARROW_ASSIGN_OR_RAISE(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
+  RETURN_NOT_OK(batch_reader->ReadNext(out));
+  return Status::OK();
 }
 
 }  // namespace test
diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h
index 9b7e7f13e3a8e..189de288795c0 100644
--- a/cpp/src/arrow/ipc/test_common.h
+++ b/cpp/src/arrow/ipc/test_common.h
@@ -184,8 +184,8 @@ Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
                         const std::vector<int64_t>& shape, bool row_major_p,
                         std::shared_ptr<Tensor>* out, uint32_t seed = 0);
 
-ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
-                                         std::shared_ptr<RecordBatch>* out);
+ARROW_TESTING_EXPORT Status RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
+                                           std::shared_ptr<RecordBatch>* out);
 
 }  // namespace test
 }  // namespace ipc
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index c4a7f363c71bc..07d15826f2c8f 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -49,6 +49,7 @@
 #include "arrow/buffer.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/datum.h"
+#include "arrow/extension/json.h"
 #include "arrow/io/memory.h"
 #include "arrow/ipc/json_simple.h"
 #include "arrow/ipc/reader.h"
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 724e6c44f2ed0..5d990a5c6bd4a 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -37,6 +37,7 @@
 #include "arrow/array/builder_primitive.h"
 #include "arrow/chunked_array.h"
 #include "arrow/compute/api.h"
+#include "arrow/extension/json.h"
 #include "arrow/io/api.h"
 #include "arrow/record_batch.h"
 #include "arrow/scalar.h"
@@ -618,10 +619,15 @@ class ParquetIOTestBase : public ::testing::Test {
     return ParquetFileWriter::Open(sink_, schema);
   }
 
-  void ReaderFromSink(std::unique_ptr<FileReader>* out) {
+  void ReaderFromSink(
+      std::unique_ptr<FileReader>* out,
+      const ArrowReaderProperties& properties = default_arrow_reader_properties()) {
     ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish());
-    ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
-                                ::arrow::default_memory_pool(), out));
+    FileReaderBuilder builder;
+    ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
+    ASSERT_OK_NO_THROW(builder.memory_pool(::arrow::default_memory_pool())
+                           ->properties(properties)
+                           ->Build(out));
   }
 
   void ReadSingleColumnFile(std::unique_ptr<FileReader> file_reader,
@@ -670,6 +676,7 @@ class ParquetIOTestBase : public ::testing::Test {
   void RoundTripSingleColumn(
       const std::shared_ptr<Array>& values, const std::shared_ptr<Array>& expected,
       const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_properties,
+      const ArrowReaderProperties& reader_properties = default_arrow_reader_properties(),
       bool nullable = true) {
     std::shared_ptr<Table> table = MakeSimpleTable(values, nullable);
     this->ResetSink();
@@ -679,7 +686,7 @@ class ParquetIOTestBase : public ::testing::Test {
 
     std::shared_ptr<Table> out;
     std::unique_ptr<FileReader> reader;
-    ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
+    ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader, reader_properties));
     const bool expect_metadata = arrow_properties->store_schema();
     ASSERT_NO_FATAL_FAILURE(
         this->ReadTableFromFile(std::move(reader), expect_metadata, &out));
@@ -1428,6 +1435,52 @@ TEST_F(TestLargeStringParquetIO, Basics) {
   this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
 }
 
+using TestJsonParquetIO = TestParquetIO<::arrow::extension::JsonExtensionType>;
+
+TEST_F(TestJsonParquetIO, JsonExtension) {
+  const char* json = R"([
+    "null",
+    "1234",
+    "3.14159",
+    "true",
+    "false",
+    "\"a json string\"",
+    "[\"a\", \"json\", \"array\"]",
+    "{\"obj\": \"a simple json object\"}"
+  ])";
+
+  const auto json_type = ::arrow::extension::json();
+  const auto string_array = ::arrow::ArrayFromJSON(::arrow::utf8(), json);
+  const auto json_array = ::arrow::ExtensionType::WrapArray(json_type, string_array);
+
+  const auto json_large_type = ::arrow::extension::json(::arrow::large_utf8());
+  const auto large_string_array = ::arrow::ArrayFromJSON(::arrow::large_utf8(), json);
+  const auto json_large_array =
+      ::arrow::ExtensionType::WrapArray(json_large_type, large_string_array);
+
+  // When the original Arrow schema isn't stored and Arrow extensions are disabled,
+  // LogicalType::JSON is read as utf8.
+  this->RoundTripSingleColumn(json_array, string_array,
+                              default_arrow_writer_properties());
+  this->RoundTripSingleColumn(json_large_array, string_array,
+                              default_arrow_writer_properties());
+
+  // When the original Arrow schema isn't stored and Arrow extensions are enabled,
+  // LogicalType::JSON is read as JsonExtensionType with utf8 storage.
+  ::parquet::ArrowReaderProperties reader_properties;
+  reader_properties.set_arrow_extensions_enabled(true);
+  this->RoundTripSingleColumn(json_array, json_array, default_arrow_writer_properties(),
+                              reader_properties);
+  this->RoundTripSingleColumn(json_large_array, json_array,
+                              default_arrow_writer_properties(), reader_properties);
+
+  // When the original Arrow schema is stored, the stored Arrow type is respected.
+  const auto writer_properties =
+      ::parquet::ArrowWriterProperties::Builder().store_schema()->build();
+  this->RoundTripSingleColumn(json_array, json_array, writer_properties);
+  this->RoundTripSingleColumn(json_large_array, json_large_array, writer_properties);
+}
+
 using TestNullParquetIO = TestParquetIO<::arrow::NullType>;
 
 TEST_F(TestNullParquetIO, NullColumn) {
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc
index 9f60cd31d3541..31ead461aa6e2 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -31,8 +31,11 @@
 #include "parquet/thrift_internal.h"
 
 #include "arrow/array.h"
+#include "arrow/extension/json.h"
+#include "arrow/ipc/writer.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
+#include "arrow/util/base64.h"
 #include "arrow/util/key_value_metadata.h"
 
 using arrow::Field;
@@ -76,17 +79,17 @@ class TestConvertParquetSchema : public ::testing::Test {
       auto result_field = result_schema_->field(i);
       auto expected_field = expected_schema->field(i);
       EXPECT_TRUE(result_field->Equals(expected_field, check_metadata))
-          << "Field " << i << "\n  result: " << result_field->ToString()
-          << "\n  expected: " << expected_field->ToString();
+          << "Field " << i << "\n  result: " << result_field->ToString(check_metadata)
+          << "\n  expected: " << expected_field->ToString(check_metadata);
     }
   }
 
   ::arrow::Status ConvertSchema(
       const std::vector<NodePtr>& nodes,
-      const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = nullptr) {
+      const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = nullptr,
+      ArrowReaderProperties props = ArrowReaderProperties()) {
     NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes);
     descr_.Init(schema);
-    ArrowReaderProperties props;
     return FromParquetSchema(&descr_, props, key_value_metadata, &result_schema_);
   }
 
@@ -230,7 +233,7 @@ TEST_F(TestConvertParquetSchema, ParquetAnnotatedFields) {
        ::arrow::uint64()},
       {"int(64, true)", LogicalType::Int(64, true), ParquetType::INT64, -1,
        ::arrow::int64()},
-      {"json", LogicalType::JSON(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()},
+      {"json", LogicalType::JSON(), ParquetType::BYTE_ARRAY, -1, ::arrow::utf8()},
       {"bson", LogicalType::BSON(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()},
       {"interval", LogicalType::Interval(), ParquetType::FIXED_LEN_BYTE_ARRAY, 12,
        ::arrow::fixed_size_binary(12)},
@@ -724,6 +727,87 @@ TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>& arrow_schema,
+                                    std::shared_ptr<KeyValueMetadata>& metadata) {
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Buffer> serialized,
+      ::arrow::ipc::SerializeSchema(*arrow_schema, ::arrow::default_memory_pool()));
+  std::string schema_as_string = serialized->ToString();
+  std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
+  metadata = ::arrow::key_value_metadata({"ARROW:schema"}, {schema_base64});
+  return Status::OK();
+}
+
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, both fields should be treated as json() extension
+    // fields.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", ::arrow::extension::json(), true),
+         ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()),
+                        true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // Both json_1 and json_2 should be returned as a json() field
+    // even though extensions are not enabled.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(false);
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata),
+         ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()),
+                        true)});
+
+    std::shared_ptr<KeyValueMetadata> metadata;
+    ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata));
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema, true /* check_metadata */);
+  }
+
+  {
+    // Parquet file contains Arrow schema. Extensions are enabled.
+    // Both json_1 and json_2 should be returned as a json() field
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", ::arrow::extension::json(), true, field_metadata),
+         ::arrow::field("json_2", ::arrow::extension::json(::arrow::large_utf8()),
+                        true)});
+
+    std::shared_ptr<KeyValueMetadata> metadata;
+    ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata));
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema, true /* check_metadata */);
+  }
+}
+
 class TestConvertArrowSchema : public ::testing::Test {
  public:
   virtual void SetUp() {}
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index ec3890a41f442..1623d80dcb0e4 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 
+#include "arrow/extension/json.h"
 #include "arrow/extension_type.h"
 #include "arrow/io/memory.h"
 #include "arrow/ipc/api.h"
@@ -427,6 +428,13 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
     }
     case ArrowTypeId::EXTENSION: {
       auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
+      // Built-in JSON extension is handled differently.
+      if (ext_type->extension_name() == std::string("arrow.json")) {
+        // Set physical and logical types and instantiate primitive node.
+        type = ParquetType::BYTE_ARRAY;
+        logical_type = LogicalType::JSON();
+        break;
+      }
       std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
           name, ext_type->storage_type(), field->nullable(), field->metadata());
       return FieldToNode(name, storage_field, properties, arrow_properties, out);
@@ -438,7 +446,7 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
     }
 
     default: {
-      // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR
+      // TODO: DENSE_UNION, SPARE_UNION, DECIMAL_TEXT, VARCHAR
       return Status::NotImplemented(
           "Unhandled type for Arrow to Parquet schema conversion: ",
           field->type()->ToString());
@@ -476,9 +484,8 @@ bool IsDictionaryReadSupported(const ArrowType& type) {
 ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
     int column_index, const schema::PrimitiveNode& primitive_node,
     SchemaTreeContext* ctx) {
-  ASSIGN_OR_RAISE(
-      std::shared_ptr<ArrowType> storage_type,
-      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
+  ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> storage_type,
+                  GetArrowType(primitive_node, ctx->properties));
   if (ctx->properties.read_dictionary(column_index) &&
       IsDictionaryReadSupported(*storage_type)) {
     return ::arrow::dictionary(::arrow::int32(), storage_type);
@@ -984,18 +991,35 @@ Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer
   bool modified = false;
 
   auto& origin_type = origin_field.type();
+  const auto& inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
     const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type);
-    auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
+    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
+        ex_type.extension_name() == std::string("arrow.json") &&
+        (inferred_type->id() == ::arrow::Type::STRING ||
+         inferred_type->id() == ::arrow::Type::LARGE_STRING ||
+         inferred_type->id() == ::arrow::Type::STRING_VIEW)) {
+      // Schema mismatch.
+      //
+      // Arrow extensions are DISABLED in Parquet.
+      // origin_type is ::arrow::extension::json()
+      // inferred_type is ::arrow::utf8()
+      //
+      // Origin type is restored as Arrow should be considered the source of truth.
+      inferred->field = inferred->field->WithType(origin_type);
+      RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred));
+    } else {
+      auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
 
-    // Apply metadata recursively to storage type
-    RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred));
+      // Apply metadata recursively to storage type
+      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred));
 
-    // Restore extension type, if the storage type is the same as inferred
-    // from the Parquet type
-    if (ex_type.storage_type()->Equals(*inferred->field->type())) {
-      inferred->field = inferred->field->WithType(origin_type);
+      // Restore extension type, if the storage type is the same as inferred
+      // from the Parquet type
+      if (ex_type.storage_type()->Equals(*inferred->field->type())) {
+        inferred->field = inferred->field->WithType(origin_type);
+      }
     }
     modified = true;
   } else {
diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
index a8e2a95b9b97d..261a00940654d 100644
--- a/cpp/src/parquet/arrow/schema_internal.cc
+++ b/cpp/src/parquet/arrow/schema_internal.cc
@@ -17,8 +17,11 @@
 
 #include "parquet/arrow/schema_internal.h"
 
+#include "arrow/extension/json.h"
 #include "arrow/type.h"
 
+#include "parquet/properties.h"
+
 using ArrowType = ::arrow::DataType;
 using ArrowTypeId = ::arrow::Type;
 using ParquetType = parquet::Type;
@@ -107,7 +110,8 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
   }
 }
 
-Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
+Result<std::shared_ptr<ArrowType>> FromByteArray(
+    const LogicalType& logical_type, const ArrowReaderProperties& reader_properties) {
   switch (logical_type.type()) {
     case LogicalType::Type::STRING:
       return ::arrow::utf8();
@@ -115,9 +119,15 @@ Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type
       return MakeArrowDecimal(logical_type);
     case LogicalType::Type::NONE:
     case LogicalType::Type::ENUM:
-    case LogicalType::Type::JSON:
     case LogicalType::Type::BSON:
       return ::arrow::binary();
+    case LogicalType::Type::JSON:
+      if (reader_properties.get_arrow_extensions_enabled()) {
+        return ::arrow::extension::json(::arrow::utf8());
+      }
+      // When the original Arrow schema isn't stored and Arrow extensions are disabled,
+      // LogicalType::JSON is read as utf8().
+      return ::arrow::utf8();
     default:
       return Status::NotImplemented("Unhandled logical logical_type ",
                                     logical_type.ToString(), " for binary array");
@@ -180,7 +190,7 @@ Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+    const ArrowReaderProperties& reader_properties) {
   if (logical_type.is_invalid() || logical_type.is_null()) {
     return ::arrow::null();
   }
@@ -193,13 +203,13 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
     case ParquetType::INT64:
       return FromInt64(logical_type);
     case ParquetType::INT96:
-      return ::arrow::timestamp(int96_arrow_time_unit);
+      return ::arrow::timestamp(reader_properties.coerce_int96_timestamp_unit());
     case ParquetType::FLOAT:
       return ::arrow::float32();
     case ParquetType::DOUBLE:
       return ::arrow::float64();
     case ParquetType::BYTE_ARRAY:
-      return FromByteArray(logical_type);
+      return FromByteArray(logical_type, reader_properties);
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
       return FromFLBA(logical_type, type_length);
     default: {
@@ -212,9 +222,9 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+    const ArrowReaderProperties& reader_properties) {
   return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
-                      primitive.type_length(), int96_arrow_time_unit);
+                      primitive.type_length(), reader_properties);
 }
 
 }  // namespace parquet::arrow
diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h
index f56ba0958ae2d..58828f85ab8e3 100644
--- a/cpp/src/parquet/arrow/schema_internal.h
+++ b/cpp/src/parquet/arrow/schema_internal.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include "arrow/result.h"
+#include "arrow/type_fwd.h"
 #include "parquet/schema.h"
 
 namespace arrow {
@@ -28,7 +29,8 @@ namespace parquet::arrow {
 
 using ::arrow::Result;
 
-Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type,
+                                                         bool use_known_arrow_extensions);
 Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
                                                     int32_t physical_length);
 Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
@@ -36,10 +38,10 @@ Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+    const ArrowReaderProperties& reader_properties);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+    const ArrowReaderProperties& reader_properties);
 
 }  // namespace parquet::arrow
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 4d3acb491e390..7f2e371df66d7 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -870,7 +870,8 @@ class PARQUET_EXPORT ArrowReaderProperties {
         batch_size_(kArrowDefaultBatchSize),
         pre_buffer_(true),
         cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
-        coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
+        coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
+        arrow_extensions_enabled_(false) {}
 
   /// \brief Set whether to use the IO thread pool to parse columns in parallel.
   ///
@@ -941,6 +942,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
     return coerce_int96_timestamp_unit_;
   }
 
+  /// Enable Parquet-supported Arrow extension types.
+  ///
+  /// When enabled, Parquet logical types will be mapped to their corresponding Arrow
+  /// extension types at read time, if such exist. Currently only arrow::extension::json()
+  /// extension type is supported. Columns whose LogicalType is JSON will be interpreted
+  /// as arrow::extension::json(), with storage type inferred from the serialized Arrow
+  /// schema if present, or `utf8` by default.
+  void set_arrow_extensions_enabled(bool extensions_enabled) {
+    arrow_extensions_enabled_ = extensions_enabled;
+  }
+  bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; }
+
  private:
   bool use_threads_;
   std::unordered_set<int> read_dict_indices_;
@@ -949,6 +962,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
   ::arrow::io::IOContext io_context_;
   ::arrow::io::CacheOptions cache_options_;
   ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
+  bool arrow_extensions_enabled_;
 };
 
 /// EXPERIMENTAL: Constructs the default ArrowReaderProperties
diff --git a/docs/source/status.rst b/docs/source/status.rst
index b685d4bbf8add..98374164d7ae0 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -119,7 +119,7 @@ Data Types
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 | Variable shape tensor |       |       |       |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
-| JSON                  |       |       | ✓     |            |       |       |       |       |
+| JSON                  | ✓     |       | ✓     |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+
 | UUID                  | ✓     |       | ✓     |            |       |       |       |       |
 +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+

From d6198c038ef35481d6f0969ab8419efc004b0f7d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 11 Sep 2024 19:32:04 +0200
Subject: [PATCH 127/186] GH-36412: [Python][CI] Fix deprecation warning about
 day freq alias with latest pandas (#44067)

### Rationale for this change

Updating our pandas usage to follow pandas' changes (they are deprecating the `"d"` alias as alternative for `"D"`)

* GitHub Issue: #36412

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/tests/test_compute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index d4307cd24f8fc..c16d2f9aacf74 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -2417,7 +2417,7 @@ def _check_temporal_rounding(ts, values, unit):
         "millisecond": "s",
         "second": "min",
         "minute": "h",
-        "hour": "d",
+        "hour": "D",
     }
     ta = pa.array(ts)
 

From 5e04103a38199484658bd569070a41c09d13ae91 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 12 Sep 2024 09:34:46 +0900
Subject: [PATCH 128/186] MINOR: [Java] Bump
 com.gradle:common-custom-user-data-maven-extension from 2.0 to 2.0.1 in /java
 (#44024)

Bumps [com.gradle:common-custom-user-data-maven-extension](https://github.com/gradle/common-custom-user-data-maven-extension) from 2.0 to 2.0.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/gradle/common-custom-user-data-maven-extension/releases">com.gradle:common-custom-user-data-maven-extension's releases</a>.</em></p>
<blockquote>
<h2>2.0.1</h2>
<ul>
<li>[NEW] JAR contains LICENSE and NOTICE files</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/7635e5ca6e7dbec7f8b65199d0d398e85b60031c"><code>7635e5c</code></a> [maven-release-plugin] prepare release v2.0.1</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/58eaf1592641d325c90d038ef497987fc6945dd4"><code>58eaf15</code></a> Create release notes for 2.0.1</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/40395eedec15c92fbbc0c44edd52ffc11a724f4b"><code>40395ee</code></a> Merge pull request <a href="https://redirect.github.com/gradle/common-custom-user-data-maven-extension/issues/241">#241</a> from gradle/erichaagdev/license-notice-2</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/ba1de901053d40f495dd68d5b5554e1e860894b5"><code>ba1de90</code></a> Ensure this project's LICENSE and NOTICE are the only ones included in JARs</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/4538056e86d8219010f5f1dd47a277adddd9af3e"><code>4538056</code></a> Add NOTICE file and include it and LICENSE in all assembled JAR files</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/9889c7ac3be3680d0e780163e344099115bd459f"><code>9889c7a</code></a> Revert &quot;Add NOTICE file and include it and LICENSE in assembled JAR&quot;</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/43e17842a9e214e276d99135538da4d7519785af"><code>43e1784</code></a> Merge pull request <a href="https://redirect.github.com/gradle/common-custom-user-data-maven-extension/issues/240">#240</a> from gradle/erichaagdev/license-notice</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/7dade910e950fce12199b1a4d22c01ff99b1ffd3"><code>7dade91</code></a> Add NOTICE file and include it and LICENSE in assembled JAR</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/6e251e32ab0bfedb14991770285ec30ba9bd872c"><code>6e251e3</code></a> Merge pull request <a href="https://redirect.github.com/gradle/common-custom-user-data-maven-extension/issues/239">#239</a> from gradle/wrapperbot/common-custom-user-data-maven-...</li>
<li><a href="https://github.com/gradle/common-custom-user-data-maven-extension/commit/3ba34bdc02db7374fefe81167968095233235b2f"><code>3ba34bd</code></a> Bump Maven Wrapper from 3.9.8 to 3.9.9</li>
<li>Additional commits viewable in <a href="https://github.com/gradle/common-custom-user-data-maven-extension/compare/v2...v2.0.1">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.gradle:common-custom-user-data-maven-extension&package-manager=maven&previous-version=2.0&new-version=2.0.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/.mvn/extensions.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/.mvn/extensions.xml b/java/.mvn/extensions.xml
index 716e2f9e81c35..c90629a91c9ec 100644
--- a/java/.mvn/extensions.xml
+++ b/java/.mvn/extensions.xml
@@ -28,6 +28,6 @@
     <extension>
         <groupId>com.gradle</groupId>
         <artifactId>common-custom-user-data-maven-extension</artifactId>
-        <version>2.0</version>
+        <version>2.0.1</version>
     </extension>
 </extensions>

From 5b968b3090c744ae26150c93138ba819ed9ebb8e Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Wed, 11 Sep 2024 21:13:55 -0400
Subject: [PATCH 129/186] GH-43748: [R] Handle package_version in
 safe_r_metadata (#43895)

### Rationale for this change

See #43748. There is what appears to be a bug in R's
`[[.numeric_version` implementation that leads to infinite recursion.

Edit: after some digging in R source, this appears to be as designed.
And other list subclasses that have methods to make them behave like
atomic types, like `POSIXlt`, also have this.

### What changes are included in this PR?

When recursing into list objects, `unclass()` them first to get the raw
list behavior. Also apply the checking to the `attributes()` before
reapplying them.

### Are these changes tested?

yes

### Are there any user-facing changes?

Fewer bugs!

* GitHub Issue: #43748
---
 r/R/metadata.R                   | 21 ++++++++++++++++++++-
 r/tests/testthat/test-metadata.R | 29 ++++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/r/R/metadata.R b/r/R/metadata.R
index ba73f0857881d..61e412be62450 100644
--- a/r/R/metadata.R
+++ b/r/R/metadata.R
@@ -107,15 +107,34 @@ safe_r_metadata <- function(metadata, on_save = FALSE) {
   # and mutate the `types_removed` variable outside of it.
   check_r_metadata_types_recursive <- function(x) {
     allowed_types <- c("character", "double", "integer", "logical", "complex", "list", "NULL")
+    # Pull out the attributes so we can also check them
+    x_attrs <- attributes(x)
+
     if (is.list(x)) {
+      # Add special handling for some base R classes that are list but
+      # their [[ methods leads to infinite recursion.
+      # We unclass here and then reapply attributes after.
+      x <- unclass(x)
+
       types <- map_chr(x, typeof)
-      x[types == "list"] <- map(x[types == "list"], check_r_metadata_types_recursive)
       ok <- types %in% allowed_types
       if (!all(ok)) {
         # Record the invalid types, then remove the offending elements
         types_removed <<- c(types_removed, setdiff(types, allowed_types))
         x <- x[ok]
+        if ("names" %in% names(x_attrs)) {
+          # Also prune from the attributes since we'll re-add later
+          x_attrs[["names"]] <- x_attrs[["names"]][ok]
+        }
       }
+      # For the rest, recurse
+      x <- map(x, check_r_metadata_types_recursive)
+    }
+
+    # attributes() of a named list will return a list with a "names" attribute,
+    # so it will recurse indefinitely.
+    if (!is.null(x_attrs) && !identical(x_attrs, list(names = names(x)))) {
+      attributes(x) <- check_r_metadata_types_recursive(x_attrs)
     }
     x
   }
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 175e7ef3b6b73..06aa1535e0a36 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -149,6 +149,15 @@ arbitrary\040code\040was\040just\040executed
   )
 })
 
+test_that("R metadata processing doesn't choke on packageVersion() output", {
+  metadata <- list(version = packageVersion("base"))
+  expect_identical(safe_r_metadata(metadata), metadata)
+
+  df <- example_data[1:6]
+  attr(df, "version") <- packageVersion("base")
+  expect_equal_data_frame(Table$create(df), df)
+})
+
 test_that("Complex or unsafe attributes are pruned from R metadata, if they exist", {
   tab <- Table$create(example_data[1:6])
   bad <- new.env()
@@ -161,18 +170,24 @@ i Type: \"environment\"
 > If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.",
     fixed = TRUE
   )
+  # Try hiding it even further, in attributes
+  bad_meta <- list(attributes = structure(list(), hidden_attr = bad))
+  tab$metadata <- list(r = rawToChar(serialize(bad_meta, NULL, ascii = TRUE)))
+  expect_warning(
+    as.data.frame(tab),
+    "Potentially unsafe or invalid elements have been discarded from R metadata.
+i Type: \"environment\"
+> If you trust the source, you can set `options(arrow.unsafe_metadata = TRUE)` to preserve them.",
+    fixed = TRUE
+  )
+
   # You can set an option to allow them through.
   # It still warns, just differently, and it doesn't prune the attributes
   withr::local_options(list("arrow.unsafe_metadata" = TRUE))
   expect_warning(
-    expect_warning(
-      as.data.frame(tab),
-      "R metadata may have unsafe or invalid elements
+    as.data.frame(tab),
+    "R metadata may have unsafe or invalid elements
 i Type: \"environment\""
-    ),
-    # This particular example ultimately fails because it's not a list
-    "Invalid metadata$r",
-    fixed = TRUE
   )
 })
 

From 046e7cbec4b069ad06db95963ae0342807199f6a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 12 Sep 2024 08:37:46 +0200
Subject: [PATCH 130/186] GH-44063: [Python] Deprecate the no longer used
 serialize/deserialize Pyarrow C++ functions (#44064)

### Rationale for this change

We want to remove this part of the code (since we no longer use it ourselves, see https://github.com/apache/arrow/issues/43587), and before doing that first deprecating them for two releases.

* GitHub Issue: #44063

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/src/arrow/python/deserialize.h | 6 ++++++
 python/pyarrow/src/arrow/python/serialize.h   | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/python/pyarrow/src/arrow/python/deserialize.h b/python/pyarrow/src/arrow/python/deserialize.h
index 41b6a13a38875..fe1d73622a3db 100644
--- a/python/pyarrow/src/arrow/python/deserialize.h
+++ b/python/pyarrow/src/arrow/python/deserialize.h
@@ -24,6 +24,7 @@
 #include "arrow/python/serialize.h"
 #include "arrow/python/visibility.h"
 #include "arrow/status.h"
+#include "arrow/util/macros.h"
 
 namespace arrow {
 
@@ -55,6 +56,7 @@ struct ARROW_PYTHON_EXPORT SparseTensorCounts {
 /// \param[in] src a RandomAccessFile
 /// \param[out] out the reconstructed data
 /// \return Status
+ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0")
 ARROW_PYTHON_EXPORT
 Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
 
@@ -70,6 +72,7 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
 /// num_csf_tensors * (2 * ndim_csf + 3) + num_buffers in length
 /// \param[out] out the reconstructed object
 /// \return Status
+ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0")
 ARROW_PYTHON_EXPORT
 Status GetSerializedFromComponents(int num_tensors,
                                    const SparseTensorCounts& num_sparse_tensors,
@@ -88,6 +91,7 @@ Status GetSerializedFromComponents(int num_tensors,
 /// \param[out] out The returned object
 /// \return Status
 /// This acquires the GIL
+ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0")
 ARROW_PYTHON_EXPORT
 Status DeserializeObject(PyObject* context, const SerializedPyObject& object,
                          PyObject* base, PyObject** out);
@@ -96,9 +100,11 @@ Status DeserializeObject(PyObject* context, const SerializedPyObject& object,
 /// \param[in] object Object to deserialize
 /// \param[out] out The deserialized tensor
 /// \return Status
+ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0")
 ARROW_PYTHON_EXPORT
 Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr<Tensor>* out);
 
+ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0")
 ARROW_PYTHON_EXPORT
 Status NdarrayFromBuffer(std::shared_ptr<Buffer> src, std::shared_ptr<Tensor>* out);
 
diff --git a/python/pyarrow/src/arrow/python/serialize.h b/python/pyarrow/src/arrow/python/serialize.h
index fd207d3e06903..af6d2d81a61c4 100644
--- a/python/pyarrow/src/arrow/python/serialize.h
+++ b/python/pyarrow/src/arrow/python/serialize.h
@@ -24,6 +24,7 @@
 #include "arrow/python/visibility.h"
 #include "arrow/sparse_tensor.h"
 #include "arrow/status.h"
+#include "arrow/util/macros.h"
 
 // Forward declaring PyObject, see
 // https://mail.python.org/pipermail/python-dev/2003-August/037601.html
@@ -92,6 +93,7 @@ struct ARROW_PYTHON_EXPORT SerializedPyObject {
 /// \return Status
 ///
 /// Release GIL before calling
+ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0")
 ARROW_PYTHON_EXPORT
 Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out);
 
@@ -99,6 +101,7 @@ Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject
 /// \param[in] tensor Tensor to be serialized
 /// \param[out] out The serialized representation
 /// \return Status
+ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0")
 ARROW_PYTHON_EXPORT
 Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* out);
 
@@ -108,6 +111,7 @@ Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* o
 /// \param[in] tensor_num_bytes The length of the Tensor data in bytes
 /// \param[in] dst The OutputStream to write the Tensor header to
 /// \return Status
+ARROW_DEPRECATED("Deprecated in 18.0.0. Will be removed in 20.0.0")
 ARROW_PYTHON_EXPORT
 Status WriteNdarrayHeader(std::shared_ptr<DataType> dtype,
                           const std::vector<int64_t>& shape, int64_t tensor_num_bytes,

From 9986b7b10618e0da2d8533e7e440c70f3fec7337 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 12 Sep 2024 10:42:22 +0200
Subject: [PATCH 131/186] GH-44072: [C++][Parquet] Add Float16 reading
 benchmarks (#44073)

Local benchmark numbers:
```
---------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                 Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------------------------------------------------------------
BM_ReadColumnPlain<false,Int32Type>/null_probability:-1            20038480 ns     20019703 ns           36 bytes_per_second=1.9512Gi/s items_per_second=523.772M/s
BM_ReadColumnPlain<true,Int32Type>/null_probability:0              37114403 ns     36766588 ns           19 bytes_per_second=1.06245Gi/s items_per_second=285.198M/s
BM_ReadColumnPlain<true,Int32Type>/null_probability:1              44589582 ns     44371707 ns           16 bytes_per_second=901.475Mi/s items_per_second=236.316M/s
BM_ReadColumnPlain<true,Int32Type>/null_probability:50             65624754 ns     65322683 ns           11 bytes_per_second=612.345Mi/s items_per_second=160.522M/s
BM_ReadColumnPlain<true,Int32Type>/null_probability:99             43072631 ns     42932582 ns           16 bytes_per_second=931.693Mi/s items_per_second=244.238M/s
BM_ReadColumnPlain<true,Int32Type>/null_probability:100            36710045 ns     36475141 ns           19 bytes_per_second=1.07093Gi/s items_per_second=287.477M/s

BM_ReadColumnPlain<false,Float16LogicalType>/null_probability:-1   52718868 ns     52616204 ns           12 bytes_per_second=380.111Mi/s items_per_second=199.288M/s
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:0     71273144 ns     71093105 ns           10 bytes_per_second=281.321Mi/s items_per_second=147.493M/s
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:1     80674727 ns     80358048 ns            8 bytes_per_second=248.886Mi/s items_per_second=130.488M/s
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:50   138249159 ns    137922632 ns            5 bytes_per_second=145.009Mi/s items_per_second=76.0264M/s
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:99    86938382 ns     86576176 ns            8 bytes_per_second=231.01Mi/s items_per_second=121.116M/s
BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:100   74154244 ns     73984356 ns            9 bytes_per_second=270.327Mi/s items_per_second=141.729M/s
```

* GitHub Issue: #44072

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../parquet/arrow/reader_writer_benchmark.cc  | 86 ++++++++++++++++---
 1 file changed, 75 insertions(+), 11 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
index 95c4a659297d9..b12f234f72bdf 100644
--- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc
+++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
@@ -28,6 +28,7 @@
 #include "parquet/file_reader.h"
 #include "parquet/file_writer.h"
 #include "parquet/platform.h"
+#include "parquet/properties.h"
 
 #include "arrow/array.h"
 #include "arrow/array/builder_primitive.h"
@@ -88,6 +89,11 @@ struct benchmark_traits<BooleanType> {
   using arrow_type = ::arrow::BooleanType;
 };
 
+template <>
+struct benchmark_traits<Float16LogicalType> {
+  using arrow_type = ::arrow::HalfFloatType;
+};
+
 template <typename ParquetType>
 using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
 
@@ -125,15 +131,15 @@ std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
   return values;
 }
 
-template <typename ParquetType>
+template <typename ParquetType, typename ArrowType = ArrowType<ParquetType>>
 std::shared_ptr<::arrow::Table> TableFromVector(
-    const std::vector<typename ParquetType::c_type>& vec, bool nullable,
+    const std::vector<typename ArrowType::c_type>& vec, bool nullable,
     int64_t null_percentage = kAlternatingOrNa) {
   if (!nullable) {
     ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
   }
-  std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
-  NumericBuilder<ArrowType<ParquetType>> builder;
+  std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType>();
+  NumericBuilder<ArrowType> builder;
   if (nullable) {
     // Note true values select index 1 of sample_values
     auto valid_bytes = RandomVector<uint8_t>(/*true_percentage=*/null_percentage,
@@ -258,18 +264,20 @@ struct Examples<bool> {
 };
 
 static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
+                               std::shared_ptr<WriterProperties> properties,
                                int64_t num_values = -1, int64_t total_bytes = -1) {
   auto output = CreateOutputStream();
-  EXIT_NOT_OK(
-      WriteTable(table, ::arrow::default_memory_pool(), output, table.num_rows()));
+  EXIT_NOT_OK(WriteTable(table, ::arrow::default_memory_pool(), output,
+                         /*chunk_size=*/table.num_rows(), properties));
   PARQUET_ASSIGN_OR_THROW(auto buffer, output->Finish());
 
-  while (state.KeepRunning()) {
+  for (auto _ : state) {
     auto reader =
         ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
     std::unique_ptr<FileReader> arrow_reader;
     EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
                                  &arrow_reader));
+
     std::shared_ptr<::arrow::Table> table;
     EXIT_NOT_OK(arrow_reader->ReadTable(&table));
   }
@@ -283,8 +291,14 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table&
   }
 }
 
+static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
+                               int64_t num_values = -1, int64_t total_bytes = -1) {
+  BenchmarkReadTable(state, table, default_writer_properties(), num_values, total_bytes);
+}
+
 static void BenchmarkReadArray(::benchmark::State& state,
                                const std::shared_ptr<Array>& array, bool nullable,
+                               std::shared_ptr<WriterProperties> properties,
                                int64_t num_values = -1, int64_t total_bytes = -1) {
   auto schema = ::arrow::schema({field("s", array->type(), nullable)});
   auto table = ::arrow::Table::Make(schema, {array}, array->length());
@@ -294,8 +308,15 @@ static void BenchmarkReadArray(::benchmark::State& state,
   BenchmarkReadTable(state, *table, num_values, total_bytes);
 }
 
+static void BenchmarkReadArray(::benchmark::State& state,
+                               const std::shared_ptr<Array>& array, bool nullable,
+                               int64_t num_values = -1, int64_t total_bytes = -1) {
+  BenchmarkReadArray(state, array, nullable, default_writer_properties(), num_values,
+                     total_bytes);
+}
+
 //
-// Benchmark reading a primitive column
+// Benchmark reading a dict-encoded primitive column
 //
 
 template <bool nullable, typename ParquetType>
@@ -308,7 +329,9 @@ static void BM_ReadColumn(::benchmark::State& state) {
   std::shared_ptr<::arrow::Table> table =
       TableFromVector<ParquetType>(values, nullable, state.range(0));
 
-  BenchmarkReadTable(state, *table, table->num_rows(),
+  auto properties = WriterProperties::Builder().disable_dictionary()->build();
+
+  BenchmarkReadTable(state, *table, properties, table->num_rows(),
                      sizeof(typename ParquetType::c_type) * table->num_rows());
 }
 
@@ -316,8 +339,9 @@ static void BM_ReadColumn(::benchmark::State& state) {
 // null_percentage governs distribution and therefore runs of null values.
 // first_value_percentage governs distribution of values (we select from 1 of 2)
 // so when 0 or 100 RLE is triggered all the time.  When a value in the range (0, 100)
-// there will be some percentage of RLE encoded values and some percentage of literal
-// encoded values (RLE is much less likely with percentages close to 50).
+// there will be some percentage of RLE-encoded dictionary indices and some
+// percentage of literal encoded dictionary indices
+// (RLE is much less likely with percentages close to 50).
 BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
     ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
     ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
@@ -325,6 +349,7 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
 
 BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
     ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/0, /*first_value_percentage=*/1})
     ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
     ->Args({/*null_percentage=*/10, /*first_value_percentage=*/10})
     ->Args({/*null_percentage=*/25, /*first_value_percentage=*/5})
@@ -369,6 +394,45 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a PLAIN-encoded primitive column
+//
+
+template <bool nullable, typename ParquetType>
+static void BM_ReadColumnPlain(::benchmark::State& state) {
+  using c_type = typename ArrowType<ParquetType>::c_type;
+
+  const std::vector<c_type> values(BENCHMARK_SIZE, static_cast<c_type>(42));
+  std::shared_ptr<::arrow::Table> table =
+      TableFromVector<ParquetType>(values, /*nullable=*/nullable, state.range(0));
+
+  auto properties = WriterProperties::Builder().disable_dictionary()->build();
+  BenchmarkReadTable(state, *table, properties, table->num_rows(),
+                     sizeof(c_type) * table->num_rows());
+}
+
+BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type)
+    ->ArgNames({"null_probability"})
+    ->Args({kAlternatingOrNa});
+BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Int32Type)
+    ->ArgNames({"null_probability"})
+    ->Args({0})
+    ->Args({1})
+    ->Args({50})
+    ->Args({99})
+    ->Args({100});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Float16LogicalType)
+    ->ArgNames({"null_probability"})
+    ->Args({kAlternatingOrNa});
+BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Float16LogicalType)
+    ->ArgNames({"null_probability"})
+    ->Args({0})
+    ->Args({1})
+    ->Args({50})
+    ->Args({99})
+    ->Args({100});
+
 //
 // Benchmark reading binary column
 //

From e0ac5d5e7dfaa151171f1f5eb95dc7f085e915c9 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 12 Sep 2024 16:29:38 +0200
Subject: [PATCH 132/186] GH-44081: [C++][Parquet] Fix reported metrics in
 parquet-arrow-reader-writer-benchmark (#44082)

### Rationale for this change

1. items/sec and bytes/sec were set to the same value in some benchmarks
2. bytes/sec was incorrectly computed for boolean columns

### What changes are included in this PR?

Fix parquet-arrow-reader-writer-benchmark to report correct metrics.

#### Example (column writing)

Before:
```
--------------------------------------------------------------------------------------------------------------------
Benchmark                                                          Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------------------------------------
BM_WriteColumn<false,Int32Type>                             43138428 ns     43118609 ns           15 bytes_per_second=927.674Mi/s items_per_second=972.736M/s
BM_WriteColumn<true,Int32Type>                             150528627 ns    150480597 ns            5 bytes_per_second=265.815Mi/s items_per_second=278.727M/s
BM_WriteColumn<false,Int64Type>                             49243514 ns     49214955 ns           14 bytes_per_second=1.58742Gi/s items_per_second=1.70448G/s
BM_WriteColumn<true,Int64Type>                             151526550 ns    151472832 ns            5 bytes_per_second=528.148Mi/s items_per_second=553.803M/s
BM_WriteColumn<false,DoubleType>                            59101372 ns     59068058 ns           12 bytes_per_second=1.32263Gi/s items_per_second=1.42016G/s
BM_WriteColumn<true,DoubleType>                            159944872 ns    159895095 ns            4 bytes_per_second=500.328Mi/s items_per_second=524.632M/s
BM_WriteColumn<false,BooleanType>                           32855604 ns     32845322 ns           21 bytes_per_second=304.457Mi/s items_per_second=319.247M/s
BM_WriteColumn<true,BooleanType>                           150566118 ns    150528329 ns            5 bytes_per_second=66.4327Mi/s items_per_second=69.6597M/s
```
After:
```
Benchmark                                                          Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------------------------------------
BM_WriteColumn<false,Int32Type>                             43919180 ns     43895926 ns           16 bytes_per_second=911.246Mi/s items_per_second=238.878M/s
BM_WriteColumn<true,Int32Type>                             153981290 ns    153929841 ns            5 bytes_per_second=259.859Mi/s items_per_second=68.1204M/s
BM_WriteColumn<false,Int64Type>                             49906105 ns     49860098 ns           14 bytes_per_second=1.56688Gi/s items_per_second=210.304M/s
BM_WriteColumn<true,Int64Type>                             154273499 ns    154202319 ns            5 bytes_per_second=518.799Mi/s items_per_second=68M/s
BM_WriteColumn<false,DoubleType>                            59789490 ns     59733498 ns           12 bytes_per_second=1.30789Gi/s items_per_second=175.542M/s
BM_WriteColumn<true,DoubleType>                            161235860 ns    161169670 ns            4 bytes_per_second=496.371Mi/s items_per_second=65.0604M/s
BM_WriteColumn<false,BooleanType>                           32962097 ns     32950864 ns           21 bytes_per_second=37.9353Mi/s items_per_second=318.224M/s
BM_WriteColumn<true,BooleanType>                           154103499 ns    154052873 ns            5 bytes_per_second=8.1141Mi/s items_per_second=68.066M/s
```

#### Example (column reading)

Before:
```
---------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                 Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------------------------------------------------------------
BM_ReadColumn<false,BooleanType>/-1/0                               6456731 ns      6453510 ns          108 bytes_per_second=1.51323Gi/s items_per_second=1.62482G/s
BM_ReadColumn<false,BooleanType>/1/20                              19012505 ns     19006068 ns           36 bytes_per_second=526.148Mi/s items_per_second=551.706M/s
BM_ReadColumn<true,BooleanType>/-1/1                               58365426 ns     58251529 ns           12 bytes_per_second=171.669Mi/s items_per_second=180.008M/s
BM_ReadColumn<true,BooleanType>/5/10                               46498966 ns     46442191 ns           15 bytes_per_second=215.321Mi/s items_per_second=225.781M/s

BM_ReadIndividualRowGroups                                         29617575 ns     29600557 ns           24 bytes_per_second=2.63931Gi/s items_per_second=2.83394G/s
BM_ReadMultipleRowGroups                                           47416980 ns     47288951 ns           15 bytes_per_second=1.65208Gi/s items_per_second=1.7739G/s
BM_ReadMultipleRowGroupsGenerator                                  29741012 ns     29722112 ns           24 bytes_per_second=2.62851Gi/s items_per_second=2.82235G/s
```

After:
```
---------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                 Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------------------------------------------------------------
BM_ReadColumn<false,BooleanType>/-1/0                               6438249 ns      6435159 ns          109 bytes_per_second=194.245Mi/s items_per_second=1.62945G/s
BM_ReadColumn<false,BooleanType>/1/20                              19427495 ns     19419378 ns           37 bytes_per_second=64.3687Mi/s items_per_second=539.964M/s
BM_ReadColumn<true,BooleanType>/-1/1                               58342877 ns     58298236 ns           12 bytes_per_second=21.4415Mi/s items_per_second=179.864M/s
BM_ReadColumn<true,BooleanType>/5/10                               46591584 ns     46532288 ns           15 bytes_per_second=26.8631Mi/s items_per_second=225.344M/s

BM_ReadIndividualRowGroups                                         30039049 ns     30021676 ns           23 bytes_per_second=2.60229Gi/s items_per_second=349.273M/s
BM_ReadMultipleRowGroups                                           47877663 ns     47650438 ns           15 bytes_per_second=1.63954Gi/s items_per_second=220.056M/s
BM_ReadMultipleRowGroupsGenerator                                  30377987 ns     30360019 ns           23 bytes_per_second=2.57329Gi/s items_per_second=345.381M/s
```

### Are these changes tested?

Manually by running benchmarks.

### Are there any user-facing changes?

No, but this breaks historical comparisons in continuous benchmarking.
* GitHub Issue: #44081

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../parquet/arrow/reader_writer_benchmark.cc  | 95 +++++++++++--------
 1 file changed, 56 insertions(+), 39 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
index b12f234f72bdf..283b113dfe992 100644
--- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc
+++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
@@ -20,6 +20,7 @@
 #include <array>
 #include <iostream>
 #include <random>
+#include <type_traits>
 
 #include "parquet/arrow/reader.h"
 #include "parquet/arrow/writer.h"
@@ -37,6 +38,7 @@
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/util/async_generator.h"
+#include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/logging.h"
 
@@ -45,6 +47,7 @@ using arrow::ArrayVector;
 using arrow::BooleanBuilder;
 using arrow::FieldVector;
 using arrow::NumericBuilder;
+using arrow::Table;
 
 #define EXIT_NOT_OK(s)                                        \
   do {                                                        \
@@ -104,13 +107,28 @@ std::shared_ptr<ColumnDescriptor> MakeSchema(Repetition::type repetition) {
                                             repetition == Repetition::REPEATED);
 }
 
-template <bool nullable, typename ParquetType>
+template <typename ParquetType>
+int64_t BytesForItems(int64_t num_items) {
+  static_assert(!std::is_same_v<ParquetType, FLBAType>,
+                "BytesForItems unsupported for FLBAType");
+  return num_items * sizeof(typename ParquetType::c_type);
+}
+
+template <>
+int64_t BytesForItems<BooleanType>(int64_t num_items) {
+  return ::arrow::bit_util::BytesForBits(num_items);
+}
+
+template <>
+int64_t BytesForItems<Float16LogicalType>(int64_t num_items) {
+  return num_items * sizeof(uint16_t);
+}
+
+template <typename ParquetType>
 void SetBytesProcessed(::benchmark::State& state, int64_t num_values = BENCHMARK_SIZE) {
   const int64_t items_processed = state.iterations() * num_values;
-  const int64_t bytes_processed = items_processed * sizeof(typename ParquetType::c_type);
-
-  state.SetItemsProcessed(bytes_processed);
-  state.SetBytesProcessed(bytes_processed);
+  state.SetItemsProcessed(items_processed);
+  state.SetBytesProcessed(BytesForItems<ParquetType>(items_processed));
 }
 
 constexpr int64_t kAlternatingOrNa = -1;
@@ -132,9 +150,9 @@ std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
 }
 
 template <typename ParquetType, typename ArrowType = ArrowType<ParquetType>>
-std::shared_ptr<::arrow::Table> TableFromVector(
-    const std::vector<typename ArrowType::c_type>& vec, bool nullable,
-    int64_t null_percentage = kAlternatingOrNa) {
+std::shared_ptr<Table> TableFromVector(const std::vector<typename ArrowType::c_type>& vec,
+                                       bool nullable,
+                                       int64_t null_percentage = kAlternatingOrNa) {
   if (!nullable) {
     ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
   }
@@ -153,13 +171,12 @@ std::shared_ptr<::arrow::Table> TableFromVector(
 
   auto field = ::arrow::field("column", type, nullable);
   auto schema = ::arrow::schema({field});
-  return ::arrow::Table::Make(schema, {array});
+  return Table::Make(schema, {array});
 }
 
 template <>
-std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>& vec,
-                                                             bool nullable,
-                                                             int64_t null_percentage) {
+std::shared_ptr<Table> TableFromVector<BooleanType, ::arrow::BooleanType>(
+    const std::vector<bool>& vec, bool nullable, int64_t null_percentage) {
   BooleanBuilder builder;
   if (nullable) {
     auto valid_bytes = RandomVector<bool>(/*true_percentage=*/null_percentage, vec.size(),
@@ -174,21 +191,21 @@ std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<b
   auto field = ::arrow::field("column", ::arrow::boolean(), nullable);
   auto schema = std::make_shared<::arrow::Schema>(
       std::vector<std::shared_ptr<::arrow::Field>>({field}));
-  return ::arrow::Table::Make(schema, {array});
+  return Table::Make(schema, {array});
 }
 
 template <bool nullable, typename ParquetType>
 static void BM_WriteColumn(::benchmark::State& state) {
   using T = typename ParquetType::c_type;
   std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
-  std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
+  std::shared_ptr<Table> table = TableFromVector<ParquetType>(values, nullable);
 
   while (state.KeepRunning()) {
     auto output = CreateOutputStream();
     EXIT_NOT_OK(
         WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
   }
-  SetBytesProcessed<nullable, ParquetType>(state);
+  SetBytesProcessed<ParquetType>(state);
 }
 
 BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int32Type);
@@ -205,8 +222,8 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);
 
 int32_t kInfiniteUniqueValues = -1;
 
-std::shared_ptr<::arrow::Table> RandomStringTable(int64_t length, int64_t unique_values,
-                                                  int64_t null_percentage) {
+std::shared_ptr<Table> RandomStringTable(int64_t length, int64_t unique_values,
+                                         int64_t null_percentage) {
   std::shared_ptr<::arrow::DataType> type = ::arrow::utf8();
   std::shared_ptr<::arrow::Array> arr;
   ::arrow::random::RandomArrayGenerator generator(/*seed=*/500);
@@ -219,12 +236,12 @@ std::shared_ptr<::arrow::Table> RandomStringTable(int64_t length, int64_t unique
                                       /*min_length=*/3, /*max_length=*/32,
                                       /*null_probability=*/null_probability);
   }
-  return ::arrow::Table::Make(
+  return Table::Make(
       ::arrow::schema({::arrow::field("column", type, null_percentage > 0)}), {arr});
 }
 
 static void BM_WriteBinaryColumn(::benchmark::State& state) {
-  std::shared_ptr<::arrow::Table> table =
+  std::shared_ptr<Table> table =
       RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0));
 
   while (state.KeepRunning()) {
@@ -263,7 +280,7 @@ struct Examples<bool> {
   static constexpr std::array<bool, 2> values() { return {false, true}; }
 };
 
-static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
+static void BenchmarkReadTable(::benchmark::State& state, const Table& table,
                                std::shared_ptr<WriterProperties> properties,
                                int64_t num_values = -1, int64_t total_bytes = -1) {
   auto output = CreateOutputStream();
@@ -278,7 +295,7 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table&
     EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
                                  &arrow_reader));
 
-    std::shared_ptr<::arrow::Table> table;
+    std::shared_ptr<Table> table;
     EXIT_NOT_OK(arrow_reader->ReadTable(&table));
   }
 
@@ -291,7 +308,7 @@ static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table&
   }
 }
 
-static void BenchmarkReadTable(::benchmark::State& state, const ::arrow::Table& table,
+static void BenchmarkReadTable(::benchmark::State& state, const Table& table,
                                int64_t num_values = -1, int64_t total_bytes = -1) {
   BenchmarkReadTable(state, table, default_writer_properties(), num_values, total_bytes);
 }
@@ -301,7 +318,7 @@ static void BenchmarkReadArray(::benchmark::State& state,
                                std::shared_ptr<WriterProperties> properties,
                                int64_t num_values = -1, int64_t total_bytes = -1) {
   auto schema = ::arrow::schema({field("s", array->type(), nullable)});
-  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+  auto table = Table::Make(schema, {array}, array->length());
 
   EXIT_NOT_OK(table->Validate());
 
@@ -326,13 +343,13 @@ static void BM_ReadColumn(::benchmark::State& state) {
   auto values = RandomVector<T>(/*percentage=*/state.range(1), BENCHMARK_SIZE,
                                 Examples<T>::values());
 
-  std::shared_ptr<::arrow::Table> table =
+  std::shared_ptr<Table> table =
       TableFromVector<ParquetType>(values, nullable, state.range(0));
 
   auto properties = WriterProperties::Builder().disable_dictionary()->build();
 
   BenchmarkReadTable(state, *table, properties, table->num_rows(),
-                     sizeof(typename ParquetType::c_type) * table->num_rows());
+                     BytesForItems<ParquetType>(table->num_rows()));
 }
 
 // There are two parameters here that cover different data distributions.
@@ -403,12 +420,12 @@ static void BM_ReadColumnPlain(::benchmark::State& state) {
   using c_type = typename ArrowType<ParquetType>::c_type;
 
   const std::vector<c_type> values(BENCHMARK_SIZE, static_cast<c_type>(42));
-  std::shared_ptr<::arrow::Table> table =
+  std::shared_ptr<Table> table =
       TableFromVector<ParquetType>(values, /*nullable=*/nullable, state.range(0));
 
   auto properties = WriterProperties::Builder().disable_dictionary()->build();
   BenchmarkReadTable(state, *table, properties, table->num_rows(),
-                     sizeof(c_type) * table->num_rows());
+                     BytesForItems<ParquetType>(table->num_rows()));
 }
 
 BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type)
@@ -438,7 +455,7 @@ BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, true, Float16LogicalType)
 //
 
 static void BM_ReadBinaryColumn(::benchmark::State& state) {
-  std::shared_ptr<::arrow::Table> table =
+  std::shared_ptr<Table> table =
       RandomStringTable(BENCHMARK_SIZE, state.range(1), state.range(0));
 
   // Offsets + data
@@ -636,7 +653,7 @@ BENCHMARK(BM_ReadListOfListColumn)->Apply(NestedReadArguments);
 
 static void BM_ReadIndividualRowGroups(::benchmark::State& state) {
   std::vector<int64_t> values(BENCHMARK_SIZE, 128);
-  std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
+  std::shared_ptr<Table> table = TableFromVector<Int64Type>(values, true);
   auto output = CreateOutputStream();
   // This writes 10 RowGroups
   EXIT_NOT_OK(
@@ -651,27 +668,27 @@ static void BM_ReadIndividualRowGroups(::benchmark::State& state) {
     EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
                                  &arrow_reader));
 
-    std::vector<std::shared_ptr<::arrow::Table>> tables;
+    std::vector<std::shared_ptr<Table>> tables;
     for (int i = 0; i < arrow_reader->num_row_groups(); i++) {
       // Only read the even numbered RowGroups
       if ((i % 2) == 0) {
-        std::shared_ptr<::arrow::Table> table;
+        std::shared_ptr<Table> table;
         EXIT_NOT_OK(arrow_reader->RowGroup(i)->ReadTable(&table));
         tables.push_back(table);
       }
     }
 
-    std::shared_ptr<::arrow::Table> final_table;
+    std::shared_ptr<Table> final_table;
     PARQUET_ASSIGN_OR_THROW(final_table, ConcatenateTables(tables));
   }
-  SetBytesProcessed<true, Int64Type>(state);
+  SetBytesProcessed<Int64Type>(state);
 }
 
 BENCHMARK(BM_ReadIndividualRowGroups);
 
 static void BM_ReadMultipleRowGroups(::benchmark::State& state) {
   std::vector<int64_t> values(BENCHMARK_SIZE, 128);
-  std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
+  std::shared_ptr<Table> table = TableFromVector<Int64Type>(values, true);
   auto output = CreateOutputStream();
   // This writes 10 RowGroups
   EXIT_NOT_OK(
@@ -685,17 +702,17 @@ static void BM_ReadMultipleRowGroups(::benchmark::State& state) {
     std::unique_ptr<FileReader> arrow_reader;
     EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
                                  &arrow_reader));
-    std::shared_ptr<::arrow::Table> table;
+    std::shared_ptr<Table> table;
     EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table));
   }
-  SetBytesProcessed<true, Int64Type>(state);
+  SetBytesProcessed<Int64Type>(state);
 }
 
 BENCHMARK(BM_ReadMultipleRowGroups);
 
 static void BM_ReadMultipleRowGroupsGenerator(::benchmark::State& state) {
   std::vector<int64_t> values(BENCHMARK_SIZE, 128);
-  std::shared_ptr<::arrow::Table> table = TableFromVector<Int64Type>(values, true);
+  std::shared_ptr<Table> table = TableFromVector<Int64Type>(values, true);
   auto output = CreateOutputStream();
   // This writes 10 RowGroups
   EXIT_NOT_OK(
@@ -714,9 +731,9 @@ static void BM_ReadMultipleRowGroupsGenerator(::benchmark::State& state) {
                     arrow_reader->GetRecordBatchGenerator(arrow_reader, rgs, {0}));
     auto fut = ::arrow::CollectAsyncGenerator(generator);
     ASSIGN_OR_ABORT(auto batches, fut.result());
-    ASSIGN_OR_ABORT(auto actual, ::arrow::Table::FromRecordBatches(std::move(batches)));
+    ASSIGN_OR_ABORT(auto actual, Table::FromRecordBatches(std::move(batches)));
   }
-  SetBytesProcessed<true, Int64Type>(state);
+  SetBytesProcessed<Int64Type>(state);
 }
 
 BENCHMARK(BM_ReadMultipleRowGroupsGenerator);

From 6a38205d69ed97b21ad8dcfba11069ebb28c05d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Thu, 12 Sep 2024 16:36:05 +0200
Subject: [PATCH 133/186] GH-44076: [CI] Remove
 verify-rc-binaries-wheel-macos-11 which is now deprecated (#44077)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Our wheels deployment target is now MACOSX_DEPLOYMENT_TARGET=12.0 and the macOS 11 runner is deprecated.

### What changes are included in this PR?

Remove macos-11 from CI matrix.

### Are these changes tested?

No, those jobs are triggered on release and is just removing a job from the matrix.

### Are there any user-facing changes?

No
* GitHub Issue: #44076

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
---
 dev/tasks/tasks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index c1c15a3ff73fd..9f13245c53f4a 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -996,7 +996,7 @@ tasks:
       github_runner: "macos-14"
   {% endfor %}
 
-  {% for macos_version in ["11", "12"] %}
+  {% for macos_version in ["12"] %}
   verify-rc-binaries-wheels-macos-{{ macos_version }}-amd64:
     ci: github
     template: verify-rc/github.macos.yml

From 2f99cf85835c05e0d3d23d06592e6d2f6322aede Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Thu, 12 Sep 2024 18:26:46 +0300
Subject: [PATCH 134/186] GH-44046: [Python] Fix threading issues with borrowed
 refs and pandas (#44047)

### Rationale for this change

Fix threading bugs that could leads to races under the free-threaded build.

### What changes are included in this PR?

- Use `PySequence_ITEM` instead of the `Fast` variant on lists under the free-threaded build.
- Use `std::once_flag` to make sure that `pandas` staic data only gets initialized once.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.

* GitHub Issue: #44046

Lead-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 python/pyarrow/src/arrow/python/helpers.cc    | 38 ++++++++++++++-----
 python/pyarrow/src/arrow/python/iterators.h   |  4 ++
 .../pyarrow/src/arrow/python/numpy_convert.cc | 12 ++++++
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc
index 18302e6fe0401..ca89ebe9d8bdd 100644
--- a/python/pyarrow/src/arrow/python/helpers.cc
+++ b/python/pyarrow/src/arrow/python/helpers.cc
@@ -22,6 +22,7 @@
 
 #include <cmath>
 #include <limits>
+#include <mutex>
 #include <sstream>
 #include <type_traits>
 
@@ -292,7 +293,15 @@ bool PyFloat_IsNaN(PyObject* obj) {
 
 namespace {
 
+// This needs a conditional, because using std::once_flag could introduce
+// a deadlock when the GIL is enabled. See
+// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for
+// more info.
+#ifdef Py_GIL_DISABLED
+static std::once_flag pandas_static_initialized;
+#else
 static bool pandas_static_initialized = false;
+#endif
 
 // Once initialized, these variables hold borrowed references to Pandas static data.
 // We should not use OwnedRef here because Python destructors would be
@@ -304,15 +313,7 @@ static PyObject* pandas_Timestamp = nullptr;
 static PyTypeObject* pandas_NaTType = nullptr;
 static PyObject* pandas_DateOffset = nullptr;
 
-}  // namespace
-
-void InitPandasStaticData() {
-  // NOTE: This is called with the GIL held.  We needn't (and shouldn't,
-  // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
-  if (pandas_static_initialized) {
-    return;
-  }
-
+void GetPandasStaticSymbols() {
   OwnedRef pandas;
 
   // Import pandas
@@ -321,11 +322,14 @@ void InitPandasStaticData() {
     return;
   }
 
+#ifndef Py_GIL_DISABLED
   // Since ImportModule can release the GIL, another thread could have
   // already initialized the static data.
   if (pandas_static_initialized) {
     return;
   }
+#endif
+
   OwnedRef ref;
 
   // set NaT sentinel and its type
@@ -355,9 +359,25 @@ void InitPandasStaticData() {
   if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
     pandas_DateOffset = ref.obj();
   }
+}
+
+}  // namespace
 
+#ifdef Py_GIL_DISABLED
+void InitPandasStaticData() {
+  std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
+}
+#else
+void InitPandasStaticData() {
+  // NOTE: This is called with the GIL held.  We needn't (and shouldn't,
+  // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
+  if (pandas_static_initialized) {
+    return;
+  }
+  GetPandasStaticSymbols();
   pandas_static_initialized = true;
 }
+#endif
 
 bool PandasObjectIsNull(PyObject* obj) {
   if (!MayHaveNaN(obj)) {
diff --git a/python/pyarrow/src/arrow/python/iterators.h b/python/pyarrow/src/arrow/python/iterators.h
index 8512276848272..dd467f6ac4077 100644
--- a/python/pyarrow/src/arrow/python/iterators.h
+++ b/python/pyarrow/src/arrow/python/iterators.h
@@ -67,7 +67,11 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&&
   }
 
   if (PySequence_Check(obj)) {
+#ifdef Py_GIL_DISABLED
+    if (PyTuple_Check(obj)) {
+#else
     if (PyList_Check(obj) || PyTuple_Check(obj)) {
+#endif
       // Use fast item access
       const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
       for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc
index 5fd2cb511ff8a..4113cc67d2fc6 100644
--- a/python/pyarrow/src/arrow/python/numpy_convert.cc
+++ b/python/pyarrow/src/arrow/python/numpy_convert.cc
@@ -488,7 +488,13 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject*
   std::vector<std::shared_ptr<Tensor>> indices(ndim);
 
   for (int i = 0; i < ndim - 1; ++i) {
+#ifdef Py_GIL_DISABLED
+    PyObject* item = PySequence_ITEM(indptr_ao, i);
+    RETURN_IF_PYERROR();
+    OwnedRef item_ref(item);
+#else
     PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i);
+#endif
     if (!PyArray_Check(item)) {
       return Status::TypeError("Did not pass ndarray object for indptr");
     }
@@ -497,7 +503,13 @@ Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject*
   }
 
   for (int i = 0; i < ndim; ++i) {
+#ifdef Py_GIL_DISABLED
+    PyObject* item = PySequence_ITEM(indices_ao, i);
+    RETURN_IF_PYERROR();
+    OwnedRef item_ref(item);
+#else
     PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i);
+#endif
     if (!PyArray_Check(item)) {
       return Status::TypeError("Did not pass ndarray object for indices");
     }

From 15158bd20deac211ae9ddb7e15517d5b05a99157 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 12 Sep 2024 17:43:46 +0200
Subject: [PATCH 135/186] MINOR: [CI] Bump actions/{download,upload}-artifact
 version (#44086)

v2 and v3 are deprecated and can fail CI builds, bump to v4.

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 dev/tasks/docker-tests/github.linux.yml          |  2 +-
 dev/tasks/java-jars/github.yml                   |  8 ++++----
 dev/tasks/python-wheels/github.linux.yml         |  2 +-
 dev/tasks/python-wheels/github.osx.yml           |  2 +-
 dev/tasks/python-wheels/github.windows.yml       |  2 +-
 dev/tasks/r/github.devdocs.yml                   |  2 +-
 .../r/github.linux.arrow.version.back.compat.yml |  4 ++--
 dev/tasks/r/github.linux.cran.yml                |  4 ++--
 dev/tasks/r/github.linux.offline.build.yml       |  6 +++---
 dev/tasks/r/github.linux.versions.yml            |  4 ++--
 dev/tasks/r/github.macos-linux.local.yml         |  4 ++--
 dev/tasks/r/github.macos.cran.yml                |  2 +-
 dev/tasks/r/github.packages.yml                  | 16 ++++++++--------
 13 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/dev/tasks/docker-tests/github.linux.yml b/dev/tasks/docker-tests/github.linux.yml
index cd2923a50d6df..ee221d6f6d8d6 100644
--- a/dev/tasks/docker-tests/github.linux.yml
+++ b/dev/tasks/docker-tests/github.linux.yml
@@ -63,7 +63,7 @@ jobs:
           done
       - name: Save the R test output
         if: always()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: test-output
           path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml
index bdbed1bd678e6..9910daa21ef37 100644
--- a/dev/tasks/java-jars/github.yml
+++ b/dev/tasks/java-jars/github.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Compress into single artifact to keep directory structure
         run: tar -cvzf arrow-shared-libs-linux-{{ arch }}.tar.gz arrow/java-dist/
       - name: Upload artifacts
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: ubuntu-shared-lib-{{ arch }}
           path: arrow-shared-libs-linux-{{ arch }}.tar.gz
@@ -152,7 +152,7 @@ jobs:
       - name: Compress into single artifact to keep directory structure
         run: tar -cvzf arrow-shared-libs-macos-{{ arch }}.tar.gz arrow/java-dist/
       - name: Upload artifacts
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: macos-shared-lib-{{ arch }}
           path: arrow-shared-libs-macos-{{ arch }}.tar.gz
@@ -186,7 +186,7 @@ jobs:
         shell: bash
         run: tar -cvzf arrow-shared-libs-windows.tar.gz arrow/java-dist/
       - name: Upload artifacts
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: windows-shared-lib
           path: arrow-shared-libs-windows.tar.gz
@@ -201,7 +201,7 @@ jobs:
     steps:
       {{ macros.github_checkout_arrow(fetch_depth=0)|indent }}
       - name: Download Libraries
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           path: artifacts
       - name: Decompress artifacts
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index d9dbef82a948e..faca698b71a4d 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -50,7 +50,7 @@ jobs:
         shell: bash
         run: archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-manylinux-{{ manylinux_version }}
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: wheel
           path: arrow/python/repaired_wheels/*.whl
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index 98e06a14ff222..5d85e7905726e 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -108,7 +108,7 @@ jobs:
           pip install --upgrade pip wheel
           PYTHON=python arrow/ci/scripts/python_wheel_macos_build.sh {{ arch }} $(pwd)/arrow $(pwd)/build
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: wheel
           path: arrow/python/repaired_wheels/*.whl
diff --git a/dev/tasks/python-wheels/github.windows.yml b/dev/tasks/python-wheels/github.windows.yml
index 3a943b6ae515c..2bcda4966db8b 100644
--- a/dev/tasks/python-wheels/github.windows.yml
+++ b/dev/tasks/python-wheels/github.windows.yml
@@ -58,7 +58,7 @@ jobs:
           )
           archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2019
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: wheel
           path: arrow/python/dist/*.whl
diff --git a/dev/tasks/r/github.devdocs.yml b/dev/tasks/r/github.devdocs.yml
index 530fb5e2f2ea9..6047951155cde 100644
--- a/dev/tasks/r/github.devdocs.yml
+++ b/dev/tasks/r/github.devdocs.yml
@@ -68,7 +68,7 @@ jobs:
           EOF
         shell: bash -l {0}
       - name: Save the install script
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: {{ "devdocs-script_os-${{ matrix.os }}_sysinstall-${{ matrix.system-install }}" }}
           path: arrow/r/vignettes/developers/script.sh
diff --git a/dev/tasks/r/github.linux.arrow.version.back.compat.yml b/dev/tasks/r/github.linux.arrow.version.back.compat.yml
index 086705dbb9cf4..90b2554eb8cd7 100644
--- a/dev/tasks/r/github.linux.arrow.version.back.compat.yml
+++ b/dev/tasks/r/github.linux.arrow.version.back.compat.yml
@@ -58,7 +58,7 @@ jobs:
         shell: bash
 
       - name: Upload the parquet artifacts
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: files
           path: arrow/r/extra-tests/files
@@ -108,7 +108,7 @@ jobs:
           cp arrow/r/extra-tests/helper*.R extra-tests/
           cp arrow/r/extra-tests/test-*.R extra-tests/
       - name: Download artifacts
-        uses: actions/download-artifact@v2
+        uses: actions/download-artifact@v4
         with:
           name: files
           path: extra-tests/files
diff --git a/dev/tasks/r/github.linux.cran.yml b/dev/tasks/r/github.linux.cran.yml
index 34cb4b9446a0b..8f56bf771d224 100644
--- a/dev/tasks/r/github.linux.cran.yml
+++ b/dev/tasks/r/github.linux.cran.yml
@@ -55,7 +55,7 @@ jobs:
         if: always()
       - name: Save the test output
         if: always()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
-          name: test-output
+          name: test-output-{{ "${{ matrix.r_image }}" }}
           path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml
index 9ac0ebc40835e..62cdaa02051dd 100644
--- a/dev/tasks/r/github.linux.offline.build.yml
+++ b/dev/tasks/r/github.linux.offline.build.yml
@@ -41,7 +41,7 @@ jobs:
           R -e "source('R/install-arrow.R'); create_package_with_all_dependencies(dest_file = 'arrow_with_deps.tar.gz', source_file = \"${built_tar}\")"
         shell: bash
       - name: Upload the third party dependency artifacts
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: thirdparty_deps
           path: arrow/r/arrow_with_deps.tar.gz
@@ -60,7 +60,7 @@ jobs:
 
       - uses: r-lib/actions/setup-r@v2
       - name: Download artifacts
-        uses: actions/download-artifact@v2
+        uses: actions/download-artifact@v4
         with:
           name: thirdparty_deps
           path: arrow/r/
@@ -91,7 +91,7 @@ jobs:
         run: cat arrow-tests/testthat.Rout*
         if: always()
       - name: Save the test output
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: test-output
           path: arrow-tests/testthat.Rout*
diff --git a/dev/tasks/r/github.linux.versions.yml b/dev/tasks/r/github.linux.versions.yml
index 753efe61d048e..092ac97de8ec4 100644
--- a/dev/tasks/r/github.linux.versions.yml
+++ b/dev/tasks/r/github.linux.versions.yml
@@ -55,7 +55,7 @@ jobs:
         if: always()
       - name: Save the test output
         if: always()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
-          name: test-output
+          name: test-output-{{ "${{ matrix.r_version }}" }}
           path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout*
diff --git a/dev/tasks/r/github.macos-linux.local.yml b/dev/tasks/r/github.macos-linux.local.yml
index b221e8c5d8d5b..2db80f254fec5 100644
--- a/dev/tasks/r/github.macos-linux.local.yml
+++ b/dev/tasks/r/github.macos-linux.local.yml
@@ -97,8 +97,8 @@ jobs:
         run: cat arrow-tests/testthat.Rout*
         if: failure()
       - name: Save the test output
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
-          name: test-output
+          name: test-output-{{ "${{ matrix.os }}" }}
           path: arrow-tests/testthat.Rout*
         if: always()
diff --git a/dev/tasks/r/github.macos.cran.yml b/dev/tasks/r/github.macos.cran.yml
index 33965988e213a..dda8ac7fd7850 100644
--- a/dev/tasks/r/github.macos.cran.yml
+++ b/dev/tasks/r/github.macos.cran.yml
@@ -75,7 +75,7 @@ jobs:
         run: cat arrow-tests/testthat.Rout*
         if: failure()
       - name: Save the test output
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: test-output
           path: arrow-tests/testthat.Rout*
diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml
index db6955b92d1e0..66008275148f9 100644
--- a/dev/tasks/r/github.packages.yml
+++ b/dev/tasks/r/github.packages.yml
@@ -51,7 +51,7 @@ jobs:
           R CMD build --no-build-vignettes .
 
       - name: Upload package artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: r-pkg__src__contrib
           path: arrow/r/arrow_*.tar.gz
@@ -106,7 +106,7 @@ jobs:
           cd arrow/r/libarrow/dist
           shasum -a 512 arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
       - name: Upload binary artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: r-lib__libarrow__bin__darwin-{{ '${{ matrix.platform.arch }}' }}-openssl-{{ '${{ matrix.openssl }}' }}
           path: arrow/r/libarrow/dist/arrow-*.zip*
@@ -161,7 +161,7 @@ jobs:
           cd arrow/r/libarrow/dist
           shasum -a 512 arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
       - name: Upload binary artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: r-lib__libarrow__bin__linux-openssl-{{ '${{ matrix.openssl }}' }}
           path: arrow/r/libarrow/dist/arrow-*.zip*
@@ -194,7 +194,7 @@ jobs:
           cd build
           sha512sum arrow-*.zip > arrow-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip.sha512
       - name: Upload binary artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: r-lib__libarrow__bin__windows
           path: build/arrow-*.zip*
@@ -291,7 +291,7 @@ jobs:
           cat(cmd, file = Sys.getenv("GITHUB_OUTPUT"), append = TRUE)
 
       - name: Upload binary artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: r-pkg{{ '${{ steps.build.outputs.path }}' }}
           path: arrow_*
@@ -347,7 +347,7 @@ jobs:
           '
       - name: Upload binary artifact
         if: matrix.config.devtoolset
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: r-pkg_centos7
           path: arrow_*
@@ -359,7 +359,7 @@ jobs:
     runs-on: ubuntu-latest
     container: "rstudio/r-base:4.2-centos7"
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
           name: r-pkg_centos7
       - name: Install DTS Package
@@ -441,7 +441,7 @@ jobs:
     steps:
       {{ macros.github_checkout_arrow()|indent }}
       - name: Download Artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           path: artifacts
       - name: Install R

From 0c2891da304e6740abf3b5d3d3c2b88b10b5aba6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Thu, 12 Sep 2024 18:09:34 +0200
Subject: [PATCH 136/186] GH-43840: [CI] Add cuda group to tasks.yml and minor
 updates for new cuda runner image (#43841)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Trigger cuda jobs as a group on crossbow

### What changes are included in this PR?

Grouping of cuda tasks under `cuda` on tasks.yml.
We have also updated the cuda runner image used to https://github.com/voltrondata-labs/cuda-action-runner-builder/blob/bc1797368e02d98e4dc04de8afe41807e2171f3c/.github/workflows/cuda-dind-runners.yaml. It will run on Ubuntu 22.04, base Python is updated to 3.10, CUDA updated from 11.4.1 to 11.8.0

### Are these changes tested?

Via archery

### Are there any user-facing changes?

No
* GitHub Issue: #43840

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 dev/tasks/docker-tests/github.cuda.yml | 8 ++++----
 dev/tasks/tasks.yml                    | 3 +++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/dev/tasks/docker-tests/github.cuda.yml b/dev/tasks/docker-tests/github.cuda.yml
index d03b3657afc53..e65ac457b2ef7 100644
--- a/dev/tasks/docker-tests/github.cuda.yml
+++ b/dev/tasks/docker-tests/github.cuda.yml
@@ -26,13 +26,13 @@ jobs:
     runs-on: ['self-hosted', 'cuda']
 {{ macros.github_set_env(env) }}
     timeout-minutes: {{ timeout|default(60) }}
-    env:
-      ARCHERY_USE_LEGACY_DOCKER_COMPOSE: 1
     steps:
       {{ macros.github_checkout_arrow(fetch_depth=fetch_depth|default(1))|indent }}
-      # python 3.8 is installed on the runner, no need to install
+      # python 3.10 is installed on the runner, no need to install
+      - name: Install pip
+        run: sudo apt update && sudo apt install python3-pip -y
       - name: Install archery
-        run: python -m pip install -e arrow/dev/archery[docker]
+        run: python3 -m pip install -e arrow/dev/archery[docker]
       - name: Execute Docker Build
         shell: bash
         env:
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 9f13245c53f4a..9bb7eedd7b3ee 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -70,6 +70,9 @@ groups:
 
 {############################# Testing tasks #################################}
 
+  cuda:
+    - test-cuda-*
+
   test:
     - test-*
 

From 85fc3ebe93ee731b55ff22021f4d55a7768aeb6f Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Thu, 12 Sep 2024 13:39:42 -0300
Subject: [PATCH 137/186] GH-42247: [C++] Support casting to and from
 utf8_view/binary_view (#43302)

### Rationale for this change

We need casts between string (binary) and string-view (binary-view) types since they are semantically equivalent.

### What changes are included in this PR?

 - Add `is_binary_view_like()` type predicate
 - Add `BinaryViewTypes()` list including `STRING_VIEW/BINARY_VIEW`
 - New cast kernels

### Are these changes tested?

Yes, but test coverage might be improved.

### Are there any user-facing changes?

More casts are available.
* GitHub Issue: #42247

Lead-authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Co-authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../arrow/compute/kernels/codegen_internal.h  |  19 +-
 .../compute/kernels/scalar_cast_boolean.cc    |   6 +
 .../compute/kernels/scalar_cast_internal.cc   |   7 +-
 .../compute/kernels/scalar_cast_numeric.cc    |  24 +-
 .../compute/kernels/scalar_cast_string.cc     | 289 +++++++++++++++++-
 .../arrow/compute/kernels/scalar_cast_test.cc | 146 ++++++---
 cpp/src/arrow/type.cc                         |  12 +-
 cpp/src/arrow/type.h                          |   3 +
 cpp/src/arrow/type_test.cc                    |   2 +
 cpp/src/arrow/type_traits.h                   |  25 ++
 cpp/src/arrow/util/binary_view_util.h         |  13 +
 cpp/src/arrow/visit_data_inline.h             |   3 +-
 12 files changed, 473 insertions(+), 76 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 9e46a21887f8c..7f9be92f3a14b 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -133,7 +133,8 @@ struct GetViewType<Type, enable_if_has_c_type<Type>> {
 
 template <typename Type>
 struct GetViewType<Type, enable_if_t<is_base_binary_type<Type>::value ||
-                                     is_fixed_size_binary_type<Type>::value>> {
+                                     is_fixed_size_binary_type<Type>::value ||
+                                     is_binary_view_like_type<Type>::value>> {
   using T = std::string_view;
   using PhysicalType = T;
 
@@ -1265,6 +1266,22 @@ ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) {
   }
 }
 
+// Generate a kernel given a templated functor for binary-view types. Generates a
+// single kernel for binary/string-view.
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateVarBinaryViewBase(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::BINARY_VIEW:
+    case Type::STRING_VIEW:
+      return Generator<Type0, BinaryViewType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return nullptr;
+  }
+}
+
 // Generate a kernel given a templated functor for temporal types
 //
 // See "Numeric" above for description of the generator functor
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index 8935b0d5f2d0d..cb1a67bad90ca 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -63,6 +63,12 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
                                                  BooleanType, ParseBooleanString>(*ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
+  for (const auto& ty : BinaryViewTypes()) {
+    ArrayKernelExec exec =
+        GenerateVarBinaryViewBase<applicator::ScalarUnaryNotNull, BooleanType,
+                                  ParseBooleanString>(*ty);
+    DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
+  }
   return {func};
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index d8c4088759643..5c43d87edcab9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -188,8 +188,6 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type,
 // ----------------------------------------------------------------------
 
 Status UnpackDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  // TODO: is there an implementation more friendly to the "span" data structures?
-
   DictionaryArray dict_arr(batch[0].array.ToArrayData());
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
 
@@ -281,6 +279,8 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty
 }
 
 static bool CanCastFromDictionary(Type::type type_id) {
+  /// TODO(GH-43010): add is_binary_view_like() here once array_take
+  /// can handle string-views
   return (is_primitive(type_id) || is_base_binary_like(type_id) ||
           is_fixed_size_binary(type_id));
 }
@@ -297,9 +297,6 @@ void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* fun
   // From dictionary to this type
   if (CanCastFromDictionary(out_type_id)) {
     // Dictionary unpacking not implemented for boolean or nested types.
-    //
-    // XXX: Uses Take and does its own memory allocation for the moment. We can
-    // fix this later.
     DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
                               UnpackDictionary, NullHandling::COMPUTED_NO_PREALLOCATE,
                               MemAllocation::NO_PREALLOCATE));
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index bd9be3e8a9532..1fe26b316362d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -313,7 +313,9 @@ struct ParseString {
 
 template <typename O, typename I>
 struct CastFunctor<
-    O, I, enable_if_t<(is_number_type<O>::value && is_base_binary_type<I>::value)>> {
+    O, I,
+    enable_if_t<(is_number_type<O>::value && (is_base_binary_type<I>::value ||
+                                              is_binary_view_like_type<I>::value))>> {
   static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
   }
@@ -658,11 +660,15 @@ struct DecimalCastFunctor {
 };
 
 template <typename I>
-struct CastFunctor<Decimal128Type, I, enable_if_t<is_base_binary_type<I>::value>>
+struct CastFunctor<
+    Decimal128Type, I,
+    enable_if_t<is_base_binary_type<I>::value || is_binary_view_like_type<I>::value>>
     : public DecimalCastFunctor<Decimal128Type, I> {};
 
 template <typename I>
-struct CastFunctor<Decimal256Type, I, enable_if_t<is_base_binary_type<I>::value>>
+struct CastFunctor<
+    Decimal256Type, I,
+    enable_if_t<is_base_binary_type<I>::value || is_binary_view_like_type<I>::value>>
     : public DecimalCastFunctor<Decimal256Type, I> {};
 
 // ----------------------------------------------------------------------
@@ -708,6 +714,10 @@ void AddCommonNumberCasts(const std::shared_ptr<DataType>& out_ty, CastFunction*
     auto exec = GenerateVarBinaryBase<CastFunctor, OutType>(*in_ty);
     DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
   }
+  for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
+    auto exec = GenerateVarBinaryViewBase<CastFunctor, OutType>(*in_ty);
+    DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
+  }
 }
 
 template <typename OutType>
@@ -793,6 +803,10 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() {
     auto exec = GenerateVarBinaryBase<CastFunctor, Decimal128Type>(in_ty->id());
     DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
   }
+  for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
+    auto exec = GenerateVarBinaryViewBase<CastFunctor, Decimal128Type>(in_ty->id());
+    DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
+  }
 
   // Cast from other decimal
   auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
@@ -828,6 +842,10 @@ std::shared_ptr<CastFunction> GetCastToDecimal256() {
     auto exec = GenerateVarBinaryBase<CastFunctor, Decimal256Type>(in_ty->id());
     DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
   }
+  for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
+    auto exec = GenerateVarBinaryViewBase<CastFunctor, Decimal256Type>(in_ty->id());
+    DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
+  }
 
   // Cast from other decimal
   auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 11875522b42c9..4edf00225d317 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -30,12 +30,14 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/formatting.h"
 #include "arrow/util/int_util.h"
+#include "arrow/util/logging.h"
 #include "arrow/util/utf8_internal.h"
 #include "arrow/visit_data_inline.h"
 
 namespace arrow {
 
 using internal::StringFormatter;
+using internal::VisitSetBitRunsVoid;
 using util::InitializeUTF8;
 using util::ValidateUTF8Inline;
 
@@ -286,17 +288,20 @@ Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
   }
 }
 
+// Offset String -> Offset String
 template <typename O, typename I>
-enable_if_t<is_base_binary_type<I>::value && !is_fixed_size_binary_type<O>::value, Status>
+enable_if_t<is_base_binary_type<I>::value && is_base_binary_type<O>::value, Status>
 BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
   const ArraySpan& input = batch[0].array;
 
-  if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
-    InitializeUTF8();
-    ArraySpanVisitor<I> visitor;
-    Utf8Validator validator;
-    RETURN_NOT_OK(visitor.Visit(input, &validator));
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
   }
 
   // Start with a zero-copy cast, but change indices to expected size
@@ -305,19 +310,243 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
       ctx, input, out->array_data().get());
 }
 
+// String View -> Offset String
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value && is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+  const ArraySpan& input = batch[0].array;
+
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
+  }
+
+  const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
+      input.GetValues<BinaryViewType::c_type>(1), input.length);
+
+  // TODO(GH-43573): A more efficient implementation that copies the validity
+  // bitmap all at once is possible, but would mean we don't delegate all the
+  // building logic to the ArrayBuilder implementation for the output type.
+  OutputBuilderType builder(options.to_type.GetSharedPtr(), ctx->memory_pool());
+  RETURN_NOT_OK(builder.Resize(input.length));
+  RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
+  arrow::internal::ArraySpanInlineVisitor<I> visitor;
+  RETURN_NOT_OK(visitor.VisitStatus(
+      input,
+      [&](std::string_view v) {
+        // Append valid string view
+        return builder.Append(v);
+      },
+      [&]() {
+        // Append null
+        builder.UnsafeAppendNull();
+        return Status::OK();
+      }));
+
+  std::shared_ptr<ArrayData> output_array;
+  RETURN_NOT_OK(builder.FinishInternal(&output_array));
+  out->value = std::move(output_array);
+  return Status::OK();
+}
+
+// Offset String -> String View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value && is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  using offset_type = typename I::offset_type;
+  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+  const ArraySpan& input = batch[0].array;
+
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
+  }
+
+  // Start with a zero-copy cast, then reconfigure the view and data buffers
+  RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+  ArrayData* output = out->array_data().get();
+
+  const int64_t total_length = input.offset + input.length;
+  const auto* validity = input.GetValues<uint8_t>(0, 0);
+  const auto* input_offsets = input.GetValues<offset_type>(1);
+  const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+  // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+  ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+                        ctx->Allocate(total_length * BinaryViewType::kSize));
+  memset(output->buffers[1]->mutable_data(), 0, total_length * BinaryViewType::kSize);
+
+  // Check against offset overflow
+  if constexpr (sizeof(offset_type) > 4) {
+    if (total_length > 0) {
+      // Offsets are monotonically increasing, that is, offsets[j] <= offsets[j+1] for
+      // 0 <= j < length, even for null slots. So we only need to check the last offset.
+      const int64_t max_data_offset = input_offsets[input.length];
+      if (ARROW_PREDICT_FALSE(max_data_offset > std::numeric_limits<int32_t>::max())) {
+        // A more complicated loop could work by slicing the data buffer into
+        // more than one variadic buffer, but this is probably overkill for now
+        // before someone hits this problem in practice.
+        return Status::CapacityError("Failed casting from ", input.type->ToString(),
+                                     " to ", output->type->ToString(),
+                                     ": input array too large for efficient conversion.");
+      }
+    }
+  }
+
+  auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+  // If all entries are inline, we can drop the extra data buffer for
+  // large strings in output->buffers[2].
+  bool all_entries_are_inline = true;
+  VisitSetBitRunsVoid(
+      validity, output->offset, output->length,
+      [&](int64_t start_offset, int64_t run_length) {
+        for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+          const offset_type data_offset = input_offsets[i];
+          const offset_type data_length = input_offsets[i + 1] - data_offset;
+          auto& out_view = out_views[i];
+          if (data_length <= BinaryViewType::kInlineSize) {
+            out_view.inlined.size = static_cast<int32_t>(data_length);
+            memcpy(out_view.inlined.data.data(), input_data + data_offset, data_length);
+          } else {
+            out_view.ref.size = static_cast<int32_t>(data_length);
+            memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+                   BinaryViewType::kPrefixSize);
+            // (buffer_index is 0'd by the memset of the buffer 1 above)
+            // out_view.ref.buffer_index = 0;
+            out_view.ref.offset = static_cast<int32_t>(data_offset);
+            all_entries_are_inline = false;
+          }
+        }
+      });
+  if (all_entries_are_inline) {
+    output->buffers[2] = nullptr;
+  }
+  return Status::OK();
+}
+
+// String View -> String View
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value && is_binary_view_like_type<O>::value,
+            Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+  const ArraySpan& input = batch[0].array;
+
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
+  }
+
+  return ZeroCopyCastExec(ctx, batch, out);
+}
+
+// Fixed -> String View
 template <typename O, typename I>
 enable_if_t<std::is_same<I, FixedSizeBinaryType>::value &&
-                !std::is_same<O, FixedSizeBinaryType>::value,
+                is_binary_view_like_type<O>::value,
             Status>
 BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
   const ArraySpan& input = batch[0].array;
 
-  if (O::is_utf8 && !options.allow_invalid_utf8) {
-    InitializeUTF8();
-    ArraySpanVisitor<I> visitor;
-    Utf8Validator validator;
-    RETURN_NOT_OK(visitor.Visit(input, &validator));
+  if constexpr (!I::is_utf8 && O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
+  }
+
+  const int32_t fixed_size_width = input.type->byte_width();
+  const int64_t total_length = input.offset + input.length;
+
+  ArrayData* output = out->array_data().get();
+  DCHECK_EQ(output->length, input.length);
+  output->offset = input.offset;
+  output->buffers.resize(3);
+  output->SetNullCount(input.null_count);
+  // Share the validity bitmap buffer
+  output->buffers[0] = input.GetBuffer(0);
+  // Init buffers[1] with input.length empty BinaryViewType::c_type entries.
+  ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+                        ctx->Allocate(total_length * BinaryViewType::kSize));
+  memset(output->buffers[1]->mutable_data(), 0, total_length * BinaryViewType::kSize);
+  auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+  auto data_buffer = input.GetBuffer(1);
+  const auto* data = data_buffer->data();
+
+  // Check against offset overflow
+  if (total_length > 0) {
+    const int64_t max_data_offset = (total_length - 1) * fixed_size_width;
+    if (ARROW_PREDICT_FALSE(max_data_offset > std::numeric_limits<int32_t>::max())) {
+      // A more complicated loop could work by slicing the data buffer into
+      // more than one variadic buffer, but this is probably overkill for now
+      // before someone hits this problem in practice.
+      return Status::CapacityError("Failed casting from ", input.type->ToString(), " to ",
+                                   output->type->ToString(),
+                                   ": input array too large for efficient conversion.");
+    }
+  }
+
+  // Inline string and non-inline string loops
+  if (fixed_size_width <= BinaryViewType::kInlineSize) {
+    int32_t data_offset = static_cast<int32_t>(input.offset) * fixed_size_width;
+    for (int64_t i = 0; i < input.length; i++) {
+      auto& out_view = out_views[i];
+      out_view.inlined.size = fixed_size_width;
+      memcpy(out_view.inlined.data.data(), data + data_offset, fixed_size_width);
+      data_offset += fixed_size_width;
+    }
+  } else {
+    // We share the fixed-size string array data buffer as variadic data
+    // buffer 0 (index=2+0) and set every buffer_index to 0.
+    output->buffers[2] = std::move(data_buffer);
+    int32_t data_offset = static_cast<int32_t>(input.offset) * fixed_size_width;
+    for (int64_t i = 0; i < input.length; i++) {
+      auto& out_view = out_views[i];
+      out_view.ref.size = fixed_size_width;
+      memcpy(out_view.ref.prefix.data(), data + data_offset, BinaryViewType::kPrefixSize);
+      // (buffer_index is 0'd by the memset of the buffer 1 above)
+      // out_view.ref.buffer_index = 0;
+      out_view.ref.offset = static_cast<int32_t>(data_offset);
+      data_offset += fixed_size_width;
+    }
+  }
+  return Status::OK();
+}
+
+// Fixed -> Offset String
+template <typename O, typename I>
+enable_if_t<std::is_same<I, FixedSizeBinaryType>::value && is_base_binary_type<O>::value,
+            Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+  const ArraySpan& input = batch[0].array;
+
+  if constexpr (O::is_utf8) {
+    if (!options.allow_invalid_utf8) {
+      InitializeUTF8();
+      ArraySpanVisitor<I> visitor;
+      Utf8Validator validator;
+      RETURN_NOT_OK(visitor.Visit(input, &validator));
+    }
   }
 
   // Check for overflow
@@ -352,7 +581,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
   }
 
   // This buffer is preallocated
-  output_offset_type* offsets = output->GetMutableValues<output_offset_type>(1);
+  auto* offsets = output->GetMutableValues<output_offset_type>(1);
   offsets[0] = static_cast<output_offset_type>(input.offset * width);
   for (int64_t i = 0; i < input.length; i++) {
     offsets[i + 1] = offsets[i] + width;
@@ -378,6 +607,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
   return Status::OK();
 }
 
+// Fixed -> Fixed
 template <typename O, typename I>
 enable_if_t<std::is_same<I, FixedSizeBinaryType>::value &&
                 std::is_same<O, FixedSizeBinaryType>::value,
@@ -394,8 +624,10 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
   return ZeroCopyCastExec(ctx, batch, out);
 }
 
+// Offset String | String View -> Fixed
 template <typename O, typename I>
-enable_if_t<is_base_binary_type<I>::value && std::is_same<O, FixedSizeBinaryType>::value,
+enable_if_t<(is_base_binary_type<I>::value || is_binary_view_like_type<I>::value) &&
+                std::is_same<O, FixedSizeBinaryType>::value,
             Status>
 BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
@@ -484,7 +716,9 @@ void AddBinaryToBinaryCast(CastFunction* func) {
 template <typename OutType>
 void AddBinaryToBinaryCast(CastFunction* func) {
   AddBinaryToBinaryCast<OutType, StringType>(func);
+  AddBinaryToBinaryCast<OutType, StringViewType>(func);
   AddBinaryToBinaryCast<OutType, BinaryType>(func);
+  AddBinaryToBinaryCast<OutType, BinaryViewType>(func);
   AddBinaryToBinaryCast<OutType, LargeStringType>(func);
   AddBinaryToBinaryCast<OutType, LargeBinaryType>(func);
   AddBinaryToBinaryCast<OutType, FixedSizeBinaryType>(func);
@@ -504,7 +738,9 @@ void AddBinaryToFixedSizeBinaryCast(CastFunction* func) {
 
 void AddBinaryToFixedSizeBinaryCast(CastFunction* func) {
   AddBinaryToFixedSizeBinaryCast<StringType>(func);
+  AddBinaryToFixedSizeBinaryCast<StringViewType>(func);
   AddBinaryToFixedSizeBinaryCast<BinaryType>(func);
+  AddBinaryToFixedSizeBinaryCast<BinaryViewType>(func);
   AddBinaryToFixedSizeBinaryCast<LargeStringType>(func);
   AddBinaryToFixedSizeBinaryCast<LargeBinaryType>(func);
   AddBinaryToFixedSizeBinaryCast<FixedSizeBinaryType>(func);
@@ -513,15 +749,24 @@ void AddBinaryToFixedSizeBinaryCast(CastFunction* func) {
 }  // namespace
 
 std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
+  // cast_binary / cast_binary_view / cast_large_binary
+
   auto cast_binary = std::make_shared<CastFunction>("cast_binary", Type::BINARY);
   AddCommonCasts(Type::BINARY, binary(), cast_binary.get());
   AddBinaryToBinaryCast<BinaryType>(cast_binary.get());
 
+  auto cast_binary_view =
+      std::make_shared<CastFunction>("cast_binary_view", Type::BINARY_VIEW);
+  AddCommonCasts(Type::BINARY_VIEW, binary_view(), cast_binary_view.get());
+  AddBinaryToBinaryCast<BinaryViewType>(cast_binary_view.get());
+
   auto cast_large_binary =
       std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY);
   AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get());
   AddBinaryToBinaryCast<LargeBinaryType>(cast_large_binary.get());
 
+  // cast_string / cast_string_view / cast_large_string
+
   auto cast_string = std::make_shared<CastFunction>("cast_string", Type::STRING);
   AddCommonCasts(Type::STRING, utf8(), cast_string.get());
   AddNumberToStringCasts<StringType>(cast_string.get());
@@ -529,6 +774,14 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
   AddTemporalToStringCasts<StringType>(cast_string.get());
   AddBinaryToBinaryCast<StringType>(cast_string.get());
 
+  auto cast_string_view =
+      std::make_shared<CastFunction>("cast_string_view", Type::STRING_VIEW);
+  AddCommonCasts(Type::STRING_VIEW, utf8_view(), cast_string_view.get());
+  AddNumberToStringCasts<StringViewType>(cast_string_view.get());
+  AddDecimalToStringCasts<StringViewType>(cast_string_view.get());
+  AddTemporalToStringCasts<StringViewType>(cast_string_view.get());
+  AddBinaryToBinaryCast<StringViewType>(cast_string_view.get());
+
   auto cast_large_string =
       std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
   AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get());
@@ -537,13 +790,19 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
   AddTemporalToStringCasts<LargeStringType>(cast_large_string.get());
   AddBinaryToBinaryCast<LargeStringType>(cast_large_string.get());
 
+  // cast_fixed_size_binary
+
   auto cast_fsb =
       std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
   AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
                  cast_fsb.get());
   AddBinaryToFixedSizeBinaryCast(cast_fsb.get());
 
-  return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
+  return {
+      std::move(cast_binary), std::move(cast_binary_view), std::move(cast_large_binary),
+      std::move(cast_string), std::move(cast_string_view), std::move(cast_large_string),
+      std::move(cast_fsb),
+  };
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 140789e59665b..6315044a1ba05 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -97,7 +97,10 @@ static std::vector<std::shared_ptr<DataType>> kDictionaryIndexTypes = kIntegerTy
 static std::vector<std::shared_ptr<DataType>> kBaseBinaryTypes = {
     binary(), utf8(), large_binary(), large_utf8()};
 
-static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) {
+static std::vector<std::shared_ptr<DataType>> kBaseBinaryAndViewTypes = {
+    binary(), utf8(), large_binary(), large_utf8(), utf8_view(), binary_view()};
+
+static void AssertBufferSame(const Array& left, const Array& right, size_t buffer_index) {
   ASSERT_EQ(left.data()->buffers[buffer_index].get(),
             right.data()->buffers[buffer_index].get());
 }
@@ -174,14 +177,14 @@ TEST(Cast, CanCast) {
 
   ExpectCanCast(null(), {boolean()});
   ExpectCanCast(null(), kNumericTypes);
-  ExpectCanCast(null(), kBaseBinaryTypes);
+  ExpectCanCast(null(), kBaseBinaryAndViewTypes);
   ExpectCanCast(
       null(), {date32(), date64(), time32(TimeUnit::MILLI), timestamp(TimeUnit::SECOND)});
   ExpectCanCast(dictionary(uint16(), null()), {null()});
 
   ExpectCanCast(boolean(), {boolean()});
   ExpectCanCast(boolean(), kNumericTypes);
-  ExpectCanCast(boolean(), {utf8(), large_utf8()});
+  ExpectCanCast(boolean(), {utf8(), utf8_view(), large_utf8()});
   ExpectCanCast(dictionary(int32(), boolean()), {boolean()});
 
   ExpectCannotCast(boolean(), {null()});
@@ -198,11 +201,15 @@ TEST(Cast, CanCast) {
     ExpectCannotCast(from_numeric, {null()});
   }
 
-  for (auto from_base_binary : kBaseBinaryTypes) {
+  for (auto from_base_binary : kBaseBinaryAndViewTypes) {
     ExpectCanCast(from_base_binary, {boolean()});
     ExpectCanCast(from_base_binary, kNumericTypes);
     ExpectCanCast(from_base_binary, kBaseBinaryTypes);
-    ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary});
+    // TODO(GH-43010): include is_binary_view_like() types here once array_take
+    // can handle string-views
+    if (!is_binary_view_like(*from_base_binary)) {
+      ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary});
+    }
 
     // any cast which is valid for the dictionary is valid for the DictionaryArray
     ExpectCanCast(dictionary(uint32(), from_base_binary), kBaseBinaryTypes);
@@ -216,8 +223,9 @@ TEST(Cast, CanCast) {
   ExpectCannotCast(timestamp(TimeUnit::MICRO),
                    {binary(), large_binary()});  // no formatting supported
 
-  ExpectCanCast(fixed_size_binary(3),
-                {binary(), utf8(), large_binary(), large_utf8(), fixed_size_binary(3)});
+  ExpectCanCast(fixed_size_binary(3), kBaseBinaryAndViewTypes);
+  // Identity cast
+  ExpectCanCast(fixed_size_binary(3), {fixed_size_binary(3)});
   // Doesn't fail since a kernel exists (but it will return an error when executed)
   // ExpectCannotCast(fixed_size_binary(3), {fixed_size_binary(5)});
 
@@ -1039,7 +1047,7 @@ TEST(Cast, DecimalToFloating) {
 }
 
 TEST(Cast, DecimalToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     for (auto decimal_type : {decimal128(5, 2), decimal256(5, 2)}) {
       CheckCast(ArrayFromJSON(decimal_type, R"(["0.00", null, "123.45", "999.99"])"),
                 ArrayFromJSON(string_type, R"(["0.00", null, "123.45", "999.99"])"));
@@ -1558,7 +1566,7 @@ TEST(Cast, TimeZeroCopy) {
 }
 
 TEST(Cast, DateToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     CheckCast(ArrayFromJSON(date32(), "[0, null]"),
               ArrayFromJSON(string_type, R"(["1970-01-01", null])"));
     CheckCast(ArrayFromJSON(date64(), "[86400000, null]"),
@@ -1567,7 +1575,7 @@ TEST(Cast, DateToString) {
 }
 
 TEST(Cast, TimeToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     CheckCast(ArrayFromJSON(time32(TimeUnit::SECOND), "[1, 62]"),
               ArrayFromJSON(string_type, R"(["00:00:01", "00:01:02"])"));
     CheckCast(
@@ -1577,7 +1585,7 @@ TEST(Cast, TimeToString) {
 }
 
 TEST(Cast, TimestampToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     CheckCast(
         ArrayFromJSON(timestamp(TimeUnit::SECOND), "[-30610224000, -5364662400]"),
         ArrayFromJSON(string_type, R"(["1000-01-01 00:00:00", "1800-01-01 00:00:00"])"));
@@ -1603,7 +1611,7 @@ TEST(Cast, TimestampToString) {
 }
 
 TEST_F(CastTimezone, TimestampWithZoneToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     CheckCast(
         ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), "[-30610224000, -5364662400]"),
         ArrayFromJSON(string_type,
@@ -1793,7 +1801,7 @@ TEST(Cast, DurationToDurationMultiplyOverflow) {
 }
 
 TEST(Cast, DurationToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     for (auto unit : TimeUnit::values()) {
       CheckCast(ArrayFromJSON(duration(unit), "[0, null, 1234567, 2000]"),
                 ArrayFromJSON(string_type, R"(["0", null, "1234567", "2000"])"));
@@ -2047,31 +2055,41 @@ TEST(Cast, StringToDate) {
 }
 
 static void AssertBinaryZeroCopy(std::shared_ptr<Array> lhs, std::shared_ptr<Array> rhs) {
+  EXPECT_TRUE(is_base_binary_like(lhs->type_id()) || is_binary_view_like(lhs->type_id()));
+  EXPECT_EQ(is_base_binary_like(lhs->type_id()), is_base_binary_like(rhs->type_id()));
   // null bitmap and data buffers are always zero-copied
   AssertBufferSame(*lhs, *rhs, 0);
-  AssertBufferSame(*lhs, *rhs, 2);
-
-  if (offset_bit_width(lhs->type_id()) == offset_bit_width(rhs->type_id())) {
-    // offset buffer is zero copied if possible
-    AssertBufferSame(*lhs, *rhs, 1);
-    return;
+  if (is_base_binary_like(lhs->type_id())) {
+    AssertBufferSame(*lhs, *rhs, 2);
+  } else {
+    for (size_t i = 2; i < lhs->data()->buffers.size(); ++i) {
+      AssertBufferSame(*lhs, *rhs, i);
+    }
   }
 
-  // offset buffers are equivalent
-  ArrayVector offsets;
-  for (auto array : {lhs, rhs}) {
-    auto length = array->length();
-    auto buffer = array->data()->buffers[1];
-    offsets.push_back(offset_bit_width(array->type_id()) == 32
-                          ? *Cast(Int32Array(length, buffer), int64())
-                          : std::make_shared<Int64Array>(length, buffer));
+  if (is_base_binary_like(lhs->type_id())) {
+    if (offset_bit_width(lhs->type_id()) == offset_bit_width(rhs->type_id())) {
+      // offset buffer is zero copied if possible
+      AssertBufferSame(*lhs, *rhs, 1);
+      return;
+    }
+
+    // offset buffers are equivalent
+    ArrayVector offsets;
+    for (auto array : {lhs, rhs}) {
+      auto length = array->length();
+      auto buffer = array->data()->buffers[1];
+      offsets.push_back(offset_bit_width(array->type_id()) == 32
+                            ? *Cast(Int32Array(length, buffer), int64())
+                            : std::make_shared<Int64Array>(length, buffer));
+    }
+    AssertArraysEqual(*offsets[0], *offsets[1]);
   }
-  AssertArraysEqual(*offsets[0], *offsets[1]);
 }
 
 TEST(Cast, BinaryToString) {
-  for (auto bin_type : {binary(), large_binary()}) {
-    for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto bin_type : {binary(), binary_view(), large_binary()}) {
+    for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
       // empty -> empty always works
       CheckCast(ArrayFromJSON(bin_type, "[]"), ArrayFromJSON(string_type, "[]"));
 
@@ -2089,13 +2107,15 @@ TEST(Cast, BinaryToString) {
       options.allow_invalid_utf8 = true;
       ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, string_type, options));
       ASSERT_RAISES(Invalid, strings->ValidateFull());
-      AssertBinaryZeroCopy(invalid_utf8, strings);
+      if (is_binary_view_like(*bin_type) == is_binary_view_like(*string_type)) {
+        AssertBinaryZeroCopy(invalid_utf8, strings);
+      }
     }
   }
 
   auto from_type = fixed_size_binary(3);
   auto invalid_utf8 = FixedSizeInvalidUtf8(from_type);
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(string_type, "[]"));
 
     // invalid utf-8 masked by a null bit is not an error
@@ -2116,13 +2136,16 @@ TEST(Cast, BinaryToString) {
 
     // ARROW-16757: we no longer zero copy, but the contents are equal
     ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get());
-    ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
+    if (!is_binary_view_like(*string_type)) {
+      ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
+    }
   }
 }
 
 TEST(Cast, BinaryOrStringToBinary) {
-  for (auto from_type : {utf8(), large_utf8(), binary(), large_binary()}) {
-    for (auto to_type : {binary(), large_binary()}) {
+  for (auto from_type :
+       {utf8(), utf8_view(), large_utf8(), binary(), binary_view(), large_binary()}) {
+    for (auto to_type : {binary(), binary_view(), large_binary()}) {
       // empty -> empty always works
       CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(to_type, "[]"));
 
@@ -2131,7 +2154,9 @@ TEST(Cast, BinaryOrStringToBinary) {
       // invalid utf-8 is not an error for binary
       ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, to_type));
       ValidateOutput(*strings);
-      AssertBinaryZeroCopy(invalid_utf8, strings);
+      if (is_binary_view_like(*from_type) == is_binary_view_like(*to_type)) {
+        AssertBinaryZeroCopy(invalid_utf8, strings);
+      }
 
       // invalid utf-8 masked by a null bit is not an error
       CheckCast(MaskArrayWithNullsAt(InvalidUtf8(from_type), {4}),
@@ -2143,7 +2168,7 @@ TEST(Cast, BinaryOrStringToBinary) {
   auto invalid_utf8 = FixedSizeInvalidUtf8(from_type);
   CheckCast(invalid_utf8, invalid_utf8);
   CheckCastFails(invalid_utf8, CastOptions::Safe(fixed_size_binary(5)));
-  for (auto to_type : {binary(), large_binary()}) {
+  for (auto to_type : {binary(), binary_view(), large_binary()}) {
     CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(to_type, "[]"));
     ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, to_type));
     ValidateOutput(*strings);
@@ -2153,7 +2178,9 @@ TEST(Cast, BinaryOrStringToBinary) {
 
     // ARROW-16757: we no longer zero copy, but the contents are equal
     ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get());
-    ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
+    if (!is_binary_view_like(*to_type)) {
+      ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
+    }
 
     // invalid utf-8 masked by a null bit is not an error
     CheckCast(MaskArrayWithNullsAt(invalid_utf8, {4}),
@@ -2162,8 +2189,8 @@ TEST(Cast, BinaryOrStringToBinary) {
 }
 
 TEST(Cast, StringToString) {
-  for (auto from_type : {utf8(), large_utf8()}) {
-    for (auto to_type : {utf8(), large_utf8()}) {
+  for (auto from_type : {utf8(), utf8_view(), large_utf8()}) {
+    for (auto to_type : {utf8(), utf8_view(), large_utf8()}) {
       // empty -> empty always works
       CheckCast(ArrayFromJSON(from_type, "[]"), ArrayFromJSON(to_type, "[]"));
 
@@ -2179,13 +2206,27 @@ TEST(Cast, StringToString) {
       // utf-8 is not checked by Cast when the origin guarantees utf-8
       ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, to_type, options));
       ASSERT_RAISES(Invalid, strings->ValidateFull());
-      AssertBinaryZeroCopy(invalid_utf8, strings);
+      if (is_binary_view_like(*from_type) == is_binary_view_like(*to_type)) {
+        AssertBinaryZeroCopy(invalid_utf8, strings);
+      }
+
+      auto short_input = R"(["foo", null, "bar", "baz", "quu"])";
+      auto long_input = R"(["foofoofoofoofoo", null, "barbarbarbarbarbarbar",
+          "bazbazbazbazbazbazbaz", "quuquuquuquuquuquuquuquuquu"])";
+      auto combine_input = R"(["foo", null, "barbarbarbarbarbarbar", "baz", "quu"])";
+
+      CheckCast(ArrayFromJSON(from_type, short_input),
+                ArrayFromJSON(to_type, short_input));
+      CheckCast(ArrayFromJSON(from_type, long_input), ArrayFromJSON(to_type, long_input));
+      CheckCast(ArrayFromJSON(from_type, combine_input),
+                ArrayFromJSON(to_type, combine_input));
     }
   }
 }
 
 TEST(Cast, BinaryOrStringToFixedSizeBinary) {
-  for (auto in_type : {utf8(), large_utf8(), binary(), large_binary()}) {
+  for (auto in_type :
+       {utf8(), large_utf8(), utf8_view(), binary(), binary_view(), large_binary()}) {
     auto valid_input = ArrayFromJSON(in_type, R"(["foo", null, "bar", "baz", "quu"])");
     auto invalid_input = ArrayFromJSON(in_type, R"(["foo", null, "bar", "baz", "quux"])");
 
@@ -2201,7 +2242,8 @@ TEST(Cast, BinaryOrStringToFixedSizeBinary) {
 }
 
 TEST(Cast, FixedSizeBinaryToBinaryOrString) {
-  for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) {
+  for (auto out_type :
+       {utf8(), utf8_view(), large_utf8(), binary(), binary_view(), large_binary()}) {
     auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar",
           "baz", "quu"])");
 
@@ -2214,7 +2256,8 @@ TEST(Cast, FixedSizeBinaryToBinaryOrString) {
 }
 
 TEST(Cast, FixedSizeBinaryToBinaryOrStringWithSlice) {
-  for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) {
+  for (auto out_type :
+       {utf8(), utf8_view(), large_utf8(), binary(), binary_view(), large_binary()}) {
     auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar",
                 "baz", "quu"])");
     auto sliced = valid_input->Slice(1, 3);
@@ -2228,7 +2271,7 @@ TEST(Cast, FixedSizeBinaryToBinaryOrStringWithSlice) {
 }
 
 TEST(Cast, IntToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"),
               ArrayFromJSON(string_type, R"(["0", "1", "127", "-128", null])"));
 
@@ -2261,7 +2304,7 @@ TEST(Cast, IntToString) {
 
 TEST(Cast, FloatingToString) {
   for (auto float_type : {float16(), float32(), float64()}) {
-    for (auto string_type : {utf8(), large_utf8()}) {
+    for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
       CheckCast(ArrayFromJSON(float_type, "[0.0, -0.0, 1.5, -Inf, Inf, NaN, null]"),
                 ArrayFromJSON(string_type,
                               R"(["0", "-0", "1.5", "-inf", "inf", "nan", null])"));
@@ -2270,7 +2313,7 @@ TEST(Cast, FloatingToString) {
 }
 
 TEST(Cast, BooleanToString) {
-  for (auto string_type : {utf8(), large_utf8()}) {
+  for (auto string_type : {utf8(), utf8_view(), large_utf8()}) {
     CheckCast(ArrayFromJSON(boolean(), "[true, true, false, null]"),
               ArrayFromJSON(string_type, R"(["true", "true", "false", null])"));
   }
@@ -2913,9 +2956,12 @@ TEST(Cast, IdentityCasts) {
   for (auto type : kNumericTypes) {
     CheckIdentityCast(type, "[1, 2, null, 4]");
   }
-  CheckIdentityCast(binary(), R"(["foo", "bar"])");
-  CheckIdentityCast(utf8(), R"(["foo", "bar"])");
-  CheckIdentityCast(fixed_size_binary(3), R"(["foo", "bar"])");
+  const std::string json = R"(["foo", "bar"])";
+  CheckIdentityCast(utf8(), json);
+  CheckIdentityCast(binary(), json);
+  CheckIdentityCast(utf8_view(), json);
+  CheckIdentityCast(binary_view(), json);
+  CheckIdentityCast(fixed_size_binary(3), json);
 
   CheckIdentityCast(list(int8()), "[[1, 2], [null], [], [3]]");
 
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 91a0d87cb8ae7..ae9b213480f7b 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -3333,6 +3333,7 @@ std::vector<std::shared_ptr<DataType>> g_int_types;
 std::vector<std::shared_ptr<DataType>> g_floating_types;
 std::vector<std::shared_ptr<DataType>> g_numeric_types;
 std::vector<std::shared_ptr<DataType>> g_base_binary_types;
+std::vector<std::shared_ptr<DataType>> g_binary_view_types;
 std::vector<std::shared_ptr<DataType>> g_temporal_types;
 std::vector<std::shared_ptr<DataType>> g_interval_types;
 std::vector<std::shared_ptr<DataType>> g_duration_types;
@@ -3384,6 +3385,9 @@ void InitStaticData() {
   // Base binary types (without FixedSizeBinary)
   g_base_binary_types = {binary(), utf8(), large_binary(), large_utf8()};
 
+  // Binary view types
+  g_binary_view_types = {utf8_view(), binary_view()};
+
   // Non-parametric, non-nested types. This also DOES NOT include
   //
   // * Decimal
@@ -3391,9 +3395,10 @@ void InitStaticData() {
   // * Time32
   // * Time64
   // * Timestamp
-  g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()};
+  g_primitive_types = {null(), boolean(), date32(), date64()};
   Extend(g_numeric_types, &g_primitive_types);
   Extend(g_base_binary_types, &g_primitive_types);
+  Extend(g_binary_view_types, &g_primitive_types);
 }
 
 }  // namespace
@@ -3413,6 +3418,11 @@ const std::vector<std::shared_ptr<DataType>>& StringTypes() {
   return types;
 }
 
+const std::vector<std::shared_ptr<DataType>>& BinaryViewTypes() {
+  std::call_once(static_data_initialized, InitStaticData);
+  return g_binary_view_types;
+}
+
 const std::vector<std::shared_ptr<DataType>>& SignedIntTypes() {
   std::call_once(static_data_initialized, InitStaticData);
   return g_signed_int_types;
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index e087c8ca1c387..e0f87e6a9d263 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -2540,6 +2540,9 @@ const std::vector<std::shared_ptr<DataType>>& BinaryTypes();
 /// \brief String and large-string types
 ARROW_EXPORT
 const std::vector<std::shared_ptr<DataType>>& StringTypes();
+/// \brief String-view and Binary-view
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& BinaryViewTypes();
 /// \brief Temporal types including date, time and timestamps for each unit
 ARROW_EXPORT
 const std::vector<std::shared_ptr<DataType>>& TemporalTypes();
diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc
index df484a8fc2c59..f641bb9fab738 100644
--- a/cpp/src/arrow/type_test.cc
+++ b/cpp/src/arrow/type_test.cc
@@ -1307,6 +1307,7 @@ TEST_F(TestUnifySchemas, Binary) {
   options.promote_binary = false;
   CheckUnifyFailsTypeError({utf8(), binary()}, {large_utf8(), large_binary()});
   CheckUnifyFailsTypeError(fixed_size_binary(2), BaseBinaryTypes());
+  CheckUnifyFailsTypeError(fixed_size_binary(2), BinaryViewTypes());
   CheckUnifyFailsTypeError(utf8(), {binary(), large_binary(), fixed_size_binary(2)});
 }
 
@@ -2430,6 +2431,7 @@ TEST(TypesTest, TestMembership) {
   TEST_PREDICATE(all_types, is_large_binary_like);
   TEST_PREDICATE(all_types, is_binary);
   TEST_PREDICATE(all_types, is_string);
+  TEST_PREDICATE(all_types, is_binary_view_like);
   TEST_PREDICATE(all_types, is_temporal);
   TEST_PREDICATE(all_types, is_interval);
   TEST_PREDICATE(all_types, is_dictionary);
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index 8caf4400fe86d..96b6ccd26a79e 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -1201,6 +1201,21 @@ constexpr bool is_string(Type::type type_id) {
   return false;
 }
 
+/// \brief Check for a binary-view-like type (i.e. string view and binary view)
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a binary-view-like type one
+constexpr bool is_binary_view_like(Type::type type_id) {
+  switch (type_id) {
+    case Type::STRING_VIEW:
+    case Type::BINARY_VIEW:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
 /// \brief Check for a temporal type
 ///
 /// \param[in] type_id the type-id to check
@@ -1624,6 +1639,16 @@ static inline bool is_binary(const DataType& type) { return is_binary(type.id())
 /// Convenience for checking using the type's id
 static inline bool is_string(const DataType& type) { return is_string(type.id()); }
 
+/// \brief Check for a binary-view-like type
+///
+/// \param[in] type the type to check
+/// \return whether type is a binary-view-like type
+///
+/// Convenience for checking using the type's id
+static inline bool is_binary_view_like(const DataType& type) {
+  return is_binary_view_like(type.id());
+}
+
 /// \brief Check for a temporal type, including time and timestamps for each unit
 ///
 /// \param[in] type the type to check
diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h
index 2206918724969..eb079e2c548ab 100644
--- a/cpp/src/arrow/util/binary_view_util.h
+++ b/cpp/src/arrow/util/binary_view_util.h
@@ -99,4 +99,17 @@ bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r,
                 l.size() - BinaryViewType::kPrefixSize) == 0;
 }
 
+/// \brief Compute the total size of a list of binary views including null
+/// views.
+///
+/// This is useful when calculating the necessary memory to store all the string
+/// data from the views.
+inline int64_t SumOfBinaryViewSizes(const BinaryViewType::c_type* views, int64_t length) {
+  int64_t total = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    total += views[i].size();
+  }
+  return total;
+}
+
 }  // namespace arrow::util
diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h
index a2ba9cfc65071..3fa557af2079d 100644
--- a/cpp/src/arrow/visit_data_inline.h
+++ b/cpp/src/arrow/visit_data_inline.h
@@ -249,7 +249,8 @@ VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& nu
 // The scalar value's type depends on the array data type:
 // - the type's `c_type`, if any
 // - for boolean arrays, a `bool`
-// - for binary, string and fixed-size binary arrays, a `std::string_view`
+// - for binary, string, large binary and string, binary and string view, and fixed-size
+//   binary arrays, a `std::string_view`
 
 template <typename T>
 struct ArraySpanVisitor {

From 990cdf7dca8ba74d1c3b9a7b14898011a1142356 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 12 Sep 2024 19:42:09 +0200
Subject: [PATCH 138/186] GH-44079: [C++][Parquet] Remove deprecated APIs
 (#44080)

### What changes are included in this PR?

Remove APIs that have been deprecated more than one year ago (i.e. in 2023 or before).

* GitHub Issue: #44079

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/arrow/writer.cc       |  20 -----
 cpp/src/parquet/arrow/writer.h        |  12 ---
 cpp/src/parquet/column_reader.cc      | 106 --------------------------
 cpp/src/parquet/column_reader.h       |  42 ----------
 cpp/src/parquet/column_reader_test.cc |  68 -----------------
 cpp/src/parquet/column_writer.cc      |  12 ---
 cpp/src/parquet/column_writer.h       |  15 ----
 cpp/src/parquet/metadata.cc           |  58 ++------------
 cpp/src/parquet/metadata.h            |  29 +------
 9 files changed, 10 insertions(+), 352 deletions(-)

diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index 4fd7ef1b47b39..463713df1b1aa 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -523,16 +523,6 @@ Status FileWriter::Make(::arrow::MemoryPool* pool,
   return Status::OK();
 }
 
-Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
-                        std::shared_ptr<::arrow::io::OutputStream> sink,
-                        std::shared_ptr<WriterProperties> properties,
-                        std::unique_ptr<FileWriter>* writer) {
-  ARROW_ASSIGN_OR_RAISE(
-      *writer, Open(std::move(schema), pool, std::move(sink), std::move(properties),
-                    default_arrow_writer_properties()));
-  return Status::OK();
-}
-
 Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
                          const ArrowWriterProperties& properties,
                          std::shared_ptr<const KeyValueMetadata>* out) {
@@ -560,16 +550,6 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo
   return Status::OK();
 }
 
-Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
-                        std::shared_ptr<::arrow::io::OutputStream> sink,
-                        std::shared_ptr<WriterProperties> properties,
-                        std::shared_ptr<ArrowWriterProperties> arrow_properties,
-                        std::unique_ptr<FileWriter>* writer) {
-  ARROW_ASSIGN_OR_RAISE(*writer, Open(std::move(schema), pool, std::move(sink),
-                                      std::move(properties), arrow_properties));
-  return Status::OK();
-}
-
 Result<std::unique_ptr<FileWriter>> FileWriter::Open(
     const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
     std::shared_ptr<::arrow::io::OutputStream> sink,
diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h
index 4a1a033a7b7b8..4e1ddafd9a082 100644
--- a/cpp/src/parquet/arrow/writer.h
+++ b/cpp/src/parquet/arrow/writer.h
@@ -74,18 +74,6 @@ class PARQUET_EXPORT FileWriter {
       std::shared_ptr<ArrowWriterProperties> arrow_properties =
           default_arrow_writer_properties());
 
-  ARROW_DEPRECATED("Deprecated in 11.0.0. Use Result-returning variants instead.")
-  static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
-                              std::shared_ptr<::arrow::io::OutputStream> sink,
-                              std::shared_ptr<WriterProperties> properties,
-                              std::unique_ptr<FileWriter>* writer);
-  ARROW_DEPRECATED("Deprecated in 11.0.0. Use Result-returning variants instead.")
-  static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
-                              std::shared_ptr<::arrow::io::OutputStream> sink,
-                              std::shared_ptr<WriterProperties> properties,
-                              std::shared_ptr<ArrowWriterProperties> arrow_properties,
-                              std::unique_ptr<FileWriter>* writer);
-
   /// Return the Arrow schema to be written to.
   virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
 
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index bc72db61ce638..3ffc6f720061f 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -75,24 +75,6 @@ constexpr int64_t kMinLevelBatchSize = 1024;
 // Both RecordReader and the ColumnReader use this for skipping.
 constexpr int64_t kSkipScratchBatchSize = 1024;
 
-inline bool HasSpacedValues(const ColumnDescriptor* descr) {
-  if (descr->max_repetition_level() > 0) {
-    // repeated+flat case
-    return !descr->schema_node()->is_required();
-  } else {
-    // non-repeated+nested case
-    // Find if a node forces nulls in the lowest level along the hierarchy
-    const schema::Node* node = descr->schema_node().get();
-    while (node) {
-      if (node->is_optional()) {
-        return true;
-      }
-      node = node->parent();
-    }
-    return false;
-  }
-}
-
 // Throws exception if number_decoded does not match expected.
 inline void CheckNumberDecoded(int64_t number_decoded, int64_t expected) {
   if (ARROW_PREDICT_FALSE(number_decoded != expected)) {
@@ -979,11 +961,6 @@ class TypedColumnReaderImpl : public TypedColumnReader<DType>,
   int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
                     T* values, int64_t* values_read) override;
 
-  int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
-                          T* values, uint8_t* valid_bits, int64_t valid_bits_offset,
-                          int64_t* levels_read, int64_t* values_read,
-                          int64_t* null_count) override;
-
   int64_t Skip(int64_t num_values_to_skip) override;
 
   Type::type type() const override { return this->descr_->physical_type(); }
@@ -1153,89 +1130,6 @@ int64_t TypedColumnReaderImpl<DType>::ReadBatch(int64_t batch_size, int16_t* def
   return total_values;
 }
 
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::ReadBatchSpaced(
-    int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, T* values,
-    uint8_t* valid_bits, int64_t valid_bits_offset, int64_t* levels_read,
-    int64_t* values_read, int64_t* null_count_out) {
-  // HasNext might invoke ReadNewPage until a data page with
-  // `available_values_current_page() > 0` is found.
-  if (!HasNext()) {
-    *levels_read = 0;
-    *values_read = 0;
-    *null_count_out = 0;
-    return 0;
-  }
-
-  // Number of non-null values to read
-  int64_t total_values;
-  // TODO(wesm): keep reading data pages until batch_size is reached, or the
-  // row group is finished
-  batch_size = std::min(batch_size, this->available_values_current_page());
-
-  // If the field is required and non-repeated, there are no definition levels
-  if (this->max_def_level_ > 0) {
-    int64_t num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
-    if (ARROW_PREDICT_FALSE(num_def_levels != batch_size)) {
-      throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
-    }
-
-    // Not present for non-repeated fields
-    if (this->max_rep_level_ > 0) {
-      int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
-      if (ARROW_PREDICT_FALSE(num_def_levels != num_rep_levels)) {
-        throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
-      }
-    }
-
-    const bool has_spaced_values = HasSpacedValues(this->descr_);
-    int64_t null_count = 0;
-    if (!has_spaced_values) {
-      int64_t values_to_read =
-          std::count(def_levels, def_levels + num_def_levels, this->max_def_level_);
-      total_values = this->ReadValues(values_to_read, values);
-      ::arrow::bit_util::SetBitsTo(valid_bits, valid_bits_offset,
-                                   /*length=*/total_values,
-                                   /*bits_are_set=*/true);
-      *values_read = total_values;
-    } else {
-      internal::LevelInfo info;
-      info.repeated_ancestor_def_level = this->max_def_level_ - 1;
-      info.def_level = this->max_def_level_;
-      info.rep_level = this->max_rep_level_;
-      internal::ValidityBitmapInputOutput validity_io;
-      validity_io.values_read_upper_bound = num_def_levels;
-      validity_io.valid_bits = valid_bits;
-      validity_io.valid_bits_offset = valid_bits_offset;
-      validity_io.null_count = null_count;
-      validity_io.values_read = *values_read;
-
-      internal::DefLevelsToBitmap(def_levels, num_def_levels, info, &validity_io);
-      null_count = validity_io.null_count;
-      *values_read = validity_io.values_read;
-
-      total_values =
-          this->ReadValuesSpaced(*values_read, values, static_cast<int>(null_count),
-                                 valid_bits, valid_bits_offset);
-    }
-    *levels_read = num_def_levels;
-    *null_count_out = null_count;
-
-  } else {
-    // Required field, read all values
-    total_values = this->ReadValues(batch_size, values);
-    ::arrow::bit_util::SetBitsTo(valid_bits, valid_bits_offset,
-                                 /*length=*/total_values,
-                                 /*bits_are_set=*/true);
-    *null_count_out = 0;
-    *values_read = total_values;
-    *levels_read = total_values;
-  }
-
-  this->ConsumeBufferedValues(*levels_read);
-  return total_values;
-}
-
 template <typename DType>
 void TypedColumnReaderImpl<DType>::InitScratchForSkip() {
   if (this->scratch_for_skip_ == nullptr) {
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 61d79d4f9b1b3..c31088c96cd84 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -219,48 +219,6 @@ class TypedColumnReader : public ColumnReader {
   virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
                             T* values, int64_t* values_read) = 0;
 
-  /// Read a batch of repetition levels, definition levels, and values from the
-  /// column and leave spaces for null entries on the lowest level in the values
-  /// buffer.
-  ///
-  /// In comparison to ReadBatch the length of repetition and definition levels
-  /// is the same as of the number of values read for max_definition_level == 1.
-  /// In the case of max_definition_level > 1, the repetition and definition
-  /// levels are larger than the values but the values include the null entries
-  /// with definition_level == (max_definition_level - 1).
-  ///
-  /// To fully exhaust a row group, you must read batches until the number of
-  /// values read reaches the number of stored values according to the metadata.
-  ///
-  /// @param batch_size the number of levels to read
-  /// @param[out] def_levels The Parquet definition levels, output has
-  ///   the length levels_read.
-  /// @param[out] rep_levels The Parquet repetition levels, output has
-  ///   the length levels_read.
-  /// @param[out] values The values in the lowest nested level including
-  ///   spacing for nulls on the lowest levels; output has the length
-  ///   values_read.
-  /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
-  ///   the row is null or on the maximum definition level. For performance
-  ///   reasons the underlying buffer should be able to store 1 bit more than
-  ///   required. If this requires an additional byte, this byte is only read
-  ///   but never written to.
-  /// @param valid_bits_offset The offset in bits of the valid_bits where the
-  ///   first relevant bit resides.
-  /// @param[out] levels_read The number of repetition/definition levels that were read.
-  /// @param[out] values_read The number of values read, this includes all
-  ///   non-null entries as well as all null-entries on the lowest level
-  ///   (i.e. definition_level == max_definition_level - 1)
-  /// @param[out] null_count The number of nulls on the lowest levels.
-  ///   (i.e. (values_read - null_count) is total number of non-null entries)
-  ///
-  /// \deprecated Since 4.0.0
-  ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
-  virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
-                                  int16_t* rep_levels, T* values, uint8_t* valid_bits,
-                                  int64_t valid_bits_offset, int64_t* levels_read,
-                                  int64_t* values_read, int64_t* null_count) = 0;
-
   // Skip reading values. This method will work for both repeated and
   // non-repeated fields. Note that this method is skipping values and not
   // records. This distinction is important for repeated fields, meaning that
diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc
index 9096f195687fb..f3d580ab5d345 100644
--- a/cpp/src/parquet/column_reader_test.cc
+++ b/cpp/src/parquet/column_reader_test.cc
@@ -125,58 +125,6 @@ class TestPrimitiveReader : public ::testing::Test {
     ASSERT_EQ(0, batch_actual);
     ASSERT_EQ(0, values_read);
   }
-  void CheckResultsSpaced() {
-    std::vector<int32_t> vresult(num_levels_, -1);
-    std::vector<int16_t> dresult(num_levels_, -1);
-    std::vector<int16_t> rresult(num_levels_, -1);
-    std::vector<uint8_t> valid_bits(num_levels_, 255);
-    int total_values_read = 0;
-    int batch_actual = 0;
-    int levels_actual = 0;
-    int64_t null_count = -1;
-    int64_t levels_read = 0;
-    int64_t values_read;
-
-    Int32Reader* reader = static_cast<Int32Reader*>(reader_.get());
-    int32_t batch_size = 8;
-    int batch = 0;
-    // This will cover both the cases
-    // 1) batch_size < page_size (multiple ReadBatch from a single page)
-    // 2) batch_size > page_size (BatchRead limits to a single page)
-    do {
-      ARROW_SUPPRESS_DEPRECATION_WARNING
-      batch = static_cast<int>(reader->ReadBatchSpaced(
-          batch_size, dresult.data() + levels_actual, rresult.data() + levels_actual,
-          vresult.data() + batch_actual, valid_bits.data() + batch_actual, 0,
-          &levels_read, &values_read, &null_count));
-      ARROW_UNSUPPRESS_DEPRECATION_WARNING
-      total_values_read += batch - static_cast<int>(null_count);
-      batch_actual += batch;
-      levels_actual += static_cast<int>(levels_read);
-      batch_size = std::min(1 << 24, std::max(batch_size * 2, 4096));
-    } while ((batch > 0) || (levels_read > 0));
-
-    ASSERT_EQ(num_levels_, levels_actual);
-    ASSERT_EQ(num_values_, total_values_read);
-    if (max_def_level_ > 0) {
-      ASSERT_TRUE(vector_equal(def_levels_, dresult));
-      ASSERT_TRUE(vector_equal_with_def_levels(values_, dresult, max_def_level_,
-                                               max_rep_level_, vresult));
-    } else {
-      ASSERT_TRUE(vector_equal(values_, vresult));
-    }
-    if (max_rep_level_ > 0) {
-      ASSERT_TRUE(vector_equal(rep_levels_, rresult));
-    }
-    // catch improper writes at EOS
-    ARROW_SUPPRESS_DEPRECATION_WARNING
-    batch_actual = static_cast<int>(
-        reader->ReadBatchSpaced(5, nullptr, nullptr, nullptr, valid_bits.data(), 0,
-                                &levels_read, &values_read, &null_count));
-    ARROW_UNSUPPRESS_DEPRECATION_WARNING
-    ASSERT_EQ(0, batch_actual);
-    ASSERT_EQ(0, null_count);
-  }
 
   void Clear() {
     values_.clear();
@@ -194,14 +142,6 @@ class TestPrimitiveReader : public ::testing::Test {
     InitReader(d);
     CheckResults();
     Clear();
-
-    num_values_ =
-        MakePages<Int32Type>(d, num_pages, levels_per_page, def_levels_, rep_levels_,
-                             values_, data_buffer_, pages_, Encoding::PLAIN);
-    num_levels_ = num_pages * levels_per_page;
-    InitReader(d);
-    CheckResultsSpaced();
-    Clear();
   }
 
   void ExecuteDict(int num_pages, int levels_per_page, const ColumnDescriptor* d) {
@@ -212,14 +152,6 @@ class TestPrimitiveReader : public ::testing::Test {
     InitReader(d);
     CheckResults();
     Clear();
-
-    num_values_ =
-        MakePages<Int32Type>(d, num_pages, levels_per_page, def_levels_, rep_levels_,
-                             values_, data_buffer_, pages_, Encoding::RLE_DICTIONARY);
-    num_levels_ = num_pages * levels_per_page;
-    InitReader(d);
-    CheckResultsSpaced();
-    Clear();
   }
 
  protected:
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 40d19d38e10ab..b7ff712abebe9 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -718,18 +718,6 @@ std::unique_ptr<PageWriter> PageWriter::Open(
   }
 }
 
-std::unique_ptr<PageWriter> PageWriter::Open(
-    std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
-    int compression_level, ColumnChunkMetaDataBuilder* metadata,
-    int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool,
-    bool buffered_row_group, std::shared_ptr<Encryptor> meta_encryptor,
-    std::shared_ptr<Encryptor> data_encryptor, bool page_write_checksum_enabled,
-    ColumnIndexBuilder* column_index_builder, OffsetIndexBuilder* offset_index_builder) {
-  return PageWriter::Open(sink, codec, metadata, row_group_ordinal, column_chunk_ordinal,
-                          pool, buffered_row_group, meta_encryptor, data_encryptor,
-                          page_write_checksum_enabled, column_index_builder,
-                          offset_index_builder, CodecOptions{compression_level});
-}
 // ----------------------------------------------------------------------
 // ColumnWriter
 
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index 845bf9aa896bd..bd329d61053f2 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -103,21 +103,6 @@ class PARQUET_EXPORT PageWriter {
       OffsetIndexBuilder* offset_index_builder = NULLPTR,
       const CodecOptions& codec_options = CodecOptions{});
 
-  ARROW_DEPRECATED("Deprecated in 13.0.0. Use CodecOptions-taking overload instead.")
-  static std::unique_ptr<PageWriter> Open(
-      std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
-      int compression_level, ColumnChunkMetaDataBuilder* metadata,
-      int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
-      bool buffered_row_group = false,
-      std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
-      std::shared_ptr<Encryptor> data_encryptor = NULLPTR,
-      bool page_write_checksum_enabled = false,
-      // column_index_builder MUST outlive the PageWriter
-      ColumnIndexBuilder* column_index_builder = NULLPTR,
-      // offset_index_builder MUST outlive the PageWriter
-      OffsetIndexBuilder* offset_index_builder = NULLPTR);
-
   // The Column Writer decides if dictionary encoding is used if set and
   // if the dictionary encoding has fallen back to default encoding on reaching dictionary
   // page limit
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 423154f8641e5..8f577be45b96d 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -408,15 +408,6 @@ std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
                               properties, writer_version, std::move(file_decryptor)));
 }
 
-std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
-    const void* metadata, const ColumnDescriptor* descr,
-    const ApplicationVersion* writer_version, int16_t row_group_ordinal,
-    int16_t column_ordinal, std::shared_ptr<InternalFileDecryptor> file_decryptor) {
-  return std::unique_ptr<ColumnChunkMetaData>(new ColumnChunkMetaData(
-      metadata, descr, row_group_ordinal, column_ordinal, default_reader_properties(),
-      writer_version, std::move(file_decryptor)));
-}
-
 ColumnChunkMetaData::ColumnChunkMetaData(
     const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
     int16_t column_ordinal, const ReaderProperties& properties,
@@ -587,15 +578,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl {
   std::shared_ptr<InternalFileDecryptor> file_decryptor_;
 };
 
-std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
-    const void* metadata, const SchemaDescriptor* schema,
-    const ApplicationVersion* writer_version,
-    std::shared_ptr<InternalFileDecryptor> file_decryptor) {
-  return std::unique_ptr<parquet::RowGroupMetaData>(
-      new RowGroupMetaData(metadata, schema, default_reader_properties(), writer_version,
-                           std::move(file_decryptor)));
-}
-
 std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
     const void* metadata, const SchemaDescriptor* schema,
     const ReaderProperties& properties, const ApplicationVersion* writer_version,
@@ -988,13 +970,6 @@ std::shared_ptr<FileMetaData> FileMetaData::Make(
       new FileMetaData(metadata, metadata_len, properties, std::move(file_decryptor)));
 }
 
-std::shared_ptr<FileMetaData> FileMetaData::Make(
-    const void* metadata, uint32_t* metadata_len,
-    std::shared_ptr<InternalFileDecryptor> file_decryptor) {
-  return std::shared_ptr<FileMetaData>(new FileMetaData(
-      metadata, metadata_len, default_reader_properties(), std::move(file_decryptor)));
-}
-
 FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len,
                            const ReaderProperties& properties,
                            std::shared_ptr<InternalFileDecryptor> file_decryptor)
@@ -1911,13 +1886,11 @@ void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written,
 // file metadata
 class FileMetaDataBuilder::FileMetaDataBuilderImpl {
  public:
-  explicit FileMetaDataBuilderImpl(
-      const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
-      std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+  explicit FileMetaDataBuilderImpl(const SchemaDescriptor* schema,
+                                   std::shared_ptr<WriterProperties> props)
       : metadata_(new format::FileMetaData()),
         properties_(std::move(props)),
-        schema_(schema),
-        key_value_metadata_(std::move(key_value_metadata)) {
+        schema_(schema) {
     if (properties_->file_encryption_properties() != nullptr &&
         properties_->file_encryption_properties()->encrypted_footer()) {
       crypto_metadata_ = std::make_unique<format::FileCryptoMetaData>();
@@ -1974,13 +1947,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
     metadata_->__set_num_rows(total_rows);
     metadata_->__set_row_groups(row_groups_);
 
-    if (key_value_metadata_ || key_value_metadata) {
-      if (!key_value_metadata_) {
-        key_value_metadata_ = key_value_metadata;
-      } else if (key_value_metadata) {
-        key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata);
-      }
-      ToThriftKeyValueMetadata(*key_value_metadata_, metadata_.get());
+    if (key_value_metadata) {
+      ToThriftKeyValueMetadata(*key_value_metadata, metadata_.get());
     }
 
     int32_t file_version = 0;
@@ -2066,27 +2034,17 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
 
   std::unique_ptr<RowGroupMetaDataBuilder> current_row_group_builder_;
   const SchemaDescriptor* schema_;
-  std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
 };
 
-std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
-    const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
-    std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
-  return std::unique_ptr<FileMetaDataBuilder>(
-      new FileMetaDataBuilder(schema, std::move(props), std::move(key_value_metadata)));
-}
-
 std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
     const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props) {
   return std::unique_ptr<FileMetaDataBuilder>(
       new FileMetaDataBuilder(schema, std::move(props)));
 }
 
-FileMetaDataBuilder::FileMetaDataBuilder(
-    const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
-    std::shared_ptr<const KeyValueMetadata> key_value_metadata)
-    : impl_{std::make_unique<FileMetaDataBuilderImpl>(schema, std::move(props),
-                                                      std::move(key_value_metadata))} {}
+FileMetaDataBuilder::FileMetaDataBuilder(const SchemaDescriptor* schema,
+                                         std::shared_ptr<WriterProperties> props)
+    : impl_{std::make_unique<FileMetaDataBuilderImpl>(schema, std::move(props))} {}
 
 FileMetaDataBuilder::~FileMetaDataBuilder() = default;
 
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index d1e2d1904a694..dc97d816daa74 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -127,14 +127,6 @@ struct IndexLocation {
 class PARQUET_EXPORT ColumnChunkMetaData {
  public:
   // API convenience to get a MetaData accessor
-
-  ARROW_DEPRECATED("Use the ReaderProperties-taking overload")
-  static std::unique_ptr<ColumnChunkMetaData> Make(
-      const void* metadata, const ColumnDescriptor* descr,
-      const ApplicationVersion* writer_version, int16_t row_group_ordinal = -1,
-      int16_t column_ordinal = -1,
-      std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
   static std::unique_ptr<ColumnChunkMetaData> Make(
       const void* metadata, const ColumnDescriptor* descr,
       const ReaderProperties& properties = default_reader_properties(),
@@ -200,12 +192,6 @@ class PARQUET_EXPORT ColumnChunkMetaData {
 /// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
 class PARQUET_EXPORT RowGroupMetaData {
  public:
-  ARROW_DEPRECATED("Use the ReaderProperties-taking overload")
-  static std::unique_ptr<RowGroupMetaData> Make(
-      const void* metadata, const SchemaDescriptor* schema,
-      const ApplicationVersion* writer_version,
-      std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
   /// \brief Create a RowGroupMetaData from a serialized thrift message.
   static std::unique_ptr<RowGroupMetaData> Make(
       const void* metadata, const SchemaDescriptor* schema,
@@ -273,11 +259,6 @@ class FileMetaDataBuilder;
 /// \brief FileMetaData is a proxy around format::FileMetaData.
 class PARQUET_EXPORT FileMetaData {
  public:
-  ARROW_DEPRECATED("Use the ReaderProperties-taking overload")
-  static std::shared_ptr<FileMetaData> Make(
-      const void* serialized_metadata, uint32_t* inout_metadata_len,
-      std::shared_ptr<InternalFileDecryptor> file_decryptor);
-
   /// \brief Create a FileMetaData from a serialized thrift message.
   static std::shared_ptr<FileMetaData> Make(
       const void* serialized_metadata, uint32_t* inout_metadata_len,
@@ -547,11 +528,6 @@ struct PageIndexLocation {
 
 class PARQUET_EXPORT FileMetaDataBuilder {
  public:
-  ARROW_DEPRECATED("Deprecated in 12.0.0. Use overload without KeyValueMetadata instead.")
-  static std::unique_ptr<FileMetaDataBuilder> Make(
-      const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
-      std::shared_ptr<const KeyValueMetadata> key_value_metadata);
-
   // API convenience to get a MetaData builder
   static std::unique_ptr<FileMetaDataBuilder> Make(
       const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props);
@@ -572,9 +548,8 @@ class PARQUET_EXPORT FileMetaDataBuilder {
   std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
 
  private:
-  explicit FileMetaDataBuilder(
-      const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
-      std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+  explicit FileMetaDataBuilder(const SchemaDescriptor* schema,
+                               std::shared_ptr<WriterProperties> props);
   // PIMPL Idiom
   class FileMetaDataBuilderImpl;
   std::unique_ptr<FileMetaDataBuilderImpl> impl_;

From 9d56569774aa23818065f262d4575fea7f59f06a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 13 Sep 2024 02:17:47 +0200
Subject: [PATCH 139/186] MINOR: [Docs] Remove mention of JIRA issues in the
 contributing PR checklist (#44091)

### Rationale for this change

We no longer have any remaining JIRA issues, so this sentence can be removed.

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 docs/source/developers/overview.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/source/developers/overview.rst b/docs/source/developers/overview.rst
index 5a18b1e4eb8db..7e38dcb8ebc85 100644
--- a/docs/source/developers/overview.rst
+++ b/docs/source/developers/overview.rst
@@ -100,9 +100,6 @@ When contributing a patch, use this list as a checklist of Apache Arrow workflow
 * So that your pull request syncs with the GitHub issue, **prefix your pull request
   title with the GitHub issue id** (ex:
   `GH-14866: [C++] Remove internal GroupBy implementation <https://github.com/apache/arrow/pull/14867>`_).
-  Similarly **prefix your pull request name with the JIRA issue id** (ex:
-  `ARROW-767: [C++] Filesystem abstraction <https://github.com/apache/arrow/pull/4225>`_)
-  in case the issue is still located in Jira.
 * Give the pull request a **clear, brief description**: when the pull request is
   merged, this will be retained in the extended commit message.
 * Make sure that your code **passes the unit tests**. You can find instructions how

From 8e891a3ce114c098a19b331a815806905ceed7f0 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 13 Sep 2024 11:23:13 +0900
Subject: [PATCH 140/186] GH-44058: [CI][Integration] Group logs on GitHub
 Actions (#44060)

### Rationale for this change

The current log is difficult to read.

### What changes are included in this PR?

Group logs.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #44058

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/integration_arrow.sh           |   8 +-
 ci/scripts/integration_arrow_build.sh     |  18 ++-
 ci/scripts/util_log.sh                    |  26 +++++
 dev/archery/archery/docker/cli.py         |  20 ++--
 dev/archery/archery/integration/runner.py | 132 ++++++++++++----------
 dev/archery/archery/utils/logger.py       |  23 ++++
 6 files changed, 160 insertions(+), 67 deletions(-)
 create mode 100644 ci/scripts/util_log.sh

diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh
index 2eb58e8dc75ec..3050ad3111198 100755
--- a/ci/scripts/integration_arrow.sh
+++ b/ci/scripts/integration_arrow.sh
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
+set -e
 
 arrow_dir=${1}
 build_dir=${2}
@@ -30,8 +30,13 @@ gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration
 : ${ARROW_INTEGRATION_JAVA:=ON}
 : ${ARROW_INTEGRATION_JS:=ON}
 
+. ${arrow_dir}/ci/scripts/util_log.sh
+
+github_actions_group_begin "Integration: Prepare: Archery"
 pip install -e $arrow_dir/dev/archery[integration]
+github_actions_group_end
 
+github_actions_group_begin "Integration: Prepare: Dependencies"
 # For C Data Interface testing
 if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then
     pip install pythonnet
@@ -39,6 +44,7 @@ fi
 if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then
     pip install jpype1
 fi
+github_actions_group_end
 
 export ARROW_BUILD_ROOT=${build_dir}
 
diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh
index 9b54049a2b803..8fca0d434b75e 100755
--- a/ci/scripts/integration_arrow_build.sh
+++ b/ci/scripts/integration_arrow_build.sh
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
+set -e
 
 arrow_dir=${1}
 build_dir=${2}
@@ -28,22 +28,35 @@ build_dir=${2}
 : ${ARROW_INTEGRATION_JAVA:=ON}
 : ${ARROW_INTEGRATION_JS:=ON}
 
+. ${arrow_dir}/ci/scripts/util_log.sh
+
+github_actions_group_begin "Integration: Build: Rust"
 ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir}
+github_actions_group_end
 
+github_actions_group_begin "Integration: Build: nanoarrow"
 ${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir}
+github_actions_group_end
 
+github_actions_group_begin "Integration: Build: C++"
 if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then
     ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir}
 fi
+github_actions_group_end
 
+github_actions_group_begin "Integration: Build: C#"
 if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then
     ${arrow_dir}/ci/scripts/csharp_build.sh ${arrow_dir} ${build_dir}
 fi
+github_actions_group_end
 
+github_actions_group_begin "Integration: Build: Go"
 if [ "${ARROW_INTEGRATION_GO}" == "ON" ]; then
     ${arrow_dir}/ci/scripts/go_build.sh ${arrow_dir} ${build_dir}
 fi
+github_actions_group_end
 
+github_actions_group_begin "Integration: Build: Java"
 if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then
     export ARROW_JAVA_CDATA="ON"
     export JAVA_JNI_CMAKE_ARGS="-DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF -DARROW_JAVA_JNI_ENABLE_C=ON"
@@ -51,7 +64,10 @@ if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then
     ${arrow_dir}/ci/scripts/java_jni_build.sh ${arrow_dir} ${ARROW_HOME} ${build_dir} /tmp/dist/java
     ${arrow_dir}/ci/scripts/java_build.sh ${arrow_dir} ${build_dir} /tmp/dist/java
 fi
+github_actions_group_end
 
+github_actions_group_begin "Integration: Build: JavaScript"
 if [ "${ARROW_INTEGRATION_JS}" == "ON" ]; then
     ${arrow_dir}/ci/scripts/js_build.sh ${arrow_dir} ${build_dir}
 fi
+github_actions_group_end
diff --git a/ci/scripts/util_log.sh b/ci/scripts/util_log.sh
new file mode 100644
index 0000000000000..b34c44059adb2
--- /dev/null
+++ b/ci/scripts/util_log.sh
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+github_actions_group_begin() {
+  echo "::group::$1"
+  set -x
+}
+
+github_actions_group_end() {
+  set +x
+  echo "::endgroup::"
+}
diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py
index 23c565f7780ff..6a1303a8983d5 100644
--- a/dev/archery/archery/docker/cli.py
+++ b/dev/archery/archery/docker/cli.py
@@ -21,6 +21,7 @@
 import click
 
 from ..utils.cli import validate_arrow_sources
+from ..utils.logger import group
 from .core import DockerCompose, UndefinedImage
 
 
@@ -82,11 +83,12 @@ def docker(ctx, src, dry_run, using_legacy_docker_compose, using_docker_cli,
     using_docker_cli |= using_docker_buildx
     compose_bin = ("docker-compose" if using_legacy_docker_compose
                    else "docker compose")
-    compose = DockerCompose(config_path, params=os.environ,
-                            using_docker=using_docker_cli,
-                            using_buildx=using_docker_buildx,
-                            debug=ctx.obj.get('debug', False),
-                            compose_bin=compose_bin)
+    with group("Docker: Preppare"):
+        compose = DockerCompose(config_path, params=os.environ,
+                                using_docker=using_docker_cli,
+                                using_buildx=using_docker_buildx,
+                                debug=ctx.obj.get('debug', False),
+                                compose_bin=compose_bin)
     if dry_run:
         _mock_compose_calls(compose)
     ctx.obj['compose'] = compose
@@ -229,10 +231,12 @@ def docker_run(obj, image, command, *, env, user, force_pull, force_build,
     env = dict(kv.split('=', 1) for kv in env)
     try:
         if force_pull:
-            compose.pull(image, pull_leaf=use_leaf_cache)
+            with group("Docker: Pull"):
+                compose.pull(image, pull_leaf=use_leaf_cache)
         if force_build:
-            compose.build(image, use_cache=use_cache,
-                          use_leaf_cache=use_leaf_cache)
+            with group("Docker: Build"):
+                compose.build(image, use_cache=use_cache,
+                              use_leaf_cache=use_leaf_cache)
         if build_only:
             return
         compose.run(
diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 66c8721519ede..ca5febca9f801 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -17,6 +17,7 @@
 
 from collections import namedtuple
 from concurrent.futures import ThreadPoolExecutor
+import contextlib
 from functools import partial
 import glob
 import gzip
@@ -39,6 +40,7 @@
 from .tester_nanoarrow import NanoarrowTester
 from .util import guid, printer
 from .util import SKIP_C_ARRAY, SKIP_C_SCHEMA, SKIP_FLIGHT, SKIP_IPC
+from ..utils.logger import group as group_raw
 from ..utils.source import ARROW_ROOT_DEFAULT
 from . import datagen
 
@@ -49,6 +51,12 @@
 log = printer.print
 
 
+@contextlib.contextmanager
+def group(name):
+    with group_raw(name, log):
+        yield
+
+
 class Outcome:
     def __init__(self):
         self.failure = None
@@ -91,20 +99,22 @@ def run_ipc(self):
             self._compare_ipc_implementations(
                 producer, consumer, self._produce_consume,
                 self.json_files)
+
         if self.gold_dirs:
             for gold_dir, consumer in itertools.product(
                     self.gold_dirs,
                     filter(lambda t: t.CONSUMER, self.testers)):
-                log('\n')
-                log('******************************************************')
-                log('Tests against golden files in {}'.format(gold_dir))
-                log('******************************************************')
-
-                def run_gold(_, consumer, test_case: datagen.File):
-                    return self._run_gold(gold_dir, consumer, test_case)
-                self._compare_ipc_implementations(
-                    consumer, consumer, run_gold,
-                    self._gold_tests(gold_dir))
+                with group(f"Integration: Test: IPC: Gold: {consumer.name}"):
+                    log('\n')
+                    log('******************************************************')
+                    log('Tests against golden files in {}'.format(gold_dir))
+                    log('******************************************************')
+
+                    def run_gold(_, consumer, test_case: datagen.File):
+                        return self._run_gold(gold_dir, consumer, test_case)
+                    self._compare_ipc_implementations(
+                        consumer, consumer, run_gold,
+                        self._gold_tests(gold_dir))
         log('\n')
 
     def run_flight(self):
@@ -233,14 +243,15 @@ def _compare_ipc_implementations(
         """
         Compare Arrow IPC for two implementations (one producer, one consumer).
         """
-        log('##########################################################')
-        log('IPC: {0} producing, {1} consuming'
-            .format(producer.name, consumer.name))
-        log('##########################################################')
+        with group(f"Integration: Test: IPC: {producer.name} -> {consumer.name}"):
+            log('##########################################################')
+            log('IPC: {0} producing, {1} consuming'
+                .format(producer.name, consumer.name))
+            log('##########################################################')
 
-        case_runner = partial(self._run_ipc_test_case,
-                              producer, consumer, run_binaries)
-        self._run_test_cases(case_runner, test_cases)
+            case_runner = partial(self._run_ipc_test_case,
+                                  producer, consumer, run_binaries)
+            self._run_test_cases(case_runner, test_cases)
 
     def _run_ipc_test_case(
         self,
@@ -357,14 +368,15 @@ def _compare_flight_implementations(
         producer: Tester,
         consumer: Tester
     ):
-        log('##########################################################')
-        log('Flight: {0} serving, {1} requesting'
-            .format(producer.name, consumer.name))
-        log('##########################################################')
+        with group(f"Integration: Test: Flight: {producer.name} -> {consumer.name}"):
+            log('##########################################################')
+            log('Flight: {0} serving, {1} requesting'
+                .format(producer.name, consumer.name))
+            log('##########################################################')
 
-        case_runner = partial(self._run_flight_test_case, producer, consumer)
-        self._run_test_cases(
-            case_runner, self.json_files + self.flight_scenarios)
+            case_runner = partial(self._run_flight_test_case, producer, consumer)
+            self._run_test_cases(
+                case_runner, self.json_files + self.flight_scenarios)
 
     def _run_flight_test_case(self,
                               producer: Tester,
@@ -415,27 +427,32 @@ def _compare_c_data_implementations(
         producer: Tester,
         consumer: Tester
     ):
-        log('##########################################################')
-        log(f'C Data Interface: '
-            f'{producer.name} exporting, {consumer.name} importing')
-        log('##########################################################')
-
-        # Serial execution is required for proper memory accounting
-        serial = True
-
-        with producer.make_c_data_exporter() as exporter:
-            with consumer.make_c_data_importer() as importer:
-                case_runner = partial(self._run_c_schema_test_case,
-                                      producer, consumer,
-                                      exporter, importer)
-                self._run_test_cases(case_runner, self.json_files, serial=serial)
-
-                if producer.C_DATA_ARRAY_EXPORTER and consumer.C_DATA_ARRAY_IMPORTER:
-                    case_runner = partial(self._run_c_array_test_cases,
+        with group("Integration: Test: C Data Interface: "
+                   f"{producer.name} -> {consumer.name}"):
+            log('##########################################################')
+            log(f'C Data Interface: '
+                f'{producer.name} exporting, {consumer.name} importing')
+            log('##########################################################')
+
+            # Serial execution is required for proper memory accounting
+            serial = True
+
+            with producer.make_c_data_exporter() as exporter:
+                with consumer.make_c_data_importer() as importer:
+                    case_runner = partial(self._run_c_schema_test_case,
                                           producer, consumer,
                                           exporter, importer)
                     self._run_test_cases(case_runner, self.json_files, serial=serial)
 
+                    if producer.C_DATA_ARRAY_EXPORTER and \
+                       consumer.C_DATA_ARRAY_IMPORTER:
+                        case_runner = partial(self._run_c_array_test_cases,
+                                              producer, consumer,
+                                              exporter, importer)
+                        self._run_test_cases(case_runner,
+                                             self.json_files,
+                                             serial=serial)
+
     def _run_c_schema_test_case(self,
                                 producer: Tester, consumer: Tester,
                                 exporter: CDataExporter,
@@ -657,22 +674,23 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True,
     if run_c_data:
         runner.run_c_data()
 
-    fail_count = 0
-    if runner.failures:
-        log("################# FAILURES #################")
-        for test_case, producer, consumer, exc_info in runner.failures:
-            fail_count += 1
-            log("FAILED TEST:", end=" ")
-            log(test_case.name, producer.name, "producing, ",
-                consumer.name, "consuming")
-            if exc_info:
-                exc_type, exc_value, exc_tb = exc_info
-                log(f'{exc_type}: {exc_value}')
-            log()
-
-    log(f"{fail_count} failures, {len(runner.skips)} skips")
-    if fail_count > 0:
-        sys.exit(1)
+    with group("Integration: Test: Result"):
+        fail_count = 0
+        if runner.failures:
+            log("################# FAILURES #################")
+            for test_case, producer, consumer, exc_info in runner.failures:
+                fail_count += 1
+                log("FAILED TEST:", end=" ")
+                log(test_case.name, producer.name, "producing, ",
+                    consumer.name, "consuming")
+                if exc_info:
+                    exc_type, exc_value, exc_tb = exc_info
+                    log(f'{exc_type}: {exc_value}')
+                log()
+
+        log(f"{fail_count} failures, {len(runner.skips)} skips")
+        if fail_count > 0:
+            sys.exit(1)
 
 
 def write_js_test_json(directory):
diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py
index 9d0feda88e6ea..b315a52b7a000 100644
--- a/dev/archery/archery/utils/logger.py
+++ b/dev/archery/archery/utils/logger.py
@@ -15,7 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import contextlib
 import logging
+import os
 
 """ Global logger. """
 logger = logging.getLogger("archery")
@@ -27,3 +29,24 @@ def __init__(self, quiet=False):
 
 
 ctx = LoggingContext()
+
+in_github_actions = (os.environ.get("GITHUB_ACTIONS") == "true")
+
+
+@contextlib.contextmanager
+def group(name, output=None):
+    """
+    Group outputs in the given with block.
+
+    This does nothing in non GitHub Actions environment for now.
+    """
+    if output is None:
+        def output(message):
+            print(message, flush=True)
+    if in_github_actions:
+        output(f"::group::{name}")
+    try:
+        yield
+    finally:
+        if in_github_actions:
+            output("::endgroup::")

From 0bdb5be03da62de750d3def1a2716536b791b50d Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 13 Sep 2024 11:24:37 +0900
Subject: [PATCH 141/186] GH-43998: [C++][Docs] Add missing install command in
 building docs (#44000)

### Rationale for this change

We need to install to use built Arrow C++ in our application.

### What changes are included in this PR?

Add an install command line for each build command lines.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #43998

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 docs/source/developers/cpp/building.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst
index b052b856c9bd5..60a9e0694158f 100644
--- a/docs/source/developers/cpp/building.rst
+++ b/docs/source/developers/cpp/building.rst
@@ -213,6 +213,8 @@ and then ask to compile the build targets:
 
    0 directories, 3 files
 
+   $ cmake --install .
+
 When creating a build, it is possible to pass custom options besides
 the preset-defined ones, for example:
 
@@ -293,6 +295,7 @@ Minimal release build (1GB of RAM for building or more recommended):
    $ cd build-release
    $ cmake ..
    $ make -j8       # if you have 8 CPU cores, otherwise adjust
+   $ make install
 
 Minimal debug build with unit tests (4GB of RAM for building or more recommended):
 
@@ -305,6 +308,7 @@ Minimal debug build with unit tests (4GB of RAM for building or more recommended
    $ cmake -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_TESTS=ON ..
    $ make -j8       # if you have 8 CPU cores, otherwise adjust
    $ make unittest  # to run the tests
+   $ make install
 
 The unit tests are not built by default. After building, one can also invoke
 the unit tests using the ``ctest`` tool provided by CMake (note that ``test``

From 5c82b9dfe027d34274b31f7fb068344730d078cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Fri, 13 Sep 2024 13:46:00 +0200
Subject: [PATCH 142/186] GH-43872: [Go][CI] Disable Dependabot for Go (#44102)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

As part of moving the go implementation to `arrow-go` we should disable new commits to the main arrow repository.

### What changes are included in this PR?

Remove go config for dependabot.

### Are these changes tested?

No

### Are there any user-facing changes?

No
* GitHub Issue: #43872

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/dependabot.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 7d9ff2f42e887..7ba9744ef005d 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -24,13 +24,6 @@ updates:
     commit-message:
       prefix: "MINOR: [CI] "
     open-pull-requests-limit: 10
-  - package-ecosystem: "gomod"
-    directory: "/go/"
-    schedule:
-      interval: "weekly"
-    commit-message:
-      prefix: "MINOR: [Go] "
-    open-pull-requests-limit: 10
   - package-ecosystem: "maven"
     directory: "/java/"
     schedule:

From aab17c330190e90b5aebde243bec500e6a1e8eda Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 13 Sep 2024 06:21:27 -0700
Subject: [PATCH 143/186] MINOR: [C#] Bump Google.Protobuf from 3.28.0 to
 3.28.1 in /csharp (#44110)

Bumps [Google.Protobuf](https://github.com/protocolbuffers/protobuf) from 3.28.0 to 3.28.1.
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/10ef3f77683f77fb3c059bf47725c27b3ff41e63"><code>10ef3f7</code></a> Updating version.json and repo version numbers to: 28.1</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/d70f0778879317c2416ca414f6ba3261962622be"><code>d70f077</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/18191">#18191</a> from protocolbuffers/cp-ruby-upb</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/60e585c8f451cafcf6fd00a6575d7f2be09904d1"><code>60e585c</code></a> Update staleness</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/70b77de0d5aa8f70f87244961dc8fae2b1c9b928"><code>70b77de</code></a> Fix a potential Ruby-upb use of uninitialized memory.</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/5b4b3af7f1238c35e491446f9e8c324ef4337cfc"><code>5b4b3af</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/18188">#18188</a> from acozzette/28-fix</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/8ea3bb17cbf7f19922f5d0443b8dd9080c596ea1"><code>8ea3bb1</code></a> Fix compiler error with <code>StrongReferenceToType()</code></li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/9deedf0304c6ed8c7fb1cdb4466f01b8253cf376"><code>9deedf0</code></a> upb: fix uninitialized upb_MessageValue buffer bugs (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/18160">#18160</a>)</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/3454ed8adcff4313bcaaa2301165617eb1de5616"><code>3454ed8</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/18013">#18013</a> from protocolbuffers/28.x-202408281753</li>
<li><a href="https://github.com/protocolbuffers/protobuf/commit/976ab4188131c4fc9a9adfebea3ba9bc05459543"><code>976ab41</code></a> Updating version.json and repo version numbers to: 28.1-dev</li>
<li>See full diff in <a href="https://github.com/protocolbuffers/protobuf/compare/v3.28.0...v3.28.1">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Google.Protobuf&package-manager=nuget&previous-version=3.28.0&new-version=3.28.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj       | 2 +-
 .../Apache.Arrow.Flight.TestWeb.csproj                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
index bcfb813c11435..a12f3f8249d4c 100644
--- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
+++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
@@ -5,7 +5,7 @@
   </PropertyGroup>
   
   <ItemGroup>
-    <PackageReference Include="Google.Protobuf" Version="3.28.0" />
+    <PackageReference Include="Google.Protobuf" Version="3.28.1" />
     <PackageReference Include="Grpc.Net.Client" Version="2.65.0" />
     <PackageReference Include="Grpc.Tools" Version="2.66.0" PrivateAssets="All" />
     <PackageReference Include="System.Memory" Version="4.5.5" />
diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
index 5ed7cc47d6ac2..3fea68352ba5b 100644
--- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
+++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj
@@ -5,7 +5,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Google.Protobuf" Version="3.28.0" />
+    <PackageReference Include="Google.Protobuf" Version="3.28.1" />
     <PackageReference Include="Grpc.AspNetCore" Version="2.65.0" />
   </ItemGroup>
 

From a828bd2abcdb4bf55511d57ccc4ec58cdc2f28b0 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Fri, 13 Sep 2024 16:13:40 +0200
Subject: [PATCH 144/186] GH-44095: [CI][Python] Enable S3 testing on Windows
 wheel builds (#44093)

### Rationale for this change

We currently build Windows wheels with S3 enabled, but we disable it for testing.

### What changes are included in this PR?

Ensure Minio is installed and re-enable S3 for testing.

### Are these changes tested?

Yes, by construction (on CI).

### Are there any user-facing changes?

No.
* GitHub Issue: #44095

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 ci/docker/python-wheel-windows-test-vs2019.dockerfile | 7 +++++--
 ci/scripts/python_wheel_windows_test.bat              | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
index 625ab25f848f2..bffc1bd13d6b7 100644
--- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
@@ -27,13 +27,16 @@ FROM abrarov/msvc-2019:2.11.0
 # Add unix tools to path
 RUN setx path "%path%;C:\Program Files\Git\usr\bin"
 
-# Remove previous installations of python from the base image
+# 1. Remove previous installations of python from the base image
 # NOTE: a more recent base image (tried with 2.12.1) comes with python 3.9.7
 # and the msi installers are failing to remove pip and tcl/tk "products" making
 # the subsequent choco python installation step failing for installing python
 # version 3.9.* due to existing python version
+# 2. Install Minio for S3 testing.
 RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \
-    rm -rf Python*
+    rm -rf Python* && \
+    curl https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z \
+        --output "C:\Windows\Minio.exe"
 
 # Install the GCS testbench using a well-known Python version.
 # NOTE: cannot use pipx's `--fetch-missing-python` because of
diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat
index de5a2c2e965cb..ae5b7e36ad7ab 100755
--- a/ci/scripts/python_wheel_windows_test.bat
+++ b/ci/scripts/python_wheel_windows_test.bat
@@ -28,7 +28,7 @@ set PYARROW_TEST_ORC=ON
 set PYARROW_TEST_PARQUET=ON
 set PYARROW_TEST_PARQUET_ENCRYPTION=ON
 set PYARROW_TEST_SUBSTRAIT=ON
-set PYARROW_TEST_S3=OFF
+set PYARROW_TEST_S3=ON
 set PYARROW_TEST_TENSORFLOW=ON
 
 @REM Enable again once https://github.com/scipy/oldest-supported-numpy/pull/27 gets merged

From 1fd8a25c44c43ce4c29820d6efb7f71c439f1368 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 13 Sep 2024 14:25:10 -0400
Subject: [PATCH 145/186] MINOR: [Java] Bump
 com.google.api.grpc:proto-google-common-protos from 2.43.0 to 2.44.0 in /java
 (#44104)

Bumps [com.google.api.grpc:proto-google-common-protos](https://github.com/googleapis/sdk-platform-java) from 2.43.0 to 2.44.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/googleapis/sdk-platform-java/releases">com.google.api.grpc:proto-google-common-protos's releases</a>.</em></p>
<blockquote>
<h2>v2.44.0</h2>
<h2><a href="https://github.com/googleapis/sdk-platform-java/compare/v2.43.0...v2.44.0">2.44.0</a> (2024-08-16)</h2>
<h3>Features</h3>
<ul>
<li>update ErrorDetails to allow unpacking arbitrary messages (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3073">#3073</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6913db5c64e35ec8925c25a04ebd6d529895c4fe">6913db5</a>)</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Generator callable generation is based on method type (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3075">#3075</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c21a0139cccb9001951cab81f4c5ffe3ae51d7c5">c21a013</a>)</li>
<li>improve warnings for Direct Path xDS set via env (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3019">#3019</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/7a26115078d23e430e190c62556c014c440fcab2">7a26115</a>)</li>
</ul>
<h3>Dependencies</h3>
<ul>
<li>update dependency argcomplete to v3.5.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3099">#3099</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/0654a289b745e153e4f41b02cbdc180e3ca9a231">0654a28</a>)</li>
<li>update dependency black to v24.8.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3082">#3082</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/a864f622023e096f207cf6fe3321f92d1b9f4560">a864f62</a>)</li>
<li>update dependency com.google.crypto.tink:tink to v1.14.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3083">#3083</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c13b63e1e4b391650eb6a0949d1fbc2319a28de6">c13b63e</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.30.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3100">#3100</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/a10ef54fcd86c321bf15acd4fe5fa96f9937d318">a10ef54</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.30.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3101">#3101</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/9bff64f416c3eb046cca30ff974a9e9c4b5cebd2">9bff64f</a>)</li>
<li>update dependency lxml to v5.3.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3102">#3102</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/4e145b128b1b964346fc7c1c874d57d9eedc00d2">4e145b1</a>)</li>
<li>update dependency org.apache.commons:commons-lang3 to v3.16.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3103">#3103</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/95c9508bd9a20f82687c80def38e510fc539eec3">95c9508</a>)</li>
<li>update dependency org.checkerframework:checker-qual to v3.46.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3081">#3081</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/243192089386122c255a4296761775ec80a1fcd0">2431920</a>)</li>
<li>update dependency org.easymock:easymock to v5.4.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3079">#3079</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/182ae507f9323e5a5861b240e976a0127d1eea04">182ae50</a>)</li>
<li>update dependency pyyaml to v6.0.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3086">#3086</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/f847e45c19d45be57d59e9d6887047d2c12b8267">f847e45</a>)</li>
<li>update dependency watchdog to v4.0.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3094">#3094</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/f1c75a15f2462644a58e4a836db6c51cca4ba0a1">f1c75a1</a>)</li>
<li>update google api dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3071">#3071</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c5abe907ed063f86571634b2df67bca5a6334abf">c5abe90</a>)</li>
<li>update google auth library dependencies to v1.24.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3109">#3109</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/62acdd6f1246c8bd95eec4a53f8cd1896e48ae53">62acdd6</a>)</li>
<li>update googleapis/java-cloud-bom digest to a98202d (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3097">#3097</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bb216ae8834f5ea169ebfd9a790743680edec053">bb216ae</a>)</li>
<li>update googleapis/java-cloud-bom digest to ad905cc (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3080">#3080</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/250b26c5af6f8c78eec94a4c3a42d3a72e06b1e5">250b26c</a>)</li>
<li>update grpc dependencies to v1.66.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3104">#3104</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/b63b643b7f6067278406fe56c125f8497aec108f">b63b643</a>)</li>
<li>update opentelemetry-java monorepo to v1.41.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3105">#3105</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/7364916ec5f997d93c668c34e77eef8b9c77d9a5">7364916</a>)</li>
<li>update protobuf to 3.25.4 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3113">#3113</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2b271fcd7e5db7f1cab3c5210e8137288497f09f">2b271fc</a>)</li>
<li>update slf4j monorepo to v2.0.16 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3098">#3098</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c13f93264a0fe6643b343fa7a72bd4a59856c3e1">c13f932</a>)</li>
</ul>
<h3>Documentation</h3>
<ul>
<li>Update the Javadoc of ObsoleteApi.java (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3088">#3088</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/0ef66196744e21fc3b4e2553d817c0dea5efe127">0ef6619</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/googleapis/sdk-platform-java/blob/main/CHANGELOG.md">com.google.api.grpc:proto-google-common-protos's changelog</a>.</em></p>
<blockquote>
<h2><a href="https://github.com/googleapis/sdk-platform-java/compare/v2.43.0...v2.44.0">2.44.0</a> (2024-08-16)</h2>
<h3>Features</h3>
<ul>
<li>update ErrorDetails to allow unpacking arbitrary messages (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3073">#3073</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/6913db5c64e35ec8925c25a04ebd6d529895c4fe">6913db5</a>)</li>
</ul>
<h3>Bug Fixes</h3>
<ul>
<li>Generator callable generation is based on method type (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3075">#3075</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c21a0139cccb9001951cab81f4c5ffe3ae51d7c5">c21a013</a>)</li>
<li>improve warnings for Direct Path xDS set via env (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3019">#3019</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/7a26115078d23e430e190c62556c014c440fcab2">7a26115</a>)</li>
</ul>
<h3>Dependencies</h3>
<ul>
<li>update dependency argcomplete to v3.5.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3099">#3099</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/0654a289b745e153e4f41b02cbdc180e3ca9a231">0654a28</a>)</li>
<li>update dependency black to v24.8.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3082">#3082</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/a864f622023e096f207cf6fe3321f92d1b9f4560">a864f62</a>)</li>
<li>update dependency com.google.crypto.tink:tink to v1.14.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3083">#3083</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c13b63e1e4b391650eb6a0949d1fbc2319a28de6">c13b63e</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.30.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3100">#3100</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/a10ef54fcd86c321bf15acd4fe5fa96f9937d318">a10ef54</a>)</li>
<li>update dependency com.google.errorprone:error_prone_annotations to v2.30.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3101">#3101</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/9bff64f416c3eb046cca30ff974a9e9c4b5cebd2">9bff64f</a>)</li>
<li>update dependency lxml to v5.3.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3102">#3102</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/4e145b128b1b964346fc7c1c874d57d9eedc00d2">4e145b1</a>)</li>
<li>update dependency org.apache.commons:commons-lang3 to v3.16.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3103">#3103</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/95c9508bd9a20f82687c80def38e510fc539eec3">95c9508</a>)</li>
<li>update dependency org.checkerframework:checker-qual to v3.46.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3081">#3081</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/243192089386122c255a4296761775ec80a1fcd0">2431920</a>)</li>
<li>update dependency org.easymock:easymock to v5.4.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3079">#3079</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/182ae507f9323e5a5861b240e976a0127d1eea04">182ae50</a>)</li>
<li>update dependency pyyaml to v6.0.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3086">#3086</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/f847e45c19d45be57d59e9d6887047d2c12b8267">f847e45</a>)</li>
<li>update dependency watchdog to v4.0.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3094">#3094</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/f1c75a15f2462644a58e4a836db6c51cca4ba0a1">f1c75a1</a>)</li>
<li>update google api dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3071">#3071</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c5abe907ed063f86571634b2df67bca5a6334abf">c5abe90</a>)</li>
<li>update google auth library dependencies to v1.24.1 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3109">#3109</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/62acdd6f1246c8bd95eec4a53f8cd1896e48ae53">62acdd6</a>)</li>
<li>update googleapis/java-cloud-bom digest to a98202d (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3097">#3097</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/bb216ae8834f5ea169ebfd9a790743680edec053">bb216ae</a>)</li>
<li>update googleapis/java-cloud-bom digest to ad905cc (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3080">#3080</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/250b26c5af6f8c78eec94a4c3a42d3a72e06b1e5">250b26c</a>)</li>
<li>update grpc dependencies to v1.66.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3104">#3104</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/b63b643b7f6067278406fe56c125f8497aec108f">b63b643</a>)</li>
<li>update opentelemetry-java monorepo to v1.41.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3105">#3105</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/7364916ec5f997d93c668c34e77eef8b9c77d9a5">7364916</a>)</li>
<li>update protobuf to 3.25.4 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3113">#3113</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/2b271fcd7e5db7f1cab3c5210e8137288497f09f">2b271fc</a>)</li>
<li>update slf4j monorepo to v2.0.16 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3098">#3098</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/c13f93264a0fe6643b343fa7a72bd4a59856c3e1">c13f932</a>)</li>
</ul>
<h3>Documentation</h3>
<ul>
<li>Update the Javadoc of ObsoleteApi.java (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3088">#3088</a>) (<a href="https://github.com/googleapis/sdk-platform-java/commit/0ef66196744e21fc3b4e2553d817c0dea5efe127">0ef6619</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/59255038278baed72fd2e3720f10740705a5923c"><code>5925503</code></a> chore(main): release 2.44.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3074">#3074</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/b63b643b7f6067278406fe56c125f8497aec108f"><code>b63b643</code></a> deps: update grpc dependencies to v1.66.0 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3104">#3104</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/007a3dba15b18bc9631760bca850e40c52a298d7"><code>007a3db</code></a> build(deps): update dependency com.google.cloud:google-cloud-shared-config to...</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/a671b520d2c77005d4bb78402d7caf8bc8e3dd56"><code>a671b52</code></a> chore: update googleapis commit at Tue Jun 25 02:16:02 UTC 2024 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/2949">#2949</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/2b271fcd7e5db7f1cab3c5210e8137288497f09f"><code>2b271fc</code></a> deps: update protobuf to 3.25.4 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3113">#3113</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/2fabd5554b64fdb57f841957c694dbcd2af052bb"><code>2fabd55</code></a> build(deps): update dependency org.codehaus.mojo:exec-maven-plugin to v3.4.1 ...</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/f847e45c19d45be57d59e9d6887047d2c12b8267"><code>f847e45</code></a> deps: update dependency pyyaml to v6.0.2 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3086">#3086</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/bb216ae8834f5ea169ebfd9a790743680edec053"><code>bb216ae</code></a> deps: update googleapis/java-cloud-bom digest to a98202d (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3097">#3097</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/c5abe907ed063f86571634b2df67bca5a6334abf"><code>c5abe90</code></a> deps: update google api dependencies (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3071">#3071</a>)</li>
<li><a href="https://github.com/googleapis/sdk-platform-java/commit/c13f93264a0fe6643b343fa7a72bd4a59856c3e1"><code>c13f932</code></a> deps: update slf4j monorepo to v2.0.16 (<a href="https://redirect.github.com/googleapis/sdk-platform-java/issues/3098">#3098</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/googleapis/sdk-platform-java/compare/v2.43.0...v2.44.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.google.api.grpc:proto-google-common-protos&package-manager=maven&previous-version=2.43.0&new-version=2.44.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Dane Pitkin <dpitkin@apache.org>
---
 java/flight/flight-core/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml
index dec679de3a543..a9724289c2252 100644
--- a/java/flight/flight-core/pom.xml
+++ b/java/flight/flight-core/pom.xml
@@ -134,7 +134,7 @@ under the License.
     <dependency>
       <groupId>com.google.api.grpc</groupId>
       <artifactId>proto-google-common-protos</artifactId>
-      <version>2.43.0</version>
+      <version>2.44.0</version>
       <scope>test</scope>
     </dependency>
     <dependency>

From 62060ab55be2dc811e5abf0c5aba9834d3056ee1 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 14 Sep 2024 07:45:36 +0900
Subject: [PATCH 146/186] GH-44096: [C++] Don't use Boost.Process with
 Emscripten (#44097)

### Rationale for this change

Boost.Process doesn't work with Emscripten. So we can't build `arrow::util::Process`.

### What changes are included in this PR?

Don't use Boost.Process with Emscripten. `arrow::util::Process` returns `Status::NotImplemented` with Emscripten.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #44096

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/arrow/testing/process.cc | 169 +++++++++++++++++++------------
 1 file changed, 105 insertions(+), 64 deletions(-)

diff --git a/cpp/src/arrow/testing/process.cc b/cpp/src/arrow/testing/process.cc
index 941ddd9a6b603..133768ff015e6 100644
--- a/cpp/src/arrow/testing/process.cc
+++ b/cpp/src/arrow/testing/process.cc
@@ -18,39 +18,45 @@
 #include "arrow/testing/process.h"
 #include "arrow/result.h"
 
+#define BOOST_PROCESS_AVAILABLE
+#ifdef __EMSCRIPTEN__
+#  undef BOOST_PROCESS_AVAILABLE
+#endif
+
+#ifdef BOOST_PROCESS_AVAILABLE
 // This boost/asio/io_context.hpp include is needless for no MinGW
 // build.
 //
 // This is for including boost/asio/detail/socket_types.hpp before any
 // "#include <windows.h>". boost/asio/detail/socket_types.hpp doesn't
 // work if windows.h is already included.
-#include <boost/asio/io_context.hpp>
+#  include <boost/asio/io_context.hpp>
 
-#ifdef BOOST_PROCESS_HAVE_V2
+#  ifdef BOOST_PROCESS_HAVE_V2
 // We can't use v2 API on Windows because v2 API doesn't support
 // process group [1] and GCS testbench uses multiple processes [2].
 //
 // [1] https://github.com/boostorg/process/issues/259
 // [2] https://github.com/googleapis/storage-testbench/issues/669
-#  ifndef _WIN32
-#    define BOOST_PROCESS_USE_V2
+#    ifndef _WIN32
+#      define BOOST_PROCESS_USE_V2
+#    endif
 #  endif
-#endif
 
-#ifdef BOOST_PROCESS_USE_V2
-#  ifdef BOOST_PROCESS_NEED_SOURCE
+#  ifdef BOOST_PROCESS_USE_V2
+#    ifdef BOOST_PROCESS_NEED_SOURCE
 // Workaround for https://github.com/boostorg/process/issues/312
-#    define BOOST_PROCESS_V2_SEPARATE_COMPILATION
-#    ifdef __APPLE__
-#      include <sys/sysctl.h>
+#      define BOOST_PROCESS_V2_SEPARATE_COMPILATION
+#      ifdef __APPLE__
+#        include <sys/sysctl.h>
+#      endif
+#      include <boost/process/v2.hpp>
+#      include <boost/process/v2/src.hpp>
+#    else
+#      include <boost/process/v2.hpp>
 #    endif
-#    include <boost/process/v2.hpp>
-#    include <boost/process/v2/src.hpp>
+#    include <unordered_map>
 #  else
-#    include <boost/process/v2.hpp>
-#  endif
-#  include <unordered_map>
-#else
 // We need BOOST_USE_WINDOWS_H definition with MinGW when we use
 // boost/process.hpp. boost/process/detail/windows/handle_workaround.hpp
 // doesn't work without BOOST_USE_WINDOWS_H with MinGW because MinGW
@@ -58,36 +64,37 @@
 //
 // See also:
 // https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp
-#  ifdef __MINGW32__
-#    define BOOST_USE_WINDOWS_H = 1
-#  endif
-#  ifdef BOOST_PROCESS_HAVE_V1
-#    include <boost/process/v1.hpp>
-#  else
-#    include <boost/process.hpp>
+#    ifdef __MINGW32__
+#      define BOOST_USE_WINDOWS_H = 1
+#    endif
+#    ifdef BOOST_PROCESS_HAVE_V1
+#      include <boost/process/v1.hpp>
+#    else
+#      include <boost/process.hpp>
+#    endif
 #  endif
-#endif
 
-#ifdef __APPLE__
-#  include <limits.h>
-#  include <mach-o/dyld.h>
-#endif
+#  ifdef __APPLE__
+#    include <limits.h>
+#    include <mach-o/dyld.h>
+#  endif
 
-#include <chrono>
-#include <iostream>
-#include <sstream>
-#include <thread>
+#  include <chrono>
+#  include <iostream>
+#  include <sstream>
+#  include <thread>
 
-#ifdef BOOST_PROCESS_USE_V2
+#  ifdef BOOST_PROCESS_USE_V2
 namespace asio = BOOST_PROCESS_V2_ASIO_NAMESPACE;
 namespace process = BOOST_PROCESS_V2_NAMESPACE;
 namespace filesystem = process::filesystem;
-#elif defined(BOOST_PROCESS_HAVE_V1)
+#  elif defined(BOOST_PROCESS_HAVE_V1)
 namespace process = boost::process::v1;
 namespace filesystem = boost::process::v1::filesystem;
-#else
+#  else
 namespace process = boost::process;
 namespace filesystem = boost::filesystem;
+#  endif
 #endif
 
 namespace arrow::util {
@@ -96,17 +103,20 @@ class Process::Impl {
  public:
   Impl() {
     // Get a copy of the current environment.
-#ifdef BOOST_PROCESS_USE_V2
+#ifdef BOOST_PROCESS_AVAILABLE
+#  ifdef BOOST_PROCESS_USE_V2
     for (const auto& kv : process::environment::current()) {
       env_[kv.key()] = process::environment::value(kv.value());
     }
-#else
+#  else
     env_ = process::environment(boost::this_process::environment());
+#  endif
 #endif
   }
 
   ~Impl() {
-#ifdef BOOST_PROCESS_USE_V2
+#ifdef BOOST_PROCESS_AVAILABLE
+#  ifdef BOOST_PROCESS_USE_V2
     // V2 doesn't provide process group support yet:
     // https://github.com/boostorg/process/issues/259
     //
@@ -126,94 +136,124 @@ class Process::Impl {
         }
       }
     }
-#else
+#  else
     process_group_ = nullptr;
-#endif
+#  endif
     process_ = nullptr;
+#endif
   }
 
   Status SetExecutable(const std::string& name) {
-#ifdef BOOST_PROCESS_USE_V2
+#ifdef BOOST_PROCESS_AVAILABLE
+#  ifdef BOOST_PROCESS_USE_V2
     executable_ = process::environment::find_executable(name);
-#else
+#  else
     executable_ = process::search_path(name);
-#endif
+#  endif
     if (executable_.empty()) {
       // Search the current executable directory as fallback.
       ARROW_ASSIGN_OR_RAISE(auto current_exe, ResolveCurrentExecutable());
-#ifdef BOOST_PROCESS_USE_V2
+#  ifdef BOOST_PROCESS_USE_V2
       std::unordered_map<process::environment::key, process::environment::value> env;
       for (const auto& kv : process::environment::current()) {
         env[kv.key()] = process::environment::value(kv.value());
       }
       env["PATH"] = process::environment::value(current_exe.parent_path());
       executable_ = process::environment::find_executable(name, env);
-#else
+#  else
       executable_ = process::search_path(name, {current_exe.parent_path()});
-#endif
+#  endif
     }
     if (executable_.empty()) {
       return Status::IOError("Failed to find '", name, "' in PATH");
     }
     return Status::OK();
+#else
+    return Status::NotImplemented("Boost.Process isn't available on this system");
+#endif
   }
 
-  void SetArgs(const std::vector<std::string>& args) { args_ = args; }
+  void SetArgs(const std::vector<std::string>& args) {
+#ifdef BOOST_PROCESS_AVAILABLE
+    args_ = args;
+#endif
+  }
 
   void SetEnv(const std::string& name, const std::string& value) {
-#ifdef BOOST_PROCESS_USE_V2
+#ifdef BOOST_PROCESS_AVAILABLE
+#  ifdef BOOST_PROCESS_USE_V2
     env_[name] = process::environment::value(value);
-#else
+#  else
     env_[name] = value;
+#  endif
 #endif
   }
 
-  void IgnoreStderr() { keep_stderr_ = false; }
+  void IgnoreStderr() {
+#ifdef BOOST_PROCESS_AVAILABLE
+    keep_stderr_ = false;
+#endif
+  }
 
   Status Execute() {
+#ifdef BOOST_PROCESS_AVAILABLE
     try {
-#ifdef BOOST_PROCESS_USE_V2
+#  ifdef BOOST_PROCESS_USE_V2
       return ExecuteV2();
-#else
+#  else
       return ExecuteV1();
-#endif
+#  endif
     } catch (const std::exception& e) {
       return Status::IOError("Failed to launch '", executable_, "': ", e.what());
     }
+#else
+    return Status::NotImplemented("Boost.Process isn't available on this system");
+#endif
   }
 
   bool IsRunning() {
-#ifdef BOOST_PROCESS_USE_V2
+#ifdef BOOST_PROCESS_AVAILABLE
+#  ifdef BOOST_PROCESS_USE_V2
     boost::system::error_code error_code;
     return process_ && process_->running(error_code);
-#else
+#  else
     return process_ && process_->running();
+#  endif
+#else
+    return false;
 #endif
   }
 
   uint64_t pid() {
+#ifdef BOOST_PROCESS_AVAILABLE
     if (!process_) {
       return 0;
     }
     return process_->id();
+#else
+    return 0;
+#endif
   }
 
  private:
+#ifdef BOOST_PROCESS_AVAILABLE
   filesystem::path executable_;
   std::vector<std::string> args_;
   bool keep_stderr_ = true;
-#ifdef BOOST_PROCESS_USE_V2
+#  ifdef BOOST_PROCESS_USE_V2
   std::unordered_map<process::environment::key, process::environment::value> env_;
   std::unique_ptr<process::process> process_;
   asio::io_context ctx_;
   // boost/process/v2/ doesn't support process group yet:
   // https://github.com/boostorg/process/issues/259
-#else
+#  else
   process::environment env_;
   std::unique_ptr<process::child> process_;
   std::unique_ptr<process::group> process_group_;
+#  endif
 #endif
 
+#ifdef BOOST_PROCESS_AVAILABLE
   Result<filesystem::path> ResolveCurrentExecutable() {
     // See https://stackoverflow.com/a/1024937/10194 for various
     // platform-specific recipes.
@@ -221,25 +261,25 @@ class Process::Impl {
     filesystem::path path;
     boost::system::error_code error_code;
 
-#if defined(__linux__)
+#  if defined(__linux__)
     path = filesystem::canonical("/proc/self/exe", error_code);
-#elif defined(__APPLE__)
+#  elif defined(__APPLE__)
     char buf[PATH_MAX + 1];
     uint32_t bufsize = sizeof(buf);
     if (_NSGetExecutablePath(buf, &bufsize) < 0) {
       return Status::Invalid("Can't resolve current exe: path too large");
     }
     path = filesystem::canonical(buf, error_code);
-#elif defined(_WIN32)
+#  elif defined(_WIN32)
     char buf[MAX_PATH + 1];
     if (!GetModuleFileNameA(NULL, buf, sizeof(buf))) {
       return Status::Invalid("Can't get executable file path");
     }
     path = filesystem::canonical(buf, error_code);
-#else
+#  else
     ARROW_UNUSED(error_code);
     return Status::NotImplemented("Not available on this system");
-#endif
+#  endif
     if (error_code) {
       // XXX fold this into the Status class?
       return Status::IOError("Can't resolve current exe: ", error_code.message());
@@ -248,7 +288,7 @@ class Process::Impl {
     }
   }
 
-#ifdef BOOST_PROCESS_USE_V2
+#  ifdef BOOST_PROCESS_USE_V2
   Status ExecuteV2() {
     process::process_environment env(env_);
     // We can't use std::make_unique<process::process>.
@@ -258,7 +298,7 @@ class Process::Impl {
                                           : process::process_stdio{{}, {}, nullptr}));
     return Status::OK();
   }
-#else
+#  else
   Status ExecuteV1() {
     process_group_ = std::make_unique<process::group>();
     if (keep_stderr_) {
@@ -271,6 +311,7 @@ class Process::Impl {
     }
     return Status::OK();
   }
+#  endif
 #endif
 };
 

From bc28696c77273618ca90ba3c7127513378849f31 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 14 Sep 2024 09:33:17 +0900
Subject: [PATCH 147/186] GH-44034:  [Go][Format][FlightRPC] Update go_package
 in Flight.proto and FlightSql.proto (#44035)

### Rationale for this change

We're migrating `go/` to apache/arrow-go.

### What changes are included in this PR?

Update Go package name to .../apache/arrow-go/... from .../apache/arrow/go/...

### Are these changes tested?

No.

### Are there any user-facing changes?

No.
* GitHub Issue: #44034

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 format/Flight.proto    | 2 +-
 format/FlightSql.proto | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/format/Flight.proto b/format/Flight.proto
index 2187a51ed48f4..f2b0f889cf7d2 100644
--- a/format/Flight.proto
+++ b/format/Flight.proto
@@ -20,7 +20,7 @@ syntax = "proto3";
 import "google/protobuf/timestamp.proto";
 
 option java_package = "org.apache.arrow.flight.impl";
-option go_package = "github.com/apache/arrow/go/arrow/flight/gen/flight";
+option go_package = "github.com/apache/arrow-go/arrow/flight/gen/flight";
 option csharp_namespace = "Apache.Arrow.Flight.Protocol";
 
 package arrow.flight.protocol;
diff --git a/format/FlightSql.proto b/format/FlightSql.proto
index 6fca141d692a7..ef1ae7513d4d4 100644
--- a/format/FlightSql.proto
+++ b/format/FlightSql.proto
@@ -20,7 +20,7 @@ syntax = "proto3";
 import "google/protobuf/descriptor.proto";
 
 option java_package = "org.apache.arrow.flight.sql.impl";
-option go_package = "github.com/apache/arrow/go/arrow/flight/gen/flight";
+option go_package = "github.com/apache/arrow-go/arrow/flight/gen/flight";
 package arrow.flight.protocol.sql;
 
 /*

From 86569b7de16d7454d2b4c790b203d27941e2bf85 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 14 Sep 2024 11:09:16 +0900
Subject: [PATCH 148/186] MINOR: [Java] Bump
 com.gradle:develocity-maven-extension from 1.21.6 to 1.22.1 in /java (#44108)

Bumps com.gradle:develocity-maven-extension from 1.21.6 to 1.22.1.

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=com.gradle:develocity-maven-extension&package-manager=maven&previous-version=1.21.6&new-version=1.22.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@ dependabot rebase` will rebase this PR
- `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@ dependabot merge` will merge this PR after your CI passes on it
- `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@ dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@ dependabot reopen` will reopen this PR if it is closed
- `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@ dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>

Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 java/.mvn/extensions.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/.mvn/extensions.xml b/java/.mvn/extensions.xml
index c90629a91c9ec..ae632dccf0c70 100644
--- a/java/.mvn/extensions.xml
+++ b/java/.mvn/extensions.xml
@@ -23,7 +23,7 @@
     <extension>
         <groupId>com.gradle</groupId>
         <artifactId>develocity-maven-extension</artifactId>
-        <version>1.21.6</version>
+        <version>1.22.1</version>
     </extension>
     <extension>
         <groupId>com.gradle</groupId>

From cf010cdf9cb808ac872224b894bd5e1118ca76b7 Mon Sep 17 00:00:00 2001
From: David Chapman <davidjchapman@gmail.com>
Date: Sat, 14 Sep 2024 15:02:28 +0100
Subject: [PATCH 149/186] GH-41347: [FlightRPC][C#] Allow hosting flight server
 in pre-Kestrel .net versions (#41348)

### Rationale for this change

With the existing structure it is not possible to create a flight RPC service as a regular GRPC service, outside of AspNet.Core/Kestrel.

This can be supported by changing the protection level of the generated classes and FlightServiceImplementation.cs

### What changes are included in this PR?

Change protection level from internal to public for generated protocol files and FlightServiceImplementation.cs

### Are these changes tested?

Confirmed that classes are public in the built assembly.

### Are there any user-facing changes?

Generated protocol classes will become visible to end users.

* GitHub Issue: #41347

Authored-by: David Chapman <David.Chapman@gazprom-mt.com>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../Apache.Arrow.Flight.csproj                |  6 ++++-
 .../Server/GrpcCoreFlightServerExtensions.cs  | 24 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 csharp/src/Apache.Arrow.Flight/Server/GrpcCoreFlightServerExtensions.cs

diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
index a12f3f8249d4c..dc2e720313ba5 100644
--- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
+++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
@@ -1,7 +1,7 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFrameworks>netstandard2.0;netstandard2.1</TargetFrameworks>
+    <TargetFrameworks>netstandard2.0;netstandard2.1;net462</TargetFrameworks>
   </PropertyGroup>
   
   <ItemGroup>
@@ -14,6 +14,10 @@
   <ItemGroup Condition="'$(TargetFramework)'=='netstandard2.0'">
     <PackageReference Include="Microsoft.Bcl.AsyncInterfaces" Version="6.0.0" />
   </ItemGroup>
+  
+  <ItemGroup Condition="'$(TargetFramework)'=='net462'">
+    <PackageReference Include="Grpc.Core" Version="2.46.6" />
+  </ItemGroup>
 
   <ItemGroup>
     <ProjectReference Include="..\Apache.Arrow\Apache.Arrow.csproj" />
diff --git a/csharp/src/Apache.Arrow.Flight/Server/GrpcCoreFlightServerExtensions.cs b/csharp/src/Apache.Arrow.Flight/Server/GrpcCoreFlightServerExtensions.cs
new file mode 100644
index 0000000000000..92c0af630a3fb
--- /dev/null
+++ b/csharp/src/Apache.Arrow.Flight/Server/GrpcCoreFlightServerExtensions.cs
@@ -0,0 +1,24 @@
+﻿#if NET46_OR_GREATER
+
+using Apache.Arrow.Flight.Protocol;
+using Apache.Arrow.Flight.Server.Internal;
+using Grpc.Core;
+
+namespace Apache.Arrow.Flight.Server
+{
+    public static class GrpcCoreFlightServerExtensions
+    {
+        /// <summary>
+        /// Create a ServerServiceDefinition for use with a <see href="https://grpc.github.io/grpc/csharp/api/Grpc.Core.Server.html">Grpc.Core Server</see>
+        //  This allows running a flight server on pre-Kestrel .net Framework versions
+        /// </summary>
+        /// <param name="flightServer"></param>
+        /// <returns></returns>
+        public static ServerServiceDefinition CreateServiceDefinition(this FlightServer flightServer)
+        {
+            return FlightService.BindService(new FlightServerImplementation(flightServer));
+        }
+    }
+}
+
+#endif

From 77f70e4a122c50f28d9ff97debb8831d2558f4d3 Mon Sep 17 00:00:00 2001
From: Curt Hagenlocher <curt@hagenlocher.org>
Date: Sat, 14 Sep 2024 09:18:26 -0700
Subject: [PATCH 150/186] MINOR: [C#] Add missing license (#44118)

### Rationale for this change

Add missing license to just-checked-in file.

### What changes are included in this PR?

The Apache license was added to a source file.

### Are these changes tested?

N/A

### Are there any user-facing changes?

No.

Authored-by: Curt Hagenlocher <curt@hagenlocher.org>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 .../Server/GrpcCoreFlightServerExtensions.cs    | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/csharp/src/Apache.Arrow.Flight/Server/GrpcCoreFlightServerExtensions.cs b/csharp/src/Apache.Arrow.Flight/Server/GrpcCoreFlightServerExtensions.cs
index 92c0af630a3fb..3773e184df468 100644
--- a/csharp/src/Apache.Arrow.Flight/Server/GrpcCoreFlightServerExtensions.cs
+++ b/csharp/src/Apache.Arrow.Flight/Server/GrpcCoreFlightServerExtensions.cs
@@ -1,4 +1,19 @@
-﻿#if NET46_OR_GREATER
+﻿// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if NET46_OR_GREATER
 
 using Apache.Arrow.Flight.Protocol;
 using Apache.Arrow.Flight.Server.Internal;

From 3eb41355d1cf7b4f50af04f901924f417d6da91a Mon Sep 17 00:00:00 2001
From: Grant McDermott <grant.mcdermott@gmail.com>
Date: Sat, 14 Sep 2024 12:28:31 -0700
Subject: [PATCH 151/186] GH-39638: [Docs][R] Add r-universe instructions
 (#44033)

Closes #39638
* GitHub Issue: #39638

Lead-authored-by: Grant McDermott <grant.mcdermott@gmail.com>
Co-authored-by: eitsupi <50911393+eitsupi@users.noreply.github.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/README.md | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/r/README.md b/r/README.md
index c3cd5a32eaf69..8601afe492b99 100644
--- a/r/README.md
+++ b/r/README.md
@@ -4,15 +4,16 @@
 
 [![cran](https://www.r-pkg.org/badges/version-last-release/arrow)](https://cran.r-project.org/package=arrow)
 [![CI](https://github.com/apache/arrow/workflows/R/badge.svg?event=push)](https://github.com/apache/arrow/actions?query=workflow%3AR+branch%3Amain+event%3Apush)
+[![R-universe status badge](https://apache.r-universe.dev/badges/arrow)](https://apache.r-universe.dev)
 [![conda-forge](https://img.shields.io/conda/vn/conda-forge/r-arrow.svg)](https://anaconda.org/conda-forge/r-arrow)
 
 <!-- badges: end -->
 
 ## Overview
 
-The R `{arrow}` package provides access to many of the features of the [Apache Arrow C++ library](https://arrow.apache.org/docs/cpp/index.html) for R users. The goal of arrow is to provide an Arrow C++ backend to `{dplyr}`, and access to the Arrow C++ library through familiar base R and tidyverse functions, or `{R6}` classes.
+The R `{arrow}` package provides access to many of the features of the [Apache Arrow C++ library](https://arrow.apache.org/docs/cpp/index.html) for R users. The goal of arrow is to provide an Arrow C++ backend to `{dplyr}`, and access to the Arrow C++ library through familiar base R and tidyverse functions, or `{R6}` classes. The dedicated R package website is located [here](https://arrow.apache.org/docs/r/index.html).
 
-To learn more about the Apache Arrow project, see the parent documentation of the [Arrow Project](https://arrow.apache.org/). The Arrow project provides functionality for a wide range of data analysis tasks to store, process and move data fast. See the [read/write article](https://arrow.apache.org/docs/r/articles/read_write.html) to learn about reading and writing data files, [data wrangling](https://arrow.apache.org/docs/r/articles/data_wrangling.html) to learn how to use dplyr syntax with arrow objects, and the [function documentation](https://arrow.apache.org/docs/r/reference/acero.html) for a full list of supported functions within dplyr queries.
+To learn more about the Apache Arrow project, see the documentation of the parent [Arrow Project](https://arrow.apache.org/). The Arrow project provides functionality for a wide range of data analysis tasks to store, process and move data fast. See the [read/write article](https://arrow.apache.org/docs/r/articles/read_write.html) to learn about reading and writing data files, [data wrangling](https://arrow.apache.org/docs/r/articles/data_wrangling.html) to learn how to use dplyr syntax with arrow objects, and the [function documentation](https://arrow.apache.org/docs/r/reference/acero.html) for a full list of supported functions within dplyr queries.
 
 ## Installation
 
@@ -23,7 +24,15 @@ Windows or macOS.
 install.packages("arrow")
 ```
 
-Alternatively, if you are using conda you can install arrow from conda-forge:
+If you are having trouble installing from CRAN, then we offer two alternative install options for grabbing the latest arrow release. First, [R-universe](https://r-universe.dev/) provides pre-compiled binaries for the most commonly used operating systems.[^1]
+
+[^1]: Linux users should consult the R-universe [documentation](https://docs.r-universe.dev/install/binaries.html) for guidance on the exact repo URL path and potential limitations.
+
+```r
+install.packages("arrow", repos = c("https://apache.r-universe.dev", "https://cloud.r-project.org"))
+```
+
+Second, if you are using conda then you can install arrow from conda-forge.
 
 ```sh
 conda install -c conda-forge --strict-channel-priority r-arrow

From 41c481f41ad322341d0698d001b4af5c98c5dbac Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Sat, 14 Sep 2024 12:38:47 -0700
Subject: [PATCH 152/186] GH-44069: [Docs][R] Add note to to_arrow() docs about
 collect/compute (#44094)

### Rationale for this change

Improves the documentation for the `to_arrow()` function for the use case referenced in https://github.com/apache/arrow/issues/44069.

### What changes are included in this PR?

Just docs.

### Are these changes tested?

Yes. Built and tested locally.

### Are there any user-facing changes?

Just docs.
* GitHub Issue: #44069

Authored-by: Bryce Mecum <petridish@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/duckdb.R      | 8 +++++++-
 r/man/to_arrow.Rd | 9 ++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/r/R/duckdb.R b/r/R/duckdb.R
index a2bf62de2fde2..65c70243e7ab3 100644
--- a/r/R/duckdb.R
+++ b/r/R/duckdb.R
@@ -137,7 +137,13 @@ duckdb_disconnector <- function(con, tbl_name) {
 
 #' Create an Arrow object from a DuckDB connection
 #'
-#' This can be used in pipelines that pass data back and forth between Arrow and DuckDB
+#' This can be used in pipelines that pass data back and forth between Arrow and
+#' DuckDB.
+#'
+#' Note that you can only call `collect()` or `compute()` on the result of this
+#' function once. To work around this limitation, you should either only call
+#' `collect()` as the final step in a pipeline or call `as_arrow_table()` on the
+#' result to materialize the entire Table in-memory.
 #'
 #' @param .data the object to be converted
 #' @return A `RecordBatchReader`.
diff --git a/r/man/to_arrow.Rd b/r/man/to_arrow.Rd
index aed40609a5161..87b8fea36eeda 100644
--- a/r/man/to_arrow.Rd
+++ b/r/man/to_arrow.Rd
@@ -13,7 +13,14 @@ to_arrow(.data)
 A \code{RecordBatchReader}.
 }
 \description{
-This can be used in pipelines that pass data back and forth between Arrow and DuckDB
+This can be used in pipelines that pass data back and forth between Arrow and
+DuckDB.
+}
+\details{
+Note that you can only call \code{collect()} or \code{compute()} on the result of this
+function once. To work around this limitation, you should either only call
+\code{collect()} as the final step in a pipeline or call \code{as_arrow_table()} on the
+result to materialize the entire Table in-memory.
 }
 \examples{
 \dontshow{if (getFromNamespace("run_duckdb_examples", "arrow")()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}

From dafc970e883091d2b81c232eca4b842846dea408 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sun, 15 Sep 2024 20:31:42 +0900
Subject: [PATCH 153/186] GH-44007: [GLib][Parquet] Add
 `gparquet_arrow_file_writer_new_buffered_row_group()` (#44100)

### Rationale for this change

It's useful for advanced use.

### What changes are included in this PR?

Add `gparquet_arrow_file_writer_new_buffered_row_group()`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #44007

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/parquet-glib/arrow-file-writer.cpp     | 38 +++++++++++++++++++
 c_glib/parquet-glib/arrow-file-writer.h       |  5 +++
 c_glib/test/parquet/test-arrow-file-writer.rb |  7 +++-
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp
index 7a672f1f21dcc..2b8e2bdeac026 100644
--- a/c_glib/parquet-glib/arrow-file-writer.cpp
+++ b/c_glib/parquet-glib/arrow-file-writer.cpp
@@ -517,6 +517,19 @@ gparquet_arrow_file_writer_get_schema(GParquetArrowFileWriter *writer)
  * @record_batch: A record batch to be written.
  * @error: (nullable): Return location for a #GError or %NULL.
  *
+ * Write a record batch into the buffered row group.
+ *
+ * Multiple record batches can be written into the same row group
+ * through this function.
+ *
+ * gparquet_writer_properties_get_max_row_group_length() is respected
+ * and a new row group will be created if the current row group
+ * exceeds the limit.
+ *
+ * Record batches get flushed to the output stream once
+ * gparquet_file_writer_new_buffered_row_group() or
+ * gparquet_file_writer_close() is called.
+ *
  * Returns: %TRUE on success, %FALSE if there was an error.
  *
  * Since: 18.0.0
@@ -564,6 +577,8 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer,
  * @chunk_size: The max number of rows in a row group.
  * @error: (nullable): Return location for a #GError or %NULL.
  *
+ * Start a new row group.
+ *
  * Returns: %TRUE on success, %FALSE if there was an error.
  *
  * Since: 18.0.0
@@ -579,12 +594,35 @@ gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer,
                        "[parquet][arrow][file-writer][new-row-group]");
 }
 
+/**
+ * gparquet_arrow_file_writer_new_buffered_row_group:
+ * @writer: A #GParquetArrowFileWriter.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Start a new buffered row group.
+ *
+ * Returns: %TRUE on success, %FALSE if there was an error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+gparquet_arrow_file_writer_new_buffered_row_group(GParquetArrowFileWriter *writer,
+                                                  GError **error)
+{
+  auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer);
+  return garrow::check(error,
+                       parquet_arrow_file_writer->NewBufferedRowGroup(),
+                       "[parquet][arrow][file-writer][new-buffered-row-group]");
+}
+
 /**
  * gparquet_arrow_file_writer_write_chunked_array:
  * @writer: A #GParquetArrowFileWriter.
  * @chunked_array: A #GArrowChunkedArray to be written.
  * @error: (nullable): Return location for a #GError or %NULL.
  *
+ * Start a chunked array as a column chunk.
+ *
  * Returns: %TRUE on success, %FALSE if there was an error.
  *
  * Since: 18.0.0
diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h
index 40595bdfef4b9..2c82f7c1f87de 100644
--- a/c_glib/parquet-glib/arrow-file-writer.h
+++ b/c_glib/parquet-glib/arrow-file-writer.h
@@ -139,6 +139,11 @@ gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer,
                                          gsize chunk_size,
                                          GError **error);
 
+GPARQUET_AVAILABLE_IN_18_0
+gboolean
+gparquet_arrow_file_writer_new_buffered_row_group(GParquetArrowFileWriter *writer,
+                                                  GError **error);
+
 GPARQUET_AVAILABLE_IN_18_0
 gboolean
 gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer,
diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb
index 89db16c6fb90b..d8344bf1c50b0 100644
--- a/c_glib/test/parquet/test-arrow-file-writer.rb
+++ b/c_glib/test/parquet/test-arrow-file-writer.rb
@@ -40,14 +40,17 @@ def test_write_record_batch
 
     writer = Parquet::ArrowFileWriter.new(record_batch.schema, @file.path)
     writer.write_record_batch(record_batch)
+    writer.new_buffered_row_group
+    writer.write_record_batch(record_batch)
     writer.close
 
     reader = Parquet::ArrowFileReader.new(@file.path)
     begin
       reader.use_threads = true
       assert_equal([
-                     1,
-                     Arrow::Table.new(record_batch.schema, [record_batch]),
+                     2,
+                     Arrow::Table.new(record_batch.schema,
+                                      [record_batch, record_batch]),
                    ],
                    [
                      reader.n_row_groups,

From a5d40d0324721f2004fa93a1d12260c7516c4813 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Sun, 15 Sep 2024 17:46:44 +0200
Subject: [PATCH 154/186] GH-44122: [R] Don't use the new pipe yet (#44123)

Swapping out the old pipe for at least a few more months to maintain 4.0 support.
* GitHub Issue: #44122

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/tests/testthat/test-dplyr-summarize.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R
index 8d2a209df547f..1768d7534e69f 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -956,8 +956,8 @@ test_that("Summarize with 0 arguments", {
 })
 
 test_that("Printing aggregation expressions", {
-  q <- tbl |>
-    arrow_table() |>
+  q <- tbl %>%
+    arrow_table() %>%
     summarize(
       total = sum(int, na.rm = TRUE),
       prod = prod(int, na.rm = TRUE),

From 9ba789d1115494b00e6772a3170c8ba2f1a9a02c Mon Sep 17 00:00:00 2001
From: Curt Hagenlocher <curt@hagenlocher.org>
Date: Sun, 15 Sep 2024 19:20:59 -0700
Subject: [PATCH 155/186] GH-43267: [C#] Correctly import sliced arrays through
 the C Data interface (#44117)

### What changes are included in this PR?

Changes to the C Data importer to correctly handle nonzero offsets.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No

Closes #43267
* GitHub Issue: #43267

Authored-by: Curt Hagenlocher <curt@hagenlocher.org>
Signed-off-by: Curt Hagenlocher <curt@hagenlocher.org>
---
 csharp/src/Apache.Arrow/Apache.Arrow.csproj   |  6 +--
 .../src/Apache.Arrow/C/CArrowArrayImporter.cs | 22 +++++-----
 csharp/src/Apache.Arrow/RecordBatch.cs        | 11 +++++
 csharp/src/Apache.Arrow/Utility.cs            |  2 -
 .../Apache.Arrow.Tests/ArrowReaderVerifier.cs |  4 +-
 .../CDataInterfaceDataTests.cs                | 18 ++++++++
 .../CDataInterfacePythonTests.cs              | 43 +++++++++++++++++++
 7 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj
index 034876a114b0b..a845f8e693695 100644
--- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj
+++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj
@@ -7,18 +7,16 @@
     <Description>Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware.</Description>
   </PropertyGroup>
 
-  <PropertyGroup Condition="'$(IsWindows)'=='true'">
+  <PropertyGroup>
     <TargetFrameworks>netstandard2.0;net6.0;net8.0;net462</TargetFrameworks>
   </PropertyGroup>
-  <PropertyGroup Condition="'$(IsWindows)'!='true'">
-    <TargetFrameworks>netstandard2.0;net6.0;net8.0</TargetFrameworks>
-  </PropertyGroup>
 
   <ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETStandard' or '$(TargetFramework)' == 'net462'">
     <PackageReference Include="System.Buffers" Version="4.5.1" />
     <PackageReference Include="System.Memory" Version="4.5.5" />
     <PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.7.1" />
     <PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.4" />
+    <PackageReference Include="System.ValueTuple" Version="4.5.0" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
index 68b67f3d7c620..c454380e17cfc 100644
--- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
+++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
@@ -260,7 +260,7 @@ private ArrayData[] ProcessStructChildren(CArrowArray* cArray, IReadOnlyList<Fie
 
             private ArrowBuffer ImportValidityBuffer(CArrowArray* cArray)
             {
-                int length = checked((int)cArray->length);
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int validityLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);
                 return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength));
             }
@@ -285,7 +285,7 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray)
                     throw new InvalidOperationException("Byte arrays are expected to have exactly three buffers");
                 }
 
-                int length = checked((int)cArray->length);
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int offsetsLength = (length + 1) * 4;
                 int* offsets = (int*)cArray->buffers[1];
                 Debug.Assert(offsets != null);
@@ -306,7 +306,7 @@ private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray)
                     throw new InvalidOperationException("Byte array views are expected to have at least three buffers");
                 }
 
-                int length = checked((int)cArray->length);
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int viewsLength = length * 16;
 
                 long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1];
@@ -336,7 +336,7 @@ private ArrowBuffer[] ImportLargeByteArrayBuffers(CArrowArray* cArray)
                         $"is greater than the maximum supported large byte array length ({maxLength})");
                 }
 
-                int length = (int)cArray->length;
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int offsetsLength = (length + 1) * 8;
                 long* offsets = (long*)cArray->buffers[1];
                 Debug.Assert(offsets != null);
@@ -364,7 +364,7 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray)
                     throw new InvalidOperationException("List arrays are expected to have exactly two buffers");
                 }
 
-                int length = checked((int)cArray->length);
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int offsetsLength = (length + 1) * 4;
 
                 ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -381,7 +381,7 @@ private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray)
                     throw new InvalidOperationException("List view arrays are expected to have exactly three buffers");
                 }
 
-                int length = checked((int)cArray->length);
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int offsetsLength = length * 4;
 
                 ArrowBuffer[] buffers = new ArrowBuffer[3];
@@ -407,7 +407,7 @@ private ArrowBuffer[] ImportLargeListBuffers(CArrowArray* cArray)
                         $"is greater than the maximum supported large list array length ({maxLength})");
                 }
 
-                int length = (int)cArray->length;
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int offsetsLength = (length + 1) * 8;
 
                 ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -436,7 +436,7 @@ private ArrowBuffer[] ImportDenseUnionBuffers(CArrowArray* cArray)
                 {
                     throw new InvalidOperationException("Dense union arrays are expected to have exactly two children");
                 }
-                int length = checked((int)cArray->length);
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int offsetsLength = length * 4;
 
                 ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -454,7 +454,7 @@ private ArrowBuffer[] ImportSparseUnionBuffers(CArrowArray* cArray)
                 }
 
                 ArrowBuffer[] buffers = new ArrowBuffer[1];
-                buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->length));
+                buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->offset + (int)cArray->length));
 
                 return buffers;
             }
@@ -467,10 +467,10 @@ private ArrowBuffer[] ImportFixedWidthBuffers(CArrowArray* cArray, int bitWidth)
                 }
 
                 // validity, data
-                int length = checked((int)cArray->length);
+                int length = checked((int)cArray->offset + (int)cArray->length);
                 int valuesLength;
                 if (bitWidth >= 8)
-                    valuesLength = checked((int)(cArray->length * bitWidth / 8));
+                    valuesLength = checked(length * bitWidth / 8);
                 else
                     valuesLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);
 
diff --git a/csharp/src/Apache.Arrow/RecordBatch.cs b/csharp/src/Apache.Arrow/RecordBatch.cs
index 9cc81b1648ea8..4067ba9ac6c2b 100644
--- a/csharp/src/Apache.Arrow/RecordBatch.cs
+++ b/csharp/src/Apache.Arrow/RecordBatch.cs
@@ -100,6 +100,17 @@ public RecordBatch Clone(MemoryAllocator allocator = default)
             return new RecordBatch(Schema, arrays, Length);
         }
 
+        public RecordBatch Slice(int offset, int length)
+        {
+            if (offset > Length)
+            {
+                throw new ArgumentException($"Offset {offset} cannot be greater than Length {Length} for RecordBatch.Slice");
+            }
+
+            length = Math.Min(Length - offset, length);
+            return new RecordBatch(Schema, _arrays.Select(a => ArrowArrayFactory.Slice(a, offset, length)), length);
+        }
+
         public void Accept(IArrowArrayVisitor visitor)
         {
             switch (visitor)
diff --git a/csharp/src/Apache.Arrow/Utility.cs b/csharp/src/Apache.Arrow/Utility.cs
index c4e5732e6eaa7..22b3ff15f1c5c 100644
--- a/csharp/src/Apache.Arrow/Utility.cs
+++ b/csharp/src/Apache.Arrow/Utility.cs
@@ -13,10 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-using Apache.Arrow.Flatbuf;
 using System;
 using System.Collections.Generic;
-using System.Text;
 
 namespace Apache.Arrow
 {
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
index 85f7b75f931ef..35b2c4e7f2ad3 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
@@ -566,7 +566,9 @@ private void CompareArrays(FixedSizeListArray actualArray)
                 var listSize = ((FixedSizeListType)expectedArray.Data.DataType).ListSize;
                 var expectedValuesSlice = ArrowArrayFactory.Slice(
                     expectedArray.Values, expectedArray.Offset * listSize, expectedArray.Length * listSize);
-                actualArray.Values.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare));
+                var actualValuesSlice = ArrowArrayFactory.Slice(
+                    actualArray.Values, actualArray.Offset * listSize, actualArray.Length * listSize);
+                actualValuesSlice.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare));
             }
 
             private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer expectedValidityBuffer, int expectedBufferOffset, ArrowBuffer actualValidityBuffer, int actualBufferOffset)
diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
index 2bd4d4d661942..70ab1fdae2f64 100644
--- a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
@@ -92,5 +92,23 @@ public unsafe void CallsReleaseForInvalid()
             GC.KeepAlive(releaseCallback);
         }
 #endif
+
+        [Fact]
+        public unsafe void RoundTripInt32ArrayWithOffset()
+        {
+            Int32Array array = new Int32Array.Builder()
+                .AppendRange(new[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 })
+                .Build();
+            IArrowArray sliced = array.Slice(2, 6);
+            CArrowArray* cArray = CArrowArray.Create();
+            CArrowArrayExporter.ExportArray(sliced, cArray);
+            using (var importedSlice = (Int32Array)CArrowArrayImporter.ImportArray(cArray, array.Data.DataType))
+            {
+                Assert.Equal(6, importedSlice.Length);
+                Assert.Equal(2, importedSlice.Offset);
+                Assert.Equal(2, importedSlice.GetValue(0));
+            }
+            CArrowArray.Free(cArray);
+        }
     }
 }
diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
index fee18d165cdbd..638cbfb272de4 100644
--- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
@@ -792,6 +792,49 @@ public unsafe void RoundTripTestBatch()
             CArrowSchema.Free(cImportSchema);
         }
 
+        [SkippableFact]
+        public unsafe void RoundTripTestSlicedBatch()
+        {
+            // TODO: Enable these once this the version of pyarrow referenced during testing supports them
+            HashSet<ArrowTypeId> unsupported = new HashSet<ArrowTypeId> { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView };
+            RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported);
+            RecordBatch batch1slice = batch1.Slice(1, 2);
+            RecordBatch batch2 = batch1slice.Clone();
+
+            CArrowArray* cExportArray = CArrowArray.Create();
+            CArrowArrayExporter.ExportRecordBatch(batch1slice, cExportArray);
+
+            CArrowSchema* cExportSchema = CArrowSchema.Create();
+            CArrowSchemaExporter.ExportSchema(batch1.Schema, cExportSchema);
+
+            CArrowArray* cImportArray = CArrowArray.Create();
+            CArrowSchema* cImportSchema = CArrowSchema.Create();
+
+            // For Python, we need to provide the pointers
+            long exportArrayPtr = ((IntPtr)cExportArray).ToInt64();
+            long exportSchemaPtr = ((IntPtr)cExportSchema).ToInt64();
+            long importArrayPtr = ((IntPtr)cImportArray).ToInt64();
+            long importSchemaPtr = ((IntPtr)cImportSchema).ToInt64();
+
+            using (Py.GIL())
+            {
+                dynamic pa = Py.Import("pyarrow");
+                dynamic exportedPyArray = pa.RecordBatch._import_from_c(exportArrayPtr, exportSchemaPtr);
+                exportedPyArray._export_to_c(importArrayPtr, importSchemaPtr);
+            }
+
+            Schema schema = CArrowSchemaImporter.ImportSchema(cImportSchema);
+            RecordBatch importedBatch = CArrowArrayImporter.ImportRecordBatch(cImportArray, schema);
+
+            ArrowReaderVerifier.CompareBatches(batch2, importedBatch, strictCompare: false); // Non-strict because span lengths won't match.
+
+            // Since we allocated, we are responsible for freeing the pointer.
+            CArrowArray.Free(cExportArray);
+            CArrowSchema.Free(cExportSchema);
+            CArrowArray.Free(cImportArray);
+            CArrowSchema.Free(cImportSchema);
+        }
+
         [SkippableFact]
         public unsafe void ExportBatchReader()
         {

From 1f964fc8885b8d25359318c5a2180fc15c904c06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Mon, 16 Sep 2024 10:36:24 +0200
Subject: [PATCH 156/186] GH-44085: [CI][R] Update Ubuntu version for R force
 test (#44087)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

We want to update the Ubuntu version to run the R force tests in order to be able to test pyarrow once it drops Python 3.8.

### What changes are included in this PR?

Bumping Ubuntu to 24.04 and use virtualenv in order to avoid installing requirements at the system python which fails on newer Ubuntu.

### Are these changes tested?

Yes, on CI

### Are there any user-facing changes?

No
* GitHub Issue: #44085

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
---
 .github/workflows/r.yml          |  2 +-
 ci/docker/linux-apt-r.dockerfile | 15 +++++++--------
 ci/scripts/r_test.sh             |  4 ++++
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 92e0e63fb7ea5..f7e661633626b 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -123,7 +123,7 @@ jobs:
       fail-fast: false
       matrix:
         r: ["4.4"]
-        ubuntu: [20.04]
+        ubuntu: [24.04]
         force-tests: ["true"]
     env:
       R: ${{ matrix.r }}
diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile
index 630b96e1007b9..4be5adf246b88 100644
--- a/ci/docker/linux-apt-r.dockerfile
+++ b/ci/docker/linux-apt-r.dockerfile
@@ -58,6 +58,7 @@ RUN apt-get update -y && \
         locales \
         # Need Python to check py-to-r bridge
         python3 \
+        python3-venv \
         python3-pip \
         python3-dev && \
     locale-gen en_US.UTF-8 && \
@@ -81,15 +82,16 @@ RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site
 # Also ensure parallel compilation of C/C++ code
 RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Renviron.site
 
-# Set up Python 3 and its dependencies
-RUN ln -s /usr/bin/python3 /usr/local/bin/python && \
-    ln -s /usr/bin/pip3 /usr/local/bin/pip
-
 COPY ci/scripts/r_deps.sh /arrow/ci/scripts/
 COPY r/DESCRIPTION /arrow/r/
 RUN /arrow/ci/scripts/r_deps.sh /arrow
 
-RUN pip install -U pip setuptools wheel
+ENV ARROW_PYTHON_VENV /arrow-dev
+COPY python/requirements-build.txt /arrow/python/
+RUN python3 -m venv ${ARROW_PYTHON_VENV} && \
+    source ${ARROW_PYTHON_VENV}/bin/activate && \
+    pip install -U pip setuptools wheel && \
+    pip install -r arrow/python/requirements-build.txt
 
 COPY ci/scripts/install_minio.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_minio.sh latest /usr/local
@@ -97,9 +99,6 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local
 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/
 RUN /arrow/ci/scripts/install_gcs_testbench.sh default
 
-COPY python/requirements-build.txt /arrow/python/
-RUN pip install -r arrow/python/requirements-build.txt
-
 ENV \
     ARROW_ACERO=ON \
     ARROW_BUILD_STATIC=OFF \
diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh
index fe9d18edb8cbb..d5fd78914755e 100755
--- a/ci/scripts/r_test.sh
+++ b/ci/scripts/r_test.sh
@@ -26,6 +26,10 @@ pushd ${source_dir}
 
 printenv
 
+if [ -n "${ARROW_PYTHON_VENV:-}" ]; then
+  . "${ARROW_PYTHON_VENV}/bin/activate"
+fi
+
 # Run the nixlibs.R test suite, which is not included in the installed package
 ${R_BIN} -e 'setwd("tools"); testthat::test_dir(".", stop_on_warning = TRUE)'
 

From 3f0585a78ac5487e316f3cf60a0bd049f863f947 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Mon, 16 Sep 2024 12:27:31 +0200
Subject: [PATCH 157/186] MINOR: [Archery] Fix typo on docker CI comment
 (#44130)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

There is a typo on the comment

### What changes are included in this PR?

Fix typo

### Are these changes tested?

No

### Are there any user-facing changes?

No

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/archery/archery/docker/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py
index 6a1303a8983d5..3375d63306a10 100644
--- a/dev/archery/archery/docker/cli.py
+++ b/dev/archery/archery/docker/cli.py
@@ -83,7 +83,7 @@ def docker(ctx, src, dry_run, using_legacy_docker_compose, using_docker_cli,
     using_docker_cli |= using_docker_buildx
     compose_bin = ("docker-compose" if using_legacy_docker_compose
                    else "docker compose")
-    with group("Docker: Preppare"):
+    with group("Docker: Prepare"):
         compose = DockerCompose(config_path, params=os.environ,
                                 using_docker=using_docker_cli,
                                 using_buildx=using_docker_buildx,

From 3600db8c74f2d83850ad510aa7efbe10fddd9600 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 16 Sep 2024 14:06:50 +0200
Subject: [PATCH 158/186] GH-44111: [CI][Python] Enable S3 tests on macOS CI
 (#44129)

### Rationale for this change

S3 support is enabled when building PyArrow for macOS (both on PR builds and in Crossbow wheel builds), but minio wan't installed before testing, therefore S3 support in PyArrow was not tested at all.

### What changes are included in this PR?

Ensure Minio is installed before running PyArrow tests on macOS builds.

### Are these changes tested?

Yes, by construction.

### Are there any user-facing changes?

No.

* GitHub Issue: #44111

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .github/workflows/python.yml           | 4 ++++
 dev/tasks/python-wheels/github.osx.yml | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 45efd305aa8f6..d5de099464677 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -188,6 +188,10 @@ jobs:
           python -m pip install \
             -r python/requirements-build.txt \
             -r python/requirements-test.txt
+      - name: Install MinIO
+        run: |
+          $(brew --prefix bash)/bin/bash \
+            ci/scripts/install_minio.sh latest /usr/local
       - name: Setup ccache
         shell: bash
         run: ci/scripts/ccache_setup.sh
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index 5d85e7905726e..4f54719feef66 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -128,6 +128,11 @@ jobs:
           PIPX_BASE_PYTHON: {{ '${{ steps.gcs-python-install.outputs.python-path }}' }}
         run: arrow/ci/scripts/install_gcs_testbench.sh default
 
+      - name: Install MinIO
+        run: |
+          $(brew --prefix bash)/bin/bash \
+            arrow/ci/scripts/install_minio.sh latest /usr/local
+
       - name: Test Wheel
         env:
           PYTEST_ADDOPTS: "-k 'not test_cancellation'"

From aa6ab95a1224d7435e212b5d6f20e231d482754e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Mon, 16 Sep 2024 15:38:38 +0200
Subject: [PATCH 159/186] GH-43518: [Python][Packaging][CI] Drop Python 3.8
 support (#43970)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Python 3.8 is End of Support on 31st October 2023. We can drop support for pyarrow 18.0.0.
See: https://endoflife.date/python

### What changes are included in this PR?

Remove support for 3.8. Update minimum required Python version and update some CI jobs to use the minimum Python supported version

### Are these changes tested?

They will be tested via Archery

### Are there any user-facing changes?

Yes in terms of not supporting older versions of Python. This will be a breaking change for some users but not for the API itself

* GitHub Issue: #43518

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 .env                                          |  4 +-
 .github/workflows/dev.yml                     |  1 +
 .github/workflows/go.yml                      |  4 +-
 .github/workflows/integration.yml             |  2 +-
 .github/workflows/java.yml                    |  2 +-
 .github/workflows/java_jni.yml                |  4 +-
 .github/workflows/js.yml                      |  2 +-
 .github/workflows/python.yml                  | 54 +++++++++----------
 .github/workflows/r.yml                       |  4 +-
 .github/workflows/ruby.yml                    |  2 +-
 .../conda-python-cpython-debug.dockerfile     |  4 +-
 ci/docker/conda-python-cython2.dockerfile     |  2 +-
 ci/docker/conda-python-dask.dockerfile        |  2 +-
 ci/docker/conda-python-hdfs.dockerfile        |  2 +-
 ci/docker/conda-python-jpype.dockerfile       |  2 +-
 ci/docker/conda-python-pandas.dockerfile      |  2 +-
 ci/docker/conda-python-spark.dockerfile       |  2 +-
 ci/docker/conda-python.dockerfile             |  2 +-
 ci/docker/linux-apt-python-3.dockerfile       |  1 +
 ci/docker/python-wheel-manylinux.dockerfile   |  4 +-
 ...ython-wheel-windows-test-vs2019.dockerfile |  9 ++--
 .../python-wheel-windows-vs2019.dockerfile    |  9 ++--
 ci/scripts/install_python.sh                  |  6 +--
 ci/scripts/r_install_system_dependencies.sh   |  2 +-
 dev/archery/setup.py                          | 10 ++--
 .../verify-release-candidate-wheels.bat       |  9 ++--
 dev/release/verify-release-candidate.bat      |  2 +-
 dev/release/verify-release-candidate.sh       |  6 +--
 dev/tasks/macros.jinja                        |  2 +-
 dev/tasks/python-sdist/github.yml             |  3 +-
 dev/tasks/python-wheels/github.linux.yml      |  4 +-
 dev/tasks/tasks.yml                           | 37 ++++++++-----
 docker-compose.yml                            |  7 +--
 .../continuous_integration/archery.rst        |  2 +-
 .../continuous_integration/crossbow.rst       |  2 +-
 docs/source/python/install.rst                |  2 +-
 python/asv.conf.json                          |  2 +-
 python/pyarrow/tests/test_csv.py              |  7 +--
 python/pyarrow/tests/test_flight.py           |  4 +-
 python/pyarrow/tests/test_types.py            | 11 ++--
 python/pyarrow/tests/util.py                  | 14 -----
 python/pyproject.toml                         |  3 +-
 r/tests/testthat/test-python.R                |  4 +-
 43 files changed, 124 insertions(+), 135 deletions(-)

diff --git a/.env b/.env
index c8c236d5ac44b..6accb27262eef 100644
--- a/.env
+++ b/.env
@@ -70,8 +70,8 @@ NODE=18
 NUMBA=latest
 NUMPY=latest
 PANDAS=latest
-PYTHON=3.8
-PYTHON_IMAGE_TAG=3.8
+PYTHON=3.9
+PYTHON_IMAGE_TAG=3.9
 R=4.4
 SPARK=master
 TURBODBC=latest
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index 3879a045fd239..d2436fe3c4525 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -66,6 +66,7 @@ jobs:
         env:
           ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
           ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+          UBUNTU: 22.04
         run: |
           source ci/scripts/util_enable_core_dumps.sh
           archery docker run -e GITHUB_ACTIONS=true ubuntu-lint
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index d463549206471..9eba4c86362e1 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -211,7 +211,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
@@ -251,7 +251,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index b73f900e616f5..bffc1c597b4c9 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -93,7 +93,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml
index 57f834bcbabee..ad39dbc7d01e6 100644
--- a/.github/workflows/java.yml
+++ b/.github/workflows/java.yml
@@ -80,7 +80,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml
index e730a5bf3e672..56aa1d0992887 100644
--- a/.github/workflows/java_jni.yml
+++ b/.github/workflows/java_jni.yml
@@ -74,7 +74,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
@@ -116,7 +116,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml
index 9ab4edf0851cd..c7693c05133b0 100644
--- a/.github/workflows/js.yml
+++ b/.github/workflows/js.yml
@@ -58,7 +58,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index d5de099464677..4916287556b0c 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -58,41 +58,41 @@ jobs:
       matrix:
         name:
           - conda-python-docs
-          - conda-python-3.9-nopandas
-          - conda-python-3.8-pandas-1.0
-          - conda-python-3.10-pandas-latest
-          - conda-python-3.10-no-numpy
+          - conda-python-3.10-nopandas
+          - conda-python-3.9-pandas-1.1.3
+          - conda-python-3.11-pandas-latest
+          - conda-python-3.11-no-numpy
         include:
           - name: conda-python-docs
-            cache: conda-python-3.9
+            cache: conda-python-3.10
             image: conda-python-docs
-            title: AMD64 Conda Python 3.9 Sphinx & Numpydoc
-            python: 3.9
-          - name: conda-python-3.9-nopandas
-            cache: conda-python-3.9
+            title: AMD64 Conda Python 3.10 Sphinx & Numpydoc
+            python: "3.10"
+          - name: conda-python-3.10-nopandas
+            cache: conda-python-3.10
             image: conda-python
-            title: AMD64 Conda Python 3.9 Without Pandas
-            python: 3.9
-          - name: conda-python-3.8-pandas-1.0
-            cache: conda-python-3.8
+            title: AMD64 Conda Python 3.10 Without Pandas
+            python: "3.10"
+          - name: conda-python-3.9-pandas-1.1.3
+            cache: conda-python-3.9
             image: conda-python-pandas
-            title: AMD64 Conda Python 3.8 Pandas 1.0
-            python: 3.8
-            pandas: "1.0"
-            numpy: 1.16
-          - name: conda-python-3.10-pandas-latest
-            cache: conda-python-3.10
+            title: AMD64 Conda Python 3.9 Pandas 1.1.3
+            python: 3.9
+            pandas: "1.1.3"
+            numpy: 1.19.5
+          - name: conda-python-3.11-pandas-latest
+            cache: conda-python-3.11
             image: conda-python-pandas
-            title: AMD64 Conda Python 3.10 Pandas latest
-            python: "3.10"
+            title: AMD64 Conda Python 3.11 Pandas latest
+            python: "3.11"
             pandas: latest
-          - name: conda-python-3.10-no-numpy
-            cache: conda-python-3.10
+          - name: conda-python-3.11-no-numpy
+            cache: conda-python-3.11
             image: conda-python-no-numpy
-            title: AMD64 Conda Python 3.10 without NumPy
-            python: "3.10"
+            title: AMD64 Conda Python 3.11 without NumPy
+            python: "3.11"
     env:
-      PYTHON: ${{ matrix.python || 3.8 }}
+      PYTHON: ${{ matrix.python || 3.9 }}
       UBUNTU: ${{ matrix.ubuntu || 20.04 }}
       PANDAS: ${{ matrix.pandas || 'latest' }}
       NUMPY: ${{ matrix.numpy || 'latest' }}
@@ -111,7 +111,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index f7e661633626b..9abedcd767150 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -150,7 +150,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
@@ -209,7 +209,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml
index 05b7b317ffd96..83a066dc27386 100644
--- a/.github/workflows/ruby.yml
+++ b/.github/workflows/ruby.yml
@@ -87,7 +87,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
         with:
-          python-version: 3.8
+          python-version: 3.12
       - name: Setup Archery
         run: pip install -e dev/archery[docker]
       - name: Execute Docker Build
diff --git a/ci/docker/conda-python-cpython-debug.dockerfile b/ci/docker/conda-python-cpython-debug.dockerfile
index 87bdcafe4092a..36ba7865a888c 100644
--- a/ci/docker/conda-python-cpython-debug.dockerfile
+++ b/ci/docker/conda-python-cpython-debug.dockerfile
@@ -17,11 +17,11 @@
 
 ARG repo
 ARG arch
-ARG python=3.8
+ARG python=3.9
 FROM ${repo}:${arch}-conda-python-${python}
 
 # (Docker oddity: ARG needs to be repeated after FROM)
-ARG python=3.8
+ARG python=3.9
 RUN mamba install -y "conda-forge/label/python_debug::python=${python}[build=*_cpython]" && \
     mamba clean --all
 # Quick check that we do have a debug mode CPython
diff --git a/ci/docker/conda-python-cython2.dockerfile b/ci/docker/conda-python-cython2.dockerfile
index d67ef677276c7..859ad868b0c71 100644
--- a/ci/docker/conda-python-cython2.dockerfile
+++ b/ci/docker/conda-python-cython2.dockerfile
@@ -17,7 +17,7 @@
 
 ARG repo
 ARG arch
-ARG python=3.8
+ARG python=3.9
 FROM ${repo}:${arch}-conda-python-${python}
 
 RUN mamba install -q -y "cython<3" && \
diff --git a/ci/docker/conda-python-dask.dockerfile b/ci/docker/conda-python-dask.dockerfile
index 44840110817e9..2c063b2e643b6 100644
--- a/ci/docker/conda-python-dask.dockerfile
+++ b/ci/docker/conda-python-dask.dockerfile
@@ -17,7 +17,7 @@
 
 ARG repo
 ARG arch=amd64
-ARG python=3.8
+ARG python=3.9
 FROM ${repo}:${arch}-conda-python-${python}
 
 ARG dask=latest
diff --git a/ci/docker/conda-python-hdfs.dockerfile b/ci/docker/conda-python-hdfs.dockerfile
index 4e5e1a402e282..4cf35f4b37a56 100644
--- a/ci/docker/conda-python-hdfs.dockerfile
+++ b/ci/docker/conda-python-hdfs.dockerfile
@@ -17,7 +17,7 @@
 
 ARG repo
 ARG arch=amd64
-ARG python=3.8
+ARG python=3.9
 FROM ${repo}:${arch}-conda-python-${python}
 
 ARG jdk=11
diff --git a/ci/docker/conda-python-jpype.dockerfile b/ci/docker/conda-python-jpype.dockerfile
index d9b43afdaec9e..c28400f0262da 100644
--- a/ci/docker/conda-python-jpype.dockerfile
+++ b/ci/docker/conda-python-jpype.dockerfile
@@ -17,7 +17,7 @@
 
 ARG repo
 ARG arch=amd64
-ARG python=3.8
+ARG python=3.9
 FROM ${repo}:${arch}-conda-python-${python}
 
 ARG jdk=11
diff --git a/ci/docker/conda-python-pandas.dockerfile b/ci/docker/conda-python-pandas.dockerfile
index 83ad52a13d639..9ee62cd282d36 100644
--- a/ci/docker/conda-python-pandas.dockerfile
+++ b/ci/docker/conda-python-pandas.dockerfile
@@ -17,7 +17,7 @@
 
 ARG repo
 ARG arch=amd64
-ARG python=3.8
+ARG python=3.9
 FROM ${repo}:${arch}-conda-python-${python}
 
 ARG pandas=latest
diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile
index d95fe58b529f6..a8e8250797fa8 100644
--- a/ci/docker/conda-python-spark.dockerfile
+++ b/ci/docker/conda-python-spark.dockerfile
@@ -17,7 +17,7 @@
 
 ARG repo
 ARG arch=amd64
-ARG python=3.8
+ARG python=3.9
 FROM ${repo}:${arch}-conda-python-${python}
 
 ARG jdk=11
diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile
index 7e8dbe76f6248..3897a7217d975 100644
--- a/ci/docker/conda-python.dockerfile
+++ b/ci/docker/conda-python.dockerfile
@@ -20,7 +20,7 @@ ARG arch
 FROM ${repo}:${arch}-conda-cpp
 
 # install python specific packages
-ARG python=3.8
+ARG python=3.9
 COPY ci/conda_env_python.txt \
      /arrow/ci/
 # If the Python version being tested is the same as the Python used by the system gdb,
diff --git a/ci/docker/linux-apt-python-3.dockerfile b/ci/docker/linux-apt-python-3.dockerfile
index 2e07c244017a0..e215976d44850 100644
--- a/ci/docker/linux-apt-python-3.dockerfile
+++ b/ci/docker/linux-apt-python-3.dockerfile
@@ -23,6 +23,7 @@ COPY python/requirements-build.txt \
      /arrow/python/
 
 ENV ARROW_PYTHON_VENV /arrow-dev
+
 RUN python3 -m venv ${ARROW_PYTHON_VENV} && \
     . ${ARROW_PYTHON_VENV}/bin/activate && \
     pip install -U pip setuptools wheel && \
diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile
index 5cc1711608c03..a797bedff166a 100644
--- a/ci/docker/python-wheel-manylinux.dockerfile
+++ b/ci/docker/python-wheel-manylinux.dockerfile
@@ -47,7 +47,7 @@ RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget
 # on manylinux_2_28, no system python is installed.
 # We therefore override the PATH with Python 3.8 in /opt/python
 # so that we have a consistent Python version across base images.
-ENV CPYTHON_VERSION=cp38
+ENV CPYTHON_VERSION=cp39
 ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH}
 
 # Install CMake
@@ -104,7 +104,7 @@ RUN vcpkg install \
 RUN pipx upgrade auditwheel
 
 # Configure Python for applications running in the bash shell of this Dockerfile
-ARG python=3.8
+ARG python=3.9
 ENV PYTHON_VERSION=${python}
 RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}) && \
     echo "export PATH=$PYTHON_ROOT/bin:\$PATH" >> /etc/profile.d/python.sh
diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
index bffc1bd13d6b7..8c17ebfa2fe0a 100644
--- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile
@@ -48,13 +48,12 @@ COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/
 RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \
     storage-testbench -h
 
-# Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0)
-ARG python=3.8
-RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10") & \
-    (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \
+# Define the full version number otherwise choco falls back to patch number 0 (3.9 => 3.9.0)
+ARG python=3.9
+RUN (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \
     (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \
     (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \
-    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4") & \
+    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.5") & \
     (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1")
 
 # Install archiver to extract xz archives
diff --git a/ci/docker/python-wheel-windows-vs2019.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile
index 5a17e3e4c52c2..f9d31eb5771ef 100644
--- a/ci/docker/python-wheel-windows-vs2019.dockerfile
+++ b/ci/docker/python-wheel-windows-vs2019.dockerfile
@@ -78,13 +78,12 @@ RUN vcpkg install \
 RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \
     rm -rf Python*
 
-# Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0)
-ARG python=3.8
-RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \
-    (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
+# Define the full version number otherwise choco falls back to patch number 0 (3.9 => 3.9.0)
+ARG python=3.9
+RUN (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \
     (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \
     (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \
-    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
+    (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.5" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \
     (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts")
 RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION%
 RUN python -m pip install -U pip setuptools
diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh
index 42d0e9ca179fb..21d8a16e3c2b8 100755
--- a/ci/scripts/install_python.sh
+++ b/ci/scripts/install_python.sh
@@ -25,11 +25,10 @@ platforms=([windows]=Windows
            [linux]=Linux)
 
 declare -A versions
-versions=([3.8]=3.8.10
-          [3.9]=3.9.13
+versions=([3.9]=3.9.13
           [3.10]=3.10.11
           [3.11]=3.11.9
-          [3.12]=3.12.4
+          [3.12]=3.12.5
           [3.13]=3.13.0)
 
 if [ "$#" -ne 2 ]; then
@@ -73,4 +72,5 @@ if [ $platform = "macOS" ]; then
     $pip install -U pip setuptools
 else
     echo "Unsupported platform: $platform"
+    exit 1
 fi
diff --git a/ci/scripts/r_install_system_dependencies.sh b/ci/scripts/r_install_system_dependencies.sh
index 7ddc2604f661a..ae2a04656c528 100755
--- a/ci/scripts/r_install_system_dependencies.sh
+++ b/ci/scripts/r_install_system_dependencies.sh
@@ -54,7 +54,7 @@ if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "T
     case "$PACKAGE_MANAGER" in
       zypper)
         # python3 is Python 3.6 on OpenSUSE 15.3.
-        # PyArrow supports Python 3.8 or later.
+        # PyArrow supports Python 3.9 or later.
         $PACKAGE_MANAGER install -y python39-pip
         ln -s /usr/bin/python3.9 /usr/local/bin/python
         ln -s /usr/bin/pip3.9 /usr/local/bin/pip
diff --git a/dev/archery/setup.py b/dev/archery/setup.py
index f87316dcc7ab9..59e6b5ca38e8c 100755
--- a/dev/archery/setup.py
+++ b/dev/archery/setup.py
@@ -21,12 +21,8 @@
 import sys
 from setuptools import setup, find_packages
 
-# pygit2>=1.14.0 requires python 3.9, so crossbow and all
-# both technically require python 3.9 — however we still need to
-# support 3.8 when using docker. When 3.8 is EOLed and we bump
-# to Python 3.9 this will resolve itself.
-if sys.version_info < (3, 8):
-    sys.exit('Python < 3.8 is not supported')
+if sys.version_info < (3, 9):
+    sys.exit('Python < 3.9 is not supported')
 
 # For pathlib.Path compatibility
 jinja_req = 'jinja2>=2.11'
@@ -57,7 +53,7 @@
     maintainer_email='dev@arrow.apache.org',
     packages=find_packages(),
     include_package_data=True,
-    python_requires='>=3.8',
+    python_requires='>=3.9',
     install_requires=['click>=7'],
     tests_require=['pytest', 'responses'],
     extras_require=extras,
diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat
index 06deb0c5b2260..d846b69da803b 100644
--- a/dev/release/verify-release-candidate-wheels.bat
+++ b/dev/release/verify-release-candidate-wheels.bat
@@ -57,9 +57,6 @@ call deactivate
 set ARROW_TEST_DATA=%cd%\arrow\testing\data
 
 
-CALL :verify_wheel 3.8
-if errorlevel 1 GOTO error
-
 CALL :verify_wheel 3.9
 if errorlevel 1 GOTO error
 
@@ -69,6 +66,12 @@ if errorlevel 1 GOTO error
 CALL :verify_wheel 3.11
 if errorlevel 1 GOTO error
 
+CALL :verify_wheel 3.12
+if errorlevel 1 GOTO error
+
+CALL :verify_wheel 3.13
+if errorlevel 1 GOTO error
+
 :done
 cd %_CURRENT_DIR%
 
diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat
index 06d3016c72af9..bba62b9c01273 100644
--- a/dev/release/verify-release-candidate.bat
+++ b/dev/release/verify-release-candidate.bat
@@ -56,7 +56,7 @@ if "%VERSION%"=="" (
 
 set ARROW_TEST_DATA=!ARROW_SOURCE!\testing\data
 set PARQUET_TEST_DATA=!ARROW_SOURCE!\cpp\submodules\parquet-testing\data
-set PYTHON=3.8
+set PYTHON=3.9
 
 @rem Using call with conda.bat seems necessary to avoid terminating the batch
 @rem script execution
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index cdea4ca0d00a1..8aaffb591b0cf 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -1146,7 +1146,7 @@ test_linux_wheels() {
     local arch="x86_64"
   fi
 
-  local python_versions="${TEST_PYTHON_VERSIONS:-3.8 3.9 3.10 3.11 3.12 3.13}"
+  local python_versions="${TEST_PYTHON_VERSIONS:-3.9 3.10 3.11 3.12 3.13}"
   local platform_tags="${TEST_WHEEL_PLATFORM_TAGS:-manylinux_2_17_${arch}.manylinux2014_${arch} manylinux_2_28_${arch}}"
 
   for python in ${python_versions}; do
@@ -1170,11 +1170,11 @@ test_macos_wheels() {
 
   # apple silicon processor
   if [ "$(uname -m)" = "arm64" ]; then
-    local python_versions="3.8 3.9 3.10 3.11 3.12 3.13"
+    local python_versions="3.9 3.10 3.11 3.12 3.13"
     local platform_tags="macosx_11_0_arm64"
     local check_flight=OFF
   else
-    local python_versions="3.8 3.9 3.10 3.11 3.12 3.13"
+    local python_versions="3.9 3.10 3.11 3.12 3.13"
     local platform_tags="macosx_10_15_x86_64"
   fi
 
diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index 082d33b124f9f..4c504a120daed 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -211,7 +211,7 @@ env:
 {%- macro azure_upload_releases(pattern) -%}
   - task: UsePythonVersion@0
     inputs:
-      versionSpec: '3.8'
+      versionSpec: '3.9'
   - script: pip install -e arrow/dev/archery[crossbow-upload]
     displayName: Install Crossbow
   - bash: |
diff --git a/dev/tasks/python-sdist/github.yml b/dev/tasks/python-sdist/github.yml
index ef36e358aa926..8a141c4099cd6 100644
--- a/dev/tasks/python-sdist/github.yml
+++ b/dev/tasks/python-sdist/github.yml
@@ -22,7 +22,7 @@
 jobs:
   build:
     name: "Build sdist"
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     steps:
       {{ macros.github_checkout_arrow()|indent }}
       {{ macros.github_install_archery()|indent }}
@@ -39,6 +39,7 @@ jobs:
       - name: Test sdist
         run: archery docker run ubuntu-python-sdist-test
         env:
+          UBUNTU: 22.04
           PYARROW_VERSION: {{ arrow.no_rc_version }}
 
       {{ macros.github_upload_releases("arrow/python/dist/*.tar.gz")|indent }}
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index faca698b71a4d..f3011ae118a11 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -66,7 +66,7 @@ jobs:
       - name: Test wheel on AlmaLinux 8
         shell: bash
         if: |
-          '{{ python_version }}' == '3.8'
+          '{{ python_version }}' == '3.9'
         env:
           ALMALINUX: "8"
         run: |
@@ -82,7 +82,7 @@ jobs:
       - name: Test wheel on Ubuntu 20.04
         shell: bash
         if: |
-          '{{ python_version }}' == '3.8'
+          '{{ python_version }}' == '3.9'
         env:
           UBUNTU: "20.04"
         run: |
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 9bb7eedd7b3ee..4dcafc73a25ad 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -390,8 +390,7 @@ tasks:
       - pyarrow-{no_rc_version}-py310(h[a-z0-9]+)_0_cuda.conda
       - pyarrow-{no_rc_version}-py311(h[a-z0-9]+)_0_cuda.conda
 
-{% for python_version, python_tag, abi_tag in [("3.8", "cp38", "cp38"),
-                                               ("3.9", "cp39", "cp39"),
+{% for python_version, python_tag, abi_tag in [("3.9", "cp39", "cp39"),
                                                ("3.10", "cp310", "cp310"),
                                                ("3.11", "cp311", "cp311"),
                                                ("3.12", "cp312", "cp312"),
@@ -1203,7 +1202,7 @@ tasks:
         UBUNTU: 22.04
       image: ubuntu-cpp-emscripten
 
-{% for python_version in ["3.8", "3.9", "3.10", "3.11", "3.12"] %}
+{% for python_version in ["3.9", "3.10", "3.11", "3.12"] %}
   test-conda-python-{{ python_version }}:
     ci: github
     template: docker-tests/github.linux.yml
@@ -1286,20 +1285,20 @@ tasks:
       flags: "-e ARROW_S3=OFF -e ARROW_GANDIVA=OFF"
       image: debian-python
 
-  test-ubuntu-20.04-python-3:
+  test-ubuntu-22.04-python-3:
     ci: github
     template: docker-tests/github.linux.yml
     params:
       env:
-        UBUNTU: 20.04
+        UBUNTU: 22.04
       image: ubuntu-python
 
-  test-ubuntu-22.04-python-3:
+  test-ubuntu-24.04-python-3:
     ci: github
     template: docker-tests/github.linux.yml
     params:
       env:
-        UBUNTU: 22.04
+        UBUNTU: 24.04
       image: ubuntu-python
 
   test-fedora-39-python-3:
@@ -1515,16 +1514,28 @@ tasks:
 
   ############################## CUDA tests #################################
 
-  test-cuda-cpp:
+  test-cuda-cpp-ubuntu-20.04-cuda-11.2.2:
     ci: github
     template: docker-tests/github.cuda.yml
     params:
       image: ubuntu-cuda-cpp
 
-  test-cuda-python:
+  test-cuda-cpp-ubuntu-22.04-cuda-11.7.1:
     ci: github
     template: docker-tests/github.cuda.yml
     params:
+      env:
+        UBUNTU: 22.04
+        CUDA: 11.7.1
+      image: ubuntu-cuda-cpp
+
+  test-cuda-python-ubuntu-22.04-cuda-11.7.1:
+    ci: github
+    template: docker-tests/github.cuda.yml
+    params:
+      env:
+        UBUNTU: 22.04
+        CUDA: 11.7.1
       image: ubuntu-cuda-python
 
   ############################## Fuzz tests #################################
@@ -1542,11 +1553,11 @@ tasks:
 
   ############################## Integration tests ############################
 
-{% for python_version, pandas_version, numpy_version, cache_leaf in [("3.8", "1.0", "1.19", True),
-                                                                     ("3.9", "latest", "latest", False),
-                                                                     ("3.10", "latest", "1.26", False),
+{% for python_version, pandas_version, numpy_version, cache_leaf in [("3.9", "1.1.3", "1.19.5", True),
                                                                      ("3.10", "latest", "latest", False),
-                                                                     ("3.10", "nightly", "nightly", False),
+                                                                     ("3.11", "latest", "1.26", False),
+                                                                     ("3.11", "latest", "latest", False),
+                                                                     ("3.11", "nightly", "nightly", False),
                                                                      ("3.11", "upstream_devel", "nightly", False)] %}
   test-conda-python-{{ python_version }}-pandas-{{ pandas_version }}-numpy-{{ numpy_version }}:
     ci: github
diff --git a/docker-compose.yml b/docker-compose.yml
index 6d9b738d8da35..66607157318a2 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -855,7 +855,7 @@ services:
     #   docker-compose run --rm conda-python
     # Parameters:
     #   ARCH: amd64, arm32v7
-    #   PYTHON: 3.8, 3.9, 3.10, 3.11
+    #   PYTHON: 3.9, 3.10, 3.11, 3.12
     image: ${REPO}:${ARCH}-conda-python-${PYTHON}
     build:
       context: .
@@ -917,6 +917,7 @@ services:
     # Parameters:
     #   ARCH: amd64
     #   CUDA: <depends on your nvidia driver, should match system CUDA>
+    #   UBUNTU: 22.04, 24.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cuda-${CUDA}-python-3
     build:
       context: .
@@ -984,7 +985,7 @@ services:
     #   docker-compose run --rm ubuntu-python
     # Parameters:
     #   ARCH: amd64, arm64v8, ...
-    #   UBUNTU: 20.04, 22.04
+    #   UBUNTU: 22.04, 24.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3
     build:
       context: .
@@ -1064,7 +1065,7 @@ services:
     # Parameters:
     #   ARCH: amd64, arm64v8, ...
     #   PYARROW_VERSION: The test target pyarrow version such as "3.0.0"
-    #   UBUNTU: 20.04, 22.04
+    #   UBUNTU: 22.04, 24.04
     image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3
     build:
       context: .
diff --git a/docs/source/developers/continuous_integration/archery.rst b/docs/source/developers/continuous_integration/archery.rst
index d190a0a96cfcb..2580693480ef1 100644
--- a/docs/source/developers/continuous_integration/archery.rst
+++ b/docs/source/developers/continuous_integration/archery.rst
@@ -26,7 +26,7 @@ utility called Archery.
 Installation
 ------------
 
-Archery requires Python 3.8 or later. It is recommended to install Archery in
+Archery requires Python 3.9 or later. It is recommended to install Archery in
 *editable* mode with the ``-e`` flag to automatically update the installation
 when pulling the Arrow repository. After cloning the Arrow repository, from
 the top level directory install Archery by using the command
diff --git a/docs/source/developers/continuous_integration/crossbow.rst b/docs/source/developers/continuous_integration/crossbow.rst
index 50ac607f4d87b..44f41895d4faf 100644
--- a/docs/source/developers/continuous_integration/crossbow.rst
+++ b/docs/source/developers/continuous_integration/crossbow.rst
@@ -119,7 +119,7 @@ to step 3:
 
    - Confirm the `auto cancellation`_ feature is turned off for branch builds. This should be the default setting.
 
-7. Install Python (minimum supported version is 3.8):
+7. Install Python (minimum supported version is 3.9):
 
    | Miniconda is preferred, see installation instructions:
    | https://conda.io/docs/user-guide/install/index.html
diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst
index 84d6253691f09..9727a68f7424b 100644
--- a/docs/source/python/install.rst
+++ b/docs/source/python/install.rst
@@ -27,7 +27,7 @@ Linux distributions. We strongly recommend using a 64-bit system.
 Python Compatibility
 --------------------
 
-PyArrow is currently compatible with Python 3.8, 3.9, 3.10 and 3.11.
+PyArrow is currently compatible with Python 3.9, 3.10, 3.11, 3.12 and 3.13.
 
 Using Conda
 -----------
diff --git a/python/asv.conf.json b/python/asv.conf.json
index 4d8d73e34b778..86fcd3537dc91 100644
--- a/python/asv.conf.json
+++ b/python/asv.conf.json
@@ -65,7 +65,7 @@
 
     // The Pythons you'd like to test against.  If not provided, defaults
     // to the current version of Python used to run `asv`.
-    "pythons": ["3.8"],
+    "pythons": ["3.9"],
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index dcf96f68c4da7..6a36b41daf302 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -1444,14 +1444,11 @@ def test_cancellation(self):
         if (threading.current_thread().ident !=
                 threading.main_thread().ident):
             pytest.skip("test only works from main Python thread")
-        # Skips test if not available
-        raise_signal = util.get_raise_signal()
-        signum = signal.SIGINT
 
         def signal_from_thread():
             # Give our workload a chance to start up
             time.sleep(0.2)
-            raise_signal(signum)
+            signal.raise_signal(signal.SIGINT)
 
         # We start with a small CSV reading workload and increase its size
         # until it's large enough to get an interruption during it, even in
@@ -1507,7 +1504,7 @@ def signal_from_thread():
         assert last_duration <= 2.0
         e = exc_info.__context__
         assert isinstance(e, pa.ArrowCancelled)
-        assert e.signum == signum
+        assert e.signum == signal.SIGINT
 
     @pytest.mark.threading
     def test_cancellation_disabled(self):
diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py
index 029a2695b9fd8..f0ceba37d6933 100644
--- a/python/pyarrow/tests/test_flight.py
+++ b/python/pyarrow/tests/test_flight.py
@@ -2101,12 +2101,10 @@ def do_exchange(self, context, descriptor, reader, writer):
 def test_interrupt():
     if threading.current_thread().ident != threading.main_thread().ident:
         pytest.skip("test only works from main Python thread")
-    # Skips test if not available
-    raise_signal = util.get_raise_signal()
 
     def signal_from_thread():
         time.sleep(0.5)
-        raise_signal(signal.SIGINT)
+        signal.raise_signal(signal.SIGINT)
 
     exc_types = (KeyboardInterrupt, pa.ArrowCancelled)
 
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index cc680939ac46a..2a05f87615074 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -383,13 +383,10 @@ def test_tzinfo_to_string_errors():
     with pytest.raises(TypeError):
         pa.lib.tzinfo_to_string("Europe/Budapest")
 
-    if sys.version_info >= (3, 8):
-        # before 3.8 it was only possible to create timezone objects with whole
-        # number of minutes
-        tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30))
-        msg = "Offset must represent whole number of minutes"
-        with pytest.raises(ValueError, match=msg):
-            pa.lib.tzinfo_to_string(tz)
+    tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30))
+    msg = "Offset must represent whole number of minutes"
+    with pytest.raises(ValueError, match=msg):
+        pa.lib.tzinfo_to_string(tz)
 
 
 if tzst:
diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
index aa6dd21f800c5..acbb2a5c0de6f 100644
--- a/python/pyarrow/tests/util.py
+++ b/python/pyarrow/tests/util.py
@@ -309,20 +309,6 @@ def open_append_stream(self, path, metadata):
         return self._fs.open_append_stream(path, metadata=metadata)
 
 
-def get_raise_signal():
-    if sys.version_info >= (3, 8):
-        return signal.raise_signal
-    elif os.name == 'nt':
-        # On Windows, os.kill() doesn't actually send a signal,
-        # it just terminates the process with the given exit code.
-        pytest.skip("test requires Python 3.8+ on Windows")
-    else:
-        # On Unix, emulate raise_signal() with os.kill().
-        def raise_signal(signum):
-            os.kill(os.getpid(), signum)
-        return raise_signal
-
-
 @contextlib.contextmanager
 def signal_wakeup_fd(*, warn_on_full_buffer=False):
     # Use a socket pair, rather a self-pipe, so that select() can be used
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 7c3fcae5cb306..932210044399e 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -34,7 +34,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "pyarrow"
 dynamic = ["version"]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
     "numpy >= 1.16.6"
 ]
@@ -43,7 +43,6 @@ readme = {file = "README.md", content-type = "text/markdown"}
 license = {text = "Apache Software License"}
 classifiers  = [
     'License :: OSI Approved :: Apache Software License',
-    'Programming Language :: Python :: 3.8',
     'Programming Language :: Python :: 3.9',
     'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R
index c9dbd024855df..279a532003d54 100644
--- a/r/tests/testthat/test-python.R
+++ b/r/tests/testthat/test-python.R
@@ -22,8 +22,8 @@ test_that("install_pyarrow", {
   # Windows CI machine doesn't pick up the right python or something
   skip_on_os("windows")
   skip_if_not_installed("reticulate")
-  # PyArrow doesn't support Python 3.7 or earlier
-  skip_on_python_older_than("3.8")
+  # PyArrow doesn't support Python 3.8 or earlier
+  skip_on_python_older_than("3.9")
   # no pyarrow wheels for macos 10.13
   skip_if(on_macos_10_13_or_lower())
 

From c3a0bd2629acfc45bffec15dadc422f7d101c105 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 17 Sep 2024 06:26:39 +0900
Subject: [PATCH 160/186] GH-44127: [CI][R] Fix util_enable_core_dumps.sh path
 (#44128)

### Rationale for this change

Checkout directory is `arrow/` not `./`.

### What changes are included in this PR?

Add missing `arrow/` prefix.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #44127

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/tasks/macros.jinja          | 8 ++++----
 dev/tasks/r/github.packages.yml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index 4c504a120daed..221d2a48b87df 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -320,7 +320,7 @@ env:
     run: mkdir repo
   {% if get_win %}
   - name: Get windows binary
-    uses: actions/download-artifact@v3
+    uses: actions/download-artifact@v4
     with:
       name: r-lib__libarrow__bin__windows
       path: repo/libarrow/bin/windows
@@ -328,7 +328,7 @@ env:
   {% if get_nix %}
     {% for openssl_version in ["1.0", "1.1", "3.0"] %}
   - name: Get Linux OpenSSL {{ openssl_version }} binary
-    uses: actions/download-artifact@v3
+    uses: actions/download-artifact@v4
     with:
       name: r-lib__libarrow__bin__linux-openssl-{{ openssl_version }}
       path: repo/libarrow/bin/linux-openssl-{{ openssl_version }}
@@ -338,7 +338,7 @@ env:
     {% for openssl_version in ["1.1", "3.0"] %}
       {% for arch in ["x86_64", "arm64"] %}
   - name: Get macOS {{ arch }} OpenSSL {{ openssl_version }} binary
-    uses: actions/download-artifact@v3
+    uses: actions/download-artifact@v4
     with:
       name: r-lib__libarrow__bin__darwin-{{arch}}-openssl-{{ openssl_version }}
       path: repo/libarrow/bin/darwin-{{ arch }}-openssl-{{ openssl_version }}
@@ -346,7 +346,7 @@ env:
     {% endfor %}
   {% endif %}
   - name: Get src pkg
-    uses: actions/download-artifact@v3
+    uses: actions/download-artifact@v4
     with:
       name: r-pkg__src__contrib
       path: repo/src/contrib
diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml
index 66008275148f9..11d1f85a4cb52 100644
--- a/dev/tasks/r/github.packages.yml
+++ b/dev/tasks/r/github.packages.yml
@@ -140,7 +140,7 @@ jobs:
           UBUNTU: {{ '"${{ matrix.ubuntu }}"' }}
         {{ macros.github_set_sccache_envvars()|indent(8) }}
         run: |
-          source ci/scripts/util_enable_core_dumps.sh
+          source arrow/ci/scripts/util_enable_core_dumps.sh
           archery docker run \
             -e EXTRA_CMAKE_FLAGS="{{ '${{ matrix.extra-cmake-flags }}' }}" \
             {{ '${{ matrix.os }}' }}-cpp-static

From 38b0c79c85e2835df7d54385bf725d5b060bf81e Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 17 Sep 2024 08:53:42 +0900
Subject: [PATCH 161/186] GH-44062:  [Dev][Archery][Integration] Reduce
 needless test matrix (#44099)

### Rationale for this change

If we enable C++, Java and Rust, we use the following patterns:

| Producer | Consumer |
|----------|----------|
| C++      | C++      |
| C++      | Java     |
| C++      | Rust     |
| Java     | C++      |
| Java     | Java     |
| Java     | Rust     |
| Rust     | C++      |
| Rust     | Java     |
| Rust     | Rust     |

In apache/arrow, the following patterns are redundant because they should be done in apache/arrow-rs:

| Producer | Consumer |
|----------|----------|
| Rust     | Rust     |

In apache/arror-rs, the following patterns are redundant because they should be done in apache/arrow:

| Producer | Consumer |
|----------|----------|
| C++      | C++      |
| C++      | Java     |
| Java     | C++      |
| Java     | Java     |

### What changes are included in this PR?

Add `--target-languages` option. We can specify target languages by this. Here are expected usages:

In apache/arrow:
* `--target-languages=cpp,csharp,go,java,js`

In apache/arrow-rs
* `--target-languages=rust`

Here is an example in apache/arrow-rs:

Used matrix:

| Producer | Consumer |
|----------|----------|
| Rust     | Rust     |
| Rust     | C++      |
| Rust     | C#       |
| Rust     | Go       |
| Rust     | Java     |
| Rust     | JS       |
| Rust     | nanoarrow|
| C++      | Rust     |
| C#       | Rust     |
| Go       | Rust     |
| Java     | Rust     |
| JS       | Rust     |
| nanoarrow| Rust     |

If no `--target-languages` is specified, all enabled languages are the target languages. (The same as the current behavior.)

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #44062

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/integration_arrow.sh           |  3 ++
 dev/archery/archery/cli.py                |  3 ++
 dev/archery/archery/integration/runner.py | 59 ++++++++++++++++++-----
 3 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh
index 3050ad3111198..ecb6b261ef1ee 100755
--- a/ci/scripts/integration_arrow.sh
+++ b/ci/scripts/integration_arrow.sh
@@ -30,6 +30,9 @@ gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration
 : ${ARROW_INTEGRATION_JAVA:=ON}
 : ${ARROW_INTEGRATION_JS:=ON}
 
+: ${ARCHERY_INTEGRATION_TARGET_LANGUAGES:=cpp,csharp,go,java,js}
+export ARCHERY_INTEGRATION_TARGET_LANGUAGES
+
 . ${arrow_dir}/ci/scripts/util_log.sh
 
 github_actions_group_begin "Integration: Prepare: Archery"
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index cd746f9c4499a..64481b8ff62a4 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -744,6 +744,9 @@ def _set_default(opt, default):
 @click.option('--with-rust', type=bool, default=False,
               help='Include Rust in integration tests',
               envvar="ARCHERY_INTEGRATION_WITH_RUST")
+@click.option('--target-languages', default='',
+              help=('Target languages in this integration tests'),
+              envvar="ARCHERY_INTEGRATION_TARGET_LANGUAGES")
 @click.option('--write_generated_json', default="",
               help='Generate test JSON to indicated path')
 @click.option('--run-ipc', is_flag=True, default=False,
diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index ca5febca9f801..117b5979a18f3 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -67,12 +67,13 @@ class IntegrationRunner(object):
 
     def __init__(self, json_files,
                  flight_scenarios: List[Scenario],
-                 testers: List[Tester], tempdir=None,
-                 debug=False, stop_on_error=True, gold_dirs=None,
+                 testers: List[Tester], other_testers: List[Tester],
+                 tempdir=None, debug=False, stop_on_error=True, gold_dirs=None,
                  serial=False, match=None, **unused_kwargs):
         self.json_files = json_files
         self.flight_scenarios = flight_scenarios
         self.testers = testers
+        self.other_testers = other_testers
         self.temp_dir = tempdir or tempfile.mkdtemp()
         self.debug = debug
         self.stop_on_error = stop_on_error
@@ -100,6 +101,20 @@ def run_ipc(self):
                 producer, consumer, self._produce_consume,
                 self.json_files)
 
+        for producer, consumer in itertools.product(
+                filter(lambda t: t.PRODUCER, self.testers),
+                filter(lambda t: t.CONSUMER, self.other_testers)):
+            self._compare_ipc_implementations(
+                producer, consumer, self._produce_consume,
+                self.json_files)
+
+        for producer, consumer in itertools.product(
+                filter(lambda t: t.PRODUCER, self.other_testers),
+                filter(lambda t: t.CONSUMER, self.testers)):
+            self._compare_ipc_implementations(
+                producer, consumer, self._produce_consume,
+                self.json_files)
+
         if self.gold_dirs:
             for gold_dir, consumer in itertools.product(
                     self.gold_dirs,
@@ -124,7 +139,7 @@ def run_flight(self):
         """
         servers = filter(lambda t: t.FLIGHT_SERVER, self.testers)
         clients = filter(lambda t: (t.FLIGHT_CLIENT and t.CONSUMER),
-                         self.testers)
+                         self.testers + self.other_testers)
         for server, client in itertools.product(servers, clients):
             self._compare_flight_implementations(server, client)
         log('\n')
@@ -138,6 +153,14 @@ def run_c_data(self):
                 filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.testers),
                 filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.testers)):
             self._compare_c_data_implementations(producer, consumer)
+        for producer, consumer in itertools.product(
+                filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.testers),
+                filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.other_testers)):
+            self._compare_c_data_implementations(producer, consumer)
+        for producer, consumer in itertools.product(
+                filter(lambda t: t.C_DATA_SCHEMA_EXPORTER, self.other_testers),
+                filter(lambda t: t.C_DATA_SCHEMA_IMPORTER, self.testers)):
+            self._compare_c_data_implementations(producer, consumer)
         log('\n')
 
     def _gold_tests(self, gold_dir):
@@ -560,31 +583,42 @@ def get_static_json_files():
 def run_all_tests(with_cpp=True, with_java=True, with_js=True,
                   with_csharp=True, with_go=True, with_rust=False,
                   with_nanoarrow=False, run_ipc=False, run_flight=False,
-                  run_c_data=False, tempdir=None, **kwargs):
+                  run_c_data=False, tempdir=None, target_languages="",
+                  **kwargs):
     tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-')
+    print(["before", target_languages])
+    target_languages = list(filter(len, target_languages.split(",")))
+    print(["after", target_languages])
 
     testers: List[Tester] = []
+    other_testers: List[Tester] = []
+
+    def append_tester(language, tester):
+        if len(target_languages) == 0 or language in target_languages:
+            testers.append(tester)
+        else:
+            other_testers.append(tester)
 
     if with_cpp:
-        testers.append(CppTester(**kwargs))
+        append_tester("cpp", CppTester(**kwargs))
 
     if with_java:
-        testers.append(JavaTester(**kwargs))
+        append_tester("java", JavaTester(**kwargs))
 
     if with_js:
-        testers.append(JSTester(**kwargs))
+        append_tester("js", JSTester(**kwargs))
 
     if with_csharp:
-        testers.append(CSharpTester(**kwargs))
+        append_tester("csharp", CSharpTester(**kwargs))
 
     if with_go:
-        testers.append(GoTester(**kwargs))
+        append_tester("go", GoTester(**kwargs))
 
     if with_nanoarrow:
-        testers.append(NanoarrowTester(**kwargs))
+        append_tester("nanoarrow", NanoarrowTester(**kwargs))
 
     if with_rust:
-        testers.append(RustTester(**kwargs))
+        append_tester("rust", RustTester(**kwargs))
 
     static_json_files = get_static_json_files()
     generated_json_files = datagen.get_generated_json_files(tempdir=tempdir)
@@ -666,7 +700,8 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True,
         ),
     ]
 
-    runner = IntegrationRunner(json_files, flight_scenarios, testers, **kwargs)
+    runner = IntegrationRunner(json_files, flight_scenarios, testers,
+                               other_testers, **kwargs)
     if run_ipc:
         runner.run_ipc()
     if run_flight:

From fea1e97feff9b5071ec352f32219920368991fd9 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 17 Sep 2024 10:56:05 +0900
Subject: [PATCH 162/186] MINOR: [Dev][Archery][Integration] Remove debug
 prints (#44140)

### Rationale for this change

#44099 included debug prints.

### What changes are included in this PR?

Remove debug prints.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/archery/archery/integration/runner.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 117b5979a18f3..22cef46d0ca26 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -586,9 +586,7 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True,
                   run_c_data=False, tempdir=None, target_languages="",
                   **kwargs):
     tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-')
-    print(["before", target_languages])
     target_languages = list(filter(len, target_languages.split(",")))
-    print(["after", target_languages])
 
     testers: List[Tester] = []
     other_testers: List[Tester] = []

From 87d647742b9ae46e86050ac7361d6c3e75613a47 Mon Sep 17 00:00:00 2001
From: Rossi Sun <zanmato1984@gmail.com>
Date: Tue, 17 Sep 2024 20:47:49 +0800
Subject: [PATCH 163/186] GH-44098: [C++] Add home made _mm256_set_m128i for
 compilers who are missing it (#44116)

### Rationale for this change

AVX2 intrinsic _mm256_set_m128i is missing in GCC versions <8.0 - this is a GCC bug discussed in [1], causing certain CI build failed.

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80582

### What changes are included in this PR?

Check the GCC version and use a home made replacement if necessary.

### Are these changes tested?

Manually tested.

### Are there any user-facing changes?

None.

* GitHub Issue: #44098

Authored-by: Ruoxi Sun <zanmato1984@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/acero/swiss_join_avx2.cc             |  3 +--
 cpp/src/arrow/compute/row/compare_internal_avx2.cc |  3 +--
 cpp/src/arrow/util/simd.h                          | 13 ++++++++++---
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc
index e42b0b40445bf..1076073523448 100644
--- a/cpp/src/arrow/acero/swiss_join_avx2.cc
+++ b/cpp/src/arrow/acero/swiss_join_avx2.cc
@@ -15,10 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <immintrin.h>
-
 #include "arrow/acero/swiss_join_internal.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/simd.h"
 
 namespace arrow {
 namespace acero {
diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc
index 96eed6fc03a2a..9f6e1adfe2108 100644
--- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc
+++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc
@@ -15,11 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <immintrin.h>
-
 #include "arrow/compute/row/compare_internal.h"
 #include "arrow/compute/util.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/simd.h"
 
 namespace arrow {
 namespace compute {
diff --git a/cpp/src/arrow/util/simd.h b/cpp/src/arrow/util/simd.h
index b37f6e4926978..cc1a7d6cc807c 100644
--- a/cpp/src/arrow/util/simd.h
+++ b/cpp/src/arrow/util/simd.h
@@ -27,13 +27,14 @@
 #else
 // gcc/clang (possibly others)
 
-#  if defined(ARROW_HAVE_BMI2)
+#  if defined(ARROW_HAVE_BMI2) || defined(ARROW_HAVE_RUNTIME_BMI2)
 #    include <x86intrin.h>
 #  endif
 
-#  if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_AVX512)
+#  if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_AVX512) || \
+      defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512)
 #    include <immintrin.h>
-#  elif defined(ARROW_HAVE_SSE4_2)
+#  elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_RUNTIME_SSE4_2)
 #    include <nmmintrin.h>
 #  endif
 
@@ -41,4 +42,10 @@
 #    include <arm_neon.h>
 #  endif
 
+// GH-44098: Workaround for missing _mm256_set_m128i in older versions of GCC.
+#  if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8
+#    define _mm256_set_m128i(hi, lo) \
+      _mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1)
+#  endif
+
 #endif

From 0f3ec82466a6de6df8f86d392313c39a206bc1cc Mon Sep 17 00:00:00 2001
From: Bryce Mecum <petridish@gmail.com>
Date: Tue, 17 Sep 2024 09:33:43 -0700
Subject: [PATCH 164/186] MINOR: [Docs][Python] Fix return type in docstring
 for Array.slice (#44134)

### Rationale for this change

Currently the docstring for Array.slice says it returns a RecordBatch. I don't see how this is possible with the existing code. My guess is that this was a copy-and-paste error back when the Array and RecordBatch slice impls were added in https://issues.apache.org/jira/browse/ARROW-547.

### What changes are included in this PR?

Just a docstring change. I copied the language from the take method so things are consistent.

### Are these changes tested?

No.

### Are there any user-facing changes?

Just docs.

Authored-by: Bryce Mecum <petridish@gmail.com>
Signed-off-by: Bryce Mecum <petridish@gmail.com>
---
 python/pyarrow/array.pxi | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 93c44297590e8..ae9e7fd777ed1 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1437,7 +1437,8 @@ cdef class Array(_PandasConvertible):
 
         Returns
         -------
-        sliced : RecordBatch
+        sliced : Array
+            An array with the same datatype, containing the sliced values.
         """
         cdef shared_ptr[CArray] result
 

From eb5f5baacfeddfe8747e525e7c937e25966ffb19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Tue, 17 Sep 2024 18:51:59 +0200
Subject: [PATCH 165/186] GH-44149: [Packaging][CI] Remove references to
 deprecated Ubuntu bionic (#44150)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

References to bionic Ubuntu are deprecated.

### What changes are included in this PR?

Remove a couple of references to bionic Ubuntu from `setup-ubuntu.sh`

### Are these changes tested?

Yes, will run CI to validate.

### Are there any user-facing changes?

No
* GitHub Issue: #44149

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
---
 dev/release/setup-ubuntu.sh | 27 +--------------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

diff --git a/dev/release/setup-ubuntu.sh b/dev/release/setup-ubuntu.sh
index 7fde65a0060de..ef9d3dde5c1f9 100755
--- a/dev/release/setup-ubuntu.sh
+++ b/dev/release/setup-ubuntu.sh
@@ -25,23 +25,6 @@ set -exu
 codename=$(. /etc/os-release && echo ${UBUNTU_CODENAME})
 
 case ${codename} in
-  bionic)
-    llvm=12
-    nlohmann_json=
-    python=3.8
-    apt-get update -y -q
-    apt-get install -y -q --no-install-recommends \
-      apt-transport-https \
-      ca-certificates \
-      gnupg \
-      wget
-    wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-    echo "deb https://apt.llvm.org/${codename}/ llvm-toolchain-${codename}-${llvm} main" > \
-      /etc/apt/sources.list.d/llvm.list
-    apt-get update -y -q
-    apt-get install -y -q --no-install-recommends \
-      llvm-${llvm}-dev
-    ;;
   *)
     nlohmann_json=3
     python=3
@@ -52,7 +35,7 @@ case ${codename} in
 esac
 
 case ${codename} in
-  bionic|focal)
+  focal)
     ;;
   *)
     apt-get update -y -q
@@ -85,11 +68,3 @@ apt-get install -y -q --no-install-recommends \
   ruby-dev \
   tzdata \
   wget
-
-case ${codename} in
-  bionic)
-    python${python} -m pip install -U pip
-    update-alternatives \
-      --install /usr/bin/python3 python3 /usr/bin/python${python} 1
-    ;;
-esac

From 0d4badbc9f3afac5a372c6eb7c4c150c339235a6 Mon Sep 17 00:00:00 2001
From: Kevin Wilson <khwilson@gmail.com>
Date: Tue, 17 Sep 2024 13:58:47 -0400
Subject: [PATCH 166/186] GH-43809: [Docs] Update extension type examples to
 not use UUID (#44120)

### Rationale for this change

UUID extension types were made canonical in #41299 and are getting
native support in C++ and Python in #37298. As such, it makes sense to
provide an alternative user-defined extension type as an example that is
unlikely to become a canonical extension type anytime soon.

After discussion in #43809, we determined a `RationalType` would make
sense.

Please note that this is a redo of #43849 as I made a blunder and
accidentally pushed a branch that was in a wonky state.

### What changes are included in this PR?

A change in several doc locations which reference a `UuidType` extension
type have been changed to a `RationalType`.

For consistency, this PR also changes single quotes (`''`) to double
quotes (`""`) throughout the Python examples that it modifies.

Also, seemingly unrelated to this change, some doctests began failing as
numpy changed the `repr` of `float16`'s between 1.x and 2.x. We have
updated the failing doctest so that it supports both styles.

### Are these changes tested?

These are documentation changes and `archery docker run
conda-python-docs` succeeds locally.

### Are there any user-facing changes?

No.

cc @ianmcook @rok
* GitHub Issue: #43809

---------

Co-authored-by: Ian Cook <ianmcook@gmail.com>
---
 docs/source/format/Columnar.rst        |  12 +-
 docs/source/format/Integration.rst     |  31 +++-
 docs/source/python/extending_types.rst | 189 +++++++++++++++---------
 python/pyarrow/types.pxi               | 194 ++++++++++++++++---------
 4 files changed, 281 insertions(+), 145 deletions(-)

diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
index 4bd937d760d59..697c39b0cb1d9 100644
--- a/docs/source/format/Columnar.rst
+++ b/docs/source/format/Columnar.rst
@@ -1596,12 +1596,12 @@ structure. These extension keys are:
    they should not be used for third-party extension types.
 
 This extension metadata can annotate any of the built-in Arrow logical
-types. The intent is that an implementation that does not support an
-extension type can still handle the underlying data. For example a
-16-byte UUID value could be embedded in ``FixedSizeBinary(16)``, and
-implementations that do not have this extension type can still work
-with the underlying binary values and pass along the
-``custom_metadata`` in subsequent Arrow protocol messages.
+types. For example, Arrow specifies a canonical extension type that
+represents a UUID as a ``FixedSizeBinary(16)``. Arrow implementations are
+not required to support canonical extensions, so an implementation that
+does not support this UUID type will simply interpret it as a
+``FixedSizeBinary(16)`` and pass along the ``custom_metadata`` in
+subsequent Arrow protocol messages.
 
 Extension types may or may not use the
 ``'ARROW:extension:metadata'`` field. Let's consider some example
diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst
index 0ab5b832ad012..f76aa0fefcf27 100644
--- a/docs/source/format/Integration.rst
+++ b/docs/source/format/Integration.rst
@@ -390,20 +390,37 @@ but can be of any type.
 
 Extension types are, as in the IPC format, represented as their underlying
 storage type plus some dedicated field metadata to reconstruct the extension
-type.  For example, assuming a "uuid" extension type backed by a
-FixedSizeBinary(16) storage, here is how a "uuid" field would be represented::
+type.  For example, assuming a "rational" extension type backed by a
+``struct<numer: int32, denom: int32>`` storage, here is how a "rational" field
+would be represented::
 
     {
       "name" : "name_of_the_field",
       "nullable" : /* boolean */,
       "type" : {
-         "name" : "fixedsizebinary",
-         "byteWidth" : 16
+        "name" : "struct"
       },
-      "children" : [],
+      "children" : [
+        {
+          "name": "numer",
+          "type": {
+            "name": "int",
+            "bitWidth": 32,
+            "isSigned": true
+          }
+        },
+        {
+          "name": "denom",
+          "type": {
+            "name": "int",
+            "bitWidth": 32,
+            "isSigned": true
+          }
+        }
+      ],
       "metadata" : [
-         {"key": "ARROW:extension:name", "value": "uuid"},
-         {"key": "ARROW:extension:metadata", "value": "uuid-serialized"}
+         {"key": "ARROW:extension:name", "value": "rational"},
+         {"key": "ARROW:extension:metadata", "value": "rational-serialized"}
       ]
     }
 
diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst
index d746505348157..264cc8dca77e1 100644
--- a/docs/source/python/extending_types.rst
+++ b/docs/source/python/extending_types.rst
@@ -116,73 +116,103 @@ a :class:`~pyarrow.Array` or a :class:`~pyarrow.ChunkedArray`.
 Defining extension types ("user-defined types")
 -----------------------------------------------
 
-Arrow has the notion of extension types in the metadata specification as a
-possibility to extend the built-in types. This is done by annotating any of the
-built-in Arrow data types (the "storage type") with a custom type name and
-optional serialized representation ("ARROW:extension:name" and
-"ARROW:extension:metadata" keys in the Field’s custom_metadata of an IPC
-message).
-See the :ref:`format_metadata_extension_types` section of the metadata
-specification for more details.
-
-Pyarrow allows you to define such extension types from Python by subclassing
-:class:`ExtensionType` and giving the derived class its own extension name
-and serialization mechanism. The extension name and serialized metadata
-can potentially be recognized by other (non-Python) Arrow implementations
+Arrow affords a notion of extension types which allow users to annotate data
+types with additional semantics. This allows developers both to
+specify custom serialization and deserialization routines (for example,
+to :ref:`Python scalars <custom-scalar-conversion>` and
+:ref:`pandas <conversion-to-pandas>`) and to more easily interpret data.
+
+In Arrow, :ref:`extension types <format_metadata_extension_types>`
+are specified by annotating any of the built-in Arrow data types
+(the "storage type") with a custom type name and, optionally, a
+bytestring that can be used to provide additional metadata (referred to as
+"parameters" in this documentation). These appear as the
+``ARROW:extension:name`` and ``ARROW:extension:metadata`` keys in the
+Field's ``custom_metadata``.
+
+Note that since these annotations are part of the Arrow specification,
+they can potentially be recognized by other (non-Python) Arrow consumers
 such as PySpark.
 
-For example, we could define a custom UUID type for 128-bit numbers which can
-be represented as ``FixedSizeBinary`` type with 16 bytes::
-
-    class UuidType(pa.ExtensionType):
-
-        def __init__(self):
-            super().__init__(pa.binary(16), "my_package.uuid")
-
-        def __arrow_ext_serialize__(self):
-            # Since we don't have a parameterized type, we don't need extra
-            # metadata to be deserialized
-            return b''
+PyArrow allows you to define extension types from Python by subclassing
+:class:`ExtensionType` and giving the derived class its own extension name
+and mechanism to (de)serialize any parameters. For example, we could define
+a custom rational type for fractions which can be represented as a pair of
+integers::
+
+    class RationalType(pa.ExtensionType):
+
+        def __init__(self, data_type: pa.DataType):
+            if not pa.types.is_integer(data_type):
+                raise TypeError(f"data_type must be an integer type not {data_type}")
+
+            super().__init__(
+                pa.struct(
+                    [
+                        ("numer", data_type),
+                        ("denom", data_type),
+                    ],
+                ),
+                "my_package.rational",
+            )
+
+        def __arrow_ext_serialize__(self) -> bytes:
+            # No parameters are necessary
+            return b""
 
         @classmethod
         def __arrow_ext_deserialize__(cls, storage_type, serialized):
             # Sanity checks, not required but illustrate the method signature.
-            assert storage_type == pa.binary(16)
-            assert serialized == b''
-            # Return an instance of this subclass given the serialized
-            # metadata.
-            return UuidType()
+            assert pa.types.is_struct(storage_type)
+            assert pa.types.is_integer(storage_type[0].type)
+            assert storage_type[0].type == storage_type[1].type
+            assert serialized == b""
+
+            # return an instance of this subclass
+            return RationalType(storage_type[0].type)
+
 
 The special methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__``
-define the serialization of an extension type instance. For non-parametric
-types such as the above, the serialization payload can be left empty.
+define the serialization and deserialization of an extension type instance.
 
 This can now be used to create arrays and tables holding the extension type::
 
-    >>> uuid_type = UuidType()
-    >>> uuid_type.extension_name
-    'my_package.uuid'
-    >>> uuid_type.storage_type
-    FixedSizeBinaryType(fixed_size_binary[16])
-
-    >>> import uuid
-    >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16))
-    >>> arr = pa.ExtensionArray.from_storage(uuid_type, storage_array)
+    >>> rational_type = RationalType(pa.int32())
+    >>> rational_type.extension_name
+    'my_package.rational'
+    >>> rational_type.storage_type
+    StructType(struct<numer: int32, denom: int32>)
+
+    >>> storage_array = pa.array(
+    ...     [
+    ...         {"numer": 10, "denom": 17},
+    ...         {"numer": 20, "denom": 13},
+    ...     ],
+    ...     type=rational_type.storage_type,
+    ... )
+    >>> arr = rational_type.wrap_array(storage_array)
+    >>> # or equivalently
+    >>> arr = pa.ExtensionArray.from_storage(rational_type, storage_array)
     >>> arr
-    <pyarrow.lib.ExtensionArray object at 0x7f75c2f300a0>
-    [
-      A6861959108644B797664AEEE686B682,
-      718747F48E5F4058A7261E2B6B228BE8,
-      7FE201227D624D96A5CD8639DEF2A68B,
-      C6CA8C7F95744BFD9462A40B3F57A86C
-    ]
+    <pyarrow.lib.ExtensionArray object at 0x1067f5420>
+    -- is_valid: all not null
+    -- child 0 type: int32
+      [
+        10,
+        20
+      ]
+    -- child 1 type: int32
+      [
+        17,
+        13
+      ]
 
 This array can be included in RecordBatches, sent over IPC and received in
 another Python process. The receiving process must explicitly register the
 extension type for deserialization, otherwise it will fall back to the
 storage type::
 
-    >>> pa.register_extension_type(UuidType())
+    >>> pa.register_extension_type(RationalType(pa.int32()))
 
 For example, creating a RecordBatch and writing it to a stream using the
 IPC protocol::
@@ -197,19 +227,45 @@ and then reading it back yields the proper type::
 
     >>> with pa.ipc.open_stream(buf) as reader:
     ...    result = reader.read_all()
-    >>> result.column('ext').type
-    UuidType(FixedSizeBinaryType(fixed_size_binary[16]))
+    >>> result.column("ext").type
+    RationalType(StructType(struct<numer: int32, denom: int32>))
+
+Further, note that while we registered the concrete type
+``RationalType(pa.int32())``, the same extension name
+(``"my_package.rational"``) is used by ``RationalType(integer_type)``
+for *all* Arrow integer types. As such, the above code also allows users to
+(de)serialize these data types::
+
+    >>> big_rational_type = RationalType(pa.int64())
+    >>> storage_array = pa.array(
+    ...     [
+    ...         {"numer": 10, "denom": 17},
+    ...         {"numer": 20, "denom": 13},
+    ...     ],
+    ...     type=big_rational_type.storage_type,
+    ... )
+    >>> arr = big_rational_type.wrap_array(storage_array)
+    >>> batch = pa.RecordBatch.from_arrays([arr], ["ext"])
+    >>> sink = pa.BufferOutputStream()
+    >>> with pa.RecordBatchStreamWriter(sink, batch.schema) as writer:
+    ...    writer.write_batch(batch)
+    >>> buf = sink.getvalue()
+    >>> with pa.ipc.open_stream(buf) as reader:
+    ...    result = reader.read_all()
+    >>> result.column("ext").type
+    RationalType(StructType(struct<numer: int64, denom: int64>))
 
 The receiving application doesn't need to be Python but can still recognize
-the extension type as a "my_package.uuid" type, if it has implemented its own
+the extension type as a "my_package.rational" type if it has implemented its own
 extension type to receive it. If the type is not registered in the receiving
 application, it will fall back to the storage type.
 
 Parameterized extension type
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The above example used a fixed storage type with no further metadata. But
-more flexible, parameterized extension types are also possible.
+The above example illustrated how to construct an extension type that requires
+no additional metadata beyond its storage type. But Arrow also provides more
+flexible, parameterized extension types.
 
 The example given here implements an extension type for the `pandas "period"
 data type <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-span-representation>`__,
@@ -225,7 +281,7 @@ of the given frequency since 1970.
             # attributes need to be set first before calling
             # super init (as that calls serialize)
             self._freq = freq
-            super().__init__(pa.int64(), 'my_package.period')
+            super().__init__(pa.int64(), "my_package.period")
 
         @property
         def freq(self):
@@ -240,7 +296,7 @@ of the given frequency since 1970.
             # metadata.
             serialized = serialized.decode()
             assert serialized.startswith("freq=")
-            freq = serialized.split('=')[1]
+            freq = serialized.split("=")[1]
             return PeriodType(freq)
 
 Here, we ensure to store all information in the serialized metadata that is
@@ -274,7 +330,7 @@ the data as a 2-D Numpy array ``(N, 3)`` without any copy::
             super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType")
 
         def __arrow_ext_serialize__(self):
-            return b''
+            return b""
 
         @classmethod
         def __arrow_ext_deserialize__(cls, storage_type, serialized):
@@ -313,6 +369,8 @@ This array can be sent over IPC, received in another Python process, and the cus
 extension array class will be preserved (as long as the receiving process registers
 the extension type using :func:`register_extension_type` before reading the IPC data).
 
+.. _custom-scalar-conversion:
+
 Custom scalar conversion
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -335,7 +393,7 @@ For example, if we wanted the above example 3D point type to return a custom
             super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType")
 
         def __arrow_ext_serialize__(self):
-            return b''
+            return b""
 
         @classmethod
         def __arrow_ext_deserialize__(cls, storage_type, serialized):
@@ -354,6 +412,7 @@ Arrays built using this extension type now provide scalars that convert to our `
     >>> arr.to_pylist()
     [Point3D(x=1.0, y=2.0, z=3.0), Point3D(x=4.0, y=5.0, z=6.0)]
 
+.. _conversion-to-pandas:
 
 Conversion to pandas
 ~~~~~~~~~~~~~~~~~~~~
@@ -436,16 +495,16 @@ Extension arrays can be used as columns in  ``pyarrow.Table`` or
 
    >>> data = [
    ...     pa.array([1, 2, 3]),
-   ...     pa.array(['foo', 'bar', None]),
+   ...     pa.array(["foo", "bar", None]),
    ...     pa.array([True, None, True]),
    ...     tensor_array,
    ...     tensor_array_2
    ... ]
-   >>> my_schema = pa.schema([('f0', pa.int8()),
-   ...                        ('f1', pa.string()),
-   ...                        ('f2', pa.bool_()),
-   ...                        ('tensors_int', tensor_type),
-   ...                        ('tensors_float', tensor_type_2)])
+   >>> my_schema = pa.schema([("f0", pa.int8()),
+   ...                        ("f1", pa.string()),
+   ...                        ("f2", pa.bool_()),
+   ...                        ("tensors_int", tensor_type),
+   ...                        ("tensors_float", tensor_type_2)])
    >>> table = pa.Table.from_arrays(data, schema=my_schema)
    >>> table
    pyarrow.Table
@@ -541,7 +600,7 @@ or
 
 .. code-block:: python
 
-    >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=['C', 'H', 'W'])
+    >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=["C", "H", "W"])
 
 for ``NCHW`` format where:
 
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index a46caff1f21a4..70f12e9796e80 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1627,59 +1627,97 @@ cdef class ExtensionType(BaseExtensionType):
 
     Examples
     --------
-    Define a UuidType extension type subclassing ExtensionType:
+    Define a RationalType extension type subclassing ExtensionType:
 
     >>> import pyarrow as pa
-    >>> class UuidType(pa.ExtensionType):
-    ...    def __init__(self):
-    ...       pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid")
-    ...    def __arrow_ext_serialize__(self):
-    ...       # since we don't have a parameterized type, we don't need extra
-    ...       # metadata to be deserialized
-    ...       return b''
-    ...    @classmethod
-    ...    def __arrow_ext_deserialize__(self, storage_type, serialized):
-    ...       # return an instance of this subclass given the serialized
-    ...       # metadata.
-    ...       return UuidType()
-    ...
+    >>> class RationalType(pa.ExtensionType):
+    ...     def __init__(self, data_type: pa.DataType):
+    ...         if not pa.types.is_integer(data_type):
+    ...             raise TypeError(f"data_type must be an integer type not {data_type}")
+    ...         super().__init__(
+    ...             pa.struct(
+    ...                 [
+    ...                     ("numer", data_type),
+    ...                     ("denom", data_type),
+    ...                 ],
+    ...             ),
+    ...             # N.B. This name does _not_ reference `data_type` so deserialization
+    ...             # will work for _any_ integer `data_type` after registration
+    ...             "my_package.rational",
+    ...         )
+    ...     def __arrow_ext_serialize__(self) -> bytes:
+    ...         # No parameters are necessary
+    ...         return b""
+    ...     @classmethod
+    ...     def __arrow_ext_deserialize__(cls, storage_type, serialized):
+    ...         # return an instance of this subclass
+    ...         return RationalType(storage_type[0].type)
 
     Register the extension type:
 
-    >>> pa.register_extension_type(UuidType())
+    >>> pa.register_extension_type(RationalType(pa.int64()))
 
-    Create an instance of UuidType extension type:
+    Create an instance of RationalType extension type:
 
-    >>> uuid_type = UuidType()
+    >>> rational_type = RationalType(pa.int32())
 
     Inspect the extension type:
 
-    >>> uuid_type.extension_name
-    'my_package.uuid'
-    >>> uuid_type.storage_type
-    FixedSizeBinaryType(fixed_size_binary[16])
+    >>> rational_type.extension_name
+    'my_package.rational'
+    >>> rational_type.storage_type
+    StructType(struct<numer: int32, denom: int32>)
 
     Wrap an array as an extension array:
 
-    >>> import uuid
-    >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16))
-    >>> uuid_type.wrap_array(storage_array)
+    >>> storage_array = pa.array(
+    ...     [
+    ...         {"numer": 10, "denom": 17},
+    ...         {"numer": 20, "denom": 13},
+    ...     ],
+    ...     type=rational_type.storage_type
+    ... )
+    >>> rational_array = rational_type.wrap_array(storage_array)
+    >>> rational_array
     <pyarrow.lib.ExtensionArray object at ...>
-    [
-      ...
-    ]
+    -- is_valid: all not null
+    -- child 0 type: int32
+      [
+        10,
+        20
+      ]
+    -- child 1 type: int32
+      [
+        17,
+        13
+      ]
 
     Or do the same with creating an ExtensionArray:
 
-    >>> pa.ExtensionArray.from_storage(uuid_type, storage_array)
+    >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array)
+    >>> rational_array
     <pyarrow.lib.ExtensionArray object at ...>
-    [
-      ...
-    ]
+    -- is_valid: all not null
+    -- child 0 type: int32
+      [
+        10,
+        20
+      ]
+    -- child 1 type: int32
+      [
+        17,
+        13
+      ]
 
     Unregister the extension type:
 
-    >>> pa.unregister_extension_type("my_package.uuid")
+    >>> pa.unregister_extension_type("my_package.rational")
+
+    Note that even though we registered the concrete type
+    ``RationalType(pa.int64())``, PyArrow will be able to deserialize
+    ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer
+    will reference the name ``my_package.rational`` and the ``@classmethod``
+    ``__arrow_ext_deserialize__``.
     """
 
     def __cinit__(self):
@@ -1739,7 +1777,7 @@ cdef class ExtensionType(BaseExtensionType):
         return NotImplementedError
 
     @classmethod
-    def __arrow_ext_deserialize__(self, storage_type, serialized):
+    def __arrow_ext_deserialize__(cls, storage_type, serialized):
         """
         Return an extension type instance from the storage type and serialized
         metadata.
@@ -2067,30 +2105,39 @@ def register_extension_type(ext_type):
 
     Examples
     --------
-    Define a UuidType extension type subclassing ExtensionType:
+    Define a RationalType extension type subclassing ExtensionType:
 
     >>> import pyarrow as pa
-    >>> class UuidType(pa.ExtensionType):
-    ...    def __init__(self):
-    ...       pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid")
-    ...    def __arrow_ext_serialize__(self):
-    ...       # since we don't have a parameterized type, we don't need extra
-    ...       # metadata to be deserialized
-    ...       return b''
-    ...    @classmethod
-    ...    def __arrow_ext_deserialize__(self, storage_type, serialized):
-    ...       # return an instance of this subclass given the serialized
-    ...       # metadata.
-    ...       return UuidType()
-    ...
+    >>> class RationalType(pa.ExtensionType):
+    ...     def __init__(self, data_type: pa.DataType):
+    ...         if not pa.types.is_integer(data_type):
+    ...             raise TypeError(f"data_type must be an integer type not {data_type}")
+    ...         super().__init__(
+    ...             pa.struct(
+    ...                 [
+    ...                     ("numer", data_type),
+    ...                     ("denom", data_type),
+    ...                 ],
+    ...             ),
+    ...             # N.B. This name does _not_ reference `data_type` so deserialization
+    ...             # will work for _any_ integer `data_type` after registration
+    ...             "my_package.rational",
+    ...         )
+    ...     def __arrow_ext_serialize__(self) -> bytes:
+    ...         # No parameters are necessary
+    ...         return b""
+    ...     @classmethod
+    ...     def __arrow_ext_deserialize__(cls, storage_type, serialized):
+    ...         # return an instance of this subclass
+    ...         return RationalType(storage_type[0].type)
 
     Register the extension type:
 
-    >>> pa.register_extension_type(UuidType())
+    >>> pa.register_extension_type(RationalType(pa.int64()))
 
     Unregister the extension type:
 
-    >>> pa.unregister_extension_type("my_package.uuid")
+    >>> pa.unregister_extension_type("my_package.rational")
     """
     cdef:
         DataType _type = ensure_type(ext_type, allow_none=False)
@@ -2117,30 +2164,39 @@ def unregister_extension_type(type_name):
 
     Examples
     --------
-    Define a UuidType extension type subclassing ExtensionType:
+    Define a RationalType extension type subclassing ExtensionType:
 
     >>> import pyarrow as pa
-    >>> class UuidType(pa.ExtensionType):
-    ...    def __init__(self):
-    ...       pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid")
-    ...    def __arrow_ext_serialize__(self):
-    ...       # since we don't have a parameterized type, we don't need extra
-    ...       # metadata to be deserialized
-    ...       return b''
-    ...    @classmethod
-    ...    def __arrow_ext_deserialize__(self, storage_type, serialized):
-    ...       # return an instance of this subclass given the serialized
-    ...       # metadata.
-    ...       return UuidType()
-    ...
+    >>> class RationalType(pa.ExtensionType):
+    ...     def __init__(self, data_type: pa.DataType):
+    ...         if not pa.types.is_integer(data_type):
+    ...             raise TypeError(f"data_type must be an integer type not {data_type}")
+    ...         super().__init__(
+    ...             pa.struct(
+    ...                 [
+    ...                     ("numer", data_type),
+    ...                     ("denom", data_type),
+    ...                 ],
+    ...             ),
+    ...             # N.B. This name does _not_ reference `data_type` so deserialization
+    ...             # will work for _any_ integer `data_type` after registration
+    ...             "my_package.rational",
+    ...         )
+    ...     def __arrow_ext_serialize__(self) -> bytes:
+    ...         # No parameters are necessary
+    ...         return b""
+    ...     @classmethod
+    ...     def __arrow_ext_deserialize__(cls, storage_type, serialized):
+    ...         # return an instance of this subclass
+    ...         return RationalType(storage_type[0].type)
 
     Register the extension type:
 
-    >>> pa.register_extension_type(UuidType())
+    >>> pa.register_extension_type(RationalType(pa.int64()))
 
     Unregister the extension type:
 
-    >>> pa.unregister_extension_type("my_package.uuid")
+    >>> pa.unregister_extension_type("my_package.rational")
     """
     cdef:
         c_string c_type_name = tobytes(type_name)
@@ -4318,8 +4374,12 @@ def float16():
       15872,
       32256
     ]
-    >>> a.to_pylist()
-    [np.float16(1.5), np.float16(nan)]
+
+    Note that unlike other float types, if you convert this array
+    to a python list, the types of its elements will be ``np.float16``
+
+    >>> [type(val) for val in a.to_pylist()]
+    [<class 'numpy.float16'>, <class 'numpy.float16'>]
     """
     return primitive_type(_Type_HALF_FLOAT)
 

From c067d9b99a1777c1dab0fb59b7f81c9f7fc5912d Mon Sep 17 00:00:00 2001
From: Alenka Frim <AlenkaF@users.noreply.github.com>
Date: Tue, 17 Sep 2024 22:13:08 +0200
Subject: [PATCH 167/186] GH-37756: [Format][Docs] Document IPC Compression
 (#43950)

### Rationale for this change

There is no information about buffer compression of the record batch IPC
message in the format docs
(https://arrow.apache.org/docs/format/Columnar.html).

### What changes are included in this PR?

New paragraph is added with basic information about buffer compression
in IPC.

### Are these changes tested?

No, it is only documentation update.

### Are there any user-facing changes?

No, only documentation update.
* GitHub Issue: #37756

---------

Co-authored-by: Ian Cook <ianmcook@gmail.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 docs/source/format/Columnar.rst | 61 +++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
index 697c39b0cb1d9..b144f1cc988f3 100644
--- a/docs/source/format/Columnar.rst
+++ b/docs/source/format/Columnar.rst
@@ -1284,6 +1284,8 @@ We additionally provide both schema-level and field-level
 ``custom_metadata`` attributes allowing for systems to insert their
 own application defined metadata to customize behavior.
 
+.. _ipc-recordbatch-message:
+
 RecordBatch message
 -------------------
 
@@ -1385,6 +1387,65 @@ have two entries in each RecordBatch. For a RecordBatch of this schema with
     buffer 13: col2    data
 
 
+Compression
+-----------
+
+There are three different options for compression of record batch
+body buffers: Buffers can be uncompressed, buffers can be
+compressed with the ``lz4`` compression codec, or buffers can be
+compressed with the ``zstd`` compression codec. Buffers in the
+flat sequence of a message body must be compressed separately using
+the same codec. Specific buffers in the sequence of compressed
+buffers may be left uncompressed (for example if compressing those
+specific buffers would not appreciably reduce their size).
+
+The compression type used is defined in the ``data header``
+of the :ref:`ipc-recordbatch-message` in the optional ``compression``
+field with the default being uncompressed.
+
+.. note::
+
+   ``lz4`` compression codec means the
+   `LZ4 frame format <https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md>`_
+   and should not to be confused with
+   `"raw" (also called "block") format <https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md>`_.
+
+The difference between compressed and uncompressed buffers in the
+serialized form is as follows:
+
+* If the buffers in the :ref:`ipc-recordbatch-message` are **compressed**
+
+  - the ``data header`` includes the length and memory offset
+    of each **compressed buffer** in the record batch's body together
+    with the compression type
+
+  - the ``body`` includes a flat sequence of **compressed buffers**
+    together with the **length of the uncompressed buffer** as a 64-bit
+    little-endian signed integer stored in the first 8 bytes of each
+    buffer in the sequence. This uncompressed length can be set to ``-1`` to indicate
+    that that specific buffer is left uncompressed.
+
+* If the buffers in the :ref:`ipc-recordbatch-message` are **uncompressed**
+
+  - the ``data header`` includes the length and memory offset
+    of each **uncompressed buffer** in the record batch's body
+
+  - the ``body`` includes a flat sequence of **uncompressed buffers**.
+
+.. note::
+
+   Some Arrow implementations lack support for producing and consuming
+   IPC data with compressed buffers using one or either of the codecs
+   listed above. See :doc:`../status` for details.
+
+   Some applications might apply compression in the protocol they use
+   to store or transport Arrow IPC data. (For example, an HTTP server
+   might serve gzip-compressed Arrow IPC streams.) Applications that
+   already use compression in their storage or transport protocols
+   should avoid using buffer compression. Double compression typically
+   worsens performance and does not substantially improve compression
+   ratios.
+
 Byte Order (`Endianness`_)
 ---------------------------
 

From 7d33f9302f55a5ee08603f29fbb0e8743097e94e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Wed, 18 Sep 2024 02:50:51 +0200
Subject: [PATCH 168/186] GH-43875: [Go][CI] Remove Go related lint
 configurations (#44144)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

The Go implementation is moving to apache/arrow-go from go/ in apache/arrow.

### What changes are included in this PR?

Removing the linting configuration

### Are these changes tested?

Yes

### Are there any user-facing changes?

No
* GitHub Issue: #43875

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .golangci.yaml          | 29 -----------------------------
 .pre-commit-config.yaml | 14 --------------
 2 files changed, 43 deletions(-)
 delete mode 100644 .golangci.yaml

diff --git a/.golangci.yaml b/.golangci.yaml
deleted file mode 100644
index 7d486a9e85a0a..0000000000000
--- a/.golangci.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-linters:
-  # Disable all linters.
-  # Default: false
-  disable-all: true
-  # Enable specific linter
-  # https://golangci-lint.run/usage/linters/#enabled-by-default
-  enable:
-    - gofmt
-    - goimports
-
-issues:
-  fix: true
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 91017969eb502..bee20369c017e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -168,17 +168,3 @@ repos:
           '--disable',
           'dangling-hyphen,line-too-long',
         ]
-  - repo: https://github.com/golangci/golangci-lint
-    rev: v1.59.0
-    hooks:
-      # no built-in support for multiple go.mod
-      # https://github.com/golangci/golangci-lint/issues/828
-      - id: golangci-lint-full
-        name: golangci-lint-full-arrow
-        entry: bash -c 'cd go/arrow && golangci-lint run'
-      - id: golangci-lint-full
-        name: golangci-lint-full-parquet
-        entry: bash -c 'cd go/parquet && golangci-lint run'
-      - id: golangci-lint-full
-        name: golangci-lint-full-internal
-        entry: bash -c 'cd go/internal && golangci-lint run'

From 3f514f2885c83fa110462322482d063bfc72fda6 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 18 Sep 2024 09:55:32 +0900
Subject: [PATCH 169/186] GH-43874: [CI][Integration][Go] Use apache/arrow-go
 (#44142)

### Rationale for this change

The Go implementation is moving to apache/arrow-go from go/ in apache/arrow.

### What changes are included in this PR?

Use apache/arrow-go for integration test.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #43874

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .github/workflows/integration.yml     |  6 ++++++
 ci/scripts/integration_arrow.sh       |  4 +---
 ci/scripts/integration_arrow_build.sh | 13 ++++++-------
 dev/archery/archery/cli.py            |  3 ++-
 docker-compose.yml                    |  4 +++-
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index bffc1c597b4c9..af9a98ed437f8 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -81,6 +81,11 @@ jobs:
         with:
           repository: apache/arrow-nanoarrow
           path: nanoarrow
+      - name: Checkout Arrow Go
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          repository: apache/arrow-go
+          path: go
       - name: Free up disk space
         run: |
           ci/scripts/util_free_space.sh
@@ -104,6 +109,7 @@ jobs:
           source ci/scripts/util_enable_core_dumps.sh
           archery docker run \
             -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \
+            -e ARCHERY_INTEGRATION_WITH_GO=1 \
             -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \
             -e ARCHERY_INTEGRATION_WITH_RUST=1 \
             conda-integration
diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh
index ecb6b261ef1ee..079521d9a368a 100755
--- a/ci/scripts/integration_arrow.sh
+++ b/ci/scripts/integration_arrow.sh
@@ -26,11 +26,10 @@ gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration
 
 : ${ARROW_INTEGRATION_CPP:=ON}
 : ${ARROW_INTEGRATION_CSHARP:=ON}
-: ${ARROW_INTEGRATION_GO:=ON}
 : ${ARROW_INTEGRATION_JAVA:=ON}
 : ${ARROW_INTEGRATION_JS:=ON}
 
-: ${ARCHERY_INTEGRATION_TARGET_LANGUAGES:=cpp,csharp,go,java,js}
+: ${ARCHERY_INTEGRATION_TARGET_LANGUAGES:=cpp,csharp,java,js}
 export ARCHERY_INTEGRATION_TARGET_LANGUAGES
 
 . ${arrow_dir}/ci/scripts/util_log.sh
@@ -67,7 +66,6 @@ time archery integration \
     --run-flight \
     --with-cpp=$([ "$ARROW_INTEGRATION_CPP" == "ON" ] && echo "1" || echo "0") \
     --with-csharp=$([ "$ARROW_INTEGRATION_CSHARP" == "ON" ] && echo "1" || echo "0") \
-    --with-go=$([ "$ARROW_INTEGRATION_GO" == "ON" ] && echo "1" || echo "0") \
     --with-java=$([ "$ARROW_INTEGRATION_JAVA" == "ON" ] && echo "1" || echo "0") \
     --with-js=$([ "$ARROW_INTEGRATION_JS" == "ON" ] && echo "1" || echo "0") \
     --gold-dirs=$gold_dir/0.14.1 \
diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh
index 8fca0d434b75e..4dfcf8768c71f 100755
--- a/ci/scripts/integration_arrow_build.sh
+++ b/ci/scripts/integration_arrow_build.sh
@@ -24,7 +24,6 @@ build_dir=${2}
 
 : ${ARROW_INTEGRATION_CPP:=ON}
 : ${ARROW_INTEGRATION_CSHARP:=ON}
-: ${ARROW_INTEGRATION_GO:=ON}
 : ${ARROW_INTEGRATION_JAVA:=ON}
 : ${ARROW_INTEGRATION_JS:=ON}
 
@@ -38,6 +37,12 @@ github_actions_group_begin "Integration: Build: nanoarrow"
 ${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir}
 github_actions_group_end
 
+github_actions_group_begin "Integration: Build: Go"
+if [ "${ARCHERY_INTEGRATION_WITH_GO}" -gt "0" ]; then
+    ${arrow_dir}/go/ci/scripts/build.sh ${arrow_dir}/go
+fi
+github_actions_group_end
+
 github_actions_group_begin "Integration: Build: C++"
 if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then
     ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir}
@@ -50,12 +55,6 @@ if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then
 fi
 github_actions_group_end
 
-github_actions_group_begin "Integration: Build: Go"
-if [ "${ARROW_INTEGRATION_GO}" == "ON" ]; then
-    ${arrow_dir}/ci/scripts/go_build.sh ${arrow_dir} ${build_dir}
-fi
-github_actions_group_end
-
 github_actions_group_begin "Integration: Build: Java"
 if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then
     export ARROW_JAVA_CDATA="ON"
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 64481b8ff62a4..49699e81d57f5 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -737,7 +737,8 @@ def _set_default(opt, default):
 @click.option('--with-js', type=bool, default=False,
               help='Include JavaScript in integration tests')
 @click.option('--with-go', type=bool, default=False,
-              help='Include Go in integration tests')
+              help='Include Go in integration tests',
+              envvar="ARCHERY_INTEGRATION_WITH_GO")
 @click.option('--with-nanoarrow', type=bool, default=False,
               help='Include nanoarrow in integration tests',
               envvar="ARCHERY_INTEGRATION_WITH_NANOARROW")
diff --git a/docker-compose.yml b/docker-compose.yml
index 66607157318a2..a76ee49490a6e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1870,6 +1870,7 @@ services:
     volumes: *conda-volumes
     environment:
       <<: [*common, *ccache]
+      ARCHERY_INTEGRATION_WITH_GO: 0
       ARCHERY_INTEGRATION_WITH_NANOARROW: 0
       ARCHERY_INTEGRATION_WITH_RUST: 0
       # Tell Archery where Arrow binaries are located
@@ -1877,7 +1878,8 @@ services:
       ARROW_NANOARROW_PATH: /build/nanoarrow
       ARROW_RUST_EXE_PATH: /build/rust/debug
     command:
-      ["/arrow/ci/scripts/integration_arrow_build.sh /arrow /build &&
+      ["git config --global --add safe.directory /arrow/go &&
+        /arrow/ci/scripts/integration_arrow_build.sh /arrow /build &&
         /arrow/ci/scripts/integration_arrow.sh /arrow /build"]
 
   ################################ Docs #######################################

From fe39c8f5518a8b57723d457a8048cc03715bd389 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Wed, 18 Sep 2024 03:18:28 +0200
Subject: [PATCH 170/186] GH-43868: [CI][Python] Skip test that requires
 PARQUET_TEST_DATA env on emscripten (#43906)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

The following PR:
- https://github.com/apache/arrow/pull/41580

Made mandatory for a test the requirement to have `PARQUET_TEST_DATA` env defined.

This is currently not available from `python_test_emscripten.sh` as we require to mount the filesystem for both Node and ChromeDriver.

### What changes are included in this PR?

Skip the test that requires `PARQUET_TEST_DATA` for emscripten.

### Are these changes tested?

Via archery

### Are there any user-facing changes?

No
* GitHub Issue: #43905
* GitHub Issue: #43868

Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 python/pyarrow/tests/parquet/conftest.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/pyarrow/tests/parquet/conftest.py b/python/pyarrow/tests/parquet/conftest.py
index 80605e973cda8..94b3058fa02c8 100644
--- a/python/pyarrow/tests/parquet/conftest.py
+++ b/python/pyarrow/tests/parquet/conftest.py
@@ -17,6 +17,7 @@
 
 import os
 import pathlib
+import sys
 
 import pytest
 
@@ -30,6 +31,8 @@ def datadir(base_datadir):
 
 @pytest.fixture(scope='module')
 def parquet_test_datadir():
+    if sys.platform == 'emscripten':
+        pytest.skip("needs PARQUET_TEST_DATA files access")
     result = os.environ.get('PARQUET_TEST_DATA')
     if not result:
         raise RuntimeError('Please point the PARQUET_TEST_DATA environment '

From 9576a41001fe883ab4b6538663647d7c602fc4df Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 18 Sep 2024 12:02:21 +0900
Subject: [PATCH 171/186] GH-44153: [GLib][FlightRPC] Fix closure annotation
 (#44154)

### Rationale for this change

It should be added to a callback function not data for the callback function.

### What changes are included in this PR?

Move annotation location.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #44153

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/arrow-flight-glib/client.cpp | 4 ++--
 c_glib/arrow-flight-glib/server.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/c_glib/arrow-flight-glib/client.cpp b/c_glib/arrow-flight-glib/client.cpp
index 75b02ec25869f..596aa4b3a5a03 100644
--- a/c_glib/arrow-flight-glib/client.cpp
+++ b/c_glib/arrow-flight-glib/client.cpp
@@ -274,8 +274,8 @@ gaflight_call_options_clear_headers(GAFlightCallOptions *options)
 /**
  * gaflight_call_options_foreach_header:
  * @options: A #GAFlightCallOptions.
- * @func: (scope call): The user's callback function.
- * @user_data: (closure): Data for @func.
+ * @func: (scope call) (closure user_data): The user's callback function.
+ * @user_data: Data for @func.
  *
  * Iterates over all headers in the options.
  *
diff --git a/c_glib/arrow-flight-glib/server.cpp b/c_glib/arrow-flight-glib/server.cpp
index e39fd97b0d06c..2feeb853e2c51 100644
--- a/c_glib/arrow-flight-glib/server.cpp
+++ b/c_glib/arrow-flight-glib/server.cpp
@@ -461,8 +461,8 @@ gaflight_server_call_context_class_init(GAFlightServerCallContextClass *klass)
 /**
  * gaflight_server_call_context_foreach_incoming_header:
  * @context: A #GAFlightServerCallContext.
- * @func: (scope call): The user's callback function.
- * @user_data: (closure): Data for @func.
+ * @func: (scope call) (closure user_data): The user's callback function.
+ * @user_data: Data for @func.
  *
  * Iterates over all incoming headers.
  *

From 3d6d5817313920abc71c854828d95b63b2562938 Mon Sep 17 00:00:00 2001
From: Rossi Sun <zanmato1984@gmail.com>
Date: Wed, 18 Sep 2024 15:51:03 +0800
Subject: [PATCH 172/186] GH-44052: [C++][Compute] Reduce the complexity of row
 segmenter (#44053)

### Rationale for this change

As described in #44052, currently `AnyKeysSegmenter::GetNextSegment` has `O(n*m)` complexity, where `n` is the number of rows in a batch, and `m` is the number of segments in this batch (a "segment" is the group of contiguous rows who have the same segment key). This is because in each invocation of the method, it computes all the group ids of the remaining rows in this batch, where it's only interested in the first group, making the rest of the computation a waste.

In this PR I introduced a new API `GetSegments` (and subsequently deprecated the old `GetNextSegment`) to compute the group ids only once and iterate all the segments outside to avoid the duplicated computation. This reduces the complexity from `O(n*m)` to `O(n)`.

### What changes are included in this PR?

1. Because `grouper.h` is a [public header](https://github.com/apache/arrow/blob/8556001e6a8b4c7f35d4e18c28704d7811005904/cpp/src/arrow/compute/api.h#L47), so I assume `RowSegmenter::GetNextSegment` is a public API and only deprecate it instead of removing it.
2. Implement new API `RowSegmenter::GetSegments` and update the call-sites.
3. Some code reorg of the segmenter code (mostly moving to inside a class).
4. A new benchmark for the segmented aggregation. (The benchmark result is listed in the comments below, which shows up to `50x` speedup, nearly `O(n*m)` to `O(n)` complexity reduction.)

### Are these changes tested?

Legacy tests are sufficient.

### Are there any user-facing changes?

Yes.

**This PR includes breaking changes to public APIs.**

The API `RowSegmenter::GetNextSegment` is deprecated due to its inefficiency and replaced with a more efficient one `RowSegmenter::GetSegments`.

* GitHub Issue: #44052

Lead-authored-by: Ruoxi Sun <zanmato1984@gmail.com>
Co-authored-by: Rossi Sun <zanmato1984@gmail.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/acero/aggregate_benchmark.cc | 119 ++++++++--
 cpp/src/arrow/acero/aggregate_internal.h   |   9 +-
 cpp/src/arrow/acero/hash_aggregate_test.cc |  66 ++----
 cpp/src/arrow/compute/row/grouper.cc       | 263 ++++++++++++++++-----
 cpp/src/arrow/compute/row/grouper.h        |   5 +
 5 files changed, 332 insertions(+), 130 deletions(-)

diff --git a/cpp/src/arrow/acero/aggregate_benchmark.cc b/cpp/src/arrow/acero/aggregate_benchmark.cc
index c0dfba66336af..9c90b63904eb3 100644
--- a/cpp/src/arrow/acero/aggregate_benchmark.cc
+++ b/cpp/src/arrow/acero/aggregate_benchmark.cc
@@ -24,6 +24,7 @@
 #include "arrow/array/array_primitive.h"
 #include "arrow/compute/api.h"
 #include "arrow/table.h"
+#include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/util/benchmark_util.h"
@@ -325,7 +326,8 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll<int64_t>)
 
 std::shared_ptr<RecordBatch> RecordBatchFromArrays(
     const std::vector<std::shared_ptr<Array>>& arguments,
-    const std::vector<std::shared_ptr<Array>>& keys) {
+    const std::vector<std::shared_ptr<Array>>& keys,
+    const std::vector<std::shared_ptr<Array>>& segment_keys) {
   std::vector<std::shared_ptr<Field>> fields;
   std::vector<std::shared_ptr<Array>> all_arrays;
   int64_t length = -1;
@@ -347,37 +349,56 @@ std::shared_ptr<RecordBatch> RecordBatchFromArrays(
     fields.push_back(field("key" + ToChars(key_idx), key->type()));
     all_arrays.push_back(key);
   }
+  for (std::size_t segment_key_idx = 0; segment_key_idx < segment_keys.size();
+       segment_key_idx++) {
+    const auto& segment_key = segment_keys[segment_key_idx];
+    DCHECK_EQ(segment_key->length(), length);
+    fields.push_back(
+        field("segment_key" + ToChars(segment_key_idx), segment_key->type()));
+    all_arrays.push_back(segment_key);
+  }
   return RecordBatch::Make(schema(std::move(fields)), length, std::move(all_arrays));
 }
 
 Result<std::shared_ptr<Table>> BatchGroupBy(
     std::shared_ptr<RecordBatch> batch, std::vector<Aggregate> aggregates,
-    std::vector<FieldRef> keys, bool use_threads = false,
-    MemoryPool* memory_pool = default_memory_pool()) {
+    std::vector<FieldRef> keys, std::vector<FieldRef> segment_keys,
+    bool use_threads = false, MemoryPool* memory_pool = default_memory_pool()) {
   ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Table> table,
                         Table::FromRecordBatches({std::move(batch)}));
   Declaration plan = Declaration::Sequence(
       {{"table_source", TableSourceNodeOptions(std::move(table))},
-       {"aggregate", AggregateNodeOptions(std::move(aggregates), std::move(keys))}});
+       {"aggregate", AggregateNodeOptions(std::move(aggregates), std::move(keys),
+                                          std::move(segment_keys))}});
   return DeclarationToTable(std::move(plan), use_threads, memory_pool);
 }
 
-static void BenchmarkGroupBy(benchmark::State& state, std::vector<Aggregate> aggregates,
-                             const std::vector<std::shared_ptr<Array>>& arguments,
-                             const std::vector<std::shared_ptr<Array>>& keys) {
-  std::shared_ptr<RecordBatch> batch = RecordBatchFromArrays(arguments, keys);
+static void BenchmarkAggregate(
+    benchmark::State& state, std::vector<Aggregate> aggregates,
+    const std::vector<std::shared_ptr<Array>>& arguments,
+    const std::vector<std::shared_ptr<Array>>& keys,
+    const std::vector<std::shared_ptr<Array>>& segment_keys = {}) {
+  std::shared_ptr<RecordBatch> batch =
+      RecordBatchFromArrays(arguments, keys, segment_keys);
   std::vector<FieldRef> key_refs;
   for (std::size_t key_idx = 0; key_idx < keys.size(); key_idx++) {
     key_refs.emplace_back(static_cast<int>(key_idx + arguments.size()));
   }
+  std::vector<FieldRef> segment_key_refs;
+  for (std::size_t segment_key_idx = 0; segment_key_idx < segment_keys.size();
+       segment_key_idx++) {
+    segment_key_refs.emplace_back(
+        static_cast<int>(segment_key_idx + arguments.size() + keys.size()));
+  }
   for (std::size_t arg_idx = 0; arg_idx < arguments.size(); arg_idx++) {
     aggregates[arg_idx].target = {FieldRef(static_cast<int>(arg_idx))};
   }
   int64_t total_bytes = TotalBufferSize(*batch);
   for (auto _ : state) {
-    ABORT_NOT_OK(BatchGroupBy(batch, aggregates, key_refs));
+    ABORT_NOT_OK(BatchGroupBy(batch, aggregates, key_refs, segment_key_refs));
   }
   state.SetBytesProcessed(total_bytes * state.iterations());
+  state.SetItemsProcessed(batch->num_rows() * state.iterations());
 }
 
 #define GROUP_BY_BENCHMARK(Name, Impl)                               \
@@ -404,7 +425,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyStringSet, [&] {
                                    /*min_length=*/3,
                                    /*max_length=*/32);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key});
 });
 
 GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallStringSet, [&] {
@@ -419,7 +440,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallStringSet, [&] {
                                    /*min_length=*/3,
                                    /*max_length=*/32);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key});
 });
 
 GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumStringSet, [&] {
@@ -434,7 +455,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumStringSet, [&] {
                                    /*min_length=*/3,
                                    /*max_length=*/32);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key});
 });
 
 GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntegerSet, [&] {
@@ -448,7 +469,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntegerSet, [&] {
                        /*min=*/0,
                        /*max=*/15);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key});
 });
 
 GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntegerSet, [&] {
@@ -462,7 +483,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntegerSet, [&] {
                        /*min=*/0,
                        /*max=*/255);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key});
 });
 
 GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntegerSet, [&] {
@@ -476,7 +497,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntegerSet, [&] {
                        /*min=*/0,
                        /*max=*/4095);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {key});
 });
 
 GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntStringPairSet, [&] {
@@ -494,7 +515,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByTinyIntStringPairSet, [&] {
                                        /*min_length=*/3,
                                        /*max_length=*/32);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key});
 });
 
 GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntStringPairSet, [&] {
@@ -512,7 +533,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedBySmallIntStringPairSet, [&] {
                                        /*min_length=*/3,
                                        /*max_length=*/32);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key});
 });
 
 GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] {
@@ -530,7 +551,7 @@ GROUP_BY_BENCHMARK(SumDoublesGroupedByMediumIntStringPairSet, [&] {
                                        /*min_length=*/3,
                                        /*max_length=*/32);
 
-  BenchmarkGroupBy(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key});
+  BenchmarkAggregate(state, {{"hash_sum", ""}}, {summand}, {int_key, str_key});
 });
 
 // Grouped MinMax
@@ -543,7 +564,7 @@ GROUP_BY_BENCHMARK(MinMaxDoublesGroupedByMediumInt, [&] {
                            /*nan_probability=*/args.null_proportion / 10);
   auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63);
 
-  BenchmarkGroupBy(state, {{"hash_min_max", ""}}, {input}, {int_key});
+  BenchmarkAggregate(state, {{"hash_min_max", ""}}, {input}, {int_key});
 });
 
 GROUP_BY_BENCHMARK(MinMaxShortStringsGroupedByMediumInt, [&] {
@@ -553,7 +574,7 @@ GROUP_BY_BENCHMARK(MinMaxShortStringsGroupedByMediumInt, [&] {
                           /*null_probability=*/args.null_proportion);
   auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63);
 
-  BenchmarkGroupBy(state, {{"hash_min_max", ""}}, {input}, {int_key});
+  BenchmarkAggregate(state, {{"hash_min_max", ""}}, {input}, {int_key});
 });
 
 GROUP_BY_BENCHMARK(MinMaxLongStringsGroupedByMediumInt, [&] {
@@ -563,7 +584,7 @@ GROUP_BY_BENCHMARK(MinMaxLongStringsGroupedByMediumInt, [&] {
                           /*null_probability=*/args.null_proportion);
   auto int_key = rng.Int64(args.size, /*min=*/0, /*max=*/63);
 
-  BenchmarkGroupBy(state, {{"hash_min_max", ""}}, {input}, {int_key});
+  BenchmarkAggregate(state, {{"hash_min_max", ""}}, {input}, {int_key});
 });
 
 //
@@ -866,5 +887,61 @@ BENCHMARK(TDigestKernelDoubleMedian)->Apply(QuantileKernelArgs);
 BENCHMARK(TDigestKernelDoubleDeciles)->Apply(QuantileKernelArgs);
 BENCHMARK(TDigestKernelDoubleCentiles)->Apply(QuantileKernelArgs);
 
+//
+// Segmented Aggregate
+//
+
+static void BenchmarkSegmentedAggregate(
+    benchmark::State& state, int64_t num_rows, std::vector<Aggregate> aggregates,
+    const std::vector<std::shared_ptr<Array>>& arguments,
+    const std::vector<std::shared_ptr<Array>>& keys, int64_t num_segment_keys,
+    int64_t num_segments) {
+  ASSERT_GT(num_segments, 0);
+
+  auto rng = random::RandomArrayGenerator(42);
+  auto segment_key = rng.Int64(num_rows, /*min=*/0, /*max=*/num_segments - 1);
+  int64_t* values = segment_key->data()->GetMutableValues<int64_t>(1);
+  std::sort(values, values + num_rows);
+  // num_segment_keys copies of the segment key.
+  ArrayVector segment_keys(num_segment_keys, segment_key);
+
+  BenchmarkAggregate(state, std::move(aggregates), arguments, keys, segment_keys);
+}
+
+template <typename... Args>
+static void CountScalarSegmentedByInts(benchmark::State& state, Args&&...) {
+  constexpr int64_t num_rows = 32 * 1024;
+
+  // A trivial column to count from.
+  auto arg = ConstantArrayGenerator::Zeroes(num_rows, int32());
+
+  BenchmarkSegmentedAggregate(state, num_rows, {{"count", ""}}, {arg}, /*keys=*/{},
+                              state.range(0), state.range(1));
+}
+BENCHMARK(CountScalarSegmentedByInts)
+    ->ArgNames({"SegmentKeys", "Segments"})
+    ->ArgsProduct({{0, 1, 2}, benchmark::CreateRange(1, 256, 8)});
+
+template <typename... Args>
+static void CountGroupByIntsSegmentedByInts(benchmark::State& state, Args&&...) {
+  constexpr int64_t num_rows = 32 * 1024;
+
+  // A trivial column to count from.
+  auto arg = ConstantArrayGenerator::Zeroes(num_rows, int32());
+
+  auto rng = random::RandomArrayGenerator(42);
+  int64_t num_keys = state.range(0);
+  ArrayVector keys(num_keys);
+  for (auto& key : keys) {
+    key = rng.Int64(num_rows, /*min=*/0, /*max=*/64);
+  }
+
+  BenchmarkSegmentedAggregate(state, num_rows, {{"hash_count", ""}}, {arg}, keys,
+                              state.range(1), state.range(2));
+}
+BENCHMARK(CountGroupByIntsSegmentedByInts)
+    ->ArgNames({"Keys", "SegmentKeys", "Segments"})
+    ->ArgsProduct({{1, 2}, {0, 1, 2}, benchmark::CreateRange(1, 256, 8)});
+
 }  // namespace acero
 }  // namespace arrow
diff --git a/cpp/src/arrow/acero/aggregate_internal.h b/cpp/src/arrow/acero/aggregate_internal.h
index 5730d99f93f88..7cdc424cbb76b 100644
--- a/cpp/src/arrow/acero/aggregate_internal.h
+++ b/cpp/src/arrow/acero/aggregate_internal.h
@@ -131,17 +131,14 @@ void AggregatesToString(std::stringstream* ss, const Schema& input_schema,
 template <typename BatchHandler>
 Status HandleSegments(RowSegmenter* segmenter, const ExecBatch& batch,
                       const std::vector<int>& ids, const BatchHandler& handle_batch) {
-  int64_t offset = 0;
   ARROW_ASSIGN_OR_RAISE(auto segment_exec_batch, batch.SelectValues(ids));
   ExecSpan segment_batch(segment_exec_batch);
 
-  while (true) {
-    ARROW_ASSIGN_OR_RAISE(compute::Segment segment,
-                          segmenter->GetNextSegment(segment_batch, offset));
-    if (segment.offset >= segment_batch.length) break;  // condition of no-next-segment
+  ARROW_ASSIGN_OR_RAISE(auto segments, segmenter->GetSegments(segment_batch));
+  for (const auto& segment : segments) {
     ARROW_RETURN_NOT_OK(handle_batch(batch, segment));
-    offset = segment.offset + segment.length;
   }
+
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc
index 743cb20d1960d..f76e326cd7399 100644
--- a/cpp/src/arrow/acero/hash_aggregate_test.cc
+++ b/cpp/src/arrow/acero/hash_aggregate_test.cc
@@ -585,19 +585,12 @@ void TestGroupClassSupportedKeys(
 
 void TestSegments(std::unique_ptr<RowSegmenter>& segmenter, const ExecSpan& batch,
                   std::vector<Segment> expected_segments) {
-  int64_t offset = 0, segment_num = 0;
-  for (auto expected_segment : expected_segments) {
-    SCOPED_TRACE("segment #" + ToChars(segment_num++));
-    ASSERT_OK_AND_ASSIGN(auto segment, segmenter->GetNextSegment(batch, offset));
-    ASSERT_EQ(expected_segment, segment);
-    offset = segment.offset + segment.length;
+  ASSERT_OK_AND_ASSIGN(auto actual_segments, segmenter->GetSegments(batch));
+  ASSERT_EQ(actual_segments.size(), expected_segments.size());
+  for (size_t i = 0; i < actual_segments.size(); ++i) {
+    SCOPED_TRACE("segment #" + ToChars(i));
+    ASSERT_EQ(actual_segments[i], expected_segments[i]);
   }
-  // Assert next is the last (empty) segment.
-  ASSERT_OK_AND_ASSIGN(auto segment, segmenter->GetNextSegment(batch, offset));
-  ASSERT_GE(segment.offset, batch.length);
-  ASSERT_EQ(segment.length, 0);
-  ASSERT_TRUE(segment.is_open);
-  ASSERT_TRUE(segment.extends);
 }
 
 Result<std::unique_ptr<Grouper>> MakeGrouper(const std::vector<TypeHolder>& key_types) {
@@ -629,61 +622,47 @@ TEST(RowSegmenter, Basics) {
   auto batch2 = ExecBatchFromJSON(types2, "[[1, 1], [1, 2], [2, 2]]");
   auto batch1 = ExecBatchFromJSON(types1, "[[1], [1], [2]]");
   ExecBatch batch0({}, 3);
-  {
-    SCOPED_TRACE("offset");
-    ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types0));
-    ExecSpan span0(batch0);
-    for (int64_t offset : {-1, 4}) {
-      EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
-                                      HasSubstr("invalid grouping segmenter offset"),
-                                      segmenter->GetNextSegment(span0, offset));
-    }
-  }
   {
     SCOPED_TRACE("types0 segmenting of batch2");
     ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types0));
     ExecSpan span2(batch2);
     EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 0 "),
-                                    segmenter->GetNextSegment(span2, 0));
+                                    segmenter->GetSegments(span2));
     ExecSpan span0(batch0);
-    TestSegments(segmenter, span0, {{0, 3, true, true}, {3, 0, true, true}});
+    TestSegments(segmenter, span0, {{0, 3, true, true}});
   }
   {
     SCOPED_TRACE("bad_types1 segmenting of batch1");
     ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(bad_types1));
     ExecSpan span1(batch1);
     EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch value 0 of type "),
-                                    segmenter->GetNextSegment(span1, 0));
+                                    segmenter->GetSegments(span1));
   }
   {
     SCOPED_TRACE("types1 segmenting of batch2");
     ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types1));
     ExecSpan span2(batch2);
     EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 1 "),
-                                    segmenter->GetNextSegment(span2, 0));
+                                    segmenter->GetSegments(span2));
     ExecSpan span1(batch1);
-    TestSegments(segmenter, span1,
-                 {{0, 2, false, true}, {2, 1, true, false}, {3, 0, true, true}});
+    TestSegments(segmenter, span1, {{0, 2, false, true}, {2, 1, true, false}});
   }
   {
     SCOPED_TRACE("bad_types2 segmenting of batch2");
     ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(bad_types2));
     ExecSpan span2(batch2);
     EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch value 1 of type "),
-                                    segmenter->GetNextSegment(span2, 0));
+                                    segmenter->GetSegments(span2));
   }
   {
     SCOPED_TRACE("types2 segmenting of batch1");
     ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types2));
     ExecSpan span1(batch1);
     EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 2 "),
-                                    segmenter->GetNextSegment(span1, 0));
+                                    segmenter->GetSegments(span1));
     ExecSpan span2(batch2);
     TestSegments(segmenter, span2,
-                 {{0, 1, false, true},
-                  {1, 1, false, false},
-                  {2, 1, true, false},
-                  {3, 0, true, true}});
+                 {{0, 1, false, true}, {1, 1, false, false}, {2, 1, true, false}});
   }
 }
 
@@ -696,8 +675,7 @@ TEST(RowSegmenter, NonOrdered) {
                  {{0, 2, false, true},
                   {2, 1, false, false},
                   {3, 1, false, false},
-                  {4, 1, true, false},
-                  {5, 0, true, true}});
+                  {4, 1, true, false}});
   }
   {
     std::vector<TypeHolder> types = {int32(), int32()};
@@ -707,8 +685,7 @@ TEST(RowSegmenter, NonOrdered) {
                  {{0, 2, false, true},
                   {2, 1, false, false},
                   {3, 1, false, false},
-                  {4, 1, true, false},
-                  {5, 0, true, true}});
+                  {4, 1, true, false}});
   }
 }
 
@@ -767,8 +744,7 @@ TEST(RowSegmenter, MultipleSegments) {
                   {3, 1, false, false},
                   {4, 2, false, false},
                   {6, 2, false, false},
-                  {8, 1, true, false},
-                  {9, 0, true, true}});
+                  {8, 1, true, false}});
   }
   {
     std::vector<TypeHolder> types = {int32(), int32()};
@@ -782,8 +758,7 @@ TEST(RowSegmenter, MultipleSegments) {
                   {3, 1, false, false},
                   {4, 2, false, false},
                   {6, 2, false, false},
-                  {8, 1, true, false},
-                  {9, 0, true, true}});
+                  {8, 1, true, false}});
   }
 }
 
@@ -845,7 +820,7 @@ void TestRowSegmenterConstantBatch(
     std::vector<TypeHolder> key_types(types.begin(), types.begin() + size);
     ARROW_ASSIGN_OR_RAISE(auto segmenter, make_segmenter(key_types));
     for (size_t i = 0; i < repetitions; i++) {
-      TestSegments(segmenter, ExecSpan(batch), {{0, 3, true, true}, {3, 0, true, true}});
+      TestSegments(segmenter, ExecSpan(batch), {{0, 3, true, true}});
       ARROW_RETURN_NOT_OK(segmenter->Reset());
     }
     return Status::OK();
@@ -893,10 +868,9 @@ TEST(RowSegmenter, RowConstantBatch) {
   constexpr size_t n = 3;
   std::vector<TypeHolder> types = {int32(), int32(), int32()};
   auto full_batch = ExecBatchFromJSON(types, "[[1, 1, 1], [2, 2, 2], [3, 3, 3]]");
-  std::vector<Segment> expected_segments_for_size_0 = {{0, 3, true, true},
-                                                       {3, 0, true, true}};
+  std::vector<Segment> expected_segments_for_size_0 = {{0, 3, true, true}};
   std::vector<Segment> expected_segments = {
-      {0, 1, false, true}, {1, 1, false, false}, {2, 1, true, false}, {3, 0, true, true}};
+      {0, 1, false, true}, {1, 1, false, false}, {2, 1, true, false}};
   auto test_by_size = [&](size_t size) -> Status {
     SCOPED_TRACE("constant-batch with " + ToChars(size) + " key(s)");
     std::vector<Datum> values(full_batch.values.begin(),
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index 2b79539a3b0c2..02ed186449d22 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -17,6 +17,7 @@
 
 #include "arrow/compute/row/grouper.h"
 
+#include <iostream>
 #include <memory>
 #include <mutex>
 #include <type_traits>
@@ -54,13 +55,8 @@ using group_id_t = std::remove_const<decltype(kNoGroupId)>::type;
 using GroupIdType = CTypeTraits<group_id_t>::ArrowType;
 auto g_group_id_type = std::make_shared<GroupIdType>();
 
-inline const uint8_t* GetValuesAsBytes(const ArraySpan& data, int64_t offset = 0) {
-  DCHECK_GT(data.type->byte_width(), 0);
-  int64_t absolute_byte_offset = (data.offset + offset) * data.type->byte_width();
-  return data.GetValues<uint8_t>(1, absolute_byte_offset);
-}
-
 template <typename Value>
+ARROW_DEPRECATED("Deprecated in 18.0.0 along with GetSegments.")
 Status CheckForGetNextSegment(const std::vector<Value>& values, int64_t length,
                               int64_t offset, const std::vector<TypeHolder>& key_types) {
   if (offset < 0 || offset > length) {
@@ -82,11 +78,22 @@ Status CheckForGetNextSegment(const std::vector<Value>& values, int64_t length,
 }
 
 template <typename Batch>
+ARROW_DEPRECATED("Deprecated in 18.0.0 along with GetSegments.")
 enable_if_t<std::is_same<Batch, ExecSpan>::value || std::is_same<Batch, ExecBatch>::value,
-            Status>
-CheckForGetNextSegment(const Batch& batch, int64_t offset,
-                       const std::vector<TypeHolder>& key_types) {
+            Status> CheckForGetNextSegment(const Batch& batch, int64_t offset,
+                                           const std::vector<TypeHolder>& key_types) {
+  ARROW_SUPPRESS_DEPRECATION_WARNING
   return CheckForGetNextSegment(batch.values, batch.length, offset, key_types);
+  ARROW_UNSUPPRESS_DEPRECATION_WARNING
+}
+
+Status CheckForGetSegments(const ExecSpan& batch,
+                           const std::vector<TypeHolder>& key_types) {
+  // TODO: Move the implementation of CheckForGetNextSegment here once we remove the
+  // deprecated functions.
+  ARROW_SUPPRESS_DEPRECATION_WARNING
+  return CheckForGetNextSegment(batch, 0, key_types);
+  ARROW_UNSUPPRESS_DEPRECATION_WARNING
 }
 
 struct BaseRowSegmenter : public RowSegmenter {
@@ -102,21 +109,6 @@ Segment MakeSegment(int64_t batch_length, int64_t offset, int64_t length, bool e
   return Segment{offset, length, offset + length >= batch_length, extends};
 }
 
-// Used by SimpleKeySegmenter::GetNextSegment to find the match-length of a value within a
-// fixed-width buffer
-int64_t GetMatchLength(const uint8_t* match_bytes, int64_t match_width,
-                       const uint8_t* array_bytes, int64_t offset, int64_t length) {
-  int64_t cursor, byte_cursor;
-  for (cursor = offset, byte_cursor = match_width * cursor; cursor < length;
-       cursor++, byte_cursor += match_width) {
-    if (memcmp(match_bytes, array_bytes + byte_cursor,
-               static_cast<size_t>(match_width)) != 0) {
-      break;
-    }
-  }
-  return std::min(cursor, length) - offset;
-}
-
 using ExtendFunc = std::function<bool(const void*)>;
 constexpr bool kDefaultExtends = true;  // by default, the first segment extends
 constexpr bool kEmptyExtends = true;    // an empty segment extends too
@@ -130,9 +122,22 @@ struct NoKeysSegmenter : public BaseRowSegmenter {
 
   Status Reset() override { return Status::OK(); }
 
+  ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.")
   Result<Segment> GetNextSegment(const ExecSpan& batch, int64_t offset) override {
+    ARROW_SUPPRESS_DEPRECATION_WARNING
     ARROW_RETURN_NOT_OK(CheckForGetNextSegment(batch, offset, {}));
     return MakeSegment(batch.length, offset, batch.length - offset, kDefaultExtends);
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
+  }
+
+  Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) override {
+    RETURN_NOT_OK(CheckForGetSegments(batch, {}));
+
+    if (batch.length == 0) {
+      return std::vector<Segment>{};
+    }
+    return std::vector<Segment>{
+        MakeSegment(batch.length, 0, batch.length - 0, kDefaultExtends)};
   }
 };
 
@@ -147,13 +152,6 @@ struct SimpleKeySegmenter : public BaseRowSegmenter {
         save_key_data_(static_cast<size_t>(key_type_.type->byte_width())),
         extend_was_called_(false) {}
 
-  Status CheckType(const DataType& type) {
-    if (!is_fixed_width(type)) {
-      return Status::Invalid("SimpleKeySegmenter does not support type ", type);
-    }
-    return Status::OK();
-  }
-
   Status Reset() override {
     extend_was_called_ = false;
     return Status::OK();
@@ -161,7 +159,8 @@ struct SimpleKeySegmenter : public BaseRowSegmenter {
 
   // Checks whether the given grouping data extends the current segment, i.e., is equal to
   // previously seen grouping data, which is updated with each invocation.
-  bool Extend(const void* data) {
+  ARROW_DEPRECATED("Deprecated in 18.0.0 along with GetSegments.")
+  bool ExtendDeprecated(const void* data) {
     bool extends = !extend_was_called_
                        ? kDefaultExtends
                        : 0 == memcmp(save_key_data_.data(), data, save_key_data_.size());
@@ -170,42 +169,136 @@ struct SimpleKeySegmenter : public BaseRowSegmenter {
     return extends;
   }
 
-  Result<Segment> GetNextSegment(const Scalar& scalar, int64_t offset, int64_t length) {
+  ARROW_DEPRECATED("Deprecated in 18.0.0 along with GetSegments.")
+  Result<Segment> GetNextSegmentDeprecated(const Scalar& scalar, int64_t offset,
+                                           int64_t length) {
+    ARROW_SUPPRESS_DEPRECATION_WARNING
     ARROW_RETURN_NOT_OK(CheckType(*scalar.type));
     if (!scalar.is_valid) {
       return Status::Invalid("segmenting an invalid scalar");
     }
     auto data = checked_cast<const PrimitiveScalarBase&>(scalar).data();
-    bool extends = length > 0 ? Extend(data) : kEmptyExtends;
+    bool extends = length > 0 ? ExtendDeprecated(data) : kEmptyExtends;
     return MakeSegment(length, offset, length, extends);
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
   }
 
-  Result<Segment> GetNextSegment(const DataType& array_type, const uint8_t* array_bytes,
-                                 int64_t offset, int64_t length) {
+  ARROW_DEPRECATED("Deprecated in 18.0.0 along with GetSegments.")
+  Result<Segment> GetNextSegmentDeprecated(const DataType& array_type,
+                                           const uint8_t* array_bytes, int64_t offset,
+                                           int64_t length) {
+    ARROW_SUPPRESS_DEPRECATION_WARNING
     RETURN_NOT_OK(CheckType(array_type));
     DCHECK_LE(offset, length);
     int64_t byte_width = array_type.byte_width();
     int64_t match_length = GetMatchLength(array_bytes + offset * byte_width, byte_width,
                                           array_bytes, offset, length);
-    bool extends = length > 0 ? Extend(array_bytes + offset * byte_width) : kEmptyExtends;
+    bool extends =
+        length > 0 ? ExtendDeprecated(array_bytes + offset * byte_width) : kEmptyExtends;
     return MakeSegment(length, offset, match_length, extends);
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
   }
 
   Result<Segment> GetNextSegment(const ExecSpan& batch, int64_t offset) override {
+    ARROW_SUPPRESS_DEPRECATION_WARNING
     ARROW_RETURN_NOT_OK(CheckForGetNextSegment(batch, offset, {key_type_}));
     if (offset == batch.length) {
       return MakeSegment(batch.length, offset, 0, kEmptyExtends);
     }
     const auto& value = batch.values[0];
     if (value.is_scalar()) {
-      return GetNextSegment(*value.scalar, offset, batch.length);
+      return GetNextSegmentDeprecated(*value.scalar, offset, batch.length);
     }
     ARROW_DCHECK(value.is_array());
     const auto& array = value.array;
     if (array.GetNullCount() > 0) {
       return Status::NotImplemented("segmenting a nullable array");
     }
-    return GetNextSegment(*array.type, GetValuesAsBytes(array), offset, batch.length);
+    return GetNextSegmentDeprecated(*array.type, GetValuesAsBytes(array), offset,
+                                    batch.length);
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
+  }
+
+  Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) override {
+    RETURN_NOT_OK(CheckForGetSegments(batch, {key_type_}));
+
+    if (batch.length == 0) {
+      return std::vector<Segment>{};
+    }
+
+    const auto& value = batch.values[0];
+    RETURN_NOT_OK(CheckType(*value.type()));
+
+    std::vector<Segment> segments;
+    const void* key_data;
+    if (value.is_scalar()) {
+      const auto& scalar = *value.scalar;
+      DCHECK(scalar.is_valid);
+      key_data = checked_cast<const PrimitiveScalarBase&>(scalar).data();
+      bool extends = Extend(key_data);
+      segments.push_back(MakeSegment(batch.length, 0, batch.length, extends));
+    } else {
+      DCHECK(value.is_array());
+      const auto& array = value.array;
+      DCHECK_EQ(array.GetNullCount(), 0);
+      auto data = GetValuesAsBytes(array);
+      int64_t byte_width = array.type->byte_width();
+      int64_t offset = 0;
+      bool extends = Extend(data);
+      while (offset < array.length) {
+        int64_t match_length = GetMatchLength(data + offset * byte_width, byte_width,
+                                              data, offset, array.length);
+        segments.push_back(MakeSegment(array.length, offset, match_length,
+                                       offset == 0 ? extends : false));
+        offset += match_length;
+      }
+      key_data = data + (array.length - 1) * byte_width;
+    }
+
+    SaveKeyData(key_data);
+
+    return segments;
+  }
+
+ private:
+  static Status CheckType(const DataType& type) {
+    if (!is_fixed_width(type)) {
+      return Status::Invalid("SimpleKeySegmenter does not support type ", type);
+    }
+    return Status::OK();
+  }
+
+  static const uint8_t* GetValuesAsBytes(const ArraySpan& data, int64_t offset = 0) {
+    DCHECK_GT(data.type->byte_width(), 0);
+    int64_t absolute_byte_offset = (data.offset + offset) * data.type->byte_width();
+    return data.GetValues<uint8_t>(1, absolute_byte_offset);
+  }
+
+  // Find the match-length of a value within a fixed-width buffer
+  static int64_t GetMatchLength(const uint8_t* match_bytes, int64_t match_width,
+                                const uint8_t* array_bytes, int64_t offset,
+                                int64_t length) {
+    int64_t cursor, byte_cursor;
+    for (cursor = offset, byte_cursor = match_width * cursor; cursor < length;
+         cursor++, byte_cursor += match_width) {
+      if (memcmp(match_bytes, array_bytes + byte_cursor,
+                 static_cast<size_t>(match_width)) != 0) {
+        break;
+      }
+    }
+    return std::min(cursor, length) - offset;
+  }
+
+  bool Extend(const void* data) {
+    if (ARROW_PREDICT_FALSE(!extend_was_called_)) {
+      extend_was_called_ = true;
+      return kDefaultExtends;
+    }
+    return 0 == memcmp(save_key_data_.data(), data, save_key_data_.size());
+  }
+
+  void SaveKeyData(const void* data) {
+    memcpy(save_key_data_.data(), data, save_key_data_.size());
   }
 
  private:
@@ -233,6 +326,7 @@ struct AnyKeysSegmenter : public BaseRowSegmenter {
     return Status::OK();
   }
 
+  ARROW_DEPRECATED("Deprecated in 18.0.0 along with GetSegments.")
   bool Extend(const void* data) {
     auto group_id = *static_cast<const group_id_t*>(data);
     bool extends =
@@ -241,24 +335,9 @@ struct AnyKeysSegmenter : public BaseRowSegmenter {
     return extends;
   }
 
-  // Runs the grouper on a single row.  This is used to determine the group id of the
-  // first row of a new segment to see if it extends the previous segment.
-  template <typename Batch>
-  Result<group_id_t> MapGroupIdAt(const Batch& batch, int64_t offset) {
-    ARROW_ASSIGN_OR_RAISE(auto datum, grouper_->Consume(batch, offset,
-                                                        /*length=*/1));
-    if (!datum.is_array()) {
-      return Status::Invalid("accessing unsupported datum kind ", datum.kind());
-    }
-    const std::shared_ptr<ArrayData>& data = datum.array();
-    ARROW_DCHECK(data->GetNullCount() == 0);
-    DCHECK_EQ(data->type->id(), GroupIdType::type_id);
-    DCHECK_EQ(1, data->length);
-    const group_id_t* values = data->GetValues<group_id_t>(1);
-    return values[0];
-  }
-
+  ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.")
   Result<Segment> GetNextSegment(const ExecSpan& batch, int64_t offset) override {
+    ARROW_SUPPRESS_DEPRECATION_WARNING
     ARROW_RETURN_NOT_OK(CheckForGetNextSegment(batch, offset, key_types_));
     if (offset == batch.length) {
       return MakeSegment(batch.length, offset, 0, kEmptyExtends);
@@ -273,7 +352,7 @@ struct AnyKeysSegmenter : public BaseRowSegmenter {
     };
     // resetting drops grouper's group-ids, freeing-up memory for the next segment
     ARROW_RETURN_NOT_OK(grouper_->Reset());
-    // GH-34475: cache the grouper-consume result across invocations of GetNextSegment
+
     ARROW_ASSIGN_OR_RAISE(auto datum, grouper_->Consume(batch, offset));
     if (datum.is_array()) {
       // `data` is an array whose index-0 corresponds to index `offset` of `batch`
@@ -292,6 +371,76 @@ struct AnyKeysSegmenter : public BaseRowSegmenter {
     } else {
       return Status::Invalid("segmenting unsupported datum kind ", datum.kind());
     }
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
+  }
+
+  Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) override {
+    RETURN_NOT_OK(CheckForGetSegments(batch, {key_types_}));
+
+    if (batch.length == 0) {
+      return std::vector<Segment>{};
+    }
+
+    // determine if the first segment in this batch extends the last segment in the
+    // previous batch
+    bool extends = kDefaultExtends;
+    if (save_group_id_ != kNoGroupId) {
+      // the group id must be computed prior to resetting the grouper, since it is
+      // compared to save_group_id_, and after resetting the grouper produces incomparable
+      // group ids
+      ARROW_ASSIGN_OR_RAISE(auto group_id, MapGroupIdAt(batch));
+      // it "extends" unless the group id differs from the last group id
+      extends = (group_id == save_group_id_);
+    }
+
+    // resetting drops grouper's group-ids, freeing-up memory for the next segment
+    RETURN_NOT_OK(grouper_->Reset());
+
+    std::vector<Segment> segments;
+    ARROW_ASSIGN_OR_RAISE(auto datum, grouper_->Consume(batch));
+    DCHECK(datum.is_array());
+    // `data` is an array whose index-0 corresponds to index `offset` of `batch`
+    const std::shared_ptr<ArrayData>& data = datum.array();
+    DCHECK_EQ(data->length, batch.length);
+    DCHECK_EQ(data->GetNullCount(), 0);
+    DCHECK_EQ(data->type->id(), GroupIdType::type_id);
+    const group_id_t* group_ids = data->GetValues<group_id_t>(1);
+    int64_t current_group_offset = 0;
+    int64_t cursor;
+    for (cursor = 1; cursor < data->length; ++cursor) {
+      if (group_ids[cursor] != group_ids[current_group_offset]) {
+        segments.push_back(MakeSegment(batch.length, current_group_offset,
+                                       cursor - current_group_offset,
+                                       current_group_offset == 0 ? extends : false));
+        current_group_offset = cursor;
+      }
+    }
+    segments.push_back(MakeSegment(batch.length, current_group_offset,
+                                   cursor - current_group_offset,
+                                   current_group_offset == 0 ? extends : false));
+
+    // update the save_group_id_ to the last group id in this batch
+    save_group_id_ = group_ids[batch.length - 1];
+
+    return segments;
+  }
+
+ private:
+  // Runs the grouper on a single row.  This is used to determine the group id of the
+  // first row of a new segment to see if it extends the previous segment.
+  template <typename Batch>
+  Result<group_id_t> MapGroupIdAt(const Batch& batch, int64_t offset = 0) {
+    ARROW_ASSIGN_OR_RAISE(auto datum, grouper_->Consume(batch, offset,
+                                                        /*length=*/1));
+    if (!datum.is_array()) {
+      return Status::Invalid("accessing unsupported datum kind ", datum.kind());
+    }
+    const std::shared_ptr<ArrayData>& data = datum.array();
+    ARROW_DCHECK(data->GetNullCount() == 0);
+    DCHECK_EQ(data->type->id(), GroupIdType::type_id);
+    DCHECK_EQ(1, data->length);
+    const group_id_t* values = data->GetValues<group_id_t>(1);
+    return values[0];
   }
 
  private:
diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h
index 1d2aaae9dffb0..345bc62924241 100644
--- a/cpp/src/arrow/compute/row/grouper.h
+++ b/cpp/src/arrow/compute/row/grouper.h
@@ -97,7 +97,12 @@ class ARROW_EXPORT RowSegmenter {
   virtual Status Reset() = 0;
 
   /// \brief Get the next segment for the given batch starting from the given offset
+  /// DEPRECATED: Due to its inefficiency, use GetSegments instead.
+  ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.")
   virtual Result<Segment> GetNextSegment(const ExecSpan& batch, int64_t offset) = 0;
+
+  /// \brief Get all segments for the given batch
+  virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0;
 };
 
 /// Consumes batches of keys and yields batches of the group ids.

From 1c1f7f359d61b59e446b0a47e0279728551ed89f Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Wed, 18 Sep 2024 11:07:13 +0200
Subject: [PATCH 173/186] GH-43964: [Python] Build macOS and manylinux wheels
 for free-threading (#43965)

### Rationale for this change

Building free-threaded wheels is necessary to support the free-threaded build. We probably want to upload these wheels as nightlies somewhere as well, so that downstream users can test the free-threading-related changes.

### What changes are included in this PR?

- Add necessary configuration to build 3.13 free-threading wheels on `manylinux`.
- Do necessary changes to build free-threaded wheels on macOS as well.

### Are these changes tested?

I tested the `manylinux` wheel builds. macOS is still untested, since it's not dockerized.

### Are there any user-facing changes?

No.

Related to #43536.
* GitHub Issue: #43964

Lead-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .env                                          |  1 +
 ...ed-wheel-manylinux-test-imports.dockerfile | 33 ++++++++++++
 ...-wheel-manylinux-test-unittests.dockerfile | 51 +++++++++++++++++++
 ci/docker/python-wheel-manylinux.dockerfile   |  4 +-
 ci/scripts/install_python.sh                  | 39 +++++++++++---
 ci/scripts/python_wheel_unix_test.sh          |  4 +-
 dev/tasks/python-wheels/github.linux.yml      | 13 +++++
 dev/tasks/python-wheels/github.osx.yml        |  8 ++-
 dev/tasks/tasks.yml                           | 19 +++++--
 docker-compose.yml                            | 46 ++++++++++++++++-
 10 files changed, 200 insertions(+), 18 deletions(-)
 create mode 100644 ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile
 create mode 100644 ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile

diff --git a/.env b/.env
index 6accb27262eef..638fd324a1f40 100644
--- a/.env
+++ b/.env
@@ -72,6 +72,7 @@ NUMPY=latest
 PANDAS=latest
 PYTHON=3.9
 PYTHON_IMAGE_TAG=3.9
+PYTHON_ABI_TAG=cp39
 R=4.4
 SPARK=master
 TURBODBC=latest
diff --git a/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile b/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile
new file mode 100644
index 0000000000000..09530560e4f20
--- /dev/null
+++ b/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ARG base
+FROM ${base}
+
+RUN apt-get update -y -q && \
+    apt install -y -q --no-install-recommends software-properties-common gpg-agent && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update -y -q && \
+    apt install -y -q --no-install-recommends python3.13-dev python3.13-nogil python3.13-venv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+ENV ARROW_PYTHON_VENV /arrow-dev
+RUN python3.13t -m venv ${ARROW_PYTHON_VENV}
+
+ENV PYTHON_GIL 0
+ENV PATH "${ARROW_PYTHON_VENV}/bin:${PATH}"
diff --git a/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile b/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile
new file mode 100644
index 0000000000000..13b3bc140a9a8
--- /dev/null
+++ b/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ARG base
+FROM ${base}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update -y -q && \
+    apt install -y -q --no-install-recommends software-properties-common gpg-agent && \
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update -y -q && \
+    apt install -y -q --no-install-recommends \
+        build-essential \
+        libffi-dev \
+        python3.13-dev \
+        python3.13-nogil \
+        python3.13-venv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+ENV ARROW_PYTHON_VENV /arrow-dev
+RUN python3.13t -m venv ${ARROW_PYTHON_VENV}
+
+ENV PYTHON_GIL 0
+ENV PATH "${ARROW_PYTHON_VENV}/bin:${PATH}"
+
+# pandas doesn't provide wheels for aarch64 yet, so we have to install nightly Cython
+# along with the rest of pandas' build dependencies and disable build isolation
+COPY python/requirements-wheel-test.txt /arrow/python/
+RUN python -m pip install \
+    --pre \
+    --prefer-binary \
+    --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" \
+    Cython numpy
+RUN python -m pip install "meson-python==0.13.1" "meson==1.2.1" wheel "versioneer[toml]" ninja
+RUN python -m pip install --no-build-isolation -r /arrow/python/requirements-wheel-test.txt
diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile
index a797bedff166a..d22a70a2d777b 100644
--- a/ci/docker/python-wheel-manylinux.dockerfile
+++ b/ci/docker/python-wheel-manylinux.dockerfile
@@ -105,8 +105,10 @@ RUN pipx upgrade auditwheel
 
 # Configure Python for applications running in the bash shell of this Dockerfile
 ARG python=3.9
+ARG python_abi_tag=cp39
 ENV PYTHON_VERSION=${python}
-RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}) && \
+ENV PYTHON_ABI_TAG=${python_abi_tag}
+RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-${PYTHON_ABI_TAG}) && \
     echo "export PATH=$PYTHON_ROOT/bin:\$PATH" >> /etc/profile.d/python.sh
 
 SHELL ["/bin/bash", "-i", "-c"]
diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh
index 21d8a16e3c2b8..0f8a0804691e7 100755
--- a/ci/scripts/install_python.sh
+++ b/ci/scripts/install_python.sh
@@ -29,7 +29,8 @@ versions=([3.9]=3.9.13
           [3.10]=3.10.11
           [3.11]=3.11.9
           [3.12]=3.12.5
-          [3.13]=3.13.0)
+          [3.13]=3.13.0
+          [3.13t]=3.13.0)
 
 if [ "$#" -ne 2 ]; then
   echo "Usage: $0 <platform> <version>"
@@ -46,9 +47,9 @@ full_version=${versions[$2]}
 if [ $platform = "macOS" ]; then
     echo "Downloading Python installer..."
 
-    if [ "$version" = "3.13" ];
+    if [ "$version" = "3.13" ] || [ "$version" = "3.13t" ];
     then
-        fname="python-${full_version}rc1-macos11.pkg"
+        fname="python-${full_version}rc2-macos11.pkg"
     elif [ "$(uname -m)" = "arm64" ] || \
          [ "$version" = "3.10" ] || \
          [ "$version" = "3.11" ] || \
@@ -61,15 +62,39 @@ if [ $platform = "macOS" ]; then
     wget "https://www.python.org/ftp/python/${full_version}/${fname}"
 
     echo "Installing Python..."
-    installer -pkg $fname -target /
+    if [[ $2 == "3.13t" ]]; then
+        # See https://github.com/python/cpython/issues/120098#issuecomment-2151122033 for more info on this.
+        cat > ./choicechanges.plist <<EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<array>
+        <dict>
+                <key>attributeSetting</key>
+                <integer>1</integer>
+                <key>choiceAttribute</key>
+                <string>selected</string>
+                <key>choiceIdentifier</key>
+                <string>org.python.Python.PythonTFramework-3.13</string>
+        </dict>
+</array>
+</plist>
+EOF
+        installer -pkg $fname -applyChoiceChangesXML ./choicechanges.plist -target /
+        rm ./choicechanges.plist
+    else
+        installer -pkg $fname -target /
+    fi
     rm $fname
 
-    echo "Installing Pip..."
     python="/Library/Frameworks/Python.framework/Versions/${version}/bin/python${version}"
-    pip="${python} -m pip"
+    if [[ $2 == "3.13t" ]]; then
+        python="/Library/Frameworks/PythonT.framework/Versions/3.13/bin/python3.13t"
+    fi
 
+    echo "Installing Pip..."
     $python -m ensurepip
-    $pip install -U pip setuptools
+    $python -m pip install -U pip setuptools
 else
     echo "Unsupported platform: $platform"
     exit 1
diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh
index 6bdc3d3621e14..1487581eaef51 100755
--- a/ci/scripts/python_wheel_unix_test.sh
+++ b/ci/scripts/python_wheel_unix_test.sh
@@ -59,7 +59,7 @@ export PARQUET_TEST_DATA=${source_dir}/cpp/submodules/parquet-testing/data
 
 if [ "${INSTALL_PYARROW}" == "ON" ]; then
   # Install the built wheels
-  pip install ${source_dir}/python/repaired_wheels/*.whl
+  python -m pip install ${source_dir}/python/repaired_wheels/*.whl
 fi
 
 if [ "${CHECK_IMPORTS}" == "ON" ]; then
@@ -95,7 +95,7 @@ fi
 
 if [ "${CHECK_UNITTESTS}" == "ON" ]; then
   # Install testing dependencies
-  pip install -U -r ${source_dir}/python/requirements-wheel-test.txt
+  python -m pip install -U -r ${source_dir}/python/requirements-wheel-test.txt
 
   # Execute unittest, test dependencies must be installed
   python -c 'import pyarrow; pyarrow.create_library_symlinks()'
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index f3011ae118a11..7a1c8fb4f9d80 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -35,6 +35,7 @@ jobs:
       ARCH: arm64v8
       {% endif %}
       PYTHON: "{{ python_version }}"
+      PYTHON_ABI_TAG: "{{ python_abi_tag }}"
       {% if python_version == "3.13" %}
       PYTHON_IMAGE_TAG: "3.13-rc"
       {% else %}
@@ -57,12 +58,24 @@ jobs:
 
       # TODO(kszucs): auditwheel show
       - name: Test wheel
+        if: |
+          '{{ python_abi_tag }}' != 'cp313t'
         shell: bash
         run: |
           source arrow/ci/scripts/util_enable_core_dumps.sh
           archery docker run python-wheel-manylinux-test-imports
           archery docker run python-wheel-manylinux-test-unittests
 
+      # Free-threaded wheels need to be tested using a different docker-compose service
+      - name: Test free-threaded wheel
+        if: |
+          '{{ python_abi_tag }}' == 'cp313t'
+        shell: bash
+        run: |
+          source arrow/ci/scripts/util_enable_core_dumps.sh
+          archery docker run python-free-threaded-wheel-manylinux-test-imports
+          archery docker run python-free-threaded-wheel-manylinux-test-unittests
+
       - name: Test wheel on AlmaLinux 8
         shell: bash
         if: |
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index 4f54719feef66..d5a4774ab2b0d 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -26,7 +26,12 @@
   PYARROW_BUILD_VERBOSE: 1
   PYARROW_VERSION: "{{ arrow.no_rc_version }}"
   PYTHON_VERSION: "{{ python_version }}"
+  PYTHON_ABI_TAG: "{{ python_abi_tag }}"
+  {% if python_abi_tag == "cp313t" %}
+  PYTHON: "/Library/Frameworks/PythonT.framework/Versions/{{ python_version }}/bin/python{{ python_version }}t"
+  {% else %}
   PYTHON: "/Library/Frameworks/Python.framework/Versions/{{ python_version }}/bin/python{{ python_version }}"
+  {% endif %}
   VCPKG_DEFAULT_TRIPLET: "{{ vcpkg_arch }}-osx-static-release"
   VCPKG_FEATURE_FLAGS: "manifests"
   VCPKG_OVERLAY_TRIPLETS: {{ "${{ github.workspace }}/arrow/ci/vcpkg" }}
@@ -99,7 +104,7 @@ jobs:
             --x-feature=s3
 
       - name: Install Python {{ python_version }}
-        run: sudo arrow/ci/scripts/install_python.sh macos {{ python_version }}
+        run: sudo arrow/ci/scripts/install_python.sh macos {{ "3.13t" if python_abi_tag == "cp313t" else python_version }}
 
       - name: Build Wheel
         run: |
@@ -136,6 +141,7 @@ jobs:
       - name: Test Wheel
         env:
           PYTEST_ADDOPTS: "-k 'not test_cancellation'"
+          PYTHON_GIL: {{ 0 if python_abi_tag == "cp313t" else 1 }}
         run: |
           $PYTHON -m venv test-env
           source test-env/bin/activate
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 4dcafc73a25ad..5f0f8aa3d9425 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -394,7 +394,8 @@ tasks:
                                                ("3.10", "cp310", "cp310"),
                                                ("3.11", "cp311", "cp311"),
                                                ("3.12", "cp312", "cp312"),
-                                               ("3.13", "cp313", "cp313")] %}
+                                               ("3.13", "cp313", "cp313"),
+                                               ("3.13", "cp313", "cp313t")] %}
 
 {############################## Wheel Linux ##################################}
 
@@ -403,12 +404,13 @@ tasks:
                                                  ("amd64", "2-28", "manylinux_2_28_x86_64"),
                                                  ("arm64", "2014", "manylinux_2_17_aarch64.manylinux2014_aarch64"),
                                                  ("arm64", "2-28", "manylinux_2_28_aarch64")] %}
-  wheel-manylinux-{{ manylinux_version }}-{{ python_tag }}-{{ arch }}:
+  wheel-manylinux-{{ manylinux_version }}-{{ python_tag }}-{{ abi_tag }}-{{ arch }}:
     ci: github
     template: python-wheels/github.linux.yml
     params:
       arch: "{{ arch }}"
       python_version: "{{ python_version }}"
+      python_abi_tag: "{{ abi_tag }}"
       manylinux_version: "{{ manylinux_version }}"
       wheel_platform_tag: "{{ platform_tag }}"
     artifacts:
@@ -420,13 +422,14 @@ tasks:
 {% for macos_version, macos_codename in [("12.0", "monterey")] %}
   {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %}
 
-  wheel-macos-{{ macos_codename }}-{{ python_tag }}-amd64:
+  wheel-macos-{{ macos_codename }}-{{ python_tag }}-{{ abi_tag }}-amd64:
     ci: github
     template: python-wheels/github.osx.yml
     params:
       arch: "x86_64"
       arrow_jemalloc: "ON"
       python_version: "{{ python_version }}"
+      python_abi_tag: "{{ abi_tag }}"
       macos_deployment_target: "{{ macos_version }}"
       runs_on: "macos-12"
       vcpkg_arch: "amd64"
@@ -435,21 +438,26 @@ tasks:
 
 {% endfor %}
 
-  wheel-macos-monterey-{{ python_tag }}-arm64:
+  wheel-macos-monterey-{{ python_tag }}-{{ abi_tag }}-arm64:
     ci: github
     template: python-wheels/github.osx.yml
     params:
       arch: "arm64"
       arrow_jemalloc: "OFF"
       python_version: "{{ python_version }}"
+      python_abi_tag: "{{ abi_tag }}"
       macos_deployment_target: "12.0"
       runs_on: "macos-14"
       vcpkg_arch: "arm64"
     artifacts:
-      - pyarrow-{no_rc_version}-{{ python_tag }}-{{ python_tag }}-macosx_12_0_arm64.whl
+      - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-macosx_12_0_arm64.whl
 
 {############################## Wheel Windows ################################}
 
+# TODO: Remove this when there's NumPy wheels for Windows.
+# See https://github.com/numpy/numpy/issues/26157 for more info.
+{% if abi_tag != "cp313t" %}
+
   wheel-windows-{{ python_tag }}-amd64:
     ci: github
     template: python-wheels/github.windows.yml
@@ -458,6 +466,7 @@ tasks:
     artifacts:
       - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-win_amd64.whl
 
+{% endif %}
 {% endfor %}
 
 {############################ Python sdist ####################################}
diff --git a/docker-compose.yml b/docker-compose.yml
index a76ee49490a6e..a89475183896a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -174,7 +174,9 @@ x-hierarchy:
     - java-jni-manylinux-2014
   - python-wheel-manylinux-2-28
   - python-wheel-manylinux-test-imports
+  - python-free-threaded-wheel-manylinux-test-imports
   - python-wheel-manylinux-test-unittests
+  - python-free-threaded-wheel-manylinux-test-unittests
   - python-wheel-windows-vs2019
   - python-wheel-windows-test
 
@@ -1126,7 +1128,7 @@ services:
         base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2024-08-03-32dfa47
         vcpkg: ${VCPKG}
         python: ${PYTHON}
-        python_image_tag: ${PYTHON_IMAGE_TAG}
+        python_abi_tag: ${PYTHON_ABI_TAG}
         manylinux: 2014
       context: .
       dockerfile: ci/docker/python-wheel-manylinux.dockerfile
@@ -1150,7 +1152,7 @@ services:
         base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2024-08-03-32dfa47
         vcpkg: ${VCPKG}
         python: ${PYTHON}
-        python_image_tag: ${PYTHON_IMAGE_TAG}
+        python_abi_tag: ${PYTHON_ABI_TAG}
         manylinux: 2_28
       context: .
       dockerfile: ci/docker/python-wheel-manylinux.dockerfile
@@ -1175,6 +1177,26 @@ services:
       CHECK_WHEEL_CONTENT: "ON"
     command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow
 
+  # TODO: Remove this when the official Docker Python image supports the free-threaded build.
+  # See https://github.com/docker-library/python/issues/947 for more info.
+  python-free-threaded-wheel-manylinux-test-imports:
+    image: ${REPO}:${ARCH}-python-3.13-free-threaded-wheel-manylinux-test-imports
+    build:
+      args:
+        base: "${ARCH}/ubuntu:${UBUNTU}"
+      context: .
+      dockerfile: ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile
+      cache_from:
+        - ${REPO}:${ARCH}-python-3.13-free-threaded-wheel-manylinux-test-imports
+    shm_size: 2G
+    volumes:
+      - .:/arrow:delegated
+    environment:
+      <<: *common
+      CHECK_IMPORTS: "ON"
+      CHECK_UNITTESTS: "OFF"
+    command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow
+
   python-wheel-manylinux-test-unittests:
     image: ${REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-test
     build:
@@ -1196,6 +1218,26 @@ services:
       CHECK_WHEEL_CONTENT: "OFF"
     command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow
 
+  # TODO: Remove this when the official Docker Python image supports the free-threaded build.
+  # See https://github.com/docker-library/python/issues/947 for more info.
+  python-free-threaded-wheel-manylinux-test-unittests:
+    image: ${REPO}:${ARCH}-python-3.13-free-threaded-wheel-manylinux-test-unittests
+    build:
+      args:
+        base: "${ARCH}/ubuntu:${UBUNTU}"
+      context: .
+      dockerfile: ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile
+      cache_from:
+        - ${REPO}:${ARCH}-python-3.13-free-threaded-wheel-manylinux-test-unittests
+    shm_size: 2G
+    volumes:
+      - .:/arrow:delegated
+    environment:
+      <<: *common
+      CHECK_IMPORTS: "OFF"
+      CHECK_UNITTESTS: "ON"
+    command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow
+
   python-wheel-windows-vs2019:
     image: ${REPO}:python-${PYTHON}-wheel-windows-vs2019-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION}
     build:

From acf15bb1290e8b6a5795f753dbeeb2d911579214 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 19 Sep 2024 01:40:14 +0200
Subject: [PATCH 174/186] MINOR: [C++][CI] Move ThreadSanitizer build to Ubuntu
 24.04 (#44159)

1. Update Crossbow ThreadSanitizer build to use newer Ubuntu LTS
2. Enable S3 as it passes on 24.04
3. Disable Flight as it produces numerous errors on 24.04 (see GH-36552)

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 4 ++++
 dev/tasks/tasks.yml                         | 8 ++++----
 docker-compose.yml                          | 4 ++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index b31037a973279..db151b4e0f44b 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -5002,6 +5002,10 @@ macro(build_awssdk)
     string(APPEND AWS_C_FLAGS " -Wno-error=shorten-64-to-32")
     string(APPEND AWS_CXX_FLAGS " -Wno-error=shorten-64-to-32")
   endif()
+  if(NOT MSVC)
+    string(APPEND AWS_C_FLAGS " -Wno-deprecated")
+    string(APPEND AWS_CXX_FLAGS " -Wno-deprecated")
+  endif()
 
   set(AWSSDK_COMMON_CMAKE_ARGS
       ${EP_COMMON_CMAKE_ARGS}
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 5f0f8aa3d9425..588823b3189b9 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -1182,16 +1182,16 @@ tasks:
       flags: "-e ARROW_ENABLE_THREADING=OFF"
       image: ubuntu-cpp
 
-  test-ubuntu-20.04-cpp-thread-sanitizer:
+  test-ubuntu-24.04-cpp-thread-sanitizer:
     ci: github
     template: docker-tests/github.linux.yml
     params:
       env:
         # clang-tools and llvm version need to be synchronized so as
         # to have the right llvm-symbolizer version
-        CLANG_TOOLS: 11
-        LLVM: 11
-        UBUNTU: 20.04
+        CLANG_TOOLS: 18
+        LLVM: 18
+        UBUNTU: 24.04
       image: ubuntu-cpp-thread-sanitizer
 
   test-ubuntu-20.04-cpp-minimal-with-formats:
diff --git a/docker-compose.yml b/docker-compose.yml
index a89475183896a..ed517564cea22 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -651,10 +651,10 @@ services:
       ARROW_BUILD_STATIC: "OFF"
       ARROW_CTEST_TIMEOUT: 500
       ARROW_ENABLE_TIMING_TESTS:  # inherit
-      ARROW_DATASET: "ON"
+      ARROW_FLIGHT: "OFF"
+      ARROW_FLIGHT_SQL: "OFF"
       ARROW_JEMALLOC: "OFF"
       ARROW_ORC: "OFF"
-      ARROW_S3: "OFF"
       ARROW_USE_TSAN: "ON"
     command: *cpp-command
 

From ee188600c511503b2973a569d7ec47b257655ebd Mon Sep 17 00:00:00 2001
From: Ian Cook <ianmcook@gmail.com>
Date: Wed, 18 Sep 2024 19:43:23 -0400
Subject: [PATCH 175/186] MINOR: [Docs] Fix number of minor format versions
 since 1.0.0 (#44163)

Fixes a minor docs omission from #43976.

Authored-by: Ian Cook <ianmcook@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 docs/source/format/Versioning.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/format/Versioning.rst b/docs/source/format/Versioning.rst
index d46d07a90906c..52cf207b959bc 100644
--- a/docs/source/format/Versioning.rst
+++ b/docs/source/format/Versioning.rst
@@ -76,7 +76,7 @@ that new clients can read serialized data produced by library version
 Post-1.0.0 Format Versions
 ==========================
 
-Since version 1.0.0, there have been four new minor versions and zero new
+Since version 1.0.0, there have been five new minor versions and zero new
 major versions of the Arrow format. Each new minor version added new features.
 When these new features are not used, the new minor format versions are
 compatible with format version 1.0.0. The new features added in each minor

From 4013815efbd6d8680c21379655a66f392f073751 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Thu, 19 Sep 2024 04:31:48 +0200
Subject: [PATCH 176/186] GH-43873: [Go][CI] Remove Go related test CI (#44143)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

The Go implementation is moving to apache/arrow-go from go/ in apache/arrow.

### What changes are included in this PR?

CI has been migrated to apache/arrow-go repository.

### Are these changes tested?

No

### Are there any user-facing changes?

No
* GitHub Issue: #43873

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .env                                         |   2 -
 .github/workflows/go.yml                     | 484 -------------------
 ci/docker/conda-integration.dockerfile       |  16 +-
 ci/docker/debian-12-go-cgo-python.dockerfile |  34 --
 ci/docker/debian-12-go.dockerfile            |  29 --
 ci/docker/debian-go-cgo.dockerfile           |  32 --
 ci/scripts/go_bench.sh                       |  54 ---
 ci/scripts/go_bench_adapt.py                 | 127 -----
 ci/scripts/go_build.sh                       |  67 ---
 ci/scripts/go_cgo_python_test.sh             |  51 --
 ci/scripts/go_test.sh                        |  89 ----
 ci/scripts/go_tinygo_example.sh              |  27 --
 dev/tasks/tasks.yml                          |  18 -
 docker-compose.yml                           |  66 ---
 14 files changed, 14 insertions(+), 1082 deletions(-)
 delete mode 100644 .github/workflows/go.yml
 delete mode 100644 ci/docker/debian-12-go-cgo-python.dockerfile
 delete mode 100644 ci/docker/debian-12-go.dockerfile
 delete mode 100644 ci/docker/debian-go-cgo.dockerfile
 delete mode 100755 ci/scripts/go_bench.sh
 delete mode 100644 ci/scripts/go_bench_adapt.py
 delete mode 100755 ci/scripts/go_build.sh
 delete mode 100755 ci/scripts/go_cgo_python_test.sh
 delete mode 100755 ci/scripts/go_test.sh
 delete mode 100755 ci/scripts/go_tinygo_example.sh

diff --git a/.env b/.env
index 638fd324a1f40..f41a142490716 100644
--- a/.env
+++ b/.env
@@ -58,8 +58,6 @@ CUDA=11.2.2
 DASK=latest
 DOTNET=8.0
 GCC_VERSION=""
-GO=1.22.6
-STATICCHECK=v0.5.1
 HDFS=3.2.1
 JDK=11
 KARTOTHEK=latest
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
deleted file mode 100644
index 9eba4c86362e1..0000000000000
--- a/.github/workflows/go.yml
+++ /dev/null
@@ -1,484 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-name: Go
-
-on:
-  push:
-    paths:
-      - '.dockerignore'
-      - '.github/workflows/go.yml'
-      - 'ci/docker/*_go.dockerfile'
-      - 'ci/scripts/go_*'
-      - 'docker-compose.yml'
-      - 'go/**'
-  pull_request:
-    paths:
-      - '.dockerignore'
-      - '.github/workflows/go.yml'
-      - 'ci/docker/*_go.dockerfile'
-      - 'ci/docker/**'
-      - 'ci/scripts/go_*'
-      - 'docker-compose.yml'
-      - 'go/**'
-
-concurrency:
-  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-env:
-  ARCHERY_DEBUG: 1
-
-jobs:
-
-  docker-targets:
-    name: Docker targets
-    runs-on: ubuntu-latest
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    outputs:
-      targets: ${{ steps.detect-targets.outputs.targets }}
-    steps:
-      - name: Detect targets
-        id: detect-targets
-        run: |
-          echo "targets<<JSON" >> "$GITHUB_OUTPUT"
-          echo "[" >> "$GITHUB_OUTPUT"
-          cat <<JSON >> "$GITHUB_OUTPUT"
-          {
-            "arch-label": "AMD64",
-            "arch": "amd64",
-            "go": "1.22",
-            "runs-on": "ubuntu-latest"
-          },
-          {
-            "arch-label": "AMD64",
-            "arch": "amd64",
-            "go": "1.23",
-            "runs-on": "ubuntu-latest"
-          }
-          JSON
-          if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then
-            echo "," >> "$GITHUB_OUTPUT"
-            cat <<JSON >> "$GITHUB_OUTPUT"
-          {
-            "arch-label": "ARM64",
-            "arch": "arm64v8",
-            "go": "1.22",
-            "runs-on": ["self-hosted", "arm", "linux"]
-          },
-          {
-            "arch-label": "ARM64",
-            "arch": "arm64v8",
-            "go": "1.23",
-            "runs-on": ["self-hosted", "arm", "linux"]
-          }
-          JSON
-          fi
-          echo "]" >> "$GITHUB_OUTPUT"
-          echo "JSON" >> "$GITHUB_OUTPUT"
-
-  docker:
-    name: ${{ matrix.arch-label }} Debian 12 Go ${{ matrix.go }}
-    needs: docker-targets
-    runs-on: ${{ matrix.runs-on }}
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 60
-    strategy:
-      fail-fast: false
-      matrix:
-        include: ${{ fromJson(needs.docker-targets.outputs.targets) }}
-    env:
-      ARCH: ${{ matrix.arch }}
-      GO: ${{ matrix.go }}
-    steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          fetch-depth: 0
-          submodules: recursive
-      - name: Setup Python
-        run: |
-          sudo apt update
-          sudo apt install -y --no-install-recommends python3 python3-dev python3-pip
-      - name: Setup Archery
-        run: python3 -m pip install -e dev/archery[docker]
-      - name: Execute Docker Build
-        env:
-          ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
-          ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
-        run: archery docker run debian-go
-      - name: Docker Push
-        if: >-
-          success() &&
-          github.event_name == 'push' &&
-          github.repository == 'apache/arrow' &&
-          github.ref_name == 'main'
-        env:
-          ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
-          ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-        run: archery docker push debian-go
-      - name: Install Go ${{ matrix.go }} for Benchmarks
-        if: >-
-          success() &&
-          matrix.arch == 'amd64' &&
-          github.event_name == 'push' &&
-          github.repository == 'apache/arrow' &&
-          github.ref_name == 'main'
-        uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0
-        with:
-          go-version: ${{ matrix.go }}
-          cache: true
-          cache-dependency-path: go/go.sum
-      - name: Run Benchmarks
-        if: >-
-          success() &&
-          matrix.arch == 'amd64' &&
-          github.event_name == 'push' &&
-          github.repository == 'apache/arrow' &&
-          github.ref_name == 'main'
-        env:
-          CONBENCH_URL: https://conbench.ursa.dev
-          CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }}
-          CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }}
-          CONBENCH_REF: ${{ github.ref_name }}
-          CONBENCH_MACHINE_INFO_NAME: ${{ matrix.arch }}-debian-12
-        run: |
-          python3 -m pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python
-          python3 ci/scripts/go_bench_adapt.py
-
-  build_test_386:
-    name: Go Cross-build and test for 386
-    runs-on: ubuntu-latest
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 20
-    steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Get required Go version
-        run: |
-          (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV
-      - name: Install Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: "${{ env.GO_VERSION }}"
-          cache: true
-          cache-dependency-path: go/go.sum
-      - name: Run build
-        run: GOARCH=386 go build ./...
-        working-directory: ./go
-      - name: Run test
-        # WIP refactor, only tests in the specified dirs have been fixed
-        run: GOARCH=386 go test ./parquet/file/...
-        working-directory: ./go
-
-  docker_cgo:
-    name: AMD64 Debian 12 Go ${{ matrix.go }} - CGO
-    runs-on: ubuntu-latest
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 20
-    strategy:
-      fail-fast: false
-      matrix:
-        go: ['1.22', '1.23']
-    env:
-      GO: ${{ matrix.go }}
-    steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          fetch-depth: 0
-          submodules: recursive
-      - name: Setup Python
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-        with:
-          python-version: 3.12
-      - name: Setup Archery
-        run: pip install -e dev/archery[docker]
-      - name: Execute Docker Build
-        env:
-          ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
-          ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
-        run: archery docker run debian-go-cgo
-      - name: Docker Push
-        if: >-
-          success() &&
-          github.event_name == 'push' &&
-          github.repository == 'apache/arrow' &&
-          github.ref_name == 'main'
-        env:
-          ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
-          ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-        run: archery docker push debian-go-cgo
-
-
-  docker_cgo_python:
-    name: AMD64 Debian 12 Go ${{ matrix.go }} - CGO Python
-    runs-on: ubuntu-latest
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 15
-    strategy:
-      fail-fast: false
-      matrix:
-        go: ['1.22', '1.23']
-    env:
-      GO: ${{ matrix.go }}
-    steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          fetch-depth: 0
-      - name: Setup Python
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-        with:
-          python-version: 3.12
-      - name: Setup Archery
-        run: pip install -e dev/archery[docker]
-      - name: Execute Docker Build
-        env:
-          ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
-          ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
-        run: archery docker run debian-go-cgo-python
-      - name: Docker Push
-        if: >-
-          success() &&
-          github.event_name == 'push' &&
-          github.repository == 'apache/arrow' &&
-          github.ref_name == 'main'
-        env:
-          ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
-          ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-        run: archery docker push debian-go-cgo-python
-
-  windows:
-    name: AMD64 Windows 2019 Go ${{ matrix.go }}
-    runs-on: windows-2019
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 25
-    strategy:
-      fail-fast: false
-      matrix:
-        go: ['1.22', '1.23']
-    steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          fetch-depth: 0
-          submodules: recursive
-      - name: Install go
-        uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0
-        with:
-          go-version: ${{ matrix.go }}
-          cache: true
-          cache-dependency-path: go/go.sum
-      - name: Install staticcheck
-        shell: bash
-        run: |
-          . .env
-          go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK}
-      - name: Build
-        shell: bash
-        run: ci/scripts/go_build.sh $(pwd)
-      - name: Test
-        shell: bash
-        run: ci/scripts/go_test.sh $(pwd)
-
-  macos:
-    name: AMD64 macOS 12 Go ${{ matrix.go }}
-    runs-on: macos-12
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 60
-    strategy:
-      fail-fast: false
-      matrix:
-        go: ['1.22', '1.23']
-    steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          fetch-depth: 0
-          submodules: recursive
-      - name: Install go
-        uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0
-        with:
-          go-version: ${{ matrix.go }}
-          cache: true
-          cache-dependency-path: go/go.sum
-      - name: Install staticcheck
-        run: |
-          . .env
-          go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK}
-      - name: Build
-        shell: bash
-        run: ci/scripts/go_build.sh $(pwd)
-      - name: Test
-        shell: bash
-        run: ci/scripts/go_test.sh $(pwd)
-      - name: Setup Python
-        if: >-
-          success() &&
-          github.event_name == 'push' &&
-          github.repository == 'apache/arrow' &&
-          github.ref_name == 'main'
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-        with:
-          python-version: '3.10'
-      - name: Run Benchmarks
-        if: >-
-          success() &&
-          github.event_name == 'push' &&
-          github.repository == 'apache/arrow' &&
-          github.ref_name == 'main'
-        shell: bash
-        env:
-          CONBENCH_URL: 'https://conbench.ursa.dev'
-          CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }}
-          CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }}
-          CONBENCH_REF: ${{ github.ref_name }}
-          CONBENCH_MACHINE_INFO_NAME: amd64-macos-11
-        run: |
-          pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python
-          python ci/scripts/go_bench_adapt.py
-
-
-  macos-cgo:
-    name: AMD64 macOS 12 Go ${{ matrix.go }} - CGO
-    runs-on: macos-12
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 60
-    strategy:
-      fail-fast: false
-      matrix:
-        go: ['1.22', '1.23']
-    env:
-      ARROW_GO_TESTCGO: "1"
-    steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          submodules: recursive
-      - name: Install go
-        uses: actions/setup-go@v5
-        with:
-          go-version: ${{ matrix.go }}
-          cache: true
-          cache-dependency-path: go/go.sum
-      - name: Brew Install Arrow and pkg-config
-        shell: bash
-        run: brew install apache-arrow pkg-config
-      - name: Install staticcheck
-        run: |
-          . .env
-          go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK}
-      - name: Add To pkg config path
-        shell: bash
-        run: |
-          echo "PKG_CONFIG_PATH=$(brew --prefix openssl@3)/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
-      - name: Build
-        shell: bash
-        run: ci/scripts/go_build.sh $(pwd)
-      - name: Test
-        shell: bash
-        run: ci/scripts/go_test.sh $(pwd)
-
-  windows-mingw:
-    name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} CGO
-    runs-on: windows-2019
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    timeout-minutes: 60
-    strategy:
-      fail-fast: false
-      matrix:
-        mingw-n-bits:
-          #- 32 runtime handling for CGO needs 64-bit currently
-          - 64
-    env:
-      ARROW_GO_TESTCGO: "1"
-      MINGW_LINT: "1"
-    steps:
-      - name: Disable Crash Dialogs
-        run: |
-          reg add `
-            "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" `
-            /v DontShowUI `
-            /t REG_DWORD `
-            /d 1 `
-            /f
-      - name: Checkout Arrow
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          submodules: recursive
-      - uses: msys2/setup-msys2@v2
-        with:
-          msystem: MINGW${{ matrix.mingw-n-bits }}
-          update: true
-      - name: Setup MSYS2
-        shell: msys2 {0}
-        run: |
-          ci/scripts/msys2_setup.sh cgo
-      - name: Get required Go version
-        run: |
-          (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV
-      - name: Update CGO Env vars
-        shell: msys2 {0}
-        run: |
-          echo "CGO_CPPFLAGS=-I$(cygpath --windows ${MINGW_PREFIX}/include)" >> $GITHUB_ENV
-          echo "CGO_LDFLAGS=-g -O2 -L$(cygpath --windows ${MINGW_PREFIX}/lib) -L$(cygpath --windows ${MINGW_PREFIX}/bin)" >> $GITHUB_ENV
-          echo "MINGW_PREFIX=$(cygpath --windows ${MINGW_PREFIX})" >> $GITHUB_ENV
-      - name: Install go
-        uses: actions/setup-go@v5
-        with:
-          go-version: "${{ env.GO_VERSION }}"
-          cache: true
-          cache-dependency-path: go/go.sum
-      - name: Install staticcheck
-        shell: bash
-        run: |
-          . .env
-          go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK}
-      - name: Build
-        shell: bash
-        run: ci/scripts/go_build.sh $(pwd)
-      - name: Test
-        shell: bash
-        run: ci/scripts/go_test.sh $(pwd)
-
-  tinygo:
-    name: TinyGo
-    runs-on: ubuntu-latest
-    if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
-    env:
-      TINYGO_VERSION: 0.27.0
-    timeout-minutes: 60
-    steps:
-      - name: Checkout Arrow
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          submodules: recursive
-      - name: Build and Run Example
-        run: |
-          docker run --rm -v $(pwd)/go:/src -v $(pwd)/ci/scripts:/ci-scripts "tinygo/tinygo:$TINYGO_VERSION" /ci-scripts/go_tinygo_example.sh
diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile
index 7ad2e5c0e8008..bc268e484d019 100644
--- a/ci/docker/conda-integration.dockerfile
+++ b/ci/docker/conda-integration.dockerfile
@@ -24,7 +24,6 @@ ARG maven=3.8.7
 ARG node=16
 ARG yarn=1.22
 ARG jdk=11
-ARG go=1.22.6
 
 # Install Archery and integration dependencies
 COPY ci/conda_env_archery.txt /arrow/ci/
@@ -54,7 +53,20 @@ ENV GOROOT=/opt/go \
     GOBIN=/opt/go/bin \
     GOPATH=/go \
     PATH=/opt/go/bin:$PATH
-RUN wget -nv -O - https://dl.google.com/go/go${go}.linux-${arch}.tar.gz | tar -xzf - -C /opt
+# Use always latest go
+RUN wget -nv -O - https://dl.google.com/go/go$( \
+        curl \
+        --fail \
+        --location \
+        --show-error \
+        --silent \
+        https://api.github.com/repos/golang/go/git/matching-refs/tags/go | \
+        grep -o '"ref": "refs/tags/go.*"' | \
+        tail -n 1 | \
+        sed \
+        -e 's,^"ref": "refs/tags/go,,g' \
+        -e 's/"$//g' \
+    ).linux-${arch}.tar.gz | tar -xzf - -C /opt
 
 ENV DOTNET_ROOT=/opt/dotnet \
     PATH=/opt/dotnet:$PATH
diff --git a/ci/docker/debian-12-go-cgo-python.dockerfile b/ci/docker/debian-12-go-cgo-python.dockerfile
deleted file mode 100644
index a24955f76e666..0000000000000
--- a/ci/docker/debian-12-go-cgo-python.dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ARG base
-FROM ${base}
-
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install python3 and pip so we can install pyarrow to test the C data interface.
-RUN apt-get update -y -q && \
-    apt-get install -y -q --no-install-recommends \
-        python3 \
-        python3-pip \
-        python3-venv && \
-    apt-get clean
-
-ENV ARROW_PYTHON_VENV /arrow-dev
-RUN python3 -m venv ${ARROW_PYTHON_VENV} && \
-    . ${ARROW_PYTHON_VENV}/bin/activate && \
-    pip install pyarrow cffi --only-binary pyarrow
diff --git a/ci/docker/debian-12-go.dockerfile b/ci/docker/debian-12-go.dockerfile
deleted file mode 100644
index 4bc683c109eb8..0000000000000
--- a/ci/docker/debian-12-go.dockerfile
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ARG arch=amd64
-ARG go=1.22
-ARG staticcheck=v0.5.1
-FROM ${arch}/golang:${go}-bookworm
-
-# FROM collects all the args, get back the staticcheck version arg
-ARG staticcheck
-RUN GO111MODULE=on go install honnef.co/go/tools/cmd/staticcheck@${staticcheck}
-
-# Copy the go.mod and go.sum over and pre-download all the dependencies
-COPY go/ /arrow/go
-RUN cd /arrow/go && go mod download
diff --git a/ci/docker/debian-go-cgo.dockerfile b/ci/docker/debian-go-cgo.dockerfile
deleted file mode 100644
index a494d1e1564ff..0000000000000
--- a/ci/docker/debian-go-cgo.dockerfile
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ARG base
-FROM ${base}
-
-ENV DEBIAN_FRONTEND noninteractive
-
-# install libarrow-dev to link against with CGO
-RUN apt-get update -y -q && \
-    apt-get install -y -q --no-install-recommends ca-certificates lsb-release wget && \
-    wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \
-    apt-get install -y -q --no-install-recommends ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \
-    apt-get update -y -q && \
-    apt-get install -y -q --no-install-recommends \
-        cmake \
-        libarrow-dev && \
-    apt-get clean
diff --git a/ci/scripts/go_bench.sh b/ci/scripts/go_bench.sh
deleted file mode 100755
index 6d5305f9eeff2..0000000000000
--- a/ci/scripts/go_bench.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# this will output the benchmarks to STDOUT but if `-json` is passed
-# as the second argument, it will create a file "bench_stats.json"
-# in the directory this is called from containing a json representation
-
-set -ex
-
-# simplistic semver comparison
-verlte() {
-    [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ]
-}
-verlt() {
-    [ "$1" = "$2" ] && return 1 || verlte $1 $2
-}
-
-ver=`go env GOVERSION`
-
-source_dir=${1}/go
-
-export PARQUET_TEST_DATA=${1}/cpp/submodules/parquet-testing/data
-pushd ${source_dir}
-
-# lots of benchmarks, they can take a while
-# the timeout is for *ALL* benchmarks together,
-# not per benchmark
-go test -bench=. -benchmem -timeout 40m -run=^$ ./... | tee bench_stat.dat
-
-popd
-
-if [[ "$2" = "-json" ]]; then
-    go install go.bobheadxi.dev/gobenchdata@latest
-    export PATH=`go env GOPATH`/bin:$PATH
-    cat ${source_dir}/bench_*.dat | gobenchdata --json bench_stats.json
-fi    
-
-rm ${source_dir}/bench_*.dat
\ No newline at end of file
diff --git a/ci/scripts/go_bench_adapt.py b/ci/scripts/go_bench_adapt.py
deleted file mode 100644
index a05e25de8bdd3..0000000000000
--- a/ci/scripts/go_bench_adapt.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import json
-import os
-import uuid
-import logging
-from pathlib import Path
-from typing import List
-
-from benchadapt import BenchmarkResult
-from benchadapt.adapters import BenchmarkAdapter
-from benchadapt.log import log
-
-log.setLevel(logging.DEBUG)
-
-ARROW_ROOT = Path(__file__).parent.parent.parent.resolve()
-SCRIPTS_PATH = ARROW_ROOT / "ci" / "scripts"
-
-# `github_commit_info` is meant to communicate GitHub-flavored commit
-# information to Conbench. See
-# https://github.com/conbench/conbench/blob/cf7931f/benchadapt/python/benchadapt/result.py#L66
-# for a specification.
-github_commit_info = {"repository": "https://github.com/apache/arrow"}
-
-if os.environ.get("CONBENCH_REF") == "main":
-    # Assume GitHub Actions CI. The environment variable lookups below are
-    # expected to fail when not running in GitHub Actions.
-    github_commit_info = {
-        "repository": f'{os.environ["GITHUB_SERVER_URL"]}/{os.environ["GITHUB_REPOSITORY"]}',
-        "commit": os.environ["GITHUB_SHA"],
-        "pr_number": None,  # implying default branch
-    }
-    run_reason = "commit"
-else:
-    # Assume that the environment is not GitHub Actions CI. Error out if that
-    # assumption seems to be wrong.
-    assert os.getenv("GITHUB_ACTIONS") is None
-
-    # This is probably a local dev environment, for testing. In this case, it
-    # does usually not make sense to provide commit information (not a
-    # controlled CI environment). Explicitly leave out "commit" and "pr_number" to
-    # reflect that (to not send commit information).
-
-    # Reflect 'local dev' scenario in run_reason. Allow user to (optionally)
-    # inject a custom piece of information into the run reason here, from
-    # environment.
-    run_reason = "localdev"
-    custom_reason_suffix = os.getenv("CONBENCH_CUSTOM_RUN_REASON")
-    if custom_reason_suffix is not None:
-        run_reason += f" {custom_reason_suffix.strip()}"
-
-
-class GoAdapter(BenchmarkAdapter):
-    result_file = "bench_stats.json"
-    command = ["bash", SCRIPTS_PATH / "go_bench.sh", ARROW_ROOT, "-json"]
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(command=self.command, *args, **kwargs)
-
-    def _transform_results(self) -> List[BenchmarkResult]:
-        with open(self.result_file, "r") as f:
-            raw_results = json.load(f)
-
-        run_id = uuid.uuid4().hex
-        parsed_results = []
-        for suite in raw_results[0]["Suites"]:
-            batch_id = uuid.uuid4().hex
-            pkg = suite["Pkg"]
-
-            for benchmark in suite["Benchmarks"]:
-                data = benchmark["Mem"]["MBPerSec"] * 1e6
-                time = 1 / benchmark["NsPerOp"] * 1e9
-
-                name = benchmark["Name"].removeprefix("Benchmark")
-                ncpu = name[name.rfind("-") + 1 :]
-                pieces = name[: -(len(ncpu) + 1)].split("/")
-
-                parsed = BenchmarkResult(
-                    run_id=run_id,
-                    batch_id=batch_id,
-                    stats={
-                        "data": [data],
-                        "unit": "B/s",
-                        "times": [time],
-                        "time_unit": "i/s",
-                        "iterations": benchmark["Runs"],
-                    },
-                    context={
-                        "benchmark_language": "Go",
-                        "goos": suite["Goos"],
-                        "goarch": suite["Goarch"],
-                    },
-                    tags={
-                        "pkg": pkg,
-                        "num_cpu": ncpu,
-                        "name": pieces[0],
-                        "params": "/".join(pieces[1:]),
-                    },
-                    run_reason=run_reason,
-                    github=github_commit_info,
-                )
-                parsed.run_name = (
-                    f"{parsed.run_reason}: {github_commit_info.get('commit')}"
-                )
-                parsed_results.append(parsed)
-
-        return parsed_results
-
-
-if __name__ == "__main__":
-    go_adapter = GoAdapter(result_fields_override={"info": {}})
-    go_adapter()
diff --git a/ci/scripts/go_build.sh b/ci/scripts/go_build.sh
deleted file mode 100755
index ea77ecf56ac0e..0000000000000
--- a/ci/scripts/go_build.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-source_dir=${1}/go
-
-# Need "all=" as per https://github.com/golang/go/issues/42131#issuecomment-713917379
-export GOFLAGS="${GOFLAGS} -gcflags=all=-d=checkptr"
-
-pushd ${source_dir}/arrow
-
-if [[ -n "${ARROW_GO_TESTCGO}" ]]; then
-    if [[ "${MSYSTEM}" = "MINGW64" ]]; then        
-        export PATH=${MINGW_PREFIX}/bin:$PATH
-        go clean -cache
-        go clean -testcache        
-    fi
-    TAGS="-tags assert,test,ccalloc"    
-fi
-
-go install $TAGS -v ./...
-
-popd
-
-pushd ${source_dir}/parquet
-
-go install -v ./...
-
-popd
-
-: ${ARROW_INTEGRATION_GO:=ON}
-
-if [ "${ARROW_INTEGRATION_GO}" == "ON" ]; then
-    pushd ${source_dir}/arrow/internal/cdata_integration
-
-    case "$(uname)" in
-        Linux)
-            go_lib="arrow_go_integration.so"
-            ;;
-        Darwin)
-            go_lib="arrow_go_integration.dylib"
-            ;;
-        MINGW*)
-            go_lib="arrow_go_integration.dll"
-            ;;
-    esac
-    go build -buildvcs=false -tags cdata_integration,assert -buildmode=c-shared -o ${go_lib} .
-
-    popd
-fi
diff --git a/ci/scripts/go_cgo_python_test.sh b/ci/scripts/go_cgo_python_test.sh
deleted file mode 100755
index a76b6d0613a4b..0000000000000
--- a/ci/scripts/go_cgo_python_test.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-source_dir=${1}/go
-
-if [ -n "${ARROW_PYTHON_VENV:-}" ]; then
-  . "${ARROW_PYTHON_VENV}/bin/activate"
-fi
-
-export GOFLAGS="${GOFLAGS} -gcflags=all=-d=checkptr"
-
-pushd ${source_dir}/arrow/cdata/test
-
-case "$(uname)" in
-    Linux)
-        testlib="cgotest.so"
-        ;;
-    Darwin)
-        testlib="cgotest.so"
-        ;;
-    MINGW*)
-        testlib="cgotest.dll"
-        ;;
-esac
-
-go build -tags cdata_test,assert -buildmode=c-shared -o $testlib .
-
-python test_export_to_cgo.py
-
-rm $testlib
-rm "${testlib%.*}.h"
-
-popd
diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh
deleted file mode 100755
index bad2ffe619026..0000000000000
--- a/ci/scripts/go_test.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-# simplistic semver comparison
-verlte() {
-    [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ]
-}
-verlt() {
-    [ "$1" = "$2" ] && return 1 || verlte $1 $2
-}
-
-ver=`go env GOVERSION`
-
-source_dir=${1}/go
-
-testargs="-race"
-if verlte "1.18" "${ver#go}" && [ "$(go env GOOS)" != "darwin" ]; then
-    # asan not supported on darwin/amd64
-    testargs="-asan"
-fi
-
-case "$(uname)" in
-    MINGW*)
-        # -asan and -race don't work on windows currently
-        testargs=""
-        ;;
-esac
-
-if [[ "$(go env GOHOSTARCH)" = "s390x" ]]; then
-    testargs="" # -race and -asan not supported on s390x
-fi
-
-# Go static check (skipped in MinGW)
-if [[ -z "${MINGW_LINT}" ]]; then
-    pushd ${source_dir}
-    "$(go env GOPATH)"/bin/staticcheck ./...
-    popd
-fi
-
-
-pushd ${source_dir}/arrow
-
-TAGS="assert,test"
-if [[ -n "${ARROW_GO_TESTCGO}" ]]; then
-    if [[ "${MSYSTEM}" = "MINGW64" ]]; then
-        export PATH=${MINGW_PREFIX}\\bin:${MINGW_PREFIX}\\lib:$PATH
-    fi
-    TAGS="${TAGS},ccalloc"
-fi
-
-# the cgo implementation of the c data interface requires the "test"
-# tag in order to run its tests so that the testing functions implemented
-# in .c files don't get included in non-test builds.
-
-go test $testargs -tags $TAGS ./...
-
-# run it again but with the noasm tag
-go test $testargs -tags $TAGS,noasm ./...
-
-popd
-
-export PARQUET_TEST_DATA=${1}/cpp/submodules/parquet-testing/data
-export ARROW_TEST_DATA=${1}/testing/data
-pushd ${source_dir}/parquet
-
-go test $testargs -tags assert ./...
-
-# run the tests again but with the noasm tag
-go test $testargs -tags assert,noasm ./...
-
-popd
diff --git a/ci/scripts/go_tinygo_example.sh b/ci/scripts/go_tinygo_example.sh
deleted file mode 100755
index 7bde56226db7b..0000000000000
--- a/ci/scripts/go_tinygo_example.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-cd ~
-pushd /src
-tinygo build -tags noasm -o ~/example_tinygo arrow/_examples/helloworld/main.go
-popd
-
-./example_tinygo
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 588823b3189b9..10b5426b7502b 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -98,9 +98,6 @@ groups:
   ruby:
     - test-*ruby*
 
-  go:
-    - test*-go-*
-
   vcpkg:
     - test-*vcpkg*
     - wheel-*
@@ -920,7 +917,6 @@ tasks:
                                  ("ubuntu", "22.04")] %}
   {% for target in ["cpp",
                     "csharp",
-                    "go",
                     "integration",
                     "java",
                     "js",
@@ -973,7 +969,6 @@ tasks:
 
   {% for target in ["cpp",
                     "csharp",
-                    "go",
                     "integration",
                     "java",
                     "js",
@@ -989,7 +984,6 @@ tasks:
 
   {% for target in ["cpp",
                     "csharp",
-                    "go",
                     "integration",
                     "js",
                     "python",
@@ -1496,18 +1490,6 @@ tasks:
         R_PRUNE_DEPS: TRUE
       image: r-clang-sanitizer
 
-  {% for go_version, staticcheck in [("1.22", "v0.5.1"), ("1.23", "latest")] %}
-  test-debian-12-go-{{ go_version }}:
-    ci: github
-    template: docker-tests/github.linux.yml
-    params:
-      env:
-        DEBIAN: 12
-        GO: "{{go_version}}"
-        STATICCHECK: "{{ staticcheck }}"
-      image: debian-go
-  {% endfor %}
-
   # be sure to update binary-task.rb when upgrading Debian
   test-debian-12-docs:
     ci: github
diff --git a/docker-compose.yml b/docker-compose.yml
index ed517564cea22..0882121d598bb 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -134,9 +134,6 @@ x-hierarchy:
       - debian-ruby
     - debian-python:
       - debian-docs
-  - debian-go:
-    - debian-go-cgo
-    - debian-go-cgo-python
   - debian-js
   - fedora-cpp:
     - fedora-python
@@ -1758,68 +1755,6 @@ services:
     command: >
       /bin/bash -c "/arrow/ci/scripts/r_revdepcheck.sh /arrow"
 
-  ################################# Go ########################################
-
-  debian-go:
-    # Usage:
-    #   docker-compose build debian-go
-    #   docker-compose run debian-go
-    image: ${REPO}:${ARCH}-debian-${DEBIAN}-go-${GO}
-    build:
-      context: .
-      dockerfile: ci/docker/debian-${DEBIAN}-go.dockerfile
-      cache_from:
-        - ${REPO}:${ARCH}-debian-${DEBIAN}-go-${GO}
-      args:
-        arch: ${ARCH}
-        go: ${GO}
-        staticcheck: ${STATICCHECK}
-    shm_size: *shm-size
-    volumes: *debian-volumes
-    command: &go-command >
-      /bin/bash -c "
-        git config --global --add safe.directory /arrow &&
-        /arrow/ci/scripts/go_build.sh /arrow &&
-        /arrow/ci/scripts/go_test.sh /arrow"
-
-  debian-go-cgo:
-    # Usage:
-    #   docker-compose build debian-go-cgo
-    #   docker-compose run debian-go-cgo
-    image: ${REPO}:${ARCH}-debian-${DEBIAN}-go-${GO}-cgo
-    build:
-      context: .
-      dockerfile: ci/docker/debian-go-cgo.dockerfile
-      cache_from:
-        - ${REPO}:${ARCH}-debian-${DEBIAN}-go-${GO}-cgo
-      args:
-        base: ${REPO}:${ARCH}-debian-${DEBIAN}-go-${GO}
-    shm_size: *shm-size
-    volumes: *debian-volumes
-    environment:
-      <<: *common
-      ARROW_GO_TESTCGO: "1"
-    command: *go-command
-
-  debian-go-cgo-python:
-    # Usage:
-    #   docker-compose build debian-go-cgo-python
-    #   docker-compose run debian-go-cgo-python
-    image: ${REPO}:${ARCH}-debian-${DEBIAN}-go-${GO}-cgo-python
-    build:
-      context: .
-      dockerfile: ci/docker/debian-${DEBIAN}-go-cgo-python.dockerfile
-      cache_from:
-        - ${REPO}:${ARCH}-debian-${DEBIAN}-go-${GO}-cgo-python
-      args:
-        base: ${REPO}:${ARCH}-debian-${DEBIAN}-go-${GO}
-    shm_size: *shm-size
-    volumes: *debian-volumes
-    command: &go-cgo-python-command >
-      /bin/bash -c "
-        git config --global --add safe.directory /arrow &&
-        /arrow/ci/scripts/go_cgo_python_test.sh /arrow"
-
   ############################# JavaScript ####################################
 
   debian-js:
@@ -1908,7 +1843,6 @@ services:
         jdk: 17
         maven: ${MAVEN}
         node: ${NODE}
-        go: ${GO}
     volumes: *conda-volumes
     environment:
       <<: [*common, *ccache]

From a6f736c962adddd0f7078a10338fe8f1445c583b Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 19 Sep 2024 11:45:40 +0900
Subject: [PATCH 177/186] GH-44155: [Archery][Integration] Rename "language" to
 "implementation" (#44156)

### Rationale for this change

Because there is not a language, nanoarrow, for integration test targets.

### What changes are included in this PR?

* Rename "language" to "implementation" in documents and variable names
* Rename `--target-languages` to `--target-implementations`
* Rename `ARCHERY_INTEGRATION_TARGET_LANGUAGES` to `ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS`

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #44155

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ci/scripts/integration_arrow.sh           |  4 ++--
 dev/archery/archery/cli.py                | 20 ++++++++++----------
 dev/archery/archery/integration/runner.py |  9 +++++----
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh
index 079521d9a368a..8d0a343ebb443 100755
--- a/ci/scripts/integration_arrow.sh
+++ b/ci/scripts/integration_arrow.sh
@@ -29,8 +29,8 @@ gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration
 : ${ARROW_INTEGRATION_JAVA:=ON}
 : ${ARROW_INTEGRATION_JS:=ON}
 
-: ${ARCHERY_INTEGRATION_TARGET_LANGUAGES:=cpp,csharp,java,js}
-export ARCHERY_INTEGRATION_TARGET_LANGUAGES
+: ${ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS:=cpp,csharp,java,js}
+export ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS
 
 . ${arrow_dir}/ci/scripts/util_log.sh
 
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 49699e81d57f5..4f090657a590a 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -724,7 +724,7 @@ def _set_default(opt, default):
 
 @archery.command(short_help="Execute protocol and Flight integration tests")
 @click.option('--with-all', is_flag=True, default=False,
-              help=('Include all known languages by default '
+              help=('Include all known implementations by default '
                     'in integration tests'))
 @click.option('--random-seed', type=int, default=12345,
               help="Seed for PRNG when generating test data")
@@ -745,9 +745,9 @@ def _set_default(opt, default):
 @click.option('--with-rust', type=bool, default=False,
               help='Include Rust in integration tests',
               envvar="ARCHERY_INTEGRATION_WITH_RUST")
-@click.option('--target-languages', default='',
-              help=('Target languages in this integration tests'),
-              envvar="ARCHERY_INTEGRATION_TARGET_LANGUAGES")
+@click.option('--target-implementations', default='',
+              help=('Target implementations in this integration tests'),
+              envvar="ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS")
 @click.option('--write_generated_json', default="",
               help='Generate test JSON to indicated path')
 @click.option('--run-ipc', is_flag=True, default=False,
@@ -783,15 +783,15 @@ def integration(with_all=False, random_seed=12345, **args):
 
     gen_path = args['write_generated_json']
 
-    languages = ['cpp', 'csharp', 'java', 'js', 'go', 'nanoarrow', 'rust']
+    implementations = ['cpp', 'csharp', 'java', 'js', 'go', 'nanoarrow', 'rust']
     formats = ['ipc', 'flight', 'c_data']
 
-    enabled_languages = 0
-    for lang in languages:
+    enabled_implementations = 0
+    for lang in implementations:
         param = f'with_{lang}'
         if with_all:
             args[param] = with_all
-        enabled_languages += args[param]
+        enabled_implementations += args[param]
 
     enabled_formats = 0
     for fmt in formats:
@@ -808,9 +808,9 @@ def integration(with_all=False, random_seed=12345, **args):
             raise click.UsageError(
                 "Need to enable at least one format to test "
                 "(IPC, Flight, C Data Interface); try --help")
-        if enabled_languages == 0:
+        if enabled_implementations == 0:
             raise click.UsageError(
-                "Need to enable at least one language to test; try --help")
+                "Need to enable at least one implementation to test; try --help")
         run_all_tests(**args)
 
 
diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py
index 22cef46d0ca26..97854b87b24bd 100644
--- a/dev/archery/archery/integration/runner.py
+++ b/dev/archery/archery/integration/runner.py
@@ -583,16 +583,17 @@ def get_static_json_files():
 def run_all_tests(with_cpp=True, with_java=True, with_js=True,
                   with_csharp=True, with_go=True, with_rust=False,
                   with_nanoarrow=False, run_ipc=False, run_flight=False,
-                  run_c_data=False, tempdir=None, target_languages="",
+                  run_c_data=False, tempdir=None, target_implementations="",
                   **kwargs):
     tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-')
-    target_languages = list(filter(len, target_languages.split(",")))
+    target_implementations = \
+        target_implementations.split(",") if target_implementations else []
 
     testers: List[Tester] = []
     other_testers: List[Tester] = []
 
-    def append_tester(language, tester):
-        if len(target_languages) == 0 or language in target_languages:
+    def append_tester(implementation, tester):
+        if len(target_implementations) == 0 or implementation in target_implementations:
             testers.append(tester)
         else:
             other_testers.append(tester)

From 192d232d41f2a61ae45ab520af870675af674355 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 20 Sep 2024 06:39:10 +0900
Subject: [PATCH 178/186] GH-44008: [C++][Parquet] Add support for
 arrow::ArrayStatistics: boolean (#44009)

### Rationale for this change

Statistics is useful for fast processing.

Target types:

* `Boolean`

### What changes are included in this PR?

Map `ColumnChunkMetaData` information to `arrow::ArrayStatistics`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #44008

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .../parquet/arrow/arrow_statistics_test.cc    |  4 ++
 cpp/src/parquet/arrow/reader_internal.cc      | 40 ++++++++++++-------
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc
index 5011bf89112c6..a8e2287d37085 100644
--- a/cpp/src/parquet/arrow/arrow_statistics_test.cc
+++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc
@@ -248,6 +248,10 @@ void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) {
 }
 }  // namespace
 
+TEST(TestStatisticsRead, Boolean) {
+  TestStatisticsReadArray<::arrow::BooleanType, bool>(::arrow::boolean());
+}
+
 TEST(TestStatisticsRead, Int8) {
   TestStatisticsReadArray<::arrow::Int8Type, int64_t>(::arrow::int8());
 }
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index aa84a7a92bbe1..9d3171ea1a95d 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -342,21 +342,24 @@ void AttachStatistics(::arrow::ArrayData* data,
           static_cast<::parquet::TypedStatistics<ParquetType>*>(statistics);
       const ArrowCType min = typed_statistics->min();
       const ArrowCType max = typed_statistics->max();
-      if (std::is_floating_point<ArrowCType>::value) {
+      if constexpr (std::is_same<ArrowCType, bool>::value) {
+        array_statistics->min = static_cast<bool>(min);
+        array_statistics->max = static_cast<bool>(max);
+      } else if constexpr (std::is_floating_point<ArrowCType>::value) {
         array_statistics->min = static_cast<double>(min);
         array_statistics->max = static_cast<double>(max);
-      } else if (std::is_signed<ArrowCType>::value) {
+      } else if constexpr (std::is_signed<ArrowCType>::value) {
         array_statistics->min = static_cast<int64_t>(min);
         array_statistics->max = static_cast<int64_t>(max);
       } else {
         array_statistics->min = static_cast<uint64_t>(min);
         array_statistics->max = static_cast<uint64_t>(max);
       }
-      // We can assume that integer and floating point number based
-      // min/max are always exact if they exist. Apache Parquet's
-      // "Statistics" has "is_min_value_exact" and
-      // "is_max_value_exact" but we can ignore them for integer and
-      // floating point number based min/max.
+      // We can assume that integer/floating point number/boolean
+      // based min/max are always exact if they exist. Apache
+      // Parquet's "Statistics" has "is_min_value_exact" and
+      // "is_max_value_exact" but we can ignore them for integer/
+      // floating point number/boolean based min/max.
       //
       // See also the discussion at dev@parquet.apache.org:
       // https://lists.apache.org/thread/zfnmg5p51b7oylft5w5k4670wgkd4zv4
@@ -414,11 +417,13 @@ std::shared_ptr<Array> TransferZeroCopy(
   return ::arrow::MakeArray(std::move(data));
 }
 
-Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum* out) {
+Status TransferBool(RecordReader* reader,
+                    std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
+                    const ReaderContext* ctx, bool nullable, Datum* out) {
   int64_t length = reader->values_written();
 
   const int64_t buffer_size = bit_util::BytesForBits(length);
-  ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, pool));
+  ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, ctx->pool));
 
   // Transfer boolean values to packed bitmap
   auto values = reinterpret_cast<const bool*>(reader->values());
@@ -431,13 +436,19 @@ Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum
     }
   }
 
+  std::shared_ptr<::arrow::ArrayData> array_data;
   if (nullable) {
-    *out = std::make_shared<BooleanArray>(length, std::move(data),
-                                          reader->ReleaseIsValid(), reader->null_count());
+    array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length,
+                                          {reader->ReleaseIsValid(), std::move(data)},
+                                          reader->null_count());
   } else {
-    *out = std::make_shared<BooleanArray>(length, std::move(data),
-                                          /*null_bitmap=*/nullptr, /*null_count=*/0);
+    array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length,
+                                          {/*null_bitmap=*/nullptr, std::move(data)},
+                                          /*null_count=*/0);
   }
+  AttachStatistics<::arrow::BooleanType, BooleanType>(array_data.get(),
+                                                      std::move(metadata), ctx);
+  *out = std::make_shared<BooleanArray>(std::move(array_data));
   return Status::OK();
 }
 
@@ -833,7 +844,8 @@ Status TransferColumnData(RecordReader* reader,
           reader, std::move(metadata), ctx, value_field);
       break;
     case ::arrow::Type::BOOL:
-      RETURN_NOT_OK(TransferBool(reader, value_field->nullable(), pool, &result));
+      RETURN_NOT_OK(TransferBool(reader, std::move(metadata), ctx,
+                                 value_field->nullable(), &result));
       break;
       TRANSFER_INT32(UINT8, ::arrow::UInt8Type);
       TRANSFER_INT32(INT8, ::arrow::Int8Type);

From 7d96d9434028fb4d5fcf677d93dd584284cc713c Mon Sep 17 00:00:00 2001
From: ViggoC <viggoc96@gmail.com>
Date: Fri, 20 Sep 2024 13:36:22 +0800
Subject: [PATCH 179/186] GH-39982: [Java] Add RunEndEncodedVector (#43888)

### Rationale for this change

### What changes are included in this PR?

### Are these changes tested?

### Are there any user-facing changes?

* GitHub Issue: #39982

Lead-authored-by: ViggoC <viggoc96@gmail.com>
Co-authored-by: chenweiguo.vc <chenweiguo.vc@bytedance.com>
Co-authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../binder/ColumnBinderArrowTypeVisitor.java  |   5 +
 .../arrow/c/BufferImportTypeVisitor.java      |   5 +
 .../jdbc/utils/AvaticaParameterBinder.java    |   6 +
 .../arrow/driver/jdbc/utils/ConvertUtils.java |   6 +
 .../src/main/codegen/data/ArrowTypes.tdd      |   5 +
 .../main/codegen/templates/UnionReader.java   |   2 +-
 .../org/apache/arrow/vector/TypeLayout.java   |  11 +
 .../vector/compare/RangeEqualsVisitor.java    |  53 ++
 .../vector/compare/TypeEqualsVisitor.java     |   6 +
 .../arrow/vector/compare/VectorVisitor.java   |   6 +
 .../vector/complex/RunEndEncodedVector.java   | 684 ++++++++++++++++++
 .../arrow/vector/extension/OpaqueType.java    |   5 +
 .../org/apache/arrow/vector/types/Types.java  |  20 +
 .../validate/ValidateVectorBufferVisitor.java |  34 +
 .../validate/ValidateVectorTypeVisitor.java   |  12 +
 .../arrow/vector/TestRunEndEncodedVector.java | 231 ++++++
 .../vector/validate/TestValidateVector.java   |  38 +
 17 files changed, 1128 insertions(+), 1 deletion(-)
 create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java
 create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java

diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java
index cb8e43035d33b..a3d615a7e1958 100644
--- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java
+++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java
@@ -96,6 +96,11 @@ public ColumnBinder visit(ArrowType.Union type) {
     throw new UnsupportedOperationException("No column binder implemented for type " + type);
   }
 
+  @Override
+  public ColumnBinder visit(ArrowType.RunEndEncoded type) {
+    throw new UnsupportedOperationException("No column binder implemented for type " + type);
+  }
+
   @Override
   public ColumnBinder visit(ArrowType.Map type) {
     return new MapBinder((MapVector) vector);
diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
index e47d27bf091ee..150c11e41edff 100644
--- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
+++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java
@@ -185,6 +185,11 @@ public List<ArrowBuf> visit(ArrowType.Union type) {
     }
   }
 
+  @Override
+  public List<ArrowBuf> visit(ArrowType.RunEndEncoded type) {
+    throw new UnsupportedOperationException("Importing buffers for type: " + type);
+  }
+
   @Override
   public List<ArrowBuf> visit(ArrowType.Map type) {
     return Arrays.asList(maybeImportBitmap(type), importOffsets(type, MapVector.OFFSET_WIDTH));
diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java
index 232fa1524088b..4c2a9b865f141 100644
--- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java
+++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java
@@ -281,5 +281,11 @@ public Boolean visit(ArrowType.ListView type) {
     public Boolean visit(ArrowType.LargeListView type) {
       throw new UnsupportedOperationException("Binding is not yet supported for type " + type);
     }
+
+    @Override
+    public Boolean visit(ArrowType.RunEndEncoded type) {
+      throw new UnsupportedOperationException(
+          "No Avatica parameter binder implemented for type " + type);
+    }
   }
 }
diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java
index ea57aeb774c0a..17b0f42dc7111 100644
--- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java
+++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java
@@ -284,5 +284,11 @@ public AvaticaParameter visit(ArrowType.LargeListView type) {
       throw new UnsupportedOperationException(
           "AvaticaParameter not yet supported for type " + type);
     }
+
+    @Override
+    public AvaticaParameter visit(ArrowType.RunEndEncoded type) {
+      throw new UnsupportedOperationException(
+          "No Avatica parameter binder implemented for type " + type);
+    }
   }
 }
diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd
index d0e8ef1e358ea..5a0b30e47ee52 100644
--- a/java/vector/src/main/codegen/data/ArrowTypes.tdd
+++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd
@@ -139,6 +139,11 @@
        name: "LargeListView",
        fields: [],
        complex: true
+    },
+    {
+       name: "RunEndEncoded",
+       fields: [],
+       complex: true
     }
   ]
 }
diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java
index d2b2f4bb70975..68e30ef48846b 100644
--- a/java/vector/src/main/codegen/templates/UnionReader.java
+++ b/java/vector/src/main/codegen/templates/UnionReader.java
@@ -39,7 +39,7 @@
 @SuppressWarnings("unused")
 public class UnionReader extends AbstractFieldReader {
 
-  private static final int NUM_SUPPORTED_TYPES = 50;
+  private static final int NUM_SUPPORTED_TYPES = 51;
 
   private BaseReader[] readers = new BaseReader[NUM_SUPPORTED_TYPES];
   public UnionVector data;
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java
index 78a3cac020a8c..fa75ef04577a3 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java
@@ -40,6 +40,7 @@
 import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8;
 import org.apache.arrow.vector.types.pojo.ArrowType.Map;
 import org.apache.arrow.vector.types.pojo.ArrowType.Null;
+import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded;
 import org.apache.arrow.vector.types.pojo.ArrowType.Struct;
 import org.apache.arrow.vector.types.pojo.ArrowType.Time;
 import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp;
@@ -280,6 +281,11 @@ public TypeLayout visit(Interval type) {
               public TypeLayout visit(Duration type) {
                 return newFixedWidthTypeLayout(BufferLayout.dataBuffer(64));
               }
+
+              @Override
+              public TypeLayout visit(RunEndEncoded type) {
+                return new TypeLayout(Collections.<BufferLayout>emptyList());
+              }
             });
     return layout;
   }
@@ -444,6 +450,11 @@ public Integer visit(Interval type) {
           public Integer visit(Duration type) {
             return FIXED_WIDTH_BUFFER_COUNT;
           }
+
+          @Override
+          public Integer visit(RunEndEncoded type) {
+            return 0;
+          }
         });
   }
 
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
index ed51f748af577..abcf312c5ecfc 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java
@@ -29,6 +29,7 @@
 import org.apache.arrow.vector.BaseVariableWidthViewVector;
 import org.apache.arrow.vector.BitVector;
 import org.apache.arrow.vector.ExtensionTypeVector;
+import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.NullVector;
 import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.complex.BaseLargeRepeatedValueViewVector;
@@ -41,11 +42,13 @@
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
+import org.apache.arrow.vector.complex.RunEndEncodedVector;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
 
 /** Visitor to compare a range of values for vectors. */
 public class RangeEqualsVisitor implements VectorVisitor<Boolean, Range> {
+
   private ValueVector left;
   private ValueVector right;
 
@@ -226,6 +229,14 @@ public Boolean visit(NullVector left, Range range) {
     return true;
   }
 
+  @Override
+  public Boolean visit(RunEndEncodedVector left, Range range) {
+    if (!validate(left)) {
+      return false;
+    }
+    return compareRunEndEncodedVectors(range);
+  }
+
   @Override
   public Boolean visit(ExtensionTypeVector<?> left, Range range) {
     if (!(right instanceof ExtensionTypeVector<?>) || !validate(left)) {
@@ -255,6 +266,48 @@ public Boolean visit(LargeListViewVector left, Range range) {
     return compareLargeListViewVectors(range);
   }
 
+  protected boolean compareRunEndEncodedVectors(Range range) {
+    RunEndEncodedVector leftVector = (RunEndEncodedVector) left;
+    RunEndEncodedVector rightVector = (RunEndEncodedVector) right;
+
+    final int leftRangeEnd = range.getLeftStart() + range.getLength();
+    final int rightRangeEnd = range.getRightStart() + range.getLength();
+
+    FieldVector leftValuesVector = leftVector.getValuesVector();
+    FieldVector rightValuesVector = rightVector.getValuesVector();
+
+    RangeEqualsVisitor innerVisitor = createInnerVisitor(leftValuesVector, rightValuesVector, null);
+
+    int leftLogicalIndex = range.getLeftStart();
+    int rightLogicalIndex = range.getRightStart();
+
+    while (leftLogicalIndex < leftRangeEnd) {
+      // TODO: implement it more efficient
+      // https://github.com/apache/arrow/issues/44157
+      int leftPhysicalIndex = leftVector.getPhysicalIndex(leftLogicalIndex);
+      int rightPhysicalIndex = rightVector.getPhysicalIndex(rightLogicalIndex);
+      if (leftValuesVector.accept(
+          innerVisitor, new Range(leftPhysicalIndex, rightPhysicalIndex, 1))) {
+        int leftRunEnd = leftVector.getRunEnd(leftLogicalIndex);
+        int rightRunEnd = rightVector.getRunEnd(rightLogicalIndex);
+
+        int leftRunLength = Math.min(leftRunEnd, leftRangeEnd) - leftLogicalIndex;
+        int rightRunLength = Math.min(rightRunEnd, rightRangeEnd) - rightLogicalIndex;
+
+        if (leftRunLength != rightRunLength) {
+          return false;
+        } else {
+          leftLogicalIndex = leftRunEnd;
+          rightLogicalIndex = rightRunEnd;
+        }
+      } else {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   protected RangeEqualsVisitor createInnerVisitor(
       ValueVector leftInner,
       ValueVector rightInner,
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java
index ce92b22ef61c9..30b2f511a0445 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java
@@ -32,6 +32,7 @@
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
+import org.apache.arrow.vector.complex.RunEndEncodedVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.types.pojo.Field;
 
@@ -136,6 +137,11 @@ public Boolean visit(LargeListViewVector left, Void value) {
     return compareField(left.getField(), right.getField());
   }
 
+  @Override
+  public Boolean visit(RunEndEncodedVector left, Void value) {
+    return compareField(left.getField(), right.getField());
+  }
+
   private boolean compareField(Field leftField, Field rightField) {
 
     if (leftField == rightField) {
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java
index e20f8cd9cfba5..989c57a0c93d0 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java
@@ -29,6 +29,7 @@
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
+import org.apache.arrow.vector.complex.RunEndEncodedVector;
 import org.apache.arrow.vector.complex.UnionVector;
 
 /**
@@ -71,4 +72,9 @@ default OUT visit(LargeListViewVector left, IN value) {
     throw new UnsupportedOperationException(
         "VectorVisitor for LargeListViewVector is not supported.");
   }
+
+  default OUT visit(RunEndEncodedVector left, IN value) {
+    throw new UnsupportedOperationException(
+        "VectorVisitor for LargeListViewVector is not supported.");
+  };
 }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java
new file mode 100644
index 0000000000000..e8de86f6e9549
--- /dev/null
+++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java
@@ -0,0 +1,684 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.vector.complex;
+
+import static org.apache.arrow.util.Preconditions.checkArgument;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.OutOfMemoryException;
+import org.apache.arrow.memory.util.ByteFunctionHelpers;
+import org.apache.arrow.memory.util.hash.ArrowBufHasher;
+import org.apache.arrow.vector.BaseIntVector;
+import org.apache.arrow.vector.BaseValueVector;
+import org.apache.arrow.vector.BufferBacked;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.ValueVector;
+import org.apache.arrow.vector.ZeroVector;
+import org.apache.arrow.vector.compare.VectorVisitor;
+import org.apache.arrow.vector.complex.reader.FieldReader;
+import org.apache.arrow.vector.complex.writer.FieldWriter;
+import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
+import org.apache.arrow.vector.types.Types.MinorType;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.arrow.vector.util.CallBack;
+import org.apache.arrow.vector.util.TransferPair;
+
+/**
+ * A run-end encoded vector contains only two child vectors: a run_end vector of type int and a
+ * values vector of any type. There are no buffers associated with the parent vector.
+ */
+public class RunEndEncodedVector extends BaseValueVector implements FieldVector {
+  public static final FieldVector DEFAULT_VALUE_VECTOR = ZeroVector.INSTANCE;
+  public static final FieldVector DEFAULT_RUN_END_VECTOR = ZeroVector.INSTANCE;
+
+  public static RunEndEncodedVector empty(String name, BufferAllocator allocator) {
+    return new RunEndEncodedVector(
+        name, allocator, FieldType.notNullable(ArrowType.RunEndEncoded.INSTANCE), null);
+  }
+
+  protected final CallBack callBack;
+  protected Field field;
+  protected FieldVector runEndsVector;
+  protected FieldVector valuesVector;
+  protected int valueCount;
+
+  /**
+   * Constructs a new instance.
+   *
+   * @param name The name of the instance.
+   * @param allocator The allocator to use for allocating/reallocating buffers.
+   * @param fieldType The type of the array that is run-end encoded.
+   * @param callBack A schema change callback.
+   */
+  public RunEndEncodedVector(
+      String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) {
+    this(new Field(name, fieldType, null), allocator, callBack);
+  }
+
+  /**
+   * Constructs a new instance.
+   *
+   * @param field The field materialized by this vector.
+   * @param allocator The allocator to use for allocating/reallocating buffers.
+   * @param callBack A schema change callback.
+   */
+  public RunEndEncodedVector(Field field, BufferAllocator allocator, CallBack callBack) {
+    this(field, allocator, DEFAULT_RUN_END_VECTOR, DEFAULT_VALUE_VECTOR, callBack);
+  }
+
+  /**
+   * Constructs a new instance.
+   *
+   * @param field The field materialized by this vector.
+   * @param allocator The allocator to use for allocating/reallocating buffers.
+   * @param runEndsVector The vector represents run ends. Only Zero vector or type int vector with
+   *     size 16, 32 is allowed
+   * @param valuesVector The vector represents values
+   * @param callBack A schema change callback.
+   */
+  public RunEndEncodedVector(
+      Field field,
+      BufferAllocator allocator,
+      FieldVector runEndsVector,
+      FieldVector valuesVector,
+      CallBack callBack) {
+    super(allocator);
+    this.field = field;
+    this.callBack = callBack;
+    this.valueCount = 0;
+    this.runEndsVector = runEndsVector;
+    this.valuesVector = valuesVector;
+  }
+
+  /** ValueVector interface */
+
+  /**
+   * Allocate new buffers. ValueVector implements logic to determine how much to allocate.
+   *
+   * @throws OutOfMemoryException Thrown if no memory can be allocated.
+   */
+  @Override
+  public void allocateNew() throws OutOfMemoryException {
+    if (!allocateNewSafe()) {
+      throw new OutOfMemoryException("Failure while allocating memory");
+    }
+  }
+
+  /**
+   * Allocates new buffers. ValueVector implements logic to determine how much to allocate.
+   *
+   * @return Returns true if allocation was successful.
+   */
+  @Override
+  public boolean allocateNewSafe() {
+    initializeChildrenFromFields(field.getChildren());
+    for (FieldVector v : getChildrenFromFields()) {
+      boolean isAllocated = v.allocateNewSafe();
+      if (!isAllocated) {
+        v.clear();
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Allocate new buffer with double capacity, and copy data into the new buffer. Replace vector's
+   * buffer with new buffer, and release old one
+   */
+  @Override
+  public void reAlloc() {
+    for (FieldVector v : getChildrenFromFields()) {
+      v.reAlloc();
+    }
+  }
+
+  @Override
+  public BufferAllocator getAllocator() {
+    return allocator;
+  }
+
+  @Override
+  protected FieldReader getReaderImpl() {
+    throw new UnsupportedOperationException("Not yet implemented.");
+  }
+
+  /**
+   * Set the initial record capacity.
+   *
+   * @param numRecords the initial record capacity.
+   */
+  @Override
+  public void setInitialCapacity(int numRecords) {}
+
+  /**
+   * Returns the maximum number of values that can be stored in this vector instance.
+   *
+   * @return the maximum number of values that can be stored in this vector instance.
+   */
+  @Override
+  public int getValueCapacity() {
+    return getChildrenFromFields().stream()
+        .mapToInt(item -> item != null ? item.getValueCapacity() : 0)
+        .min()
+        .orElseThrow(NoSuchElementException::new);
+  }
+
+  /** Alternative to clear(). Allows use as an AutoCloseable in try-with-resources. */
+  @Override
+  public void close() {
+    for (FieldVector v : getChildrenFromFields()) {
+      v.close();
+    }
+  }
+
+  /**
+   * Release any owned ArrowBuf and reset the ValueVector to the initial state. If the vector has
+   * any child vectors, they will also be cleared.
+   */
+  @Override
+  public void clear() {
+    for (FieldVector v : getChildrenFromFields()) {
+      v.clear();
+    }
+  }
+
+  /**
+   * Reset the ValueVector to the initial state without releasing any owned ArrowBuf. Buffer
+   * capacities will remain unchanged and any previous data will be zeroed out. This includes
+   * buffers for data, validity, offset, etc. If the vector has any child vectors, they will also be
+   * reset.
+   */
+  @Override
+  public void reset() {
+    for (FieldVector v : getChildrenFromFields()) {
+      v.reset();
+    }
+    valueCount = 0;
+  }
+
+  /**
+   * Get information about how this field is materialized.
+   *
+   * @return the field corresponding to this vector
+   */
+  @Override
+  public Field getField() {
+    return field;
+  }
+
+  @Override
+  public MinorType getMinorType() {
+    return MinorType.RUNENDENCODED;
+  }
+
+  /**
+   * To transfer quota responsibility.
+   *
+   * @param allocator the target allocator
+   * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new
+   *     target vector of the same type.
+   */
+  @Override
+  public TransferPair getTransferPair(BufferAllocator allocator) {
+    throw new UnsupportedOperationException(
+        "RunEndEncodedVector does not support getTransferPair(BufferAllocator)");
+  }
+
+  /**
+   * To transfer quota responsibility.
+   *
+   * @param ref the name of the vector
+   * @param allocator the target allocator
+   * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new
+   *     target vector of the same type.
+   */
+  @Override
+  public TransferPair getTransferPair(String ref, BufferAllocator allocator) {
+    return getTransferPair(ref, allocator, null);
+  }
+
+  /**
+   * To transfer quota responsibility.
+   *
+   * @param field the Field object used by the target vector
+   * @param allocator the target allocator
+   * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new
+   *     target vector of the same type.
+   */
+  @Override
+  public TransferPair getTransferPair(Field field, BufferAllocator allocator) {
+    return getTransferPair(field, allocator, null);
+  }
+
+  /**
+   * To transfer quota responsibility.
+   *
+   * @param ref the name of the vector
+   * @param allocator the target allocator
+   * @param callBack A schema change callback.
+   * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new
+   *     target vector of the same type.
+   */
+  @Override
+  public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) {
+    throw new UnsupportedOperationException(
+        "RunEndEncodedVector does not support getTransferPair(String, BufferAllocator, CallBack)");
+  }
+
+  /**
+   * To transfer quota responsibility.
+   *
+   * @param field the Field object used by the target vector
+   * @param allocator the target allocator
+   * @param callBack A schema change callback.
+   * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new
+   *     target vector of the same type.
+   */
+  @Override
+  public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) {
+    throw new UnsupportedOperationException(
+        "RunEndEncodedVector does not support getTransferPair(Field, BufferAllocator, CallBack)");
+  }
+
+  /**
+   * Makes a new transfer pair used to transfer underlying buffers.
+   *
+   * @param target the target for the transfer
+   * @return a new {@link org.apache.arrow.vector.util.TransferPair transfer pair} that is used to
+   *     transfer underlying buffers into the target vector.
+   */
+  @Override
+  public TransferPair makeTransferPair(ValueVector target) {
+    throw new UnsupportedOperationException(
+        "RunEndEncodedVector does not support makeTransferPair(ValueVector)");
+  }
+
+  /**
+   * Get a reader for this vector.
+   *
+   * @return a {@link org.apache.arrow.vector.complex.reader.FieldReader field reader} that supports
+   *     reading values from this vector.
+   */
+  @Override
+  public FieldReader getReader() {
+    throw new UnsupportedOperationException("Not yet implemented.");
+  }
+
+  /**
+   * Get a writer for this vector.
+   *
+   * @return a {@link org.apache.arrow.vector.complex.writer.FieldWriter field writer} that supports
+   *     writing values to this vector.
+   */
+  public FieldWriter getWriter() {
+    throw new UnsupportedOperationException("Not yet implemented.");
+  }
+
+  /**
+   * Get the number of bytes used by this vector.
+   *
+   * @return the number of bytes that is used by this vector instance.
+   */
+  @Override
+  public int getBufferSize() {
+    int bufferSize = 0;
+    for (FieldVector v : getChildrenFromFields()) {
+      bufferSize += v.getBufferSize();
+    }
+    return bufferSize;
+  }
+
+  /**
+   * Returns the number of bytes that is used by this vector if it holds the given number of values.
+   * The result will be the same as if setValueCount() were called, followed by calling
+   * getBufferSize(), but without any of the closing side-effects that setValueCount() implies wrt
+   * finishing off the population of a vector. Some operations might wish to use this to determine
+   * how much memory has been used by a vector so far, even though it is not finished being
+   * populated.
+   *
+   * @param valueCount the number of values to assume this vector contains
+   * @return the buffer size if this vector is holding valueCount values
+   */
+  @Override
+  public int getBufferSizeFor(int valueCount) {
+    return 0;
+  }
+
+  /**
+   * Return the underlying buffers associated with this vector. Note that this doesn't impact the
+   * reference counts for this buffer so it only should be used for in-context access. Also note
+   * that this buffer changes regularly thus external classes shouldn't hold a reference to it
+   * (unless they change it).
+   *
+   * @param clear Whether to clear vector before returning; the buffers will still be refcounted;
+   *     but the returned array will be the only reference to them
+   * @return The underlying {@link ArrowBuf buffers} that is used by this vector instance.
+   */
+  @Override
+  public ArrowBuf[] getBuffers(boolean clear) {
+    return new ArrowBuf[0];
+  }
+
+  /**
+   * Gets the underlying buffer associated with validity vector.
+   *
+   * @return buffer
+   */
+  @Override
+  public ArrowBuf getValidityBuffer() {
+    throw new UnsupportedOperationException(
+        "Run-end encoded vectors do not have a validity buffer.");
+  }
+
+  /**
+   * Gets the underlying buffer associated with data vector.
+   *
+   * @return buffer
+   */
+  @Override
+  public ArrowBuf getDataBuffer() {
+    throw new UnsupportedOperationException("Run-end encoded vectors do not have a data buffer.");
+  }
+
+  /**
+   * Gets the underlying buffer associated with offset vector.
+   *
+   * @return buffer
+   */
+  @Override
+  public ArrowBuf getOffsetBuffer() {
+    throw new UnsupportedOperationException("Run-end encoded vectors do not have a offset buffer.");
+  }
+
+  /**
+   * Gets the number of values.
+   *
+   * @return number of values in the vector
+   */
+  @Override
+  public int getValueCount() {
+    return valueCount;
+  }
+
+  /** Set number of values in the vector. */
+  @Override
+  public void setValueCount(int valueCount) {
+    this.valueCount = valueCount;
+  }
+
+  /**
+   * Get friendly type object from the vector.
+   *
+   * @param index index of object to get
+   * @return friendly type object
+   */
+  @Override
+  public Object getObject(int index) {
+    checkIndex(index);
+    int physicalIndex = getPhysicalIndex(index);
+    return valuesVector.getObject(physicalIndex);
+  }
+
+  /**
+   * Get the run end of giving index.
+   *
+   * @param index index of the run end to get
+   * @return the run end of giving index
+   */
+  public int getRunEnd(int index) {
+    checkIndex(index);
+    int physicalIndex = getPhysicalIndex(index);
+    return (int) ((BaseIntVector) runEndsVector).getValueAsLong(physicalIndex);
+  }
+
+  /**
+   * Returns number of null elements in the vector.
+   *
+   * @return number of null elements
+   */
+  @Override
+  public int getNullCount() {
+    // Null count is always 0 for run-end encoded array
+    return 0;
+  }
+
+  /**
+   * Check whether an element in the vector is null.
+   *
+   * @param index index to check for null
+   * @return true if element is null
+   */
+  @Override
+  public boolean isNull(int index) {
+    int physicalIndex = getPhysicalIndex(runEndsVector, index);
+    return valuesVector.isNull(physicalIndex);
+  }
+
+  /** Returns hashCode of element in index with the default hasher. */
+  @Override
+  public int hashCode(int index) {
+    return hashCode(index, null);
+  }
+
+  /** Returns hashCode of element in index with the given hasher. */
+  @Override
+  public int hashCode(int index, ArrowBufHasher hasher) {
+    int hash = 0;
+    for (FieldVector v : getChildrenFromFields()) {
+      if (index < v.getValueCount()) {
+        hash = ByteFunctionHelpers.combineHash(hash, v.hashCode(index, hasher));
+      }
+    }
+    return hash;
+  }
+
+  /**
+   * Accept a generic {@link VectorVisitor} and return the result.
+   *
+   * @param <OUT> the output result type.
+   * @param <IN> the input data together with visitor.
+   */
+  @Override
+  public <OUT, IN> OUT accept(VectorVisitor<OUT, IN> visitor, IN value) {
+    return visitor.visit(this, value);
+  }
+
+  /**
+   * Gets the name of the vector.
+   *
+   * @return the name of the vector.
+   */
+  @Override
+  public String getName() {
+    return this.field.getName();
+  }
+
+  @Override
+  public Iterator<ValueVector> iterator() {
+    return Collections.<ValueVector>unmodifiableCollection(getChildrenFromFields()).iterator();
+  }
+
+  /** FieldVector interface */
+
+  /**
+   * Initializes the child vectors to be later loaded with loadBuffers.
+   *
+   * @param children the schema containing the run_ends column first and the values column second
+   */
+  @Override
+  public void initializeChildrenFromFields(List<Field> children) {
+    checkArgument(
+        children.size() == 2,
+        "Run-end encoded vectors must have two child Fields. Found: %s",
+        children.isEmpty() ? "none" : children);
+    checkArgument(
+        Arrays.asList(
+                MinorType.SMALLINT.getType(), MinorType.INT.getType(), MinorType.BIGINT.getType())
+            .contains(children.get(0).getType()),
+        "The first field represents the run-end vector and must be of type int "
+            + "with size 16, 32, or 64 bits. Found: %s",
+        children.get(0).getType());
+    runEndsVector = (BaseIntVector) children.get(0).createVector(allocator);
+    valuesVector = children.get(1).createVector(allocator);
+    field = new Field(field.getName(), field.getFieldType(), children);
+  }
+
+  /**
+   * The returned list is the same size as the list passed to initializeChildrenFromFields.
+   *
+   * @return the children according to schema (empty for primitive types)
+   */
+  @Override
+  public List<FieldVector> getChildrenFromFields() {
+    return Arrays.asList(runEndsVector, valuesVector);
+  }
+
+  /**
+   * Loads data in the vectors. (ownBuffers must be the same size as getFieldVectors())
+   *
+   * @param fieldNode the fieldNode
+   * @param ownBuffers the buffers for this Field (own buffers only, children not included)
+   */
+  @Override
+  public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
+    if (!ownBuffers.isEmpty()) {
+      throw new UnsupportedOperationException(
+          "Run-end encoded vectors do not have any associated buffers.");
+    }
+  }
+
+  /**
+   * Get the buffers of the fields, (same size as getFieldVectors() since it is their content).
+   *
+   * @return the buffers containing the data for this vector (ready for reading)
+   */
+  @Override
+  public List<ArrowBuf> getFieldBuffers() {
+    return List.of();
+  }
+
+  /**
+   * Get the inner vectors.
+   *
+   * @return the inner vectors for this field as defined by the TypeLayout
+   * @deprecated This API will be removed as the current implementations no longer support inner
+   *     vectors.
+   */
+  @Deprecated
+  @Override
+  public List<BufferBacked> getFieldInnerVectors() {
+    throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers().");
+  }
+
+  /**
+   * Gets the starting address of the underlying buffer associated with validity vector.
+   *
+   * @return buffer address
+   */
+  @Override
+  public long getValidityBufferAddress() {
+    throw new UnsupportedOperationException(
+        "Run-end encoded vectors do not have a validity buffer.");
+  }
+
+  /**
+   * Gets the starting address of the underlying buffer associated with data vector.
+   *
+   * @return buffer address
+   */
+  @Override
+  public long getDataBufferAddress() {
+    throw new UnsupportedOperationException("Run-end encoded vectors do not have a data buffer.");
+  }
+
+  /**
+   * Gets the starting address of the underlying buffer associated with offset vector.
+   *
+   * @return buffer address
+   */
+  @Override
+  public long getOffsetBufferAddress() {
+    throw new UnsupportedOperationException(
+        "Run-end encoded vectors do not have an offset buffer.");
+  }
+
+  /**
+   * Set the element at the given index to null.
+   *
+   * @param index the value to change
+   */
+  @Override
+  public void setNull(int index) {
+    throw new UnsupportedOperationException(
+        "Run-end encoded vectors do not have a validity buffer.");
+  }
+
+  public FieldVector getRunEndsVector() {
+    return runEndsVector;
+  }
+
+  public FieldVector getValuesVector() {
+    return valuesVector;
+  }
+
+  private void checkIndex(int logicalIndex) {
+    if (logicalIndex < 0 || logicalIndex >= valueCount) {
+      throw new IndexOutOfBoundsException(
+          String.format("index: %s, expected range (0, %s)", logicalIndex, valueCount));
+    }
+  }
+
+  /**
+   * The physical index is the index of the first value that is larger than logical index. e.g. if
+   * run_ends is [1,2,3], the physical index of logical index from 0 to 5 is [0, 1, 1, 2, 2, 2]
+   */
+  public int getPhysicalIndex(int logicalIndex) {
+    return getPhysicalIndex(runEndsVector, logicalIndex);
+  }
+
+  static int getPhysicalIndex(FieldVector runEndVector, int logicalIndex) {
+    if (runEndVector == null || runEndVector.getValueCount() == 0) {
+      return -1;
+    }
+
+    int low = 0;
+    int high = runEndVector.getValueCount() - 1;
+    int result = -1;
+
+    while (low <= high) {
+      int mid = low + (high - low) / 2;
+      long valueAsLong = ((BaseIntVector) runEndVector).getValueAsLong(mid);
+      if (valueAsLong > logicalIndex) {
+        result = mid;
+        high = mid - 1;
+      } else {
+        low = mid + 1;
+      }
+    }
+
+    return result;
+  }
+}
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java b/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java
index f4f06dad2a424..ca56214fdac77 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/extension/OpaqueType.java
@@ -394,5 +394,10 @@ public FieldVector visit(ListView type) {
     public FieldVector visit(LargeListView type) {
       throw unsupported(type);
     }
+
+    @Override
+    public FieldVector visit(RunEndEncoded type) {
+      throw unsupported(type);
+    }
   }
 }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java
index 6b2c56de01c40..e9b963b62c13b 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java
@@ -73,6 +73,7 @@
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.ListViewVector;
 import org.apache.arrow.vector.complex.MapVector;
+import org.apache.arrow.vector.complex.RunEndEncodedVector;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.complex.impl.BigIntWriterImpl;
@@ -142,6 +143,7 @@
 import org.apache.arrow.vector.types.pojo.ArrowType.ListView;
 import org.apache.arrow.vector.types.pojo.ArrowType.Map;
 import org.apache.arrow.vector.types.pojo.ArrowType.Null;
+import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded;
 import org.apache.arrow.vector.types.pojo.ArrowType.Struct;
 import org.apache.arrow.vector.types.pojo.ArrowType.Time;
 import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp;
@@ -786,6 +788,19 @@ public FieldWriter getNewFieldWriter(ValueVector vector) {
             .getNewFieldWriter(vector);
       }
     },
+    RUNENDENCODED(RunEndEncoded.INSTANCE) {
+      @Override
+      public FieldVector getNewVector(
+          Field field, BufferAllocator allocator, CallBack schemaChangeCallback) {
+        return new RunEndEncodedVector(field, allocator, schemaChangeCallback);
+      }
+
+      @Override
+      public FieldWriter getNewFieldWriter(ValueVector vector) {
+        throw new UnsupportedOperationException(
+            "FieldWriter for run-end encoded vector is not implemented yet.");
+      }
+    },
     ;
 
     private final ArrowType type;
@@ -1021,6 +1036,11 @@ public MinorType visit(LargeListView type) {
           public MinorType visit(ExtensionType type) {
             return MinorType.EXTENSIONTYPE;
           }
+
+          @Override
+          public MinorType visit(RunEndEncoded type) {
+            return MinorType.RUNENDENCODED;
+          }
         });
   }
 }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java
index 0c9140c360d15..ef31b4f837344 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java
@@ -20,6 +20,7 @@
 
 import org.apache.arrow.memory.ArrowBuf;
 import org.apache.arrow.vector.BaseFixedWidthVector;
+import org.apache.arrow.vector.BaseIntVector;
 import org.apache.arrow.vector.BaseLargeVariableWidthVector;
 import org.apache.arrow.vector.BaseVariableWidthVector;
 import org.apache.arrow.vector.BaseVariableWidthViewVector;
@@ -35,6 +36,7 @@
 import org.apache.arrow.vector.complex.LargeListVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
+import org.apache.arrow.vector.complex.RunEndEncodedVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.types.pojo.ArrowType;
 
@@ -287,4 +289,36 @@ public Void visit(ExtensionTypeVector<?> vector, Void value) {
     vector.getUnderlyingVector().accept(this, value);
     return null;
   }
+
+  @Override
+  public Void visit(RunEndEncodedVector vector, Void value) {
+    validateVectorCommon(vector);
+    int valueCount = vector.getValueCount();
+    FieldVector runEndsVector = vector.getRunEndsVector();
+
+    if (runEndsVector != null) {
+      validateOrThrow(
+          runEndsVector.getNullCount() == 0, "Run ends vector cannot contain null values");
+      runEndsVector.accept(this, null);
+
+      int runCount = runEndsVector.getValueCount();
+      if (runCount == 0) {
+        validateOrThrow(valueCount == 0, "Run end vector does not contain enough elements");
+      } else if (runCount > 0) {
+        double lastEnd = ((BaseIntVector) runEndsVector).getValueAsLong(runCount - 1);
+        validateOrThrow(
+            valueCount == lastEnd,
+            "Vector logic length not equal to the last end in run ends vector. Logical length %s, last end %s",
+            valueCount,
+            lastEnd);
+      }
+    }
+
+    FieldVector valuesVector = vector.getValuesVector();
+    if (valuesVector != null) {
+      valuesVector.accept(this, null);
+    }
+
+    return null;
+  }
 }
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java
index f947dcf41342f..daad41dbdc2ce 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java
@@ -67,6 +67,7 @@
 import org.apache.arrow.vector.complex.LargeListVector;
 import org.apache.arrow.vector.complex.ListVector;
 import org.apache.arrow.vector.complex.NonNullableStructVector;
+import org.apache.arrow.vector.complex.RunEndEncodedVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.types.DateUnit;
 import org.apache.arrow.vector.types.FloatingPointPrecision;
@@ -478,4 +479,15 @@ public Void visit(ExtensionTypeVector<?> vector, Void value) {
     validateExtensionTypeVector(vector);
     return null;
   }
+
+  @Override
+  public Void visit(RunEndEncodedVector vector, Void value) {
+    validateVectorCommon(vector, ArrowType.RunEndEncoded.class);
+    for (ValueVector subVector : vector.getChildrenFromFields()) {
+      if (subVector != null) {
+        subVector.accept(this, null);
+      }
+    }
+    return null;
+  }
 }
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java
new file mode 100644
index 0000000000000..3f4be2e52ce56
--- /dev/null
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.arrow.vector;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.List;
+import java.util.function.Function;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.compare.Range;
+import org.apache.arrow.vector.compare.RangeEqualsVisitor;
+import org.apache.arrow.vector.complex.RunEndEncodedVector;
+import org.apache.arrow.vector.types.Types;
+import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+public class TestRunEndEncodedVector {
+
+  private BufferAllocator allocator;
+
+  @BeforeEach
+  public void init() {
+    allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100);
+  }
+
+  @AfterEach
+  public void terminate() throws Exception {
+    allocator.close();
+  }
+
+  @Test
+  public void testInitializeChildrenFromFields() {
+    final FieldType valueType = FieldType.notNullable(Types.MinorType.BIGINT.getType());
+    final FieldType runEndType = FieldType.notNullable(Types.MinorType.INT.getType());
+    final Field valueField = new Field("value", valueType, null);
+    final Field runEndField = new Field("ree", runEndType, null);
+
+    try (RunEndEncodedVector reeVector = RunEndEncodedVector.empty("empty", allocator)) {
+      reeVector.initializeChildrenFromFields(List.of(runEndField, valueField));
+      reeVector.validate();
+    }
+  }
+
+  /** Create REE vector with constant value. */
+  @Test
+  public void testConstantValueVector() {
+    final Field runEndEncodedField = createBigIntRunEndEncodedField("constant");
+    int logicalValueCount = 100;
+
+    // constant vector
+    try (RunEndEncodedVector reeVector =
+        new RunEndEncodedVector(runEndEncodedField, allocator, null)) {
+      Long value = 65536L;
+      setConstantVector(reeVector, value, logicalValueCount);
+      assertEquals(logicalValueCount, reeVector.getValueCount());
+      for (int i = 0; i < logicalValueCount; i++) {
+        assertEquals(value, reeVector.getObject(i));
+      }
+    }
+
+    // constant null vector
+    try (RunEndEncodedVector reeVector =
+        new RunEndEncodedVector(runEndEncodedField, allocator, null)) {
+      setConstantVector(reeVector, null, logicalValueCount);
+      assertEquals(logicalValueCount, reeVector.getValueCount());
+      // Null count is always 0 for run-end encoded array
+      assertEquals(0, reeVector.getNullCount());
+      for (int i = 0; i < logicalValueCount; i++) {
+        assertTrue(reeVector.isNull(i));
+        assertNull(reeVector.getObject(i));
+      }
+    }
+  }
+
+  @Test
+  public void testBasicRunEndEncodedVector() {
+    try (RunEndEncodedVector reeVector =
+        new RunEndEncodedVector(createBigIntRunEndEncodedField("basic"), allocator, null)) {
+
+      // Create REE vector representing:
+      // [null, 2, 2, null, null, null, 4, 4, 4, 4, null, null, null, null, null].
+      int runCount = 5;
+      final int logicalValueCount =
+          setBasicVector(reeVector, runCount, i -> i % 2 == 0 ? null : i + 1, i -> i + 1);
+
+      assertEquals(15, reeVector.getValueCount());
+      int index = 0;
+      for (int run = 0; run < runCount; run++) {
+        long expectedRunValue = (long) run + 1;
+        for (int j = 0; j <= run; j++) {
+          if (run % 2 == 0) {
+            assertNull(reeVector.getObject(index));
+          } else {
+            assertEquals(expectedRunValue, reeVector.getObject(index));
+          }
+          index++;
+        }
+      }
+
+      // test index out of bound
+      assertThrows(IndexOutOfBoundsException.class, () -> reeVector.getObject(-1));
+      assertThrows(IndexOutOfBoundsException.class, () -> reeVector.getObject(logicalValueCount));
+    }
+  }
+
+  @Test
+  public void testRangeCompare() {
+    // test compare same constant vector
+    RunEndEncodedVector constantVector =
+        new RunEndEncodedVector(createBigIntRunEndEncodedField("constant"), allocator, null);
+    int logicalValueCount = 15;
+
+    setConstantVector(constantVector, 1L, logicalValueCount);
+
+    assertTrue(
+        constantVector.accept(
+            new RangeEqualsVisitor(constantVector, constantVector),
+            new Range(0, 0, logicalValueCount)));
+    assertTrue(
+        constantVector.accept(
+            new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 1, 14)));
+    assertTrue(
+        constantVector.accept(
+            new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 2, 13)));
+    assertFalse(
+        constantVector.accept(
+            new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 10, 10)));
+    assertFalse(
+        constantVector.accept(
+            new RangeEqualsVisitor(constantVector, constantVector), new Range(10, 1, 10)));
+
+    // Create REE vector representing: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5].
+    RunEndEncodedVector reeVector =
+        new RunEndEncodedVector(createBigIntRunEndEncodedField("basic"), allocator, null);
+    setBasicVector(reeVector, 5, i -> i + 1, i -> i + 1);
+
+    assertTrue(
+        reeVector.accept(
+            new RangeEqualsVisitor(reeVector, reeVector), new Range(0, 0, logicalValueCount)));
+    assertTrue(
+        reeVector.accept(
+            new RangeEqualsVisitor(reeVector, reeVector), new Range(2, 2, logicalValueCount - 2)));
+    assertFalse(
+        reeVector.accept(
+            new RangeEqualsVisitor(reeVector, reeVector), new Range(1, 2, logicalValueCount - 2)));
+
+    assertFalse(
+        reeVector.accept(
+            new RangeEqualsVisitor(reeVector, constantVector), new Range(0, 0, logicalValueCount)));
+
+    // Create REE vector representing: [2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5].
+    RunEndEncodedVector reeVector2 =
+        new RunEndEncodedVector(createBigIntRunEndEncodedField("basic"), allocator, null);
+    setBasicVector(reeVector2, 4, i -> i + 2, i -> i + 2);
+
+    assertTrue(
+        reeVector.accept(
+            new RangeEqualsVisitor(reeVector, reeVector2), new Range(1, 0, logicalValueCount - 1)));
+
+    constantVector.close();
+    reeVector.close();
+    reeVector2.close();
+  }
+
+  private static Field createBigIntRunEndEncodedField(String fieldName) {
+    final FieldType valueType = FieldType.notNullable(Types.MinorType.BIGINT.getType());
+    final FieldType runEndType = FieldType.notNullable(Types.MinorType.INT.getType());
+
+    final Field valueField = new Field("value", valueType, null);
+    final Field runEndField = new Field("ree", runEndType, null);
+
+    return new Field(
+        fieldName, FieldType.notNullable(RunEndEncoded.INSTANCE), List.of(runEndField, valueField));
+  }
+
+  private static void setConstantVector(
+      RunEndEncodedVector constantVector, Long value, long logicalValueCount) {
+    setBasicVector(constantVector, 1, i -> value, i -> logicalValueCount);
+  }
+
+  private static int setBasicVector(
+      RunEndEncodedVector reeVector,
+      int runCount,
+      Function<Long, Long> runValueSupplier,
+      Function<Long, Long> runLengthSupplier) {
+    reeVector.allocateNew();
+    reeVector.setInitialCapacity(runCount);
+    int end = 0;
+    for (int i = 0; i < runCount; i++) {
+      Long runValue = runValueSupplier.apply((long) i);
+      if (runValue == null) {
+        reeVector.getValuesVector().setNull(i);
+      } else {
+        ((BigIntVector) reeVector.getValuesVector()).set(i, runValue);
+      }
+
+      Long runLength = runLengthSupplier.apply((long) i);
+      assert runLength != null && runLength > 0;
+      end += runLength;
+      ((IntVector) reeVector.getRunEndsVector()).set(i, end);
+    }
+
+    final int logicalValueCount = end;
+    reeVector.getValuesVector().setValueCount(runCount);
+    reeVector.getRunEndsVector().setValueCount(runCount);
+    reeVector.setValueCount(logicalValueCount);
+    return logicalValueCount;
+  }
+}
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java b/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java
index 60c4c3a9bc6d2..35c15bdf538f3 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVector.java
@@ -23,6 +23,7 @@
 
 import java.nio.charset.Charset;
 import java.util.Arrays;
+import java.util.List;
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.memory.RootAllocator;
 import org.apache.arrow.vector.BigIntVector;
@@ -33,6 +34,7 @@
 import org.apache.arrow.vector.complex.FixedSizeListVector;
 import org.apache.arrow.vector.complex.LargeListVector;
 import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.RunEndEncodedVector;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.UnionVector;
 import org.apache.arrow.vector.complex.impl.NullableStructWriter;
@@ -40,6 +42,7 @@
 import org.apache.arrow.vector.holders.NullableFloat8Holder;
 import org.apache.arrow.vector.types.Types;
 import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded;
 import org.apache.arrow.vector.types.pojo.Field;
 import org.apache.arrow.vector.types.pojo.FieldType;
 import org.junit.jupiter.api.AfterEach;
@@ -265,6 +268,41 @@ public void testBaseFixedWidthVectorInstanceMethod() {
     }
   }
 
+  @Test
+  public void testRunEndEncodedVector() {
+    final FieldType valueType = FieldType.notNullable(Types.MinorType.BIGINT.getType());
+    final FieldType runEndType = FieldType.notNullable(Types.MinorType.INT.getType());
+
+    final Field valueField = new Field("value", valueType, null);
+    final Field runEndField = new Field("ree", runEndType, null);
+
+    try (RunEndEncodedVector vector =
+        new RunEndEncodedVector(
+            new Field(
+                "ree",
+                FieldType.notNullable(RunEndEncoded.INSTANCE),
+                List.of(runEndField, valueField)),
+            allocator,
+            null)) {
+      vector.validate();
+
+      int runCount = 1;
+      vector.allocateNew();
+      ((BigIntVector) vector.getValuesVector()).set(0, 1);
+      ((IntVector) vector.getRunEndsVector()).set(0, 10);
+      vector.getValuesVector().setValueCount(runCount);
+      vector.getRunEndsVector().setValueCount(runCount);
+      vector.setValueCount(10);
+
+      vector.validate();
+
+      vector.getRunEndsVector().setValueCount(0);
+      ValidateUtil.ValidateException e =
+          assertThrows(ValidateUtil.ValidateException.class, () -> vector.validate());
+      assertTrue(e.getMessage().contains("Run end vector does not contain enough elements"));
+    }
+  }
+
   private void writeStructVector(NullableStructWriter writer, int value1, long value2) {
     writer.start();
     writer.integer("f0").writeInt(value1);

From 25d42b9f2d24836b3eeaad52f051c668e61c1b1a Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 20 Sep 2024 16:10:43 +0900
Subject: [PATCH 180/186] GH-40493: [GLib][Ruby] Add GArrowStreamDecoder
 (#44170)

### Rationale for this change

This is the bindings of
`arrow::ipc::StreamDecoder`. `arrow::ipc::StreamDecoder` is useful to integrate async input.

### What changes are included in this PR?

* Add `GArrowStreamDecoder`
* Add `GArrowStreamListener`
* Add convenient Ruby API

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #40493

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 c_glib/Gemfile                              |   2 +-
 c_glib/arrow-glib/arrow-glib.h              |   1 +
 c_glib/arrow-glib/arrow-glib.hpp            |   1 +
 c_glib/arrow-glib/decoder.cpp               | 607 ++++++++++++++++++++
 c_glib/arrow-glib/decoder.h                 |  96 ++++
 c_glib/arrow-glib/decoder.hpp               |  38 ++
 c_glib/arrow-glib/internal-hash-table.hpp   |  18 +
 c_glib/arrow-glib/meson.build               |   3 +
 c_glib/arrow-glib/reader.cpp                |  20 +-
 c_glib/test/test-stream-decoder.rb          | 126 ++++
 ruby/red-arrow/lib/arrow/loader.rb          |  12 +
 ruby/red-arrow/lib/arrow/stream-decoder.rb  |  29 +
 ruby/red-arrow/lib/arrow/stream-listener.rb |  47 ++
 ruby/red-arrow/red-arrow.gemspec            |   2 +-
 ruby/red-arrow/test/test-stream-listener.rb |  60 ++
 15 files changed, 1044 insertions(+), 18 deletions(-)
 create mode 100644 c_glib/arrow-glib/decoder.cpp
 create mode 100644 c_glib/arrow-glib/decoder.h
 create mode 100644 c_glib/arrow-glib/decoder.hpp
 create mode 100644 c_glib/test/test-stream-decoder.rb
 create mode 100644 ruby/red-arrow/lib/arrow/stream-decoder.rb
 create mode 100644 ruby/red-arrow/lib/arrow/stream-listener.rb
 create mode 100644 ruby/red-arrow/test/test-stream-listener.rb

diff --git a/c_glib/Gemfile b/c_glib/Gemfile
index d32bc87ba72c6..cc6adecabe230 100644
--- a/c_glib/Gemfile
+++ b/c_glib/Gemfile
@@ -20,4 +20,4 @@
 source "https://rubygems.org/"
 
 gem "test-unit"
-gem "gobject-introspection", ">= 4.1.1"
+gem "gobject-introspection", ">= 4.2.3"
diff --git a/c_glib/arrow-glib/arrow-glib.h b/c_glib/arrow-glib/arrow-glib.h
index 7ba20882610e8..272b6ba1dae10 100644
--- a/c_glib/arrow-glib/arrow-glib.h
+++ b/c_glib/arrow-glib/arrow-glib.h
@@ -30,6 +30,7 @@
 #include <arrow-glib/compute.h>
 #include <arrow-glib/data-type.h>
 #include <arrow-glib/datum.h>
+#include <arrow-glib/decoder.h>
 #include <arrow-glib/enums.h>
 #include <arrow-glib/error.h>
 #include <arrow-glib/expression.h>
diff --git a/c_glib/arrow-glib/arrow-glib.hpp b/c_glib/arrow-glib/arrow-glib.hpp
index 79e8dcbcce61a..49571eeae4929 100644
--- a/c_glib/arrow-glib/arrow-glib.hpp
+++ b/c_glib/arrow-glib/arrow-glib.hpp
@@ -29,6 +29,7 @@
 #include <arrow-glib/compute.hpp>
 #include <arrow-glib/data-type.hpp>
 #include <arrow-glib/datum.hpp>
+#include <arrow-glib/decoder.hpp>
 #include <arrow-glib/error.hpp>
 #include <arrow-glib/expression.hpp>
 #include <arrow-glib/field.hpp>
diff --git a/c_glib/arrow-glib/decoder.cpp b/c_glib/arrow-glib/decoder.cpp
new file mode 100644
index 0000000000000..83af6bc484394
--- /dev/null
+++ b/c_glib/arrow-glib/decoder.cpp
@@ -0,0 +1,607 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow-glib/buffer.hpp>
+#include <arrow-glib/decoder.hpp>
+#include <arrow-glib/error.hpp>
+#include <arrow-glib/internal-hash-table.hpp>
+#include <arrow-glib/ipc-options.hpp>
+#include <arrow-glib/record-batch.hpp>
+#include <arrow-glib/schema.hpp>
+
+G_BEGIN_DECLS
+
+/**
+ * SECTION: decoder
+ * @section_id: decoder-classes
+ * @title: Decoder classes
+ * @include: arrow-glib/arrow-glib.h
+ *
+ * #GArrowStreamListener is a class for receiving decoded information
+ * from #GArrowStreamDecoder.
+ *
+ * #GArrowStreamDecoder is a class for decoding record batches in
+ * stream format from given data chunks.
+ */
+
+struct GArrowStreamListenerPrivate
+{
+  std::shared_ptr<arrow::ipc::Listener> listener;
+};
+
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowStreamListener,
+                                    garrow_stream_listener,
+                                    G_TYPE_OBJECT);
+
+#define GARROW_STREAM_LISTENER_GET_PRIVATE(object)                                       \
+  static_cast<GArrowStreamListenerPrivate *>(                                            \
+    garrow_stream_listener_get_instance_private(GARROW_STREAM_LISTENER(object)))
+
+G_END_DECLS
+
+namespace garrow {
+  class StreamListener : public arrow::ipc::Listener {
+  public:
+    StreamListener(GArrowStreamListener *listener) : listener_(listener)
+    {
+      g_object_ref(listener_);
+    }
+    ~StreamListener() { g_object_unref(listener_); }
+
+    arrow::Status
+    OnEOS() override
+    {
+      if (!klass()->on_eos) {
+        return arrow::Status::OK();
+      }
+
+      GError *error = nullptr;
+      if (garrow_stream_listener_on_eos(listener_, &error)) {
+        return arrow::Status::OK();
+      } else {
+        return garrow_error_to_status(error,
+                                      arrow::StatusCode::UnknownError,
+                                      "[stream-listener][on-eos]");
+      }
+    }
+
+    arrow::Status
+    OnRecordBatchWithMetadataDecoded(
+      arrow::RecordBatchWithMetadata arrow_record_batch_with_metadata) override
+    {
+      if (!klass()->on_record_batch_decoded) {
+        return arrow::Status::OK();
+      }
+
+      auto record_batch =
+        garrow_record_batch_new_raw(&(arrow_record_batch_with_metadata.batch));
+      GHashTable *metadata = nullptr;
+      if (arrow_record_batch_with_metadata.custom_metadata) {
+        metadata = garrow_internal_hash_table_from_metadata(
+          arrow_record_batch_with_metadata.custom_metadata);
+      }
+      GError *error = nullptr;
+      auto success = garrow_stream_listener_on_record_batch_decoded(listener_,
+                                                                    record_batch,
+                                                                    metadata,
+                                                                    &error);
+      g_object_unref(record_batch);
+      if (metadata) {
+        g_hash_table_unref(metadata);
+      }
+      if (success) {
+        return arrow::Status::OK();
+      } else {
+        return garrow_error_to_status(error,
+                                      arrow::StatusCode::UnknownError,
+                                      "[stream-listener][on-record-batch-decoded]");
+      }
+    }
+
+    arrow::Status
+    OnSchemaDecoded(std::shared_ptr<arrow::Schema> arrow_schema,
+                    std::shared_ptr<arrow::Schema> arrow_filtered_schema) override
+    {
+      if (!klass()->on_schema_decoded) {
+        return arrow::Status::OK();
+      }
+
+      auto schema = garrow_schema_new_raw(&arrow_schema);
+      auto filtered_schema = garrow_schema_new_raw(&arrow_filtered_schema);
+      GError *error = nullptr;
+      auto success = garrow_stream_listener_on_schema_decoded(listener_,
+                                                              schema,
+                                                              filtered_schema,
+                                                              &error);
+      g_object_unref(schema);
+      g_object_unref(filtered_schema);
+      if (success) {
+        return arrow::Status::OK();
+      } else {
+        return garrow_error_to_status(error,
+                                      arrow::StatusCode::UnknownError,
+                                      "[stream-listener][on-schema-decoded]");
+      }
+    }
+
+  private:
+    GArrowStreamListener *listener_;
+
+    GArrowStreamListenerClass *
+    klass()
+    {
+      return GARROW_STREAM_LISTENER_GET_CLASS(listener_);
+    }
+  };
+}; // namespace garrow
+
+G_BEGIN_DECLS
+
+static void
+garrow_stream_listener_finalize(GObject *object)
+{
+  auto priv = GARROW_STREAM_LISTENER_GET_PRIVATE(object);
+  priv->listener.~shared_ptr();
+  G_OBJECT_CLASS(garrow_stream_listener_parent_class)->finalize(object);
+}
+
+static void
+garrow_stream_listener_init(GArrowStreamListener *object)
+{
+  auto priv = GARROW_STREAM_LISTENER_GET_PRIVATE(object);
+  new (&priv->listener)
+    std::shared_ptr<garrow::StreamListener>(new garrow::StreamListener(object));
+}
+
+static void
+garrow_stream_listener_class_init(GArrowStreamListenerClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+  gobject_class->finalize = garrow_stream_listener_finalize;
+
+  klass->on_eos = nullptr;
+  klass->on_record_batch_decoded = nullptr;
+  klass->on_schema_decoded = nullptr;
+}
+
+/**
+ * garrow_stream_listener_on_eos:
+ * @listener: A #GArrowStreamListener.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Processes an EOS event.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+garrow_stream_listener_on_eos(GArrowStreamListener *listener, GError **error)
+{
+  auto klass = GARROW_STREAM_LISTENER_GET_CLASS(listener);
+  if (!(klass && klass->on_eos)) {
+    g_set_error(error,
+                GARROW_ERROR,
+                GARROW_ERROR_NOT_IMPLEMENTED,
+                "[stream-listener][on-eos] not implemented");
+    return false;
+  }
+  return klass->on_eos(listener, error);
+}
+
+/**
+ * garrow_stream_listener_on_record_batch_decoded:
+ * @listener: A #GArrowStreamListener.
+ * @record_batch: A decoded #GArrowRecordBatch.
+ * @metadata: (element-type utf8 utf8) (nullable): A decoded metadata.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Processes a decoded record batch.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+garrow_stream_listener_on_record_batch_decoded(GArrowStreamListener *listener,
+                                               GArrowRecordBatch *record_batch,
+                                               GHashTable *metadata,
+                                               GError **error)
+{
+  auto klass = GARROW_STREAM_LISTENER_GET_CLASS(listener);
+  if (!(klass && klass->on_record_batch_decoded)) {
+    g_set_error(error,
+                GARROW_ERROR,
+                GARROW_ERROR_NOT_IMPLEMENTED,
+                "[stream-listener][on-record-batch-decoded] not implemented");
+    return false;
+  }
+  return klass->on_record_batch_decoded(listener, record_batch, metadata, error);
+}
+
+/**
+ * garrow_stream_listener_on_schema_decoded:
+ * @listener: A #GArrowStreamListener.
+ * @schema: A decoded #GArrowSchema.
+ * @filtered_schema: A decoded #GArrowSchema that only has read fields.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Processes a decoded schema.
+ *
+ * Returns: %TRUE on success, %FALSE on error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+garrow_stream_listener_on_schema_decoded(GArrowStreamListener *listener,
+                                         GArrowSchema *schema,
+                                         GArrowSchema *filtered_schema,
+                                         GError **error)
+{
+  auto klass = GARROW_STREAM_LISTENER_GET_CLASS(listener);
+  if (!(klass && klass->on_schema_decoded)) {
+    g_set_error(error,
+                GARROW_ERROR,
+                GARROW_ERROR_NOT_IMPLEMENTED,
+                "[stream-listener][on-schema-decoded] not implemented");
+    return false;
+  }
+  return klass->on_schema_decoded(listener, schema, filtered_schema, error);
+}
+
+struct GArrowStreamDecoderPrivate
+{
+  std::shared_ptr<arrow::ipc::StreamDecoder> decoder;
+  GArrowStreamListener *listener;
+};
+
+enum {
+  PROP_DECODER = 1,
+  PROP_LISTENER,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GArrowStreamDecoder, garrow_stream_decoder, G_TYPE_OBJECT);
+
+#define GARROW_STREAM_DECODER_GET_PRIVATE(object)                                        \
+  static_cast<GArrowStreamDecoderPrivate *>(                                             \
+    garrow_stream_decoder_get_instance_private(GARROW_STREAM_DECODER(object)))
+
+static void
+garrow_stream_decoder_finalize(GObject *object)
+{
+  auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object);
+  priv->decoder.~shared_ptr();
+  G_OBJECT_CLASS(garrow_stream_decoder_parent_class)->finalize(object);
+}
+
+static void
+garrow_stream_decoder_dispose(GObject *object)
+{
+  auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object);
+
+  if (priv->listener) {
+    g_object_unref(priv->listener);
+    priv->listener = nullptr;
+  }
+
+  G_OBJECT_CLASS(garrow_stream_decoder_parent_class)->dispose(object);
+}
+
+static void
+garrow_stream_decoder_set_property(GObject *object,
+                                   guint prop_id,
+                                   const GValue *value,
+                                   GParamSpec *pspec)
+{
+  auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_DECODER:
+    priv->decoder = *static_cast<std::shared_ptr<arrow::ipc::StreamDecoder> *>(
+      g_value_get_pointer(value));
+    break;
+  case PROP_LISTENER:
+    priv->listener = GARROW_STREAM_LISTENER(g_value_dup_object(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+garrow_stream_decoder_get_property(GObject *object,
+                                   guint prop_id,
+                                   GValue *value,
+                                   GParamSpec *pspec)
+{
+  auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_LISTENER:
+    g_value_set_object(value, priv->listener);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+garrow_stream_decoder_init(GArrowStreamDecoder *object)
+{
+  auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(object);
+  new (&priv->decoder) std::shared_ptr<arrow::ipc::StreamDecoder>;
+}
+
+static void
+garrow_stream_decoder_class_init(GArrowStreamDecoderClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize = garrow_stream_decoder_finalize;
+  gobject_class->dispose = garrow_stream_decoder_dispose;
+  gobject_class->set_property = garrow_stream_decoder_set_property;
+  gobject_class->get_property = garrow_stream_decoder_get_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer(
+    "decoder",
+    nullptr,
+    nullptr,
+    static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_DECODER, spec);
+
+  /**
+   * GArrowStreamDecoder:listener:
+   *
+   * A listener that receives decoded events.
+   *
+   * Since: 18.0.0
+   */
+  spec = g_param_spec_object(
+    "listener",
+    nullptr,
+    nullptr,
+    GARROW_TYPE_STREAM_LISTENER,
+    static_cast<GParamFlags>(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_LISTENER, spec);
+}
+
+/**
+ * garrow_stream_decoder_new:
+ * @listener: The #GArrowStreamListener that receives decoded events.
+ * @options: (nullable): The #GArrowReadOptions.
+ *
+ * Returns: A newly created #GArrowStreamDecoder.
+ *
+ * Since: 18.0.0
+ */
+GArrowStreamDecoder *
+garrow_stream_decoder_new(GArrowStreamListener *listener, GArrowReadOptions *options)
+{
+  auto arrow_listener = garrow_stream_listener_get_raw(listener);
+  arrow::ipc::IpcReadOptions arrow_options;
+  if (options) {
+    arrow_options = *garrow_read_options_get_raw(options);
+  } else {
+    arrow_options = arrow::ipc::IpcReadOptions::Defaults();
+  }
+  auto arrow_decoder =
+    std::make_shared<arrow::ipc::StreamDecoder>(arrow_listener, arrow_options);
+  return garrow_stream_decoder_new_raw(&arrow_decoder, listener);
+}
+
+/**
+ * garrow_stream_decoder_consume_bytes:
+ * @decoder: A #GArrowStreamDecoder.
+ * @bytes: A #GBytes to be decoded.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Feed data to the decoder as a raw data.
+ *
+ * If the decoder can read one or more record batches by the data, the
+ * decoder calls [vfunc@GArrowStreamListener.on_record_batch_decoded]
+ * with a decoded record batch multiple times.
+ *
+ * Returns: %TRUE on success, %FALSE if there was an error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+garrow_stream_decoder_consume_bytes(GArrowStreamDecoder *decoder,
+                                    GBytes *bytes,
+                                    GError **error)
+{
+  auto arrow_decoder = garrow_stream_decoder_get_raw(decoder);
+  gsize size;
+  gconstpointer data = g_bytes_get_data(bytes, &size);
+  return garrow::check(error,
+                       arrow_decoder->Consume(static_cast<const uint8_t *>(data), size),
+                       "[stream-decoder][consume-bytes]");
+}
+
+/**
+ * garrow_stream_decoder_consume_buffer:
+ * @decoder: A #GArrowStreamDecoder.
+ * @buffer: A #GArrowBuffer to be decoded.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Feed data to the decoder as a #GArrowBuffer.
+ *
+ * If the decoder can read one or more record batches by the data, the
+ * decoder calls [vfunc@GArrowStreamListener.on_record_batch_decoded]
+ * with a decoded record batch multiple times.
+ *
+ * Returns: %TRUE on success, %FALSE if there was an error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+garrow_stream_decoder_consume_buffer(GArrowStreamDecoder *decoder,
+                                     GArrowBuffer *buffer,
+                                     GError **error)
+{
+  auto arrow_decoder = garrow_stream_decoder_get_raw(decoder);
+  auto arrow_buffer = garrow_buffer_get_raw(buffer);
+  return garrow::check(error,
+                       arrow_decoder->Consume(arrow_buffer),
+                       "[stream-decoder][consume-buffer]");
+}
+
+/**
+ * garrow_stream_decoder_reset:
+ * @decoder: A #GArrowStreamDecoder.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Reset the internal status.
+ *
+ * You can reuse this decoder for new stream after calling this.
+ *
+ * Returns: %TRUE on success, %FALSE if there was an error.
+ *
+ * Since: 18.0.0
+ */
+gboolean
+garrow_stream_decoder_reset(GArrowStreamDecoder *decoder, GError **error)
+{
+  auto arrow_decoder = garrow_stream_decoder_get_raw(decoder);
+  return garrow::check(error, arrow_decoder->Reset(), "[stream-decoder][reset]");
+}
+
+/**
+ * garrow_stream_decoder_get_schema:
+ * @decoder: A #GArrowStreamDecoder.
+ *
+ * Returns: (nullable) (transfer full): The shared #GArrowSchema of
+ *   the record batches in the stream.
+ *
+ * Since: 18.0.0
+ */
+GArrowSchema *
+garrow_stream_decoder_get_schema(GArrowStreamDecoder *decoder)
+{
+  auto arrow_decoder = garrow_stream_decoder_get_raw(decoder);
+  auto arrow_schema = arrow_decoder->schema();
+  if (arrow_schema) {
+    return garrow_schema_new_raw(&arrow_schema);
+  } else {
+    return nullptr;
+  }
+}
+
+/**
+ * garrow_stream_decoder_get_next_required_size:
+ * @decoder: A #GArrowStreamDecoder.
+ *
+ * This method is provided for users who want to optimize performance.
+ * Normal users don't need to use this method.
+ *
+ * Here is an example usage for normal users:
+ *
+ *     garrow_stream_decoder_consume_buffer(decoder, buffer1);
+ *     garrow_stream_decoder_consume_buffer(decoder, buffer2);
+ *     garrow_stream_decoder_consume_buffer(decoder, buffer3);
+ *
+ * Decoder has internal buffer. If consumed data isn't enough to
+ * advance the state of the decoder, consumed data is buffered to
+ * the internal buffer. It causes performance overhead.
+ *
+ * If you pass garrow_stream_decoer_get_next_required_size() size data
+ * to each
+ * garrow_stream_decoder_consume_bytes()/garrow_stream_decoder_consume_buffer()
+ * call, the decoder doesn't use its internal buffer. It improves
+ * performance.
+ *
+ * Here is an example usage to avoid using internal buffer:
+ *
+ *     buffer1 = get_data(garrow_stream_decoder_get_next_required_size(decoder));
+ *     garrow_stream_decoder_consume_buffer(buffer1);
+ *     buffer2 = get_data(garrow_stream_decoder_get_next_required_size(decoder));
+ *     garrow_stream_decoder_consume_buffer(buffer2);
+ *
+ * Users can use this method to avoid creating small chunks. Record
+ * batch data must be contiguous data. If users pass small chunks to
+ * the decoder, the decoder needs concatenate small chunks
+ * internally. It causes performance overhead.
+ *
+ * Here is an example usage to reduce small chunks:
+ *
+ *     GArrowResizablBuffer *buffer = garrow_resizable_buffer_new(1024, NULL);
+ *     while ((small_chunk = get_data(&small_chunk_size))) {
+ *       size_t current_buffer_size = garrow_buffer_get_size(GARROW_BUFFER(buffer));
+ *       garrow_resizable_buffer_resize(buffer, current_buffer_size + small_chunk_size,
+NULL);
+ *       garrow_mutable_buffer_set_data(GARROW_MUTABLE_BUFFER(buffer),
+ *                                      current_buffer_size,
+ *                                      small_chunk,
+ *                                      small_chunk_size,
+ *                                      NULL);
+ *       if (garrow_buffer_get_size(GARROW_BUFFER(buffer)) <
+ *           garrow_stream_decoder_get_next_required_size(decoder)) {
+ *         continue;
+ *       }
+ *       garrow_stream_decoder_consume_buffer(decoder, GARROW_BUFFER(buffer), NULL);
+ *       g_object_unref(buffer);
+ *       buffer = garrow_resizable_buffer_new(1024, NULL);
+ *     }
+ *     if (garrow_buffer_get_size(GARROW_BUFFER(buffer)) > 0) {
+ *       garrow_stream_decoder_consume_buffer(decoder, GARROW_BUFFER(buffer), NULL);
+ *     }
+ *     g_object_unref(buffer);
+ *
+ * Returns: The number of bytes needed to advance the state of
+ *   the decoder.
+ *
+ * Since: 18.0.0
+ */
+gsize
+garrow_stream_decoder_get_next_required_size(GArrowStreamDecoder *decoder)
+{
+  auto arrow_decoder = garrow_stream_decoder_get_raw(decoder);
+  return arrow_decoder->next_required_size();
+}
+
+G_END_DECLS
+
+std::shared_ptr<arrow::ipc::Listener>
+garrow_stream_listener_get_raw(GArrowStreamListener *listener)
+{
+  auto priv = GARROW_STREAM_LISTENER_GET_PRIVATE(listener);
+  return priv->listener;
+}
+
+GArrowStreamDecoder *
+garrow_stream_decoder_new_raw(std::shared_ptr<arrow::ipc::StreamDecoder> *arrow_decoder,
+                              GArrowStreamListener *listener)
+{
+  return GARROW_STREAM_DECODER(g_object_new(GARROW_TYPE_STREAM_DECODER,
+                                            "decoder",
+                                            arrow_decoder,
+                                            "listener",
+                                            listener,
+                                            nullptr));
+}
+
+std::shared_ptr<arrow::ipc::StreamDecoder>
+garrow_stream_decoder_get_raw(GArrowStreamDecoder *decoder)
+{
+  auto priv = GARROW_STREAM_DECODER_GET_PRIVATE(decoder);
+  return priv->decoder;
+}
diff --git a/c_glib/arrow-glib/decoder.h b/c_glib/arrow-glib/decoder.h
new file mode 100644
index 0000000000000..2ac0efbabfc7b
--- /dev/null
+++ b/c_glib/arrow-glib/decoder.h
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow-glib/record-batch.h>
+#include <arrow-glib/schema.h>
+
+G_BEGIN_DECLS
+
+#define GARROW_TYPE_STREAM_LISTENER (garrow_stream_listener_get_type())
+GARROW_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(
+  GArrowStreamListener, garrow_stream_listener, GARROW, STREAM_LISTENER, GObject)
+struct _GArrowStreamListenerClass
+{
+  GObjectClass parent_class;
+
+  gboolean (*on_eos)(GArrowStreamListener *listener, GError **error);
+  gboolean (*on_record_batch_decoded)(GArrowStreamListener *listener,
+                                      GArrowRecordBatch *record_batch,
+                                      GHashTable *metadata,
+                                      GError **error);
+  gboolean (*on_schema_decoded)(GArrowStreamListener *listener,
+                                GArrowSchema *schema,
+                                GArrowSchema *filtered_schema,
+                                GError **error);
+};
+
+GARROW_AVAILABLE_IN_18_0
+gboolean
+garrow_stream_listener_on_eos(GArrowStreamListener *listener, GError **error);
+
+GARROW_AVAILABLE_IN_18_0
+gboolean
+garrow_stream_listener_on_record_batch_decoded(GArrowStreamListener *listener,
+                                               GArrowRecordBatch *record_batch,
+                                               GHashTable *metadata,
+                                               GError **error);
+
+GARROW_AVAILABLE_IN_18_0
+gboolean
+garrow_stream_listener_on_schema_decoded(GArrowStreamListener *listener,
+                                         GArrowSchema *schema,
+                                         GArrowSchema *filtered_schema,
+                                         GError **error);
+
+#define GARROW_TYPE_STREAM_DECODER (garrow_stream_decoder_get_type())
+GARROW_AVAILABLE_IN_18_0
+G_DECLARE_DERIVABLE_TYPE(
+  GArrowStreamDecoder, garrow_stream_decoder, GARROW, STREAM_DECODER, GObject)
+struct _GArrowStreamDecoderClass
+{
+  GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_18_0
+GArrowStreamDecoder *
+garrow_stream_decoder_new(GArrowStreamListener *listener, GArrowReadOptions *options);
+GARROW_AVAILABLE_IN_18_0
+gboolean
+garrow_stream_decoder_consume_bytes(GArrowStreamDecoder *decoder,
+                                    GBytes *bytes,
+                                    GError **error);
+GARROW_AVAILABLE_IN_18_0
+gboolean
+garrow_stream_decoder_consume_buffer(GArrowStreamDecoder *decoder,
+                                     GArrowBuffer *buffer,
+                                     GError **error);
+GARROW_AVAILABLE_IN_18_0
+gboolean
+garrow_stream_decoder_reset(GArrowStreamDecoder *decoder, GError **error);
+GARROW_AVAILABLE_IN_18_0
+GArrowSchema *
+garrow_stream_decoder_get_schema(GArrowStreamDecoder *decoder);
+GARROW_AVAILABLE_IN_18_0
+gsize
+garrow_stream_decoder_get_next_required_size(GArrowStreamDecoder *decoder);
+
+G_END_DECLS
diff --git a/c_glib/arrow-glib/decoder.hpp b/c_glib/arrow-glib/decoder.hpp
new file mode 100644
index 0000000000000..24b329867c685
--- /dev/null
+++ b/c_glib/arrow-glib/decoder.hpp
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow/api.h>
+#include <arrow/ipc/api.h>
+
+#include <arrow-glib/decoder.h>
+
+GARROW_EXTERN
+std::shared_ptr<arrow::ipc::Listener>
+garrow_stream_listener_get_raw(GArrowStreamListener *listener);
+
+GARROW_EXTERN
+GArrowStreamDecoder *
+garrow_stream_decoder_new_raw(std::shared_ptr<arrow::ipc::StreamDecoder> *arrow_decoder,
+                              GArrowStreamListener *listener);
+
+GARROW_EXTERN
+std::shared_ptr<arrow::ipc::StreamDecoder>
+garrow_stream_decoder_get_raw(GArrowStreamDecoder *decoder);
diff --git a/c_glib/arrow-glib/internal-hash-table.hpp b/c_glib/arrow-glib/internal-hash-table.hpp
index 27ec060994c98..2e0a72561a7d8 100644
--- a/c_glib/arrow-glib/internal-hash-table.hpp
+++ b/c_glib/arrow-glib/internal-hash-table.hpp
@@ -37,3 +37,21 @@ garrow_internal_hash_table_to_metadata(GHashTable *metadata)
     &arrow_metadata);
   return arrow_metadata;
 }
+
+static inline GHashTable *
+garrow_internal_hash_table_from_metadata(
+  const std::shared_ptr<arrow::KeyValueMetadata> &arrow_metadata)
+{
+  auto metadata = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, g_free);
+  const auto &keys = arrow_metadata->keys();
+  const auto &values = arrow_metadata->values();
+  auto n = arrow_metadata->size();
+  for (int64_t i = 0; i < n; ++i) {
+    const auto &key = keys[i];
+    const auto &value = values[i];
+    g_hash_table_insert(metadata,
+                        g_strndup(key.data(), key.size()),
+                        g_strndup(value.data(), value.size()));
+  }
+  return metadata;
+}
diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build
index 36a8274513ed2..854988e348986 100644
--- a/c_glib/arrow-glib/meson.build
+++ b/c_glib/arrow-glib/meson.build
@@ -28,6 +28,7 @@ sources = files(
   'composite-data-type.cpp',
   'datum.cpp',
   'decimal.cpp',
+  'decoder.cpp',
   'error.cpp',
   'expression.cpp',
   'field.cpp',
@@ -91,6 +92,7 @@ c_headers = files(
   'data-type.h',
   'datum.h',
   'decimal.h',
+  'decoder.h',
   'error.h',
   'expression.h',
   'field.h',
@@ -153,6 +155,7 @@ cpp_headers = files(
   'data-type.hpp',
   'datum.hpp',
   'decimal.hpp',
+  'decoder.hpp',
   'error.hpp',
   'expression.hpp',
   'field.hpp',
diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp
index 8a1c3722d4a0f..9fe9d9d1b3199 100644
--- a/c_glib/arrow-glib/reader.cpp
+++ b/c_glib/arrow-glib/reader.cpp
@@ -668,10 +668,10 @@ garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r
   }
 }
 
-typedef struct GArrowFeatherFileReaderPrivate_
+struct GArrowFeatherFileReaderPrivate
 {
   std::shared_ptr<arrow::ipc::feather::Reader> feather_reader;
-} GArrowFeatherFileReaderPrivate;
+};
 
 enum {
   PROP_FEATHER_READER = 1,
@@ -714,22 +714,11 @@ garrow_feather_file_reader_set_property(GObject *object,
   }
 }
 
-static void
-garrow_feather_file_reader_get_property(GObject *object,
-                                        guint prop_id,
-                                        GValue *value,
-                                        GParamSpec *pspec)
-{
-  switch (prop_id) {
-  default:
-    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
-    break;
-  }
-}
-
 static void
 garrow_feather_file_reader_init(GArrowFeatherFileReader *object)
 {
+  auto priv = GARROW_FEATHER_FILE_READER_GET_PRIVATE(object);
+  new (&priv->feather_reader) std::shared_ptr<arrow::ipc::feather::Reader>;
 }
 
 static void
@@ -739,7 +728,6 @@ garrow_feather_file_reader_class_init(GArrowFeatherFileReaderClass *klass)
 
   gobject_class->finalize = garrow_feather_file_reader_finalize;
   gobject_class->set_property = garrow_feather_file_reader_set_property;
-  gobject_class->get_property = garrow_feather_file_reader_get_property;
 
   GParamSpec *spec;
   spec = g_param_spec_pointer(
diff --git a/c_glib/test/test-stream-decoder.rb b/c_glib/test/test-stream-decoder.rb
new file mode 100644
index 0000000000000..108e687e3aa6b
--- /dev/null
+++ b/c_glib/test/test-stream-decoder.rb
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestStreamDecoder < Test::Unit::TestCase
+  include Helper::Buildable
+
+  class Listener < Arrow::StreamListener
+    type_register
+
+    attr_reader :events
+    def initialize
+      super
+      @events = []
+    end
+
+    private
+    def virtual_do_on_eos
+      @events << [:eos]
+      true
+    end
+
+    def virtual_do_on_record_batch_decoded(record_batch, metadata)
+      @events << [:record_batch_decoded, record_batch, metadata]
+      true
+    end
+
+    def virtual_do_on_schema_decoded(schema, filtered_schema)
+      @events << [:schema_decoded, schema, filtered_schema]
+      true
+    end
+  end
+
+  def setup
+    columns = {
+      "enabled": build_boolean_array([true, false, nil, true]),
+    }
+    @record_batch = build_record_batch(columns)
+    @schema = @record_batch.schema
+
+    @buffer = Arrow::ResizableBuffer.new(0)
+    output = Arrow::BufferOutputStream.new(@buffer)
+    stream_writer = Arrow::RecordBatchStreamWriter.new(output, @schema)
+    stream_writer.write_record_batch(@record_batch)
+    stream_writer.close
+    output.close
+
+    @listener = Listener.new
+    @decoder = Arrow::StreamDecoder.new(@listener)
+  end
+
+  def test_listener
+    assert_equal(@listener, @decoder.listener)
+  end
+
+  def test_consume_bytes
+    @buffer.data.to_s.each_byte do |byte|
+      @decoder.consume_bytes(GLib::Bytes.new(byte.chr))
+    end
+    assert_equal([
+                   [:schema_decoded, @schema, @schema],
+                   [:record_batch_decoded, @record_batch, nil],
+                   [:eos],
+                 ],
+                 @listener.events)
+  end
+
+  def test_consume_buffer
+    @buffer.data.to_s.each_byte do |byte|
+      @decoder.consume_buffer(Arrow::Buffer.new(byte.chr))
+    end
+    assert_equal([
+                   [:schema_decoded, @schema, @schema],
+                   [:record_batch_decoded, @record_batch, nil],
+                   [:eos],
+                 ],
+                 @listener.events)
+  end
+
+  def test_reset
+    @decoder.consume_bytes(@buffer.data.to_s[0, 10])
+    @decoder.reset
+    @decoder.consume_bytes(@buffer.data)
+    assert_equal([
+                   [:schema_decoded, @schema, @schema],
+                   [:record_batch_decoded, @record_batch, nil],
+                   [:eos],
+                 ],
+                 @listener.events)
+  end
+
+  def test_schema
+    assert_nil(@decoder.schema)
+    @decoder.consume_bytes(@buffer.data)
+    assert_equal(@schema, @decoder.schema)
+  end
+
+  def test_next_required_size
+    data = @buffer.data.to_s
+    loop do
+      next_required_size = @decoder.next_required_size
+      break if next_required_size.zero?
+      @decoder.consume_bytes(data[0, next_required_size])
+      data = data[next_required_size..-1]
+    end
+    assert_equal([
+                   [:schema_decoded, @schema, @schema],
+                   [:record_batch_decoded, @record_batch, nil],
+                   [:eos],
+                 ],
+                 @listener.events)
+  end
+end
diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb
index bd0d03930885c..5468b0c78cc99 100644
--- a/ruby/red-arrow/lib/arrow/loader.rb
+++ b/ruby/red-arrow/lib/arrow/loader.rb
@@ -116,6 +116,8 @@ def require_libraries
       require "arrow/sparse-union-data-type"
       require "arrow/string-dictionary-array-builder"
       require "arrow/string-array-builder"
+      require "arrow/stream-decoder"
+      require "arrow/stream-listener"
       require "arrow/struct-array"
       require "arrow/struct-array-builder"
       require "arrow/struct-data-type"
@@ -168,6 +170,16 @@ def gc_guard
       end
     end
 
+    def rubyish_class_name(info)
+      name = info.name
+      case name
+      when "StreamListener"
+        "StreamListenerRaw"
+      else
+        super
+      end
+    end
+
     def load_object_info(info)
       super
 
diff --git a/ruby/red-arrow/lib/arrow/stream-decoder.rb b/ruby/red-arrow/lib/arrow/stream-decoder.rb
new file mode 100644
index 0000000000000..a6945215bc646
--- /dev/null
+++ b/ruby/red-arrow/lib/arrow/stream-decoder.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+  class StreamDecoder
+    def consume(data)
+      case data
+      when Buffer
+        consume_buffer(data)
+      else
+        consume_bytes(data)
+      end
+    end
+  end
+end
diff --git a/ruby/red-arrow/lib/arrow/stream-listener.rb b/ruby/red-arrow/lib/arrow/stream-listener.rb
new file mode 100644
index 0000000000000..14a70385842e4
--- /dev/null
+++ b/ruby/red-arrow/lib/arrow/stream-listener.rb
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+  class StreamListener < StreamListenerRaw
+    type_register
+
+    def on_eos
+    end
+
+    def on_record_batch_decoded(record_batch, metadata)
+    end
+
+    def on_schema(schema, filtered_schema)
+    end
+
+    private
+    def virtual_do_on_eos
+      on_eos
+      true
+    end
+
+    def virtual_do_on_record_batch_decoded(record_batch, metadata)
+      on_record_batch_decoded(record_batch, metadata)
+      true
+    end
+
+    def virtual_do_on_schema_decoded(schema, filtered_schema)
+      on_schema_decoded(schema, filtered_schema)
+      true
+    end
+  end
+end
diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec
index 9e9c147f76507..67fec2e0907c1 100644
--- a/ruby/red-arrow/red-arrow.gemspec
+++ b/ruby/red-arrow/red-arrow.gemspec
@@ -49,7 +49,7 @@ Gem::Specification.new do |spec|
   spec.add_runtime_dependency("bigdecimal", ">= 3.1.0")
   spec.add_runtime_dependency("csv")
   spec.add_runtime_dependency("extpp", ">= 0.1.1")
-  spec.add_runtime_dependency("gio2", ">= 3.5.0")
+  spec.add_runtime_dependency("gio2", ">= 4.2.3")
   spec.add_runtime_dependency("native-package-installer")
   spec.add_runtime_dependency("pkg-config")
 
diff --git a/ruby/red-arrow/test/test-stream-listener.rb b/ruby/red-arrow/test/test-stream-listener.rb
new file mode 100644
index 0000000000000..0aed9cb1e2613
--- /dev/null
+++ b/ruby/red-arrow/test/test-stream-listener.rb
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestStreamListener < Test::Unit::TestCase
+  class Listener < Arrow::StreamListener
+    attr_reader :events
+    def initialize
+      super
+      @events = []
+    end
+
+    def on_eos
+      @events << [:eos]
+    end
+
+    def on_record_batch_decoded(record_batch, metadata)
+      @events << [:record_batch_decoded, record_batch, metadata]
+    end
+
+    def on_schema_decoded(schema, filtered_schema)
+      @events << [:schema_decoded, schema, filtered_schema]
+    end
+  end
+
+  def setup
+    @record_batch = Arrow::RecordBatch.new(enabled: [true, false, nil, true])
+    @schema = @record_batch.schema
+
+    @buffer = Arrow::ResizableBuffer.new(0)
+    table = Arrow::Table.new(@schema, [@record_batch])
+    table.save(@buffer, format: :stream)
+
+    @listener = Listener.new
+    @decoder = Arrow::StreamDecoder.new(@listener)
+  end
+
+  def test_consume
+    @decoder.consume(@buffer)
+    assert_equal([
+                   [:schema_decoded, @schema, @schema],
+                   [:record_batch_decoded, @record_batch, nil],
+                   [:eos],
+                 ],
+                 @listener.events)
+  end
+end

From b153791f217ee1725b551d894fa9d43ca613992e Mon Sep 17 00:00:00 2001
From: wiedld <wiedld@users.noreply.github.com>
Date: Fri, 20 Sep 2024 10:40:54 -1000
Subject: [PATCH 181/186] MINOR: [Docs] Update implementation matrix for view
 types in arrow-rs (#44175)

### Rationale for this change

BinaryView and Utf8View are now supported in arrow-rs [as of the 52.0.0 release](https://github.com/apache/arrow-rs/issues/5374#issuecomment-2261464353).

### What changes are included in this PR?

Add two checkmarks for the Data Type support.

### Are these changes tested?

N/A

### Are there any user-facing changes?

N/A

Authored-by: wiedld <dlw405@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 docs/source/status.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/status.rst b/docs/source/status.rst
index 98374164d7ae0..765aeb1a076ae 100644
--- a/docs/source/status.rst
+++ b/docs/source/status.rst
@@ -68,11 +68,11 @@ Data Types
 +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+
 | Large Utf8        | ✓     | ✓     | ✓     | ✓  | \(4)  |  ✓    | ✓     |       | ✓         |
 +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+
-| Binary View       | ✓     |       | ✓     |    |   ✓   |       |       |       |           |
+| Binary View       | ✓     |       | ✓     |    |   ✓   |  ✓    |       |       |           |
 +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+
 | Large Binary View | ✓     |       | ✓     |    |       |       |       |       |           |
 +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+
-| Utf8 View         | ✓     |       | ✓     |    |   ✓   |       |       |       |           |
+| Utf8 View         | ✓     |       | ✓     |    |   ✓   |  ✓    |       |       |           |
 +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+
 | Large Utf8 View   | ✓     |       | ✓     |    |       |       |       |       |           |
 +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+

From 37f62d0bc5f4d22e7194947963b445225b984558 Mon Sep 17 00:00:00 2001
From: Stephen Coussens <coussens@users.noreply.github.com>
Date: Sat, 21 Sep 2024 03:05:10 -0700
Subject: [PATCH 182/186] GH-43960: [R] fix `str_sub` binding to properly
 handle negative `end` values (#44141)

First-time contributor here, so let me know where I can improve!

### Rationale for this change

The `str_sub` binding in arrow was not handling negative `end` values properly. The problem was two-fold:
 1.  When `end` values were negative (and less than the `start` value, which might be positive), `str_sub` would improperly return an empty string.
2.  When `end` values were < -1 but the `end` position was still to the right of the `start` position, `str_sub` failed to return the final character in the substring, since it did not account for the fact that `end` is counted exclusively in the underlying C++ function (`utf8_slice_codeunits`), but inclusively in R.

See discussion/examples at https://github.com/apache/arrow/issues/43960 for details.

### What changes are included in this PR?
1. The removal of lines from `r/R/dplyr-funcs-string.R` that previously set `end`= 0 when `start < end`, which meant if the user was counting backwards from the end of the string (with a negative `end` value), an empty string would [wrongly] be returned. It appears that the case that the previous code was trying to address is already handled properly by the underlying C++ function (`utf8_slice_codeunits`).
2. Addition of lines to `r/R/dplyr-funcs-string.R` in order to account the difference in between R's inclusive `end` and C++'s exclusive `end` when `end` is negative.
3. The addition of a test (described below) to `r/tests/testthat/test-dplyr-funcs-string.R` to test for these cases.

### Are these changes tested?

Yes, I ran all tests in `r/tests/testthat/test-dplyr-funcs-string.R`, including one which I added (see attached commit), which explicitly tests the case where `end` is negative (-3) and less than the `start` value (1). This also tests the case where `end`  < -1 and to the right of the `start` position.

### Are there any user-facing changes?

No.

**This PR contains a "Critical Fix".** Previously:
- When `end` values were negative (and less than the `start` value, which might be positive), `str_sub` would improperly return an empty string.
- When `end` values were < -1 but the `end` position was still to the right of the `start` position, `str_sub` failed to return the final character in the substring, since it did not account for the fact that `end` is counted exclusively in the underlying C++ function (`utf8_slice_codeunits`), but inclusively in R.
* GitHub Issue: #43960

Lead-authored-by: Stephen Coussens <coussens@users.noreply.github.com>
Co-authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
---
 r/R/dplyr-funcs-string.R                   | 10 ++++++----
 r/tests/testthat/test-dplyr-funcs-string.R |  9 +++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/r/R/dplyr-funcs-string.R b/r/R/dplyr-funcs-string.R
index 77e1a5405a692..28db78f609309 100644
--- a/r/R/dplyr-funcs-string.R
+++ b/r/R/dplyr-funcs-string.R
@@ -570,10 +570,12 @@ register_bindings_string_other <- function() {
       end <- .Machine$integer.max
     }
 
-    # An end value lower than a start value returns an empty string in
-    # stringr::str_sub so set end to 0 here to match this behavior
-    if (end < start) {
-      end <- 0
+    # strings returned by utf8_slice_codeunits are exclusive of the `end` position.
+    # stringr::str_sub returns strings inclusive of the `end` position, so add 1 to `end`.
+    # NOTE:this isn't necessary for positive values of `end`, because utf8_slice_codeunits
+    # is 0-based while R is 1-based, which cancels out the effect of the exclusive `end`
+    if (end < -1) {
+      end <- end + 1L
     }
 
     # subtract 1 from `start` because C++ is 0-based and R is 1-based
diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R
index cb1d4675058b6..86966b305368a 100644
--- a/r/tests/testthat/test-dplyr-funcs-string.R
+++ b/r/tests/testthat/test-dplyr-funcs-string.R
@@ -1178,6 +1178,15 @@ test_that("str_sub", {
       collect(),
     df
   )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        y = str_sub(x, 1, -3),
+        y2 = stringr::str_sub(x, 1, -3)
+      ) %>%
+      collect(),
+    df
+  )
 
   expect_arrow_eval_error(
     str_sub("Apache Arrow", c(1, 2), 3),

From 81b94dcda6180781effc331650d9cd413a4ac115 Mon Sep 17 00:00:00 2001
From: Neal Richardson <neal.p.richardson@gmail.com>
Date: Sun, 22 Sep 2024 11:15:40 -0400
Subject: [PATCH 183/186] GH-43440: [R] Unable to filter a factor column with
 %in% (#43446)

### Rationale for this change

Fixes #43440

### What changes are included in this PR?

The binding for `%in%` sends the DictionaryType's `value_type` to
`cast_or_parse()`. It's possible that it would be better to handle this
in `cast_or_parse()`, but it is used in lots of places and I wasn't sure
that was correct everywhere. We could certainly find out, but that's a
bigger testing exercise than I wanted to take on this afternoon.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

The bug is fixed.
* GitHub Issue: #43440
---
 r/R/dplyr-funcs-conditional.R                   | 11 +++++++++--
 r/tests/testthat/test-dplyr-funcs-conditional.R | 10 ++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/r/R/dplyr-funcs-conditional.R b/r/R/dplyr-funcs-conditional.R
index 3ab955aa8aee4..52f47a128f40a 100644
--- a/r/R/dplyr-funcs-conditional.R
+++ b/r/R/dplyr-funcs-conditional.R
@@ -21,9 +21,16 @@ register_bindings_conditional <- function() {
     value_set <- Array$create(table)
     # If possible, `table` should be the same type as `x`
     # Try downcasting here; otherwise Acero may upcast x to table's type
+    x_type <- x$type()
+    # GH-43440: `is_in` doesn't want a DictionaryType in the value_set,
+    # so we'll cast to its value_type
+    # TODO: should this be pushed into cast_or_parse? Is this a bigger issue?
+    if (inherits(x_type, "DictionaryType")) {
+      x_type <- x_type$value_type
+    }
     try(
-      value_set <- cast_or_parse(value_set, x$type()),
-      silent = TRUE
+      value_set <- cast_or_parse(value_set, x_type),
+      silent = !getOption("arrow.debug", FALSE)
     )
 
     expr <- Expression$create("is_in", x,
diff --git a/r/tests/testthat/test-dplyr-funcs-conditional.R b/r/tests/testthat/test-dplyr-funcs-conditional.R
index d90dc827b40d5..24ddd342a882b 100644
--- a/r/tests/testthat/test-dplyr-funcs-conditional.R
+++ b/r/tests/testthat/test-dplyr-funcs-conditional.R
@@ -26,6 +26,16 @@ tbl <- example_data
 tbl$verses <- verses[[1]]
 tbl$another_chr <- tail(letters, 10)
 
+test_that("%in% handles dictionary type", {
+  df <- tibble::tibble(x = factor(c("a", "b", "c")))
+  compare_dplyr_binding(
+    .input %>%
+      filter(x %in% "a") %>%
+      collect(),
+    df
+  )
+})
+
 test_that("if_else and ifelse", {
   compare_dplyr_binding(
     .input %>%

From 6730c1a39260e74d97ef3eb7f25c7207b4157f26 Mon Sep 17 00:00:00 2001
From: ViggoC <chenweiguo.vc@bytedance.com>
Date: Mon, 23 Sep 2024 14:42:14 +0800
Subject: [PATCH 184/186] GH-44088: [Java] Fix copyFrom in
 BaseVariableWidthViewVector (#44078)

Fix bugs in `copyFromSafe` and `handleSafe`
* GitHub Issue: #44088

Authored-by: chenweiguo.vc <chenweiguo.vc@bytedance.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 .../vector/BaseVariableWidthViewVector.java   | 102 ++++++---------
 .../arrow/vector/TestVarCharViewVector.java   | 119 ++++++++++++++++--
 2 files changed, 152 insertions(+), 69 deletions(-)

diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java
index aee5233f9d466..15d21827839e2 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java
@@ -565,6 +565,7 @@ public void reallocViewBuffer(long desiredAllocSize) {
     viewBuffer.getReferenceManager().release();
     viewBuffer = newBuf;
     lastValueAllocationSizeInBytes = viewBuffer.capacity();
+    lastValueCapacity = getValueCapacity();
   }
 
   /**
@@ -1248,10 +1249,7 @@ public void setSafe(int index, ByteBuffer value, int start, int length) {
    */
   @Override
   public void setNull(int index) {
-    // We need to check and reallocate the validity buffer
-    while (index >= getValueCapacity()) {
-      reallocValidityBuffer();
-    }
+    handleSafe(index, 0);
     BitVectorHelper.unsetBit(validityBuffer, index);
   }
 
@@ -1460,25 +1458,12 @@ public final int getTotalValueLengthUpToIndex(int index) {
   }
 
   protected final void handleSafe(int index, int dataLength) {
-    final long lastSetCapacity = lastSet < 0 ? 0 : (long) index * ELEMENT_SIZE;
-    final long targetCapacity = roundUpToMultipleOf16(lastSetCapacity + dataLength);
-    // for views, we need each buffer with 16 byte alignment, so we need to check the last written
-    // index
-    // in the viewBuffer and allocate a new buffer which has 16 byte alignment for adding new
-    // values.
-    long writePosition = (long) index * ELEMENT_SIZE;
-    if (viewBuffer.capacity() <= writePosition || viewBuffer.capacity() < targetCapacity) {
-      /*
-       * Everytime we want to increase the capacity of the viewBuffer, we need to make sure that the new capacity
-       * meets 16 byte alignment.
-       * If the targetCapacity is larger than the writePosition, we may not necessarily
-       * want to allocate the targetCapacity to viewBuffer since when it is >={@link #INLINE_SIZE} either way
-       * we are writing to the dataBuffer.
-       */
-      reallocViewBuffer(Math.max(writePosition, targetCapacity));
+    final long targetCapacity = roundUpToMultipleOf16((long) index * ELEMENT_SIZE + dataLength);
+    if (viewBuffer.capacity() < targetCapacity) {
+      reallocViewBuffer(targetCapacity);
     }
 
-    while (index >= getValueCapacity()) {
+    while (index >= getValidityBufferValueCapacity()) {
       reallocValidityBuffer();
     }
   }
@@ -1498,26 +1483,7 @@ public void copyFrom(int fromIndex, int thisIndex, ValueVector from) {
       BitVectorHelper.unsetBit(validityBuffer, thisIndex);
     } else {
       final int viewLength = from.getDataBuffer().getInt((long) fromIndex * ELEMENT_SIZE);
-      BitVectorHelper.setBit(validityBuffer, thisIndex);
-      final int start = thisIndex * ELEMENT_SIZE;
-      final int copyStart = fromIndex * ELEMENT_SIZE;
-      from.getDataBuffer().getBytes(start, viewBuffer, copyStart, ELEMENT_SIZE);
-      if (viewLength > INLINE_SIZE) {
-        final int bufIndex =
-            from.getDataBuffer()
-                .getInt(((long) fromIndex * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH);
-        final int dataOffset =
-            from.getDataBuffer()
-                .getInt(
-                    ((long) fromIndex * ELEMENT_SIZE)
-                        + LENGTH_WIDTH
-                        + PREFIX_WIDTH
-                        + BUF_INDEX_WIDTH);
-        final ArrowBuf dataBuf = ((BaseVariableWidthViewVector) from).dataBuffers.get(bufIndex);
-        final ArrowBuf thisDataBuf = allocateOrGetLastDataBuffer(viewLength);
-        thisDataBuf.setBytes(thisDataBuf.writerIndex(), dataBuf, dataOffset, viewLength);
-        thisDataBuf.writerIndex(thisDataBuf.writerIndex() + viewLength);
-      }
+      copyFromNotNull(fromIndex, thisIndex, from, viewLength);
     }
     lastSet = thisIndex;
   }
@@ -1539,30 +1505,44 @@ public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) {
     } else {
       final int viewLength = from.getDataBuffer().getInt((long) fromIndex * ELEMENT_SIZE);
       handleSafe(thisIndex, viewLength);
-      BitVectorHelper.setBit(validityBuffer, thisIndex);
-      final int start = thisIndex * ELEMENT_SIZE;
-      final int copyStart = fromIndex * ELEMENT_SIZE;
-      from.getDataBuffer().getBytes(start, viewBuffer, copyStart, ELEMENT_SIZE);
-      if (viewLength > INLINE_SIZE) {
-        final int bufIndex =
-            from.getDataBuffer()
-                .getInt(((long) fromIndex * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH);
-        final int dataOffset =
-            from.getDataBuffer()
-                .getInt(
-                    ((long) fromIndex * ELEMENT_SIZE)
-                        + LENGTH_WIDTH
-                        + PREFIX_WIDTH
-                        + BUF_INDEX_WIDTH);
-        final ArrowBuf dataBuf = ((BaseVariableWidthViewVector) from).dataBuffers.get(bufIndex);
-        final ArrowBuf thisDataBuf = allocateOrGetLastDataBuffer(viewLength);
-        thisDataBuf.setBytes(thisDataBuf.writerIndex(), dataBuf, dataOffset, viewLength);
-        thisDataBuf.writerIndex(thisDataBuf.writerIndex() + viewLength);
-      }
+      copyFromNotNull(fromIndex, thisIndex, from, viewLength);
     }
     lastSet = thisIndex;
   }
 
+  private void copyFromNotNull(int fromIndex, int thisIndex, ValueVector from, int viewLength) {
+    BitVectorHelper.setBit(validityBuffer, thisIndex);
+    final int start = thisIndex * ELEMENT_SIZE;
+    final int copyStart = fromIndex * ELEMENT_SIZE;
+    if (viewLength > INLINE_SIZE) {
+      final int bufIndex =
+          from.getDataBuffer()
+              .getInt(((long) fromIndex * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH);
+      final int dataOffset =
+          from.getDataBuffer()
+              .getInt(
+                  ((long) fromIndex * ELEMENT_SIZE)
+                      + LENGTH_WIDTH
+                      + PREFIX_WIDTH
+                      + BUF_INDEX_WIDTH);
+      final ArrowBuf dataBuf = ((BaseVariableWidthViewVector) from).dataBuffers.get(bufIndex);
+      final ArrowBuf thisDataBuf = allocateOrGetLastDataBuffer(viewLength);
+
+      viewBuffer.setBytes(start, from.getDataBuffer(), copyStart, LENGTH_WIDTH + PREFIX_WIDTH);
+      int writePosition = start + LENGTH_WIDTH + PREFIX_WIDTH;
+      // set buf id
+      viewBuffer.setInt(writePosition, dataBuffers.size() - 1);
+      writePosition += BUF_INDEX_WIDTH;
+      // set offset
+      viewBuffer.setInt(writePosition, (int) thisDataBuf.writerIndex());
+
+      thisDataBuf.setBytes(thisDataBuf.writerIndex(), dataBuf, dataOffset, viewLength);
+      thisDataBuf.writerIndex(thisDataBuf.writerIndex() + viewLength);
+    } else {
+      from.getDataBuffer().getBytes(copyStart, viewBuffer, start, ELEMENT_SIZE);
+    }
+  }
+
   @Override
   public ArrowBufPointer getDataPointer(int index) {
     return getDataPointer(index, new ArrowBufPointer());
diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java
index 308431fdeb9cf..232eec9ef1b23 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java
@@ -33,7 +33,9 @@
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
 import java.util.Random;
 import java.util.function.BiConsumer;
@@ -261,6 +263,39 @@ public void testDataBufferBasedAllocationInOtherBuffer() {
     }
   }
 
+  @Test
+  public void testSetSafe() {
+    try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) {
+      viewVarCharVector.allocateNew(1, 1);
+      byte[] str6 = generateRandomString(40).getBytes();
+      final List<byte[]> strings = List.of(STR0, STR1, STR2, STR3, STR4, STR5, str6);
+
+      // set data to a position out of capacity index
+      Map<Integer, byte[]> expected = new HashMap<>();
+      for (byte[] string : strings) {
+        int cap = viewVarCharVector.getValueCapacity();
+        expected.put(cap, string);
+        viewVarCharVector.setSafe(cap, string);
+      }
+      int nullIndex = viewVarCharVector.getValueCapacity();
+      viewVarCharVector.setNull(nullIndex);
+      int valueCount = nullIndex + 1;
+      viewVarCharVector.setValueCount(valueCount);
+      assertEquals(viewVarCharVector.getNullCount(), valueCount - strings.size());
+
+      assertEquals(128, viewVarCharVector.getValueCapacity());
+      assertEquals(2, viewVarCharVector.dataBuffers.size());
+
+      for (int i = 0; i < viewVarCharVector.getValueCapacity(); i++) {
+        if (expected.containsKey(i)) {
+          assertArrayEquals(expected.get(i), viewVarCharVector.get(i));
+        } else {
+          assertNull(viewVarCharVector.get(i));
+        }
+      }
+    }
+  }
+
   @Test
   public void testMixedAllocation() {
     try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) {
@@ -1749,12 +1784,12 @@ public void testCopyFromWithNulls(
         } else if (i % 3 == 1) {
           assertArrayEquals(
               Integer.toString(i).getBytes(StandardCharsets.UTF_8),
-              vector.get(i),
+              vector2.get(i),
               "unexpected value at index: " + i);
         } else {
           assertArrayEquals(
               (i + prefixString).getBytes(StandardCharsets.UTF_8),
-              vector.get(i),
+              vector2.get(i),
               "unexpected value at index: " + i);
         }
       }
@@ -1769,12 +1804,12 @@ public void testCopyFromWithNulls(
         } else if (i % 3 == 1) {
           assertArrayEquals(
               Integer.toString(i).getBytes(StandardCharsets.UTF_8),
-              vector.get(i),
+              vector2.get(i),
               "unexpected value at index: " + i);
         } else {
           assertArrayEquals(
               (i + prefixString).getBytes(StandardCharsets.UTF_8),
-              vector.get(i),
+              vector2.get(i),
               "unexpected value at index: " + i);
         }
       }
@@ -1846,12 +1881,12 @@ public void testCopyFromSafeWithNulls(
         } else if (i % 3 == 1) {
           assertArrayEquals(
               Integer.toString(i).getBytes(StandardCharsets.UTF_8),
-              vector.get(i),
+              vector2.get(i),
               "unexpected value at index: " + i);
         } else {
           assertArrayEquals(
               (i + prefixString).getBytes(StandardCharsets.UTF_8),
-              vector.get(i),
+              vector2.get(i),
               "unexpected value at index: " + i);
         }
       }
@@ -1867,15 +1902,83 @@ public void testCopyFromSafeWithNulls(
         } else if (i % 3 == 1) {
           assertArrayEquals(
               Integer.toString(i).getBytes(StandardCharsets.UTF_8),
-              vector.get(i),
+              vector2.get(i),
               "unexpected value at index: " + i);
         } else {
           assertArrayEquals(
               (i + prefixString).getBytes(StandardCharsets.UTF_8),
-              vector.get(i),
+              vector2.get(i),
+              "unexpected value at index: " + i);
+        }
+      }
+
+      // make it reallocate
+      int valueCapacity = vector2.getValueCapacity();
+      for (int i = 0; i < numberOfValues; i++) {
+        int thisIndex = i + valueCapacity;
+        vector2.copyFromSafe(i, thisIndex, vector);
+        if (i % 3 == 0) {
+          assertNull(vector2.getObject(thisIndex));
+        } else if (i % 3 == 1) {
+          assertArrayEquals(
+              Integer.toString(i).getBytes(StandardCharsets.UTF_8),
+              vector2.get(thisIndex),
+              "unexpected value at index: " + i);
+        } else {
+          assertArrayEquals(
+              (i + prefixString).getBytes(StandardCharsets.UTF_8),
+              vector2.get(thisIndex),
               "unexpected value at index: " + i);
         }
       }
+
+      // test target vector with different initialCapacity
+      try (final BaseVariableWidthViewVector vector3 = vectorCreator.apply(allocator)) {
+        vector3.setInitialCapacity(16);
+        vector3.allocateNew();
+        for (int i = 0; i < numberOfValues; i++) {
+          vector3.copyFromSafe(i, i, vector);
+          if (i % 3 == 0) {
+            assertNull(vector3.getObject(i));
+          } else {
+            assertArrayEquals(vector.get(i), vector3.get(i));
+          }
+        }
+      }
+
+      // test overwrite a used vector by copy
+      try (final BaseVariableWidthViewVector targetVector = vectorCreator.apply(allocator)) {
+
+        targetVector.setInitialCapacity(initialCapacity);
+        targetVector.allocateNew();
+
+        // source vector: null, short, long...
+        // target vector: long, null, short...
+        for (int i = 0; i < numberOfValues; i++) {
+          if (i % 3 == 0) {
+            // long strings
+            byte[] b = (i + prefixString).getBytes(StandardCharsets.UTF_8);
+            targetVector.set(i, b, 0, b.length);
+          } else if (i % 3 == 1) {
+            // null values
+            targetVector.setNull(i);
+          } else {
+            // short strings
+            byte[] b = Integer.toString(i).getBytes(StandardCharsets.UTF_8);
+            targetVector.set(i, b, 0, b.length);
+          }
+        }
+        targetVector.setValueCount(numberOfValues);
+
+        for (int i = 0; i < numberOfValues; i++) {
+          targetVector.copyFromSafe(i, i, vector);
+          if (i % 3 == 0) {
+            assertNull(targetVector.getObject(i));
+          } else {
+            assertArrayEquals(vector.get(i), targetVector.get(i));
+          }
+        }
+      }
     }
   }
 

From 13fe5fb7c093c39d4529bc5343d24c4339a19d8a Mon Sep 17 00:00:00 2001
From: RoboSchmied <github@roboschmie.de>
Date: Mon, 23 Sep 2024 10:11:06 +0200
Subject: [PATCH 185/186] GH-44186: [C++][Parquet] Fix typo in
 parquet/column_writer.cc (#40856)

fix typo
* GitHub Issue: #44186

Lead-authored-by: RoboSchmied <github@roboschmie.de>
Co-authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/parquet/column_writer.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index b7ff712abebe9..94c301f918544 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -2272,11 +2272,11 @@ struct SerializeFunctor<
 
     if (array.null_count() == 0) {
       for (int64_t i = 0; i < array.length(); i++) {
-        out[i] = FixDecimalEndianess<ArrowType::kByteWidth>(array.GetValue(i), offset);
+        out[i] = FixDecimalEndianness<ArrowType::kByteWidth>(array.GetValue(i), offset);
       }
     } else {
       for (int64_t i = 0; i < array.length(); i++) {
-        out[i] = array.IsValid(i) ? FixDecimalEndianess<ArrowType::kByteWidth>(
+        out[i] = array.IsValid(i) ? FixDecimalEndianness<ArrowType::kByteWidth>(
                                         array.GetValue(i), offset)
                                   : FixedLenByteArray();
       }
@@ -2304,7 +2304,7 @@ struct SerializeFunctor<
   }
 
   template <int byte_width>
-  FixedLenByteArray FixDecimalEndianess(const uint8_t* in, int64_t offset) {
+  FixedLenByteArray FixDecimalEndianness(const uint8_t* in, int64_t offset) {
     const auto* u64_in = reinterpret_cast<const int64_t*>(in);
     auto out = reinterpret_cast<const uint8_t*>(scratch) + offset;
     static_assert(byte_width == 16 || byte_width == 32,

From 0f7b5e573ec857cf54708a6805cd942a77eff7fa Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 23 Sep 2024 10:30:28 +0200
Subject: [PATCH 186/186] GH-44071: [C++] Leak S3 structures if finalization
 happens too late (#44090)

### Rationale for this change

Leaking S3 structures at shutdown can be better than inducing a segfault because those structures' destructors run too late at process exit.

This seems to avoid the crash when run under `uwsgi` in https://github.com/apache/arrow/issues/44071

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Hopefully not.

* GitHub Issue: #44071

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/filesystem/s3fs.cc      | 15 ++++++++++-
 python/pyarrow/tests/test_fs.py       | 39 +++++++++++++++++++++++++++
 python/pyarrow/tests/wsgi_examples.py | 35 ++++++++++++++++++++++++
 python/requirements-test.txt          |  1 +
 python/requirements-wheel-test.txt    |  1 +
 5 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 python/pyarrow/tests/wsgi_examples.py

diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 96c771aeb61b8..77b111f61bf4c 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -3389,6 +3389,12 @@ struct AwsInstance {
         ARROW_LOG(WARNING)
             << " arrow::fs::FinalizeS3 was not called even though S3 was initialized.  "
                "This could lead to a segmentation fault at exit";
+        // Leak the S3ClientFinalizer to avoid crashes when destroying remaining
+        // S3Client instances (GH-44071).
+        auto* leaked_shared_ptr =
+            new std::shared_ptr<S3ClientFinalizer>(GetClientFinalizer());
+        ARROW_UNUSED(leaked_shared_ptr);
+        return;
       }
       GetClientFinalizer()->Finalize();
 #ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION
@@ -3480,7 +3486,14 @@ Status EnsureS3Initialized() {
 }
 
 Status FinalizeS3() {
-  GetAwsInstance()->Finalize();
+  auto instance = GetAwsInstance();
+  // The AWS instance might already be destroyed in case FinalizeS3
+  // is called from an atexit handler (which is a bad idea anyway as the
+  // AWS SDK is not safe anymore to shutdown by this time). See GH-44071.
+  if (instance == nullptr) {
+    return Status::Invalid("FinalizeS3 called too late");
+  }
+  instance->Finalize();
   return Status::OK();
 }
 
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index f8ce74700dea8..1c639412cd1ab 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -19,8 +19,10 @@
 import gzip
 import os
 import pathlib
+from urllib.request import urlopen
 import subprocess
 import sys
+import time
 
 import pytest
 import weakref
@@ -34,6 +36,10 @@
                         LocalFileSystem, SubTreeFileSystem, _MockFileSystem,
                         FileSystemHandler, PyFileSystem, FSSpecHandler,
                         copy_files)
+from pyarrow.util import find_free_port
+
+
+here = os.path.dirname(os.path.abspath(__file__))
 
 
 class DummyHandler(FileSystemHandler):
@@ -2010,3 +2016,36 @@ def test_concurrent_s3fs_init():
         finalize_s3()
         """
     subprocess.check_call([sys.executable, "-c", code])
+
+
+@pytest.mark.s3
+def test_uwsgi_integration():
+    # GH-44071: using S3FileSystem under uwsgi shouldn't lead to a crash at shutdown
+    try:
+        subprocess.check_call(["uwsgi", "--version"])
+    except FileNotFoundError:
+        pytest.skip("uwsgi not installed on this Python")
+
+    port = find_free_port()
+    args = ["uwsgi", "-i", "--http", f"127.0.0.1:{port}",
+            "--wsgi-file", os.path.join(here, "wsgi_examples.py")]
+    proc = subprocess.Popen(args, stdin=subprocess.DEVNULL)
+    # Try to fetch URL, it should return 200 Ok...
+    try:
+        url = f"http://127.0.0.1:{port}/s3/"
+        start_time = time.time()
+        error = None
+        while time.time() < start_time + 5:
+            try:
+                with urlopen(url) as resp:
+                    assert resp.status == 200
+                break
+            except OSError as e:
+                error = e
+                time.sleep(0.1)
+        else:
+            pytest.fail(f"Could not fetch {url!r}: {error}")
+    finally:
+        proc.terminate()
+    # ... and uwsgi should gracefully shutdown after it's been asked above
+    assert proc.wait() == 30  # UWSGI_END_CODE = 30
diff --git a/python/pyarrow/tests/wsgi_examples.py b/python/pyarrow/tests/wsgi_examples.py
new file mode 100644
index 0000000000000..440b107abe511
--- /dev/null
+++ b/python/pyarrow/tests/wsgi_examples.py
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pyarrow.fs
+
+
+def application(env, start_response):
+    path = env['PATH_INFO']
+    members = path.split('/')
+    assert members[0] == ''
+    assert len(members) >= 2
+    root = members[1]
+    if root == 's3':
+        # See test_fs::test_uwsgi_integration
+        start_response('200 OK', [('Content-Type', 'text/html')])
+        # flake8: noqa
+        fs = pyarrow.fs.S3FileSystem()
+        return [b"Hello World\n"]
+    else:
+        start_response('404 Not Found', [('Content-Type', 'text/html')])
+        return [f"Path {path!r} not found\n".encode()]
diff --git a/python/requirements-test.txt b/python/requirements-test.txt
index 975477c4223dd..48422f86cc720 100644
--- a/python/requirements-test.txt
+++ b/python/requirements-test.txt
@@ -3,3 +3,4 @@ hypothesis
 pandas
 pytest
 pytz
+uwsgi; sys.platform != 'win32' and python_version < '3.13'
diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt
index 98ec2bd4fd4e4..bad3e251d4464 100644
--- a/python/requirements-wheel-test.txt
+++ b/python/requirements-wheel-test.txt
@@ -10,6 +10,7 @@ hypothesis
 pytest
 pytz
 tzdata; sys_platform == 'win32'
+uwsgi; sys.platform != 'win32' and python_version < '3.13'
 
 # We generally test with the oldest numpy version that supports a given Python
 # version. However, there is no need to make this strictly the oldest version,