diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 95029d98f7a01..2cdd1d42634bf 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=14.0.0.9000 +pkgver=14.0.1.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 37b245e0dd6c2..e54b94f1a861a 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -438,9 +438,9 @@ class TypedComparatorImpl return Helper::Compare(type_length_, a, b); } - bool Compare(const T& a, const T& b) override { return CompareInline(a, b); } + bool Compare(const T& a, const T& b) const override { return CompareInline(a, b); } - std::pair GetMinMax(const T* values, int64_t length) override { + std::pair GetMinMax(const T* values, int64_t length) const override { DCHECK_GT(length, 0); T min = Helper::DefaultMin(); @@ -457,7 +457,7 @@ class TypedComparatorImpl std::pair GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, - int64_t valid_bits_offset) override { + int64_t valid_bits_offset) const override { DCHECK_GT(length, 0); T min = Helper::DefaultMin(); @@ -477,7 +477,7 @@ class TypedComparatorImpl return {min, max}; } - std::pair GetMinMax(const ::arrow::Array& values) override { + std::pair GetMinMax(const ::arrow::Array& values) const override { ParquetException::NYI(values.type()->ToString()); } @@ -491,7 +491,7 @@ class TypedComparatorImpl template <> std::pair TypedComparatorImpl::GetMinMax(const int32_t* values, - int64_t length) { + int64_t length) const { DCHECK_GT(length, 0); const uint32_t* unsigned_values = reinterpret_cast(values); @@ -537,13 +537,13 @@ std::pair GetMinMaxBinaryHelper( template <> std::pair TypedComparatorImpl::GetMinMax( - const ::arrow::Array& values) { + const ::arrow::Array& values) const { return GetMinMaxBinaryHelper(*this, values); } template <> std::pair TypedComparatorImpl::GetMinMax( - const ::arrow::Array& values) { + const ::arrow::Array& values) const { return GetMinMaxBinaryHelper(*this, values); } diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index ae6c1ca29b2f6..6730e6bcdc1e0 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -73,16 +73,16 @@ class TypedComparator : public Comparator { /// \brief Scalar comparison of two elements, return true if first /// is strictly less than the second - virtual bool Compare(const T& a, const T& b) = 0; + virtual bool Compare(const T& a, const T& b) const = 0; /// \brief Compute maximum and minimum elements in a batch of /// elements without any nulls - virtual std::pair GetMinMax(const T* values, int64_t length) = 0; + virtual std::pair GetMinMax(const T* values, int64_t length) const = 0; /// \brief Compute minimum and maximum elements from an Arrow array. Only /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY /// / arrow::BinaryArray - virtual std::pair GetMinMax(const ::arrow::Array& values) = 0; + virtual std::pair GetMinMax(const ::arrow::Array& values) const = 0; /// \brief Compute maximum and minimum elements in a batch of /// elements with accompanying bitmap indicating which elements are @@ -96,7 +96,7 @@ class TypedComparator : public Comparator { /// the first element in the sequence virtual std::pair GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, - int64_t valid_bits_offset) = 0; + int64_t valid_bits_offset) const = 0; }; /// \brief Typed version of Comparator::Make diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index 221fb0caa8952..83a388c93051d 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (14.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Mon, 06 Nov 2023 22:23:27 -0000 + apache-arrow-apt-source (14.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 273bf32a2a8e4..245e8afeaeb1d 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -102,6 +102,9 @@ else fi %changelog +* Mon Nov 06 2023 Raúl Cumplido - 14.0.1-1 +- New upstream release. + * Thu Oct 19 2023 Raúl Cumplido - 14.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 5e01d962c44d4..1f3f1bd5abd07 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (14.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Mon, 06 Nov 2023 22:23:27 -0000 + apache-arrow (14.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index f61d47db2edd7..87e05558e8cda 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -864,6 +864,9 @@ Documentation for Apache Parquet GLib. %{_datadir}/gtk-doc/html/parquet-glib/ %changelog +* Mon Nov 06 2023 Raúl Cumplido - 14.0.1-1 +- New upstream release. + * Thu Oct 19 2023 Raúl Cumplido - 14.0.0-1 - New upstream release. diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json index 10e179420b803..56411de862096 100644 --- a/docs/source/_static/versions.json +++ b/docs/source/_static/versions.json @@ -10,6 +10,11 @@ "url": "https://arrow.apache.org/docs/", "preferred": true }, + { + "name": "14.0", + "version": "14.0/", + "url": "https://arrow.apache.org/docs/14.0/" + }, { "name": "13.0", "version": "13.0/", diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 6924c2d714e8b..5b7726f58d2d0 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -567,6 +567,9 @@ Be sure to go through on the following checklist: .. code-block:: Bash + # You can run the script with BUMP_TAG=0 and BUMP_PUSH=0 + # this will avoid default pushing to main and pushing the tag + # but you will require to push manually after reviewing the commits. # dev/release/post-11-bump-versions.sh 10.0.0 11.0.0 dev/release/post-11-bump-versions.sh X.Y.Z NEXT_X.NEXT_Y.NEXT_Z diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index e2022171214b7..36952ad99e777 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -207,9 +207,9 @@ names and types of child fields are read from the child arrays. +------------------------+---------------------------------------------------+------------+ | ``+L`` | large list | | +------------------------+---------------------------------------------------+------------+ -| ``+lv`` | list-view | | +| ``+vl`` | list-view | | +------------------------+---------------------------------------------------+------------+ -| ``+Lv`` | large list-view | | +| ``+vL`` | large list-view | | +------------------------+---------------------------------------------------+------------+ | ``+w:123`` | fixed-sized list [123 items] | | +------------------------+---------------------------------------------------+------------+ diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 3f8cd946292ea..9bdee37d18048 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -715,7 +715,7 @@ A struct array has its own validity bitmap that is independent of its child arrays' validity bitmaps. The validity bitmap for the struct array might indicate a null when one or more of its child arrays has a non-null value in its corresponding slot; or conversely, a child -array might have a null in its validity bitmap while the struct array's +array might indicate a null in its validity bitmap while the struct array's validity bitmap shows a non-null value. Therefore, to know whether a particular child entry is valid, one must diff --git a/go/arrow/array.go b/go/arrow/array.go index e07fa478aae57..eed859cf46649 100644 --- a/go/arrow/array.go +++ b/go/arrow/array.go @@ -81,6 +81,8 @@ type ArrayData interface { // Dictionary returns the ArrayData object for the dictionary if this is a // dictionary array, otherwise it will be nil. Dictionary() ArrayData + // SizeInBytes returns the size of the ArrayData buffers and any children and/or dictionary in bytes. + SizeInBytes() uint64 } // Array represents an immutable sequence of values using the Arrow in-memory format. diff --git a/go/arrow/array/data.go b/go/arrow/array/data.go index 8cce49182b879..3c859ec30bc78 100644 --- a/go/arrow/array/data.go +++ b/go/arrow/array/data.go @@ -190,9 +190,34 @@ func (d *Data) SetDictionary(dict arrow.ArrayData) { } } +// SizeInBytes returns the size of the Data and any children and/or dictionary in bytes by +// recursively examining the nested structures of children and/or dictionary. +// The value returned is an upper-bound since offset is not taken into account. +func (d *Data) SizeInBytes() uint64 { + var size uint64 + + if d == nil { + return 0 + } + + for _, b := range d.Buffers() { + size += uint64(b.Len()) + } + for _, c := range d.Children() { + size += c.SizeInBytes() + } + if dict := d.Dictionary(); dict != nil { + size += dict.SizeInBytes() + } + + return size +} + // NewSliceData returns a new slice that shares backing data with the input. // The returned Data slice starts at i and extends j-i elements, such as: -// slice := data[i:j] +// +// slice := data[i:j] +// // The returned value must be Release'd after use. // // NewSliceData panics if the slice is outside the valid range of the input Data. diff --git a/go/arrow/array/data_test.go b/go/arrow/array/data_test.go index b7b0f396470d7..dd4793a7cdbfa 100644 --- a/go/arrow/array/data_test.go +++ b/go/arrow/array/data_test.go @@ -49,3 +49,78 @@ func TestDataReset(t *testing.T) { data.Reset(&arrow.Int64Type{}, 5, data.Buffers(), nil, 1, 2) } } + +func TestSizeInBytes(t *testing.T) { + var buffers1 = make([]*memory.Buffer, 0, 3) + + for i := 0; i < cap(buffers1); i++ { + buffers1 = append(buffers1, memory.NewBufferBytes([]byte("15-bytes-buffer"))) + } + data := NewData(&arrow.StringType{}, 10, buffers1, nil, 0, 0) + var arrayData arrow.ArrayData = data + dataWithChild := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{arrayData}, 0, 0) + + t.Run("buffers only", func(t *testing.T) { + expectedSize := uint64(45) + if actualSize := data.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and child data", func(t *testing.T) { + // 45 bytes in buffers, 45 bytes in child data + expectedSize := uint64(90) + if actualSize := dataWithChild.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and nested child data", func(t *testing.T) { + var dataWithChildArrayData arrow.ArrayData = dataWithChild + var dataWithNestedChild arrow.ArrayData = NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{dataWithChildArrayData}, 0, 0) + // 45 bytes in buffers, 90 bytes in nested child data + expectedSize := uint64(135) + if actualSize := dataWithNestedChild.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and dictionary", func(t *testing.T) { + dictData := data + dataWithDict := NewDataWithDictionary(&arrow.StringType{}, 10, buffers1, 0, 0, dictData) + // 45 bytes in buffers, 45 bytes in dictionary + expectedSize := uint64(90) + if actualSize := dataWithDict.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("sliced data", func(t *testing.T) { + sliceData := NewSliceData(arrayData, 3, 5) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(45) + if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("sliced data with children", func(t *testing.T) { + var dataWithChildArrayData arrow.ArrayData = dataWithChild + sliceData := NewSliceData(dataWithChildArrayData, 3, 5) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(90) + if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers with children which are sliced data", func(t *testing.T) { + sliceData := NewSliceData(arrayData, 3, 5) + dataWithSlicedChildren := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{sliceData}, 0, 0) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(90) + if actualSize := dataWithSlicedChildren.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) +} diff --git a/js/src/enum.ts b/js/src/enum.ts index 4e207dd37cec1..2a82dd4235c51 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -21,7 +21,7 @@ // v4 doesn't seem to be able to tree-shake the rest of those exports. // // We will have to keep these enums in sync when we re-generate the flatbuffers -// code from the shchemas. See js/DEVELOP.md for info on how to run flatbuffers +// code from the schemas. See js/DEVELOP.md for info on how to run flatbuffers // code generation. // //// @@ -174,7 +174,7 @@ export enum Type { FixedSizeBinary = 15, /** Fixed-size binary. Each value occupies the same number of bytes */ FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */ Map = 17, /** Map of named logical types */ - Duration = 18, /** Measure of elapsed time in either seconds, miliseconds, microseconds or nanoseconds. */ + Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, @@ -215,7 +215,7 @@ export enum BufferType { OFFSET = 0, /** - * actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector + * actual data, either fixed width primitive types in slots or variable width delimited by an OFFSET vector */ DATA = 1, diff --git a/js/src/fb/timestamp.ts b/js/src/fb/timestamp.ts index 9c391802e89c9..636a83882a6d9 100644 --- a/js/src/fb/timestamp.ts +++ b/js/src/fb/timestamp.ts @@ -105,7 +105,7 @@ import { TimeUnit } from './time-unit.js'; * no indication of how to map this information to a physical point in time. * Naive date-times must be handled with care because of this missing * information, and also because daylight saving time (DST) may make - * some values ambiguous or non-existent. A naive date-time may be + * some values ambiguous or nonexistent. A naive date-time may be * stored as a struct with Date and Time fields. However, it may also be * encoded into a Timestamp column with an empty timezone. The timestamp * values should be computed "as if" the timezone of the date-time values diff --git a/js/src/ipc/reader.ts b/js/src/ipc/reader.ts index b1ad5248d6158..e4dac0606aa47 100644 --- a/js/src/ipc/reader.ts +++ b/js/src/ipc/reader.ts @@ -185,7 +185,7 @@ export class RecordBatchReader extends ReadableInterop< // // Since TS is a structural type system, we define the following subclass stubs -// so that concrete types exist to associate with with the interfaces below. +// so that concrete types exist to associate with the interfaces below. // // The implementation for each RecordBatchReader is hidden away in the set of // `RecordBatchReaderImpl` classes in the second half of this file. This allows diff --git a/js/src/vector.ts b/js/src/vector.ts index 318ce06e5c3c0..8c9a3da66c92c 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -302,8 +302,8 @@ export class Vector { * values. * * Memoization is very useful when decoding a value is expensive such as - * Uft8. The memoization creates a cache of the size of the Vector and - * therfore increases memory usage. + * Utf8. The memoization creates a cache of the size of the Vector and + * therefore increases memory usage. * * @returns A new vector that memoizes calls to {@link get}. */ diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 2d20f2a8efd5c..54b5610a50eed 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -96,7 +96,7 @@ export class GetBuilderCtor extends Visitor { public visitDurationSecond() { return DurationSecondBuilder; } public visitDurationMillisecond() { return DurationMillisecondBuilder; } public visitDurationMicrosecond() { return DurationMicrosecondBuilder; } - public visistDurationNanosecond() { return DurationNanosecondBuilder; } + public visitDurationNanosecond() { return DurationNanosecondBuilder; } public visitFixedSizeList() { return FixedSizeListBuilder; } public visitMap() { return MapBuilder; } } diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 28dcff20d3bd3..4cf0076b3c8e2 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -144,7 +144,7 @@ function indexOfValue(data: Data, searchElement?: T['TVal function indexOfUnion(data: Data, searchElement?: T['TValue'] | null, fromIndex?: number): number { // Unions are special -- they do have a nullBitmap, but so can their children. // If the searchElement is null, we don't know whether it came from the Union's - // bitmap or one of its childrens'. So we don't interrogate the Union's bitmap, + // bitmap or one of its children's. So we don't interrogate the Union's bitmap, // since that will report the wrong index if a child has a null before the Union. const get = getVisitor.getVisitFn(data); const compare = createElementComparator(searchElement); diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 55a6b4e2ea390..0af954e4adacc 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -62,9 +62,9 @@ export class JSONVectorAssembler extends Visitor { /** @nocollapse */ public static assemble(...batches: T[]) { - const assemlber = new JSONVectorAssembler(); + const assembler = new JSONVectorAssembler(); return batches.map(({ schema, data }) => { - return assemlber.visitMany(schema.fields, data.children); + return assembler.visitMany(schema.fields, data.children); }); } diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 3a36a808ab1d0..1bf25e57a3cce 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 14.0.0.9000 +Version: 14.0.1.9000 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index e00c6b51b597d..c337e907a4213 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,17 @@ under the License. --> -# arrow 14.0.0.9000 +# arrow 14.0.1.9000 + +# arrow 14.0.1 + +# arrow 14.0.0.1 + +## Minor improvements and fixes + +* Add more debug output for build failures (#38819) +* Increase timeout during static library download (#38767) +* Fix bug where rosetta detection was causing installation failure (#38754) # arrow 14.0.0 diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 615a84511fca9..424a5fbc5174a 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,12 +1,16 @@ [ { - "name": "14.0.0.9000 (dev)", + "name": "14.0.1.9000 (dev)", "version": "dev/" }, { - "name": "14.0.0 (release)", + "name": "14.0.1 (release)", "version": "" }, + { + "name": "14.0.0", + "version": "14.0/" + }, { "name": "13.0.0.1", "version": "13.0/" diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index ae435d54d6cbb..9bacf07d1840e 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -152,12 +152,15 @@ struct AltrepVectorBase { const char* class_name = CHAR(PRINTNAME(data_class_sym)); if (IsMaterialized(alt)) { - Rprintf("materialized %s len=%d\n", class_name, Rf_xlength(Representation(alt))); + Rprintf("materialized %s len=%ld\n", class_name, + static_cast(Rf_xlength(Representation(alt)))); // NOLINT: runtime/int } else { const auto& chunked_array = GetChunkedArray(alt); - Rprintf("%s<%p, %s, %d chunks, %d nulls> len=%d\n", class_name, chunked_array.get(), + Rprintf("%s<%p, %s, %d chunks, %ld nulls> len=%ld\n", class_name, + reinterpret_cast(chunked_array.get()), chunked_array->type()->ToString().c_str(), chunked_array->num_chunks(), - chunked_array->null_count(), chunked_array->length()); + static_cast(chunked_array->null_count()), // NOLINT: runtime/int + static_cast(chunked_array->length())); // NOLINT: runtime/int } return TRUE; @@ -819,7 +822,7 @@ struct AltrepVectorString : public AltrepVectorBase> { "'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul " "= TRUE)"; - Rf_error(stripped_string_.c_str()); + Rf_error("%s", stripped_string_.c_str()); } void SetArray(const std::shared_ptr& array) {