From 9cece9dc3e63956a8cbfc125026a17eb3a7ae3dc Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 27 Nov 2023 17:21:59 -0400 Subject: [PATCH 01/10] GH-38893: [R] Fix printf syntax in altrep.cpp (#38894) ### Rationale for this change We have CI errors and CRAN check errors on R-devel, where the appropriate attribute for printf format checking was just added. ### What changes are included in this PR? The appopriate types are now used for printf parameters. ### Are these changes tested? Covered by existing tests ### Are there any user-facing changes? No * Closes: #38893 Authored-by: Dewey Dunnington Signed-off-by: Jacob Wujciak-Jens --- r/src/altrep.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index ae435d54d6cbb..9bacf07d1840e 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -152,12 +152,15 @@ struct AltrepVectorBase { const char* class_name = CHAR(PRINTNAME(data_class_sym)); if (IsMaterialized(alt)) { - Rprintf("materialized %s len=%d\n", class_name, Rf_xlength(Representation(alt))); + Rprintf("materialized %s len=%ld\n", class_name, + static_cast(Rf_xlength(Representation(alt)))); // NOLINT: runtime/int } else { const auto& chunked_array = GetChunkedArray(alt); - Rprintf("%s<%p, %s, %d chunks, %d nulls> len=%d\n", class_name, chunked_array.get(), + Rprintf("%s<%p, %s, %d chunks, %ld nulls> len=%ld\n", class_name, + reinterpret_cast(chunked_array.get()), chunked_array->type()->ToString().c_str(), chunked_array->num_chunks(), - chunked_array->null_count(), chunked_array->length()); + static_cast(chunked_array->null_count()), // NOLINT: runtime/int + static_cast(chunked_array->length())); // NOLINT: runtime/int } return TRUE; @@ -819,7 +822,7 @@ struct AltrepVectorString : public AltrepVectorBase> { "'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul " "= TRUE)"; - Rf_error(stripped_string_.c_str()); + Rf_error("%s", stripped_string_.c_str()); } void SetArray(const std::shared_ptr& array) { From bf79d6e19b63f9144a13e7fe4911e97dab99a566 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 27 Nov 2023 22:00:12 +0000 Subject: [PATCH 02/10] GH-38864: [R] Update NEWS.md for 14.0.0.1 (#38866) ### What changes are included in this PR? Update NEWS file in R package for 14.0.0.1 ### Are these changes tested? No ### Are there any user-facing changes? No * Closes: #38864 Authored-by: Nic Crane Signed-off-by: Nic Crane --- r/NEWS.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/r/NEWS.md b/r/NEWS.md index e00c6b51b597d..7dbf8a7767c2d 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,6 +19,14 @@ # arrow 14.0.0.9000 +# arrow 14.0.0.1 + +## Minor improvements and fixes + +* Add more debug output for build failures (#38819) +* Increase timeout during static library download (#38767) +* Fix bug where rosetta detection was causing installation failure (#38754) + # arrow 14.0.0 ## New features From 63353baf1cda1d1fc7bb614ce01558c12990e073 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Mon, 27 Nov 2023 17:45:22 -0500 Subject: [PATCH 03/10] GH-38900:[JS] Fix spelling (#38901) # ### Rationale for this change ### What changes are included in this PR? Spelling fixes to js/ ### Are these changes tested? ### Are there any user-facing changes? * ~Closes~: #38900 * Closes: #38900 --------- Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- js/src/enum.ts | 6 +++--- js/src/fb/timestamp.ts | 2 +- js/src/ipc/reader.ts | 2 +- js/src/vector.ts | 4 ++-- js/src/visitor/builderctor.ts | 2 +- js/src/visitor/indexof.ts | 2 +- js/src/visitor/jsonvectorassembler.ts | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/js/src/enum.ts b/js/src/enum.ts index 4e207dd37cec1..2a82dd4235c51 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -21,7 +21,7 @@ // v4 doesn't seem to be able to tree-shake the rest of those exports. // // We will have to keep these enums in sync when we re-generate the flatbuffers -// code from the shchemas. See js/DEVELOP.md for info on how to run flatbuffers +// code from the schemas. See js/DEVELOP.md for info on how to run flatbuffers // code generation. // //// @@ -174,7 +174,7 @@ export enum Type { FixedSizeBinary = 15, /** Fixed-size binary. Each value occupies the same number of bytes */ FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */ Map = 17, /** Map of named logical types */ - Duration = 18, /** Measure of elapsed time in either seconds, miliseconds, microseconds or nanoseconds. */ + Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, @@ -215,7 +215,7 @@ export enum BufferType { OFFSET = 0, /** - * actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector + * actual data, either fixed width primitive types in slots or variable width delimited by an OFFSET vector */ DATA = 1, diff --git a/js/src/fb/timestamp.ts b/js/src/fb/timestamp.ts index 9c391802e89c9..636a83882a6d9 100644 --- a/js/src/fb/timestamp.ts +++ b/js/src/fb/timestamp.ts @@ -105,7 +105,7 @@ import { TimeUnit } from './time-unit.js'; * no indication of how to map this information to a physical point in time. * Naive date-times must be handled with care because of this missing * information, and also because daylight saving time (DST) may make - * some values ambiguous or non-existent. A naive date-time may be + * some values ambiguous or nonexistent. A naive date-time may be * stored as a struct with Date and Time fields. However, it may also be * encoded into a Timestamp column with an empty timezone. The timestamp * values should be computed "as if" the timezone of the date-time values diff --git a/js/src/ipc/reader.ts b/js/src/ipc/reader.ts index b1ad5248d6158..e4dac0606aa47 100644 --- a/js/src/ipc/reader.ts +++ b/js/src/ipc/reader.ts @@ -185,7 +185,7 @@ export class RecordBatchReader extends ReadableInterop< // // Since TS is a structural type system, we define the following subclass stubs -// so that concrete types exist to associate with with the interfaces below. +// so that concrete types exist to associate with the interfaces below. // // The implementation for each RecordBatchReader is hidden away in the set of // `RecordBatchReaderImpl` classes in the second half of this file. This allows diff --git a/js/src/vector.ts b/js/src/vector.ts index 318ce06e5c3c0..8c9a3da66c92c 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -302,8 +302,8 @@ export class Vector { * values. * * Memoization is very useful when decoding a value is expensive such as - * Uft8. The memoization creates a cache of the size of the Vector and - * therfore increases memory usage. + * Utf8. The memoization creates a cache of the size of the Vector and + * therefore increases memory usage. * * @returns A new vector that memoizes calls to {@link get}. */ diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 2d20f2a8efd5c..54b5610a50eed 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -96,7 +96,7 @@ export class GetBuilderCtor extends Visitor { public visitDurationSecond() { return DurationSecondBuilder; } public visitDurationMillisecond() { return DurationMillisecondBuilder; } public visitDurationMicrosecond() { return DurationMicrosecondBuilder; } - public visistDurationNanosecond() { return DurationNanosecondBuilder; } + public visitDurationNanosecond() { return DurationNanosecondBuilder; } public visitFixedSizeList() { return FixedSizeListBuilder; } public visitMap() { return MapBuilder; } } diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 28dcff20d3bd3..4cf0076b3c8e2 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -144,7 +144,7 @@ function indexOfValue(data: Data, searchElement?: T['TVal function indexOfUnion(data: Data, searchElement?: T['TValue'] | null, fromIndex?: number): number { // Unions are special -- they do have a nullBitmap, but so can their children. // If the searchElement is null, we don't know whether it came from the Union's - // bitmap or one of its childrens'. So we don't interrogate the Union's bitmap, + // bitmap or one of its children's. So we don't interrogate the Union's bitmap, // since that will report the wrong index if a child has a null before the Union. const get = getVisitor.getVisitFn(data); const compare = createElementComparator(searchElement); diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 55a6b4e2ea390..0af954e4adacc 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -62,9 +62,9 @@ export class JSONVectorAssembler extends Visitor { /** @nocollapse */ public static assemble(...batches: T[]) { - const assemlber = new JSONVectorAssembler(); + const assembler = new JSONVectorAssembler(); return batches.map(({ schema, data }) => { - return assemlber.visitMany(schema.fields, data.children); + return assembler.visitMany(schema.fields, data.children); }); } From 7cdb768bd2506c54a5befbdd92609aaa3208f4c9 Mon Sep 17 00:00:00 2001 From: Stas Stepanov <78556261+stfdxv@users.noreply.github.com> Date: Tue, 28 Nov 2023 10:35:23 +0300 Subject: [PATCH 04/10] MINOR: [Docs] Replace "have" with "indicate" in the "Struct validity" section of the docs (#38895) ### Rationale for this change This documentation [section](https://arrow.apache.org/docs/format/Columnar.html#struct-validity) says: `...a child array might have a null in its validity bitmap...` But an array can't have a null in its validity bitmap. It can only indicate a null with a bit equal to 0 in its validity bitmap. ### What changes are included in this PR? Rewording ### Are these changes tested? Renders normally ### Are there any user-facing changes? Changes documentation only. Authored-by: Stas Stepanov <78556261+stfdxv@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- docs/source/format/Columnar.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 3f8cd946292ea..9bdee37d18048 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -715,7 +715,7 @@ A struct array has its own validity bitmap that is independent of its child arrays' validity bitmaps. The validity bitmap for the struct array might indicate a null when one or more of its child arrays has a non-null value in its corresponding slot; or conversely, a child -array might have a null in its validity bitmap while the struct array's +array might indicate a null in its validity bitmap while the struct array's validity bitmap shows a non-null value. Therefore, to know whether a particular child entry is valid, one must From c614014db805caf3bbc4d5270bd990c095bf8725 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 28 Nov 2023 18:26:23 +0800 Subject: [PATCH 05/10] GH-38874: [C++][Parquet] Minor: making parquet TypedComparator operation as const method (#38875) ### Rationale for this change `parquet::TypedComparator` is not const method, which should be const ### What changes are included in this PR? Change `Compare`, `GetMinMax`, `GetMinMaxSpaced` to const ### Are these changes tested? No ### Are there any user-facing changes? No * Closes: #38874 Authored-by: mwish Signed-off-by: mwish --- cpp/src/parquet/statistics.cc | 14 +++++++------- cpp/src/parquet/statistics.h | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 37b245e0dd6c2..e54b94f1a861a 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -438,9 +438,9 @@ class TypedComparatorImpl return Helper::Compare(type_length_, a, b); } - bool Compare(const T& a, const T& b) override { return CompareInline(a, b); } + bool Compare(const T& a, const T& b) const override { return CompareInline(a, b); } - std::pair GetMinMax(const T* values, int64_t length) override { + std::pair GetMinMax(const T* values, int64_t length) const override { DCHECK_GT(length, 0); T min = Helper::DefaultMin(); @@ -457,7 +457,7 @@ class TypedComparatorImpl std::pair GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, - int64_t valid_bits_offset) override { + int64_t valid_bits_offset) const override { DCHECK_GT(length, 0); T min = Helper::DefaultMin(); @@ -477,7 +477,7 @@ class TypedComparatorImpl return {min, max}; } - std::pair GetMinMax(const ::arrow::Array& values) override { + std::pair GetMinMax(const ::arrow::Array& values) const override { ParquetException::NYI(values.type()->ToString()); } @@ -491,7 +491,7 @@ class TypedComparatorImpl template <> std::pair TypedComparatorImpl::GetMinMax(const int32_t* values, - int64_t length) { + int64_t length) const { DCHECK_GT(length, 0); const uint32_t* unsigned_values = reinterpret_cast(values); @@ -537,13 +537,13 @@ std::pair GetMinMaxBinaryHelper( template <> std::pair TypedComparatorImpl::GetMinMax( - const ::arrow::Array& values) { + const ::arrow::Array& values) const { return GetMinMaxBinaryHelper(*this, values); } template <> std::pair TypedComparatorImpl::GetMinMax( - const ::arrow::Array& values) { + const ::arrow::Array& values) const { return GetMinMaxBinaryHelper(*this, values); } diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index ae6c1ca29b2f6..6730e6bcdc1e0 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -73,16 +73,16 @@ class TypedComparator : public Comparator { /// \brief Scalar comparison of two elements, return true if first /// is strictly less than the second - virtual bool Compare(const T& a, const T& b) = 0; + virtual bool Compare(const T& a, const T& b) const = 0; /// \brief Compute maximum and minimum elements in a batch of /// elements without any nulls - virtual std::pair GetMinMax(const T* values, int64_t length) = 0; + virtual std::pair GetMinMax(const T* values, int64_t length) const = 0; /// \brief Compute minimum and maximum elements from an Arrow array. Only /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY /// / arrow::BinaryArray - virtual std::pair GetMinMax(const ::arrow::Array& values) = 0; + virtual std::pair GetMinMax(const ::arrow::Array& values) const = 0; /// \brief Compute maximum and minimum elements in a batch of /// elements with accompanying bitmap indicating which elements are @@ -96,7 +96,7 @@ class TypedComparator : public Comparator { /// the first element in the sequence virtual std::pair GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, - int64_t valid_bits_offset) = 0; + int64_t valid_bits_offset) const = 0; }; /// \brief Typed version of Comparator::Make From a9c99cfe362fc7d450711fed82f74e565f381337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 28 Nov 2023 15:06:42 +0100 Subject: [PATCH 06/10] MINOR: [Release] Update versions for 15.0.0-SNAPSHOT --- ci/scripts/PKGBUILD | 2 +- docs/source/_static/versions.json | 5 +++++ r/DESCRIPTION | 2 +- r/NEWS.md | 4 +++- r/pkgdown/assets/versions.json | 8 ++++++-- 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 95029d98f7a01..2cdd1d42634bf 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=14.0.0.9000 +pkgver=14.0.1.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json index 10e179420b803..56411de862096 100644 --- a/docs/source/_static/versions.json +++ b/docs/source/_static/versions.json @@ -10,6 +10,11 @@ "url": "https://arrow.apache.org/docs/", "preferred": true }, + { + "name": "14.0", + "version": "14.0/", + "url": "https://arrow.apache.org/docs/14.0/" + }, { "name": "13.0", "version": "13.0/", diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 3a36a808ab1d0..1bf25e57a3cce 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 14.0.0.9000 +Version: 14.0.1.9000 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index 7dbf8a7767c2d..c337e907a4213 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,9 @@ under the License. --> -# arrow 14.0.0.9000 +# arrow 14.0.1.9000 + +# arrow 14.0.1 # arrow 14.0.0.1 diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 615a84511fca9..424a5fbc5174a 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,12 +1,16 @@ [ { - "name": "14.0.0.9000 (dev)", + "name": "14.0.1.9000 (dev)", "version": "dev/" }, { - "name": "14.0.0 (release)", + "name": "14.0.1 (release)", "version": "" }, + { + "name": "14.0.0", + "version": "14.0/" + }, { "name": "13.0.0.1", "version": "13.0/" From 8b7897a0f05321c97de13c830975e85e387b6315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 28 Nov 2023 15:06:43 +0100 Subject: [PATCH 07/10] MINOR: [Release] Update .deb/.rpm changelogs for 14.0.1 --- .../linux-packages/apache-arrow-apt-source/debian/changelog | 6 ++++++ .../apache-arrow-release/yum/apache-arrow-release.spec.in | 3 +++ dev/tasks/linux-packages/apache-arrow/debian/changelog | 6 ++++++ dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in | 3 +++ 4 files changed, 18 insertions(+) diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index 221fb0caa8952..83a388c93051d 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (14.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Mon, 06 Nov 2023 22:23:27 -0000 + apache-arrow-apt-source (14.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 273bf32a2a8e4..245e8afeaeb1d 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -102,6 +102,9 @@ else fi %changelog +* Mon Nov 06 2023 Raúl Cumplido - 14.0.1-1 +- New upstream release. + * Thu Oct 19 2023 Raúl Cumplido - 14.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 5e01d962c44d4..1f3f1bd5abd07 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (14.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Mon, 06 Nov 2023 22:23:27 -0000 + apache-arrow (14.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index f61d47db2edd7..87e05558e8cda 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -864,6 +864,9 @@ Documentation for Apache Parquet GLib. %{_datadir}/gtk-doc/html/parquet-glib/ %changelog +* Mon Nov 06 2023 Raúl Cumplido - 14.0.1-1 +- New upstream release. + * Thu Oct 19 2023 Raúl Cumplido - 14.0.0-1 - New upstream release. From 427112a4936e8cb27f45a683b7ae17812ddaf0c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 28 Nov 2023 08:11:17 -0600 Subject: [PATCH 08/10] MINOR: [Release][Docs] Document how to avoid default pushing to main when running bump-versions (#38672) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change This is the second time I've had to revert some commits after bumping versions. Usually happened on patch versions. This documentation is a reminder to the release manager to avoid default pushing so we can review changes locally before. ### What changes are included in this PR? Just a minor note on the documentation ### Are these changes tested? Does not apply. ### Are there any user-facing changes? No Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- docs/source/developers/release.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index 6924c2d714e8b..5b7726f58d2d0 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -567,6 +567,9 @@ Be sure to go through on the following checklist: .. code-block:: Bash + # You can run the script with BUMP_TAG=0 and BUMP_PUSH=0 + # this will avoid default pushing to main and pushing the tag + # but you will require to push manually after reviewing the commits. # dev/release/post-11-bump-versions.sh 10.0.0 11.0.0 dev/release/post-11-bump-versions.sh X.Y.Z NEXT_X.NEXT_Y.NEXT_Z From 83b2c5f903943a4bf646d35c07038c679e657886 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 28 Nov 2023 12:53:59 -0300 Subject: [PATCH 09/10] MINOR: [Documentation] Fix the documented format strings for list-views (#38899) ### Rationale for this change The code has the correct format string, but documentation was stale with initially proposed values instead of the ones we ended up with. ### What changes are included in this PR? Documentation change. ### Are these changes tested? N/A Authored-by: Felipe Oliveira Carvalho Signed-off-by: Benjamin Kietzman --- docs/source/format/CDataInterface.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index e2022171214b7..36952ad99e777 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -207,9 +207,9 @@ names and types of child fields are read from the child arrays. +------------------------+---------------------------------------------------+------------+ | ``+L`` | large list | | +------------------------+---------------------------------------------------+------------+ -| ``+lv`` | list-view | | +| ``+vl`` | list-view | | +------------------------+---------------------------------------------------+------------+ -| ``+Lv`` | large list-view | | +| ``+vL`` | large list-view | | +------------------------+---------------------------------------------------+------------+ | ``+w:123`` | fixed-sized list [123 items] | | +------------------------+---------------------------------------------------+------------+ From 143b475f94dad840be2eb109ff27f3181791ad9f Mon Sep 17 00:00:00 2001 From: Yifeng-Sigma Date: Tue, 28 Nov 2023 11:09:24 -0600 Subject: [PATCH 10/10] GH-38836:[Go] Add Size() for ArrayData (#38839) ### Rationale for this change Address https://github.com/apache/arrow/issues/38836 ### What changes are included in this PR? Add a new function SizeInBytes() to calculate the size of ArrayData. ### Are these changes tested? ### Are there any user-facing changes? No * Closes: #38836 Lead-authored-by: Yifeng Wu Co-authored-by: Matt Topol Co-authored-by: Yifeng-Sigma Signed-off-by: Matt Topol --- go/arrow/array.go | 2 + go/arrow/array/data.go | 27 ++++++++++++- go/arrow/array/data_test.go | 75 +++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) diff --git a/go/arrow/array.go b/go/arrow/array.go index e07fa478aae57..eed859cf46649 100644 --- a/go/arrow/array.go +++ b/go/arrow/array.go @@ -81,6 +81,8 @@ type ArrayData interface { // Dictionary returns the ArrayData object for the dictionary if this is a // dictionary array, otherwise it will be nil. Dictionary() ArrayData + // SizeInBytes returns the size of the ArrayData buffers and any children and/or dictionary in bytes. + SizeInBytes() uint64 } // Array represents an immutable sequence of values using the Arrow in-memory format. diff --git a/go/arrow/array/data.go b/go/arrow/array/data.go index 8cce49182b879..3c859ec30bc78 100644 --- a/go/arrow/array/data.go +++ b/go/arrow/array/data.go @@ -190,9 +190,34 @@ func (d *Data) SetDictionary(dict arrow.ArrayData) { } } +// SizeInBytes returns the size of the Data and any children and/or dictionary in bytes by +// recursively examining the nested structures of children and/or dictionary. +// The value returned is an upper-bound since offset is not taken into account. +func (d *Data) SizeInBytes() uint64 { + var size uint64 + + if d == nil { + return 0 + } + + for _, b := range d.Buffers() { + size += uint64(b.Len()) + } + for _, c := range d.Children() { + size += c.SizeInBytes() + } + if dict := d.Dictionary(); dict != nil { + size += dict.SizeInBytes() + } + + return size +} + // NewSliceData returns a new slice that shares backing data with the input. // The returned Data slice starts at i and extends j-i elements, such as: -// slice := data[i:j] +// +// slice := data[i:j] +// // The returned value must be Release'd after use. // // NewSliceData panics if the slice is outside the valid range of the input Data. diff --git a/go/arrow/array/data_test.go b/go/arrow/array/data_test.go index b7b0f396470d7..dd4793a7cdbfa 100644 --- a/go/arrow/array/data_test.go +++ b/go/arrow/array/data_test.go @@ -49,3 +49,78 @@ func TestDataReset(t *testing.T) { data.Reset(&arrow.Int64Type{}, 5, data.Buffers(), nil, 1, 2) } } + +func TestSizeInBytes(t *testing.T) { + var buffers1 = make([]*memory.Buffer, 0, 3) + + for i := 0; i < cap(buffers1); i++ { + buffers1 = append(buffers1, memory.NewBufferBytes([]byte("15-bytes-buffer"))) + } + data := NewData(&arrow.StringType{}, 10, buffers1, nil, 0, 0) + var arrayData arrow.ArrayData = data + dataWithChild := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{arrayData}, 0, 0) + + t.Run("buffers only", func(t *testing.T) { + expectedSize := uint64(45) + if actualSize := data.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and child data", func(t *testing.T) { + // 45 bytes in buffers, 45 bytes in child data + expectedSize := uint64(90) + if actualSize := dataWithChild.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and nested child data", func(t *testing.T) { + var dataWithChildArrayData arrow.ArrayData = dataWithChild + var dataWithNestedChild arrow.ArrayData = NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{dataWithChildArrayData}, 0, 0) + // 45 bytes in buffers, 90 bytes in nested child data + expectedSize := uint64(135) + if actualSize := dataWithNestedChild.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers and dictionary", func(t *testing.T) { + dictData := data + dataWithDict := NewDataWithDictionary(&arrow.StringType{}, 10, buffers1, 0, 0, dictData) + // 45 bytes in buffers, 45 bytes in dictionary + expectedSize := uint64(90) + if actualSize := dataWithDict.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("sliced data", func(t *testing.T) { + sliceData := NewSliceData(arrayData, 3, 5) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(45) + if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("sliced data with children", func(t *testing.T) { + var dataWithChildArrayData arrow.ArrayData = dataWithChild + sliceData := NewSliceData(dataWithChildArrayData, 3, 5) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(90) + if actualSize := sliceData.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) + + t.Run("buffers with children which are sliced data", func(t *testing.T) { + sliceData := NewSliceData(arrayData, 3, 5) + dataWithSlicedChildren := NewData(&arrow.StringType{}, 10, buffers1, []arrow.ArrayData{sliceData}, 0, 0) + // offset is not taken into account in SizeInBytes() + expectedSize := uint64(90) + if actualSize := dataWithSlicedChildren.SizeInBytes(); actualSize != expectedSize { + t.Errorf("expected size %d, got %d", expectedSize, actualSize) + } + }) +}