GH-40592: [C++][Parquet] Implement SizeStatistics #48595
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Licensed to the Apache Software Foundation (ASF) under one | |
# or more contributor license agreements. See the NOTICE file | |
# distributed with this work for additional information | |
# regarding copyright ownership. The ASF licenses this file | |
# to you under the Apache License, Version 2.0 (the | |
# "License"); you may not use this file except in compliance | |
# with the License. You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, | |
# software distributed under the License is distributed on an | |
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
# KIND, either express or implied. See the License for the | |
# specific language governing permissions and limitations | |
# under the License. | |
name: R | |
on: | |
push: | |
branches: | |
- '**' | |
- '!dependabot/**' | |
tags: | |
- '**' | |
paths: | |
- '.dockerignore' | |
- ".github/workflows/r.yml" | |
- "ci/docker/**" | |
- "ci/etc/rprofile" | |
- "ci/scripts/PKGBUILD" | |
- "ci/scripts/cpp_*.sh" | |
- "ci/scripts/install_minio.sh" | |
- "ci/scripts/r_*.sh" | |
- "cpp/**" | |
- "docker-compose.yml" | |
- "r/**" | |
pull_request: | |
paths: | |
- '.dockerignore' | |
- ".github/workflows/r.yml" | |
- "ci/docker/**" | |
- "ci/etc/rprofile" | |
- "ci/scripts/PKGBUILD" | |
- "ci/scripts/cpp_*.sh" | |
- "ci/scripts/install_minio.sh" | |
- "ci/scripts/r_*.sh" | |
- "cpp/**" | |
- "docker-compose.yml" | |
- "r/**" | |
concurrency: | |
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} | |
cancel-in-progress: true | |
permissions: | |
contents: read | |
env: | |
ARCHERY_DEBUG: 1 | |
DOCKER_VOLUME_PREFIX: ".docker/" | |
jobs: | |
ubuntu-minimum-cpp-version: | |
name: Check minimum supported Arrow C++ Version (${{ matrix.cpp_version }}) | |
# We don't provide Apache Arrow C++ 15.0.2 deb packages for Ubuntu 24.04. | |
# So we use ubuntu-22.04 here. | |
runs-on: ubuntu-22.04 | |
strategy: | |
matrix: | |
include: | |
- cpp_version: "15.0.2" | |
steps: | |
- name: Checkout Arrow | |
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 | |
with: | |
path: src | |
submodules: recursive | |
- name: Install Arrow C++ (${{ matrix.cpp_version }}) | |
run: | | |
sudo apt update | |
sudo apt install -y -V ca-certificates lsb-release wget | |
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb | |
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb | |
sudo apt update | |
# We have to list all packages to avoid version conflicts. | |
sudo apt install -y -V libarrow-dev=${{ matrix.cpp_version }}-1 \ | |
libarrow-acero-dev=${{ matrix.cpp_version }}-1 \ | |
libparquet-dev=${{ matrix.cpp_version }}-1 \ | |
libarrow-dataset-dev=${{ matrix.cpp_version }}-1 | |
- name: Install checkbashisms | |
run: | | |
sudo apt-get install devscripts | |
- uses: r-lib/actions/setup-r@v2 | |
with: | |
use-public-rspm: true | |
install-r: false | |
- uses: r-lib/actions/setup-r-dependencies@v2 | |
with: | |
extra-packages: any::rcmdcheck | |
needs: check | |
working-directory: src/r | |
- uses: r-lib/actions/check-r-package@v2 | |
with: | |
working-directory: src/r | |
env: | |
LIBARROW_BINARY: "false" | |
LIBARROW_BUILD: "false" | |
ARROW_R_VERBOSE_TEST: "true" | |
ARROW_R_ALLOW_CPP_VERSION_MISMATCH: "true" | |
- name: Show install output | |
if: always() | |
run: find src/r/check -name '00install.out*' -exec cat '{}' \; || true | |
shell: bash | |
ubuntu: | |
name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} Force-Tests ${{ matrix.force-tests }} | |
runs-on: ubuntu-latest | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 75 | |
strategy: | |
fail-fast: false | |
matrix: | |
r: ["4.4"] | |
ubuntu: [24.04] | |
force-tests: ["true"] | |
env: | |
R: ${{ matrix.r }} | |
UBUNTU: ${{ matrix.ubuntu }} | |
steps: | |
- name: Checkout Arrow | |
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Free up disk space | |
run: | | |
ci/scripts/util_free_space.sh | |
- name: Cache Docker Volumes | |
uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 | |
with: | |
path: .docker | |
# As this key is identical on both matrix builds only one will be able to successfully cache, | |
# this is fine as there are no differences in the build | |
key: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }} | |
restore-keys: | | |
ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- | |
ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- | |
- name: Setup Python | |
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 | |
with: | |
python-version: 3.12 | |
- name: Setup Archery | |
run: pip install -e dev/archery[docker] | |
- name: Execute Docker Build | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
run: | | |
source ci/scripts/util_enable_core_dumps.sh | |
# Setting a non-default and non-probable Marquesas French Polynesia time | |
# it has both with a .45 offset and very very few people who live there. | |
archery docker run -e TZ=MART -e ARROW_R_FORCE_TESTS=${{ matrix.force-tests }} ubuntu-r | |
- name: Dump install logs | |
run: cat r/check/arrow.Rcheck/00install.out | |
if: always() | |
- name: Dump test logs | |
run: cat r/check/arrow.Rcheck/tests/testthat.Rout* | |
if: always() | |
- name: Save the test output | |
if: always() | |
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 | |
with: | |
name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} | |
path: r/check/arrow.Rcheck/tests/testthat.Rout* | |
- name: Docker Push | |
if: >- | |
success() && | |
github.event_name == 'push' && | |
github.repository == 'apache/arrow' && | |
github.ref_name == 'main' | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
continue-on-error: true | |
run: archery docker push ubuntu-r | |
bundled: | |
name: "${{ matrix.config.org }}/${{ matrix.config.image }}:${{ matrix.config.tag }}" | |
runs-on: ubuntu-latest | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 60 | |
strategy: | |
fail-fast: false | |
matrix: | |
config: | |
- { org: "rhub", image: "ubuntu-gcc12", tag: "latest" } | |
env: | |
R_ORG: ${{ matrix.config.org }} | |
R_IMAGE: ${{ matrix.config.image }} | |
R_TAG: ${{ matrix.config.tag }} | |
steps: | |
- name: Checkout Arrow | |
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Setup Python | |
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 | |
with: | |
python-version: 3.12 | |
- name: Setup Archery | |
run: pip install -e dev/archery[docker] | |
- name: Execute Docker Build | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
run: | | |
source ci/scripts/util_enable_core_dumps.sh | |
# Don't set a TZ here to test that case. These builds will have the following warning in them: | |
# System has not been booted with systemd as init system (PID 1). Can't operate. | |
# Failed to connect to bus: Host is down | |
archery docker run -e TZ="" r | |
- name: Dump install logs | |
run: cat r/check/arrow.Rcheck/00install.out | |
if: always() | |
- name: Dump test logs | |
run: cat r/check/arrow.Rcheck/tests/testthat.Rout* | |
if: always() | |
- name: Save the test output | |
if: always() | |
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 | |
with: | |
name: test-output-bundled | |
path: r/check/arrow.Rcheck/tests/testthat.Rout* | |
- name: Docker Push | |
if: >- | |
success() && | |
github.event_name == 'push' && | |
github.repository == 'apache/arrow' && | |
github.ref_name == 'main' | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
continue-on-error: true | |
run: archery docker push r | |
windows-cpp: | |
name: AMD64 Windows C++ RTools ${{ matrix.config.rtools }} ${{ matrix.config.arch }} | |
runs-on: windows-2019 | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 90 | |
strategy: | |
fail-fast: false | |
matrix: | |
config: | |
- { rtools: 40, arch: 'ucrt64' } | |
steps: | |
- run: git config --global core.autocrlf false | |
- name: Checkout Arrow | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
- name: Setup ccache | |
shell: bash | |
run: | | |
ci/scripts/ccache_setup.sh | |
echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >> $GITHUB_ENV | |
- name: Cache ccache | |
uses: actions/cache@v4 | |
with: | |
path: ccache | |
key: r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }} | |
restore-keys: | | |
r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- | |
r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}- | |
- uses: r-lib/actions/setup-r@v2 | |
with: | |
# Note: RTools must be 40 here because RTools40 + ucrt is how we build the Arrow C++ | |
# static library. The R is not used here but R 4.1 was the last R to use | |
# Rtools40. | |
r-version: "4.1" | |
rtools-version: 40 | |
Ncpus: 2 | |
- name: Build Arrow C++ | |
shell: bash | |
env: | |
MINGW_ARCH: ${{ matrix.config.arch }} | |
run: ci/scripts/r_windows_build.sh | |
- name: Rename libarrow.zip | |
# So that they're unique when multiple are downloaded in the next step | |
shell: bash | |
run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip | |
- uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 | |
with: | |
name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip | |
path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip | |
windows-r: | |
needs: [windows-cpp] | |
name: AMD64 Windows R ${{ matrix.config.rversion }} | |
runs-on: windows-2019 | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 75 | |
strategy: | |
fail-fast: false | |
matrix: | |
config: | |
- { rversion: "release" } | |
env: | |
ARROW_R_CXXFLAGS: "-Werror" | |
_R_CHECK_TESTS_NLINES_: 0 | |
steps: | |
- run: git config --global core.autocrlf false | |
- name: Checkout Arrow | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
# This must be done before r-lib/actions/setup-r because curl in | |
# Rtools doesn't work on non Rtools' MSYS2 environment. If we | |
# use "shell: bash" after r-lib/actions/setup-r, bash in Rtools | |
# is used on non Rtools' MSYS2 environment. | |
- name: Install MinIO | |
shell: bash | |
run: | | |
mkdir -p "$HOME/.local/bin" | |
ci/scripts/install_minio.sh latest "$HOME/.local" | |
echo "$HOME/.local/bin" >> $GITHUB_PATH | |
- run: mkdir r/windows | |
- name: Download artifacts | |
uses: actions/download-artifact@v4.1.8 | |
with: | |
name: libarrow-rtools40-ucrt64.zip | |
path: r/windows | |
- name: Unzip and rezip libarrows | |
shell: bash | |
run: | | |
cd r/windows | |
ls *.zip | xargs -n 1 unzip -uo | |
rm -rf *.zip | |
- uses: r-lib/actions/setup-r@v2 | |
with: | |
r-version: ${{ matrix.config.rversion }} | |
Ncpus: 2 | |
- uses: r-lib/actions/setup-r-dependencies@v2 | |
env: | |
GITHUB_PAT: "${{ github.token }}" | |
with: | |
# For some arcane reason caching does not work on the windows runners | |
# most likely due to https://github.com/actions/cache/issues/815 | |
cache: false | |
working-directory: 'r' | |
extra-packages: | | |
any::rcmdcheck | |
# TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows | |
# - name: Install Google Cloud Storage Testbench | |
# shell: bash | |
# run: ci/scripts/install_gcs_testbench.sh default | |
- name: Check | |
shell: Rscript {0} | |
run: | | |
# Because we do R CMD build and r/windows is in .Rbuildignore, | |
# assemble the libarrow.zip file and pass it as an env var | |
setwd("r/windows") | |
zip("libarrow.zip", ".") | |
setwd("..") | |
Sys.setenv( | |
RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), | |
MAKEFLAGS = paste0("-j", parallel::detectCores()), | |
ARROW_R_DEV = TRUE, | |
"_R_CHECK_FORCE_SUGGESTS_" = FALSE, | |
"_R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_" = TRUE, | |
"_R_CHECK_DONTTEST_EXAMPLES_" = TRUE | |
) | |
rcmdcheck::rcmdcheck(".", | |
build_args = '--no-build-vignettes', | |
args = c('--no-manual', '--as-cran', '--ignore-vignettes'), | |
error_on = 'warning', | |
check_dir = 'check', | |
timeout = 3600 | |
) | |
- name: Run lintr | |
if: ${{ matrix.config.rversion == 'release' }} | |
env: | |
NOT_CRAN: "true" | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
shell: Rscript {0} | |
working-directory: r | |
run: | | |
Sys.setenv( | |
RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), | |
MAKEFLAGS = paste0("-j", parallel::detectCores()), | |
ARROW_R_DEV = TRUE, | |
"_R_CHECK_FORCE_SUGGESTS_" = FALSE | |
) | |
# we use pak for package installation since it is faster, safer and more convenient | |
pak::local_install() | |
pak::pak("lintr") | |
lintr::expect_lint_free() | |
- name: Dump install logs | |
shell: cmd | |
run: cat r/check/arrow.Rcheck/00install.out | |
if: always() | |
- name: Dump test logs | |
shell: bash | |
run: find r/check -name 'testthat.Rout*' -exec cat '{}' \; || true | |
if: always() |