Skip to content

GH-40592: [C++][Parquet] Implement SizeStatistics #48595

GH-40592: [C++][Parquet] Implement SizeStatistics

GH-40592: [C++][Parquet] Implement SizeStatistics #48595

Workflow file for this run

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: R
on:
push:
branches:
- '**'
- '!dependabot/**'
tags:
- '**'
paths:
- '.dockerignore'
- ".github/workflows/r.yml"
- "ci/docker/**"
- "ci/etc/rprofile"
- "ci/scripts/PKGBUILD"
- "ci/scripts/cpp_*.sh"
- "ci/scripts/install_minio.sh"
- "ci/scripts/r_*.sh"
- "cpp/**"
- "docker-compose.yml"
- "r/**"
pull_request:
paths:
- '.dockerignore'
- ".github/workflows/r.yml"
- "ci/docker/**"
- "ci/etc/rprofile"
- "ci/scripts/PKGBUILD"
- "ci/scripts/cpp_*.sh"
- "ci/scripts/install_minio.sh"
- "ci/scripts/r_*.sh"
- "cpp/**"
- "docker-compose.yml"
- "r/**"
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
permissions:
contents: read
env:
ARCHERY_DEBUG: 1
DOCKER_VOLUME_PREFIX: ".docker/"
jobs:
ubuntu-minimum-cpp-version:
name: Check minimum supported Arrow C++ Version (${{ matrix.cpp_version }})
# We don't provide Apache Arrow C++ 15.0.2 deb packages for Ubuntu 24.04.
# So we use ubuntu-22.04 here.
runs-on: ubuntu-22.04
strategy:
matrix:
include:
- cpp_version: "15.0.2"
steps:
- name: Checkout Arrow
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
with:
path: src
submodules: recursive
- name: Install Arrow C++ (${{ matrix.cpp_version }})
run: |
sudo apt update
sudo apt install -y -V ca-certificates lsb-release wget
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt update
# We have to list all packages to avoid version conflicts.
sudo apt install -y -V libarrow-dev=${{ matrix.cpp_version }}-1 \
libarrow-acero-dev=${{ matrix.cpp_version }}-1 \
libparquet-dev=${{ matrix.cpp_version }}-1 \
libarrow-dataset-dev=${{ matrix.cpp_version }}-1
- name: Install checkbashisms
run: |
sudo apt-get install devscripts
- uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true
install-r: false
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
needs: check
working-directory: src/r
- uses: r-lib/actions/check-r-package@v2
with:
working-directory: src/r
env:
LIBARROW_BINARY: "false"
LIBARROW_BUILD: "false"
ARROW_R_VERBOSE_TEST: "true"
ARROW_R_ALLOW_CPP_VERSION_MISMATCH: "true"
- name: Show install output
if: always()
run: find src/r/check -name '00install.out*' -exec cat '{}' \; || true
shell: bash
ubuntu:
name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} Force-Tests ${{ matrix.force-tests }}
runs-on: ubuntu-latest
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 75
strategy:
fail-fast: false
matrix:
r: ["4.4"]
ubuntu: [24.04]
force-tests: ["true"]
env:
R: ${{ matrix.r }}
UBUNTU: ${{ matrix.ubuntu }}
steps:
- name: Checkout Arrow
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
with:
fetch-depth: 0
submodules: recursive
- name: Free up disk space
run: |
ci/scripts/util_free_space.sh
- name: Cache Docker Volumes
uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0
with:
path: .docker
# As this key is identical on both matrix builds only one will be able to successfully cache,
# this is fine as there are no differences in the build
key: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }}
restore-keys: |
ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-
ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-
- name: Setup Python
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.12
- name: Setup Archery
run: pip install -e dev/archery[docker]
- name: Execute Docker Build
env:
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
run: |
source ci/scripts/util_enable_core_dumps.sh
# Setting a non-default and non-probable Marquesas French Polynesia time
# it has both with a .45 offset and very very few people who live there.
archery docker run -e TZ=MART -e ARROW_R_FORCE_TESTS=${{ matrix.force-tests }} ubuntu-r
- name: Dump install logs
run: cat r/check/arrow.Rcheck/00install.out
if: always()
- name: Dump test logs
run: cat r/check/arrow.Rcheck/tests/testthat.Rout*
if: always()
- name: Save the test output
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
with:
name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }}
path: r/check/arrow.Rcheck/tests/testthat.Rout*
- name: Docker Push
if: >-
success() &&
github.event_name == 'push' &&
github.repository == 'apache/arrow' &&
github.ref_name == 'main'
env:
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
continue-on-error: true
run: archery docker push ubuntu-r
bundled:
name: "${{ matrix.config.org }}/${{ matrix.config.image }}:${{ matrix.config.tag }}"
runs-on: ubuntu-latest
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
config:
- { org: "rhub", image: "ubuntu-gcc12", tag: "latest" }
env:
R_ORG: ${{ matrix.config.org }}
R_IMAGE: ${{ matrix.config.image }}
R_TAG: ${{ matrix.config.tag }}
steps:
- name: Checkout Arrow
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
with:
fetch-depth: 0
submodules: recursive
- name: Setup Python
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.12
- name: Setup Archery
run: pip install -e dev/archery[docker]
- name: Execute Docker Build
env:
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
run: |
source ci/scripts/util_enable_core_dumps.sh
# Don't set a TZ here to test that case. These builds will have the following warning in them:
# System has not been booted with systemd as init system (PID 1). Can't operate.
# Failed to connect to bus: Host is down
archery docker run -e TZ="" r
- name: Dump install logs
run: cat r/check/arrow.Rcheck/00install.out
if: always()
- name: Dump test logs
run: cat r/check/arrow.Rcheck/tests/testthat.Rout*
if: always()
- name: Save the test output
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
with:
name: test-output-bundled
path: r/check/arrow.Rcheck/tests/testthat.Rout*
- name: Docker Push
if: >-
success() &&
github.event_name == 'push' &&
github.repository == 'apache/arrow' &&
github.ref_name == 'main'
env:
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
continue-on-error: true
run: archery docker push r
windows-cpp:
name: AMD64 Windows C++ RTools ${{ matrix.config.rtools }} ${{ matrix.config.arch }}
runs-on: windows-2019
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 90
strategy:
fail-fast: false
matrix:
config:
- { rtools: 40, arch: 'ucrt64' }
steps:
- run: git config --global core.autocrlf false
- name: Checkout Arrow
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup ccache
shell: bash
run: |
ci/scripts/ccache_setup.sh
echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >> $GITHUB_ENV
- name: Cache ccache
uses: actions/cache@v4
with:
path: ccache
key: r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }}
restore-keys: |
r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-
r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-
- uses: r-lib/actions/setup-r@v2
with:
# Note: RTools must be 40 here because RTools40 + ucrt is how we build the Arrow C++
# static library. The R is not used here but R 4.1 was the last R to use
# Rtools40.
r-version: "4.1"
rtools-version: 40
Ncpus: 2
- name: Build Arrow C++
shell: bash
env:
MINGW_ARCH: ${{ matrix.config.arch }}
run: ci/scripts/r_windows_build.sh
- name: Rename libarrow.zip
# So that they're unique when multiple are downloaded in the next step
shell: bash
run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
- uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
with:
name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
windows-r:
needs: [windows-cpp]
name: AMD64 Windows R ${{ matrix.config.rversion }}
runs-on: windows-2019
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 75
strategy:
fail-fast: false
matrix:
config:
- { rversion: "release" }
env:
ARROW_R_CXXFLAGS: "-Werror"
_R_CHECK_TESTS_NLINES_: 0
steps:
- run: git config --global core.autocrlf false
- name: Checkout Arrow
uses: actions/checkout@v4
with:
fetch-depth: 0
# This must be done before r-lib/actions/setup-r because curl in
# Rtools doesn't work on non Rtools' MSYS2 environment. If we
# use "shell: bash" after r-lib/actions/setup-r, bash in Rtools
# is used on non Rtools' MSYS2 environment.
- name: Install MinIO
shell: bash
run: |
mkdir -p "$HOME/.local/bin"
ci/scripts/install_minio.sh latest "$HOME/.local"
echo "$HOME/.local/bin" >> $GITHUB_PATH
- run: mkdir r/windows
- name: Download artifacts
uses: actions/download-artifact@v4.1.8
with:
name: libarrow-rtools40-ucrt64.zip
path: r/windows
- name: Unzip and rezip libarrows
shell: bash
run: |
cd r/windows
ls *.zip | xargs -n 1 unzip -uo
rm -rf *.zip
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.rversion }}
Ncpus: 2
- uses: r-lib/actions/setup-r-dependencies@v2
env:
GITHUB_PAT: "${{ github.token }}"
with:
# For some arcane reason caching does not work on the windows runners
# most likely due to https://github.com/actions/cache/issues/815
cache: false
working-directory: 'r'
extra-packages: |
any::rcmdcheck
# TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows
# - name: Install Google Cloud Storage Testbench
# shell: bash
# run: ci/scripts/install_gcs_testbench.sh default
- name: Check
shell: Rscript {0}
run: |
# Because we do R CMD build and r/windows is in .Rbuildignore,
# assemble the libarrow.zip file and pass it as an env var
setwd("r/windows")
zip("libarrow.zip", ".")
setwd("..")
Sys.setenv(
RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"),
MAKEFLAGS = paste0("-j", parallel::detectCores()),
ARROW_R_DEV = TRUE,
"_R_CHECK_FORCE_SUGGESTS_" = FALSE,
"_R_CHECK_STOP_ON_INVALID_NUMERIC_VERSION_INPUTS_" = TRUE,
"_R_CHECK_DONTTEST_EXAMPLES_" = TRUE
)
rcmdcheck::rcmdcheck(".",
build_args = '--no-build-vignettes',
args = c('--no-manual', '--as-cran', '--ignore-vignettes'),
error_on = 'warning',
check_dir = 'check',
timeout = 3600
)
- name: Run lintr
if: ${{ matrix.config.rversion == 'release' }}
env:
NOT_CRAN: "true"
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
shell: Rscript {0}
working-directory: r
run: |
Sys.setenv(
RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"),
MAKEFLAGS = paste0("-j", parallel::detectCores()),
ARROW_R_DEV = TRUE,
"_R_CHECK_FORCE_SUGGESTS_" = FALSE
)
# we use pak for package installation since it is faster, safer and more convenient
pak::local_install()
pak::pak("lintr")
lintr::expect_lint_free()
- name: Dump install logs
shell: cmd
run: cat r/check/arrow.Rcheck/00install.out
if: always()
- name: Dump test logs
shell: bash
run: find r/check -name 'testthat.Rout*' -exec cat '{}' \; || true
if: always()