Update pdf2parquet to Docling v2 #260
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files | |
# | |
name: Test - transforms/universal/tokenization | |
on: | |
workflow_dispatch: | |
push: | |
branches: | |
- "dev" | |
- "releases/**" | |
tags: | |
- "*" | |
paths: | |
- ".make.*" | |
- "transforms/.make.transforms" | |
- "transforms/universal/tokenization/**" | |
- "data-processing-lib/**" | |
- "!transforms/universal/tokenization/**/kfp_ray/**" # This is/will be tested in separate workflow | |
- "!data-processing-lib/**/test/**" | |
- "!data-processing-lib/**/test-data/**" | |
- "!**.md" | |
- "!**/doc/**" | |
- "!**/images/**" | |
- "!**.gitignore" | |
pull_request: | |
branches: | |
- "dev" | |
- "releases/**" | |
paths: | |
- ".make.*" | |
- "transforms/.make.transforms" | |
- "transforms/universal/tokenization/**" | |
- "data-processing-lib/**" | |
- "!transforms/universal/tokenization/**/kfp_ray/**" # This is/will be tested in separate workflow | |
- "!data-processing-lib/**/test/**" | |
- "!data-processing-lib/**/test-data/**" | |
- "!**.md" | |
- "!**/doc/**" | |
- "!**/images/**" | |
- "!**.gitignore" | |
# Taken from https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
cancel-in-progress: true | |
jobs: | |
check_if_push_image: | |
# check whether the Docker images should be pushed to the remote repository | |
# The images are pushed if it is a merge to dev branch or a new tag is created. | |
# The latter being part of the release process. | |
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. | |
runs-on: ubuntu-22.04 | |
outputs: | |
publish_images: ${{ steps.version.outputs.publish_images }} | |
steps: | |
- id: version | |
run: | | |
publish_images='false' | |
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; | |
then | |
publish_images='true' | |
fi | |
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; | |
then | |
publish_images='true' | |
fi | |
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" | |
test-src: | |
runs-on: ubuntu-22.04 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Free up space in github runner | |
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 | |
run: | | |
df -h | |
sudo rm -rf "/usr/local/share/boost" | |
sudo rm -rf "$AGENT_TOOLSDIRECTORY" | |
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup | |
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true | |
df -h | |
- name: Test transform source in transforms/universal/tokenization | |
run: | | |
if [ -e "transforms/universal/tokenization/Makefile" ]; then | |
make -C transforms/universal/tokenization DOCKER=docker test-src | |
else | |
echo "transforms/universal/tokenization/Makefile not found - source testing disabled for this transform." | |
fi | |
test-image: | |
needs: [check_if_push_image] | |
runs-on: ubuntu-22.04 | |
timeout-minutes: 120 | |
env: | |
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} | |
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Free up space in github runner | |
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 | |
run: | | |
df -h | |
sudo rm -rf /opt/ghc | |
sudo rm -rf "/usr/local/share/boost" | |
sudo rm -rf "$AGENT_TOOLSDIRECTORY" | |
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup | |
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true | |
df -h | |
- name: Test transform image in transforms/universal/tokenization | |
run: | | |
if [ -e "transforms/universal/tokenization/Makefile" ]; then | |
if [ -d "transforms/universal/tokenization/spark" ]; then | |
make -C data-processing-lib/spark DOCKER=docker image | |
fi | |
make -C transforms/universal/tokenization DOCKER=docker test-image | |
else | |
echo "transforms/universal/tokenization/Makefile not found - testing disabled for this transform." | |
fi | |
- name: Print space | |
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 | |
run: | | |
df -h | |
docker images | |
- name: Publish images | |
if: needs.check_if_push_image.outputs.publish_images == 'true' | |
run: | | |
if [ -e "transforms/universal/tokenization/Makefile" ]; then | |
make -C transforms/universal/tokenization publish | |
else | |
echo "transforms/universal/tokenization/Makefile not found - publishing disabled for this transform." | |
fi |