Skip to content

Regenerate DB

Regenerate DB #131

Workflow file for this run

name: Regenerate DB
on:
workflow_dispatch:
schedule:
- cron: '0 4 5,24 * *' # run on days 5/24 of each month (dumps start on days 1/20 and take a few days to complete and get mirrored)
jobs:
create-updated-wiki-dump:
strategy:
fail-fast: false
matrix:
lang: [en, fr, de, ja, it, ru, es, zh, pl, nl, pt, ar, fi, hu, th, he, tr, sv]
name: ${{ matrix.lang }}wiki Dump Gen
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Prerequisites
id: setup
run: |
sudo apt install -y jq xz-utils libxml2-dev libgc-dev
date=$(curl -L https://dumps.wikimedia.org/index.json | jq --raw-output '.wikis.${{ matrix.lang }}wiki.jobs.pagelinkstable.updated | split(" ") | .[0]')
if [ -z "$date" ]; then
echo "Error: date is empty"
exit 1
fi
echo "date=$date" >> $GITHUB_OUTPUT
- name: Initialize swapfile
run: |
sudo fallocate -l 8G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
sudo swapon --show
- name: Build
run: |
cd db_gen
clang++ -Ofast -flto -march=native -mtune=native -I/usr/include/libxml2 db_gen.cc -o ../gen -std=c++17 -lxml2 -lgc
cd ..
curl -L "https://dumps.wikimedia.org/$(curl -L https://dumps.wikimedia.org/index.json | jq --raw-output '.wikis.${{ matrix.lang }}wiki.jobs | if has("articlesdumprecombine") then .articlesdumprecombine else .articlesdump end | .files | to_entries[0].value.url')" | bzcat | ./gen | xz -C crc32 --lzma2=preset=9e,lc=4,pb=2 -f - > "${{ matrix.lang }}.bin"
# if size of file is less than 1KB, something went wrong
if [ $(stat -c %s "${{ matrix.lang }}.bin") -lt 1024 ]; then
echo "File is too small, something went wrong"
exit 1
fi
aws configure set aws_access_key_id ${{ secrets.AWS_ACCESS_KEY_ID }}
aws configure set aws_secret_access_key ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws configure set region auto
aws configure set output json
aws s3 cp ${{ matrix.lang }}.bin s3://wiki-dbs/${{ matrix.lang }}.bin --endpoint-url ${{ secrets.R2_AWS_ENDPOINT_URL }}
- name: Upload
uses: softprops/action-gh-release@v2
with:
files: ${{ matrix.lang }}.bin
tag_name: ${{ steps.setup.outputs.date }}