From 836a6723718a774a2a7e374f5f65a7fb8c742622 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 2 Aug 2024 11:23:35 -0700 Subject: [PATCH] Testing raidz_expand_001_pos failure --- .github/workflows/scripts/qemu-6-tests.sh | 8 ++- .github/workflows/scripts/qemu-7-reports.sh | 14 ++++- .github/workflows/zfs-qemu.yml | 61 ++++++++++++++----- .../cli_root/zfs_copies/zfs_copies.kshlib | 11 ++++ .../zpool_status/zpool_status_008_pos.ksh | 7 ++- .../functional/crtime/crtime_001_pos.ksh | 5 +- .../functional/raidz/raidz_expand_001_pos.ksh | 21 ++++++- .../functional/raidz/raidz_expand_002_pos.ksh | 4 ++ 8 files changed, 107 insertions(+), 24 deletions(-) diff --git a/.github/workflows/scripts/qemu-6-tests.sh b/.github/workflows/scripts/qemu-6-tests.sh index 488dee801d85..7bbcc8e680ae 100755 --- a/.github/workflows/scripts/qemu-6-tests.sh +++ b/.github/workflows/scripts/qemu-6-tests.sh @@ -71,8 +71,14 @@ esac cd /var/tmp uname -a > uname.txt +# ONLY FOR TESTING DO NOT COMMIT +# +TAGS=$2/$3 + +# TAGS=raidz + # run functional testings -$TDIR/zfs-tests.sh -vK -s 3G -T $2/$3 +$TDIR/zfs-tests.sh -vK -s 3G -T $TAGS RV=$? # we wont fail here, this will be done later diff --git a/.github/workflows/scripts/qemu-7-reports.sh b/.github/workflows/scripts/qemu-7-reports.sh index 3338f491166a..108774a79e96 100755 --- a/.github/workflows/scripts/qemu-7-reports.sh +++ b/.github/workflows/scripts/qemu-7-reports.sh @@ -52,16 +52,26 @@ for i in `seq 1 $VMs`; do echo "##[endgroup]" done +RESPATH="/var/tmp/test_results" + # all tests without grouping: MERGE="$BASE/.github/workflows/scripts/merge_summary.awk" -$MERGE vm*log.txt | $BASE/scripts/zfs-tests-color.sh +$MERGE vm*log.txt | $BASE/scripts/zfs-tests-color.sh | tee $RESPATH/summary.txt -RESPATH="/var/tmp/test_results" for i in `seq 1 $VMs`; do rsync -arL zfs@192.168.122.1$i:$RESPATH/current $RESPATH/vm$i || true scp zfs@192.168.122.1$i:"/var/tmp/*.txt" $RESPATH/vm$i || true done cp -f /var/tmp/*.txt $RESPATH || true + + +# Save a list of all failed test logs for easy access +awk '/\[FAIL\]|\[KILLED\]/{ show=1; print; next; }; /\[SKIP\]|\[PASS\]/{ show=0; } show' \ + $RESPATH/vm*/current/log >> $RESPATH/summary-failure-logs.txt + +cp $RESPATH/summary.txt $RESPATH/summary-with-logs.txt +cat $RESPATH/summary-failure-logs.txt >> $RESPATH/summary-with-logs.txt + tar cf /tmp/qemu-$OS.tar -C $RESPATH -h . || true echo "********************************************************************" diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index bfcbb468f59d..320a7e820f3f 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -2,7 +2,7 @@ name: zfs-qemu on: push: - pull_request: +# pull_request: jobs: @@ -13,10 +13,14 @@ jobs: matrix: # all: # os: [almalinux8, almalinux9, archlinux, centos-stream9, fedora39, fedora40, debian11, debian12, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24] - os: [almalinux8, almalinux9, centos-stream9, fedora39, fedora40, debian11, debian12, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24] + # os: [almalinux8, almalinux9, centos-stream9, fedora39, fedora40, debian11, debian12, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24] # openzfs: # os: [almalinux8, almalinux9, centos-stream9, fedora39, fedora40, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24] # freebsd: + # os: [almalinux8, almalinux9, fedora39, fedora40, freebsd13, freebsd14, freebsd15, ubuntu20, ubuntu22, ubuntu24] + # os: [almalinux8, almalinux9, fedora39, fedora40, ubuntu22, ubuntu24] + os: [almalinux8] + # os: [freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15] runs-on: ubuntu-24.04 steps: @@ -24,18 +28,18 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} - - name: Setup SSH - run: | - mkdir -p $HOME/.ssh - echo "ConnectTimeout 1" >> $HOME/.ssh/config - echo "StrictHostKeyChecking no" >> $HOME/.ssh/config - echo "${{ secrets.AUTHORIZED_KEYS }}" >> $HOME/.ssh/authorized_keys - echo "${{ secrets.SSH_KEY }}" > $HOME/.ssh/id_ed25519 - echo "${{ secrets.KNOWN_HOSTS }}" >> $HOME/.ssh/known_hosts - chmod 600 $HOME/.ssh/id_ed25519 - R=`shuf -n 1 -i 10000-60000` - echo "Port $R" - ssh -x -N -C -f -R $R:127.0.0.1:22 mcmilk@${{ secrets.SOME_HOST }} +# - name: Setup SSH +# run: | +# mkdir -p $HOME/.ssh +# echo "ConnectTimeout 1" >> $HOME/.ssh/config +# echo "StrictHostKeyChecking no" >> $HOME/.ssh/config +# echo "${{ secrets.AUTHORIZED_KEYS }}" >> $HOME/.ssh/authorized_keys +# echo "${{ secrets.SSH_KEY }}" > $HOME/.ssh/id_ed25519 +# echo "${{ secrets.KNOWN_HOSTS }}" >> $HOME/.ssh/known_hosts +# chmod 600 $HOME/.ssh/id_ed25519 +# R=`shuf -n 1 -i 10000-60000` +# echo "Port $R" +# ssh -x -N -C -f -R $R:127.0.0.1:22 mcmilk@${{ secrets.SOME_HOST }} - name: Setup QEMU timeout-minutes: 10 @@ -87,12 +91,41 @@ jobs: run: .github/workflows/scripts/qemu-7-reports.sh - uses: actions/upload-artifact@v4 + id: artifact-upload-step if: always() with: name: Logs-functional-${{ matrix.os }} path: /tmp/qemu-${{ matrix.os }}.tar if-no-files-found: ignore + - name: Test Summary + if: success() || failure() + run: | + cat /var/tmp/test_results/summary.txt + echo "" + echo "Full logs for download:" + echo '${{ steps.artifact-upload-step.outputs.artifact-url }}' + echo "File listing:" + ls -l /var/tmp/test_results/ + + # Did we have a test failure? + if grep -vq 0 /var/tmp/test_results/vm*/exitcode.txt ; then + echo "One or more tests failed" + cat /var/tmp/test_results/summary-failure-logs.txt + # On test failures, the github test results page will autoscroll to the + # bottom of summary-with-logs.txt. The first thing we want to see + # is the summary page of failures so print another copy of it at + # the bottom of the "page" for easy access. + echo "" + cat /var/tmp/test_results/summary.txt + echo "" + echo "Full logs for download:" + echo '${{ steps.artifact-upload-step.outputs.artifact-url }}' + false + else + true + fi + cleanup: if: always() name: Cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib index a7a93a3046d2..2456fbabc20d 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib @@ -155,5 +155,16 @@ function do_vol_test # Ubuntu 20.04 wants a sync here log_must sync + + # Not sure if this 'udevadm settle' is needed, but we once saw this + # failure and want to attempt to mitigate it: + # + # ERROR: zfs destroy testpool/testvol1 exited 1 + # cannot destroy 'testpool/testvol1': dataset is busy + # + if is_linux; then + udevadm settle + fi + log_must zfs destroy $vol } diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh index 6be2ad5a7410..8a35f1ae51ca 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh @@ -69,12 +69,12 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" # Check no ONLINE slow vdevs are show. Then mark IOs greater than - # 10ms slow, delay IOs 20ms to vdev6, check slow IOs. + # 160ms slow, delay IOs 160ms to vdev6, check slow IOs. log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE" log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE" - log_must set_tunable64 ZIO_SLOW_IO_MS 10 - log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2 + log_must set_tunable64 ZIO_SLOW_IO_MS 160 + log_must zinject -d $TESTDIR/vdev6 -D320:100 $TESTPOOL2 log_must mkfile 1048576 /$TESTPOOL2/testfile sync_pool $TESTPOOL2 log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO @@ -91,6 +91,7 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE" log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE" log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE" log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE" log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE" diff --git a/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh b/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh index b2d0e7378652..1d16a8f83866 100755 --- a/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh @@ -63,7 +63,10 @@ for arg in ${args[*]}; do fi log_must touch $TESTFILE typeset -i crtime1=$(stat_crtime $TESTFILE) - if (( crtime1 != crtime )); then + + # On slow test machines, there's a slight chance crtime1 rolls over + # to the next second. If that happens count it as ok. + if ! within_tolerance $crtime1 $crtime 1 ; then log_fail "touch modified crtime ($crtime1 != $crtime)" fi done diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh index 063d7fa735df..1ff390366401 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh @@ -137,7 +137,7 @@ function test_scrub # typeset nparity=$2 typeset dir=$3 typeset combrec=$4 - + log_note "Begin: $1, $2, $3, $4" reflow_size=$(get_pool_prop allocated $pool) randbyte=$(( ((RANDOM<<15) + RANDOM) % $reflow_size )) log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $randbyte @@ -151,10 +151,18 @@ function test_scrub # dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ bs=1M seek=4 count=$(($dev_size_mb-4)) done + log_must sync log_must zpool import -o cachefile=none -d $dir $pool - + if is_pool_scrubbing $pool ; then + wait_scrubbed $pool + fi + log_note "=== About to scrub1 ===" + log_note "$(zpool status -vs)" log_must zpool scrub -w $pool + log_note "=== After scrub1 ===" + log_note "$(zpool status -vs)" + log_must zpool clear $pool log_must zpool export $pool @@ -163,10 +171,17 @@ function test_scrub # dd conv=notrunc if=/dev/zero of=$dir/dev-$i \ bs=1M seek=4 count=$(($dev_size_mb-4)) done + log_must sync log_must zpool import -o cachefile=none -d $dir $pool - + if is_pool_scrubbing $pool ; then + wait_scrubbed $pool + fi + log_note "=== About to scrub2 ===" + log_note "$(zpool status -vs)" log_must zpool scrub -w $pool + log_note "=== After scrub2 ===" + log_note "$(zpool status -vs)" log_must check_pool_status $pool "errors" "No known data errors" diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh index 004f3d1f9255..e416926d1a06 100755 --- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh @@ -105,6 +105,10 @@ for disk in ${disks[$(($nparity+2))..$devs]}; do log_fail "pool $pool not expanded" fi + # It's possible the pool could be auto scrubbing here. If so, wait. + if is_pool_scrubbing $pool ; then + wait_scrubbed $pool + fi verify_pool $pool pool_size=$expand_size