From 836a6723718a774a2a7e374f5f65a7fb8c742622 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 2 Aug 2024 11:23:35 -0700
Subject: [PATCH] Testing raidz_expand_001_pos failure

---
 .github/workflows/scripts/qemu-6-tests.sh     |  8 ++-
 .github/workflows/scripts/qemu-7-reports.sh   | 14 ++++-
 .github/workflows/zfs-qemu.yml                | 61 ++++++++++++++-----
 .../cli_root/zfs_copies/zfs_copies.kshlib     | 11 ++++
 .../zpool_status/zpool_status_008_pos.ksh     |  7 ++-
 .../functional/crtime/crtime_001_pos.ksh      |  5 +-
 .../functional/raidz/raidz_expand_001_pos.ksh | 21 ++++++-
 .../functional/raidz/raidz_expand_002_pos.ksh |  4 ++
 8 files changed, 107 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/scripts/qemu-6-tests.sh b/.github/workflows/scripts/qemu-6-tests.sh
index 488dee801d85..7bbcc8e680ae 100755
--- a/.github/workflows/scripts/qemu-6-tests.sh
+++ b/.github/workflows/scripts/qemu-6-tests.sh
@@ -71,8 +71,14 @@ esac
 cd /var/tmp
 uname -a > uname.txt
 
+# ONLY FOR TESTING DO NOT COMMIT
+#
+TAGS=$2/$3
+
+# TAGS=raidz
+
 # run functional testings
-$TDIR/zfs-tests.sh -vK -s 3G -T $2/$3
+$TDIR/zfs-tests.sh -vK -s 3G -T $TAGS
 RV=$?
 
 # we wont fail here, this will be done later
diff --git a/.github/workflows/scripts/qemu-7-reports.sh b/.github/workflows/scripts/qemu-7-reports.sh
index 3338f491166a..108774a79e96 100755
--- a/.github/workflows/scripts/qemu-7-reports.sh
+++ b/.github/workflows/scripts/qemu-7-reports.sh
@@ -52,16 +52,26 @@ for i in `seq 1 $VMs`; do
   echo "##[endgroup]"
 done
 
+RESPATH="/var/tmp/test_results"
+
 # all tests without grouping:
 MERGE="$BASE/.github/workflows/scripts/merge_summary.awk"
-$MERGE vm*log.txt | $BASE/scripts/zfs-tests-color.sh
+$MERGE vm*log.txt | $BASE/scripts/zfs-tests-color.sh | tee $RESPATH/summary.txt
 
-RESPATH="/var/tmp/test_results"
 for i in `seq 1 $VMs`; do
   rsync -arL zfs@192.168.122.1$i:$RESPATH/current $RESPATH/vm$i || true
   scp zfs@192.168.122.1$i:"/var/tmp/*.txt" $RESPATH/vm$i || true
 done
 cp -f /var/tmp/*.txt $RESPATH || true
+
+
+# Save a list of all failed test logs for easy access
+awk '/\[FAIL\]|\[KILLED\]/{ show=1; print; next; }; /\[SKIP\]|\[PASS\]/{ show=0; } show' \
+    $RESPATH/vm*/current/log >> $RESPATH/summary-failure-logs.txt
+
+cp $RESPATH/summary.txt $RESPATH/summary-with-logs.txt
+cat $RESPATH/summary-failure-logs.txt >> $RESPATH/summary-with-logs.txt
+
 tar cf /tmp/qemu-$OS.tar -C $RESPATH -h . || true
 
 echo "********************************************************************"
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index bfcbb468f59d..320a7e820f3f 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -2,7 +2,7 @@ name: zfs-qemu
 
 on:
   push:
-  pull_request:
+#  pull_request:
 
 jobs:
 
@@ -13,10 +13,14 @@ jobs:
       matrix:
         # all:
         # os: [almalinux8, almalinux9, archlinux, centos-stream9, fedora39, fedora40, debian11, debian12, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24]
-        os: [almalinux8, almalinux9, centos-stream9, fedora39, fedora40, debian11, debian12, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24]
+        # os: [almalinux8, almalinux9, centos-stream9, fedora39, fedora40, debian11, debian12, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24]
         # openzfs:
         # os: [almalinux8, almalinux9, centos-stream9, fedora39, fedora40, freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15, ubuntu20, ubuntu22, ubuntu24]
         # freebsd:
+        # os: [almalinux8, almalinux9, fedora39, fedora40, freebsd13, freebsd14, freebsd15, ubuntu20, ubuntu22, ubuntu24]
+        # os: [almalinux8, almalinux9, fedora39, fedora40, ubuntu22, ubuntu24]
+        os: [almalinux8]
+
         # os: [freebsd13, freebsd13r, freebsd14, freebsd14r, freebsd15]
     runs-on: ubuntu-24.04
     steps:
@@ -24,18 +28,18 @@ jobs:
       with:
         ref: ${{ github.event.pull_request.head.sha }}
 
-    - name: Setup SSH
-      run: |
-        mkdir -p $HOME/.ssh
-        echo "ConnectTimeout 1" >> $HOME/.ssh/config
-        echo "StrictHostKeyChecking no" >> $HOME/.ssh/config
-        echo "${{ secrets.AUTHORIZED_KEYS }}" >> $HOME/.ssh/authorized_keys
-        echo "${{ secrets.SSH_KEY }}" > $HOME/.ssh/id_ed25519
-        echo "${{ secrets.KNOWN_HOSTS }}" >> $HOME/.ssh/known_hosts
-        chmod 600 $HOME/.ssh/id_ed25519
-        R=`shuf -n 1 -i 10000-60000`
-        echo "Port  $R"
-        ssh -x -N -C -f -R $R:127.0.0.1:22 mcmilk@${{ secrets.SOME_HOST }}
+#    - name: Setup SSH
+#      run: |
+#        mkdir -p $HOME/.ssh
+#        echo "ConnectTimeout 1" >> $HOME/.ssh/config
+#        echo "StrictHostKeyChecking no" >> $HOME/.ssh/config
+#        echo "${{ secrets.AUTHORIZED_KEYS }}" >> $HOME/.ssh/authorized_keys
+#        echo "${{ secrets.SSH_KEY }}" > $HOME/.ssh/id_ed25519
+#        echo "${{ secrets.KNOWN_HOSTS }}" >> $HOME/.ssh/known_hosts
+#        chmod 600 $HOME/.ssh/id_ed25519
+#        R=`shuf -n 1 -i 10000-60000`
+#        echo "Port  $R"
+#        ssh -x -N -C -f -R $R:127.0.0.1:22 mcmilk@${{ secrets.SOME_HOST }}
 
     - name: Setup QEMU
       timeout-minutes: 10
@@ -87,12 +91,41 @@ jobs:
       run: .github/workflows/scripts/qemu-7-reports.sh
 
     - uses: actions/upload-artifact@v4
+      id: artifact-upload-step
       if: always()
       with:
         name: Logs-functional-${{ matrix.os }}
         path: /tmp/qemu-${{ matrix.os }}.tar
         if-no-files-found: ignore
 
+    - name: Test Summary
+      if: success() || failure()
+      run: |
+        cat /var/tmp/test_results/summary.txt
+        echo ""
+        echo "Full logs for download:"
+        echo '${{ steps.artifact-upload-step.outputs.artifact-url }}'
+        echo "File listing:"
+        ls -l /var/tmp/test_results/
+
+        # Did we have a test failure?
+        if grep -vq 0 /var/tmp/test_results/vm*/exitcode.txt ; then
+            echo "One or more tests failed"
+            cat /var/tmp/test_results/summary-failure-logs.txt
+            # On test failures, the github test results page will autoscroll to the
+            # bottom of summary-with-logs.txt.  The first thing we want to see
+            # is the summary page of failures so print another copy of it at
+            # the bottom of the "page" for easy access.
+            echo ""
+            cat /var/tmp/test_results/summary.txt
+            echo ""
+            echo "Full logs for download:"
+            echo '${{ steps.artifact-upload-step.outputs.artifact-url }}'
+            false
+        else
+            true
+        fi
+
   cleanup:
     if: always()
     name: Cleanup
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib
index a7a93a3046d2..2456fbabc20d 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib
@@ -155,5 +155,16 @@ function do_vol_test
 
 	# Ubuntu 20.04 wants a sync here
 	log_must sync
+
+	# Not sure if this 'udevadm settle' is needed, but we once saw this
+	# failure and want to attempt to mitigate it:
+	#
+	#  ERROR: zfs destroy testpool/testvol1 exited 1
+	#  cannot destroy 'testpool/testvol1': dataset is busy
+	#
+	if is_linux; then
+		udevadm settle
+	fi
+
 	log_must zfs destroy $vol
 }
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh
index 6be2ad5a7410..8a35f1ae51ca 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh
@@ -69,12 +69,12 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do
 	log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE"
 
 	# Check no ONLINE slow vdevs are show.  Then mark IOs greater than
-	# 10ms slow, delay IOs 20ms to vdev6, check slow IOs.
+	# 160ms slow, delay IOs 160ms to vdev6, check slow IOs.
 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE"
 	log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE"
 
-	log_must set_tunable64 ZIO_SLOW_IO_MS 10
-	log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2
+	log_must set_tunable64 ZIO_SLOW_IO_MS 160
+	log_must zinject -d $TESTDIR/vdev6 -D320:100 $TESTPOOL2
 	log_must mkfile 1048576 /$TESTPOOL2/testfile
 	sync_pool $TESTPOOL2
 	log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
@@ -91,6 +91,7 @@ for raid_type in "draid2:3d:6c:1s" "raidz2"; do
 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE"
 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE"
 	log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE"
+
 	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE"
 	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE"
 	log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE"
diff --git a/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh b/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh
index b2d0e7378652..1d16a8f83866 100755
--- a/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/crtime/crtime_001_pos.ksh
@@ -63,7 +63,10 @@ for arg in ${args[*]}; do
 	fi
 	log_must touch $TESTFILE
 	typeset -i crtime1=$(stat_crtime $TESTFILE)
-	if (( crtime1 != crtime )); then
+
+	# On slow test machines, there's a slight chance crtime1 rolls over
+	# to the next second.  If that happens count it as ok.
+	if ! within_tolerance $crtime1 $crtime 1 ; then
 		log_fail "touch modified crtime ($crtime1 != $crtime)"
 	fi
 done
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh
index 063d7fa735df..1ff390366401 100755
--- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh
@@ -137,7 +137,7 @@ function test_scrub # <pool> <parity> <dir>
 	typeset nparity=$2
 	typeset dir=$3
 	typeset combrec=$4
-
+	log_note "Begin: $1, $2, $3, $4"
 	reflow_size=$(get_pool_prop allocated $pool)
 	randbyte=$(( ((RANDOM<<15) + RANDOM) % $reflow_size ))
 	log_must set_tunable64 RAIDZ_EXPAND_MAX_REFLOW_BYTES $randbyte
@@ -151,10 +151,18 @@ function test_scrub # <pool> <parity> <dir>
 		dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
 		    bs=1M seek=4 count=$(($dev_size_mb-4))
 	done
+	log_must sync
 
 	log_must zpool import -o cachefile=none -d $dir $pool
-
+	if is_pool_scrubbing $pool ; then
+		wait_scrubbed $pool
+	fi
+	log_note "=== About to scrub1 ==="
+	log_note "$(zpool status -vs)"
 	log_must zpool scrub -w $pool
+	log_note "=== After scrub1 ==="
+	log_note "$(zpool status -vs)"
+
 	log_must zpool clear $pool
 	log_must zpool export $pool
 
@@ -163,10 +171,17 @@ function test_scrub # <pool> <parity> <dir>
 		dd conv=notrunc if=/dev/zero of=$dir/dev-$i \
 		    bs=1M seek=4 count=$(($dev_size_mb-4))
 	done
+	log_must sync
 
 	log_must zpool import -o cachefile=none -d $dir $pool
-
+	if is_pool_scrubbing $pool ; then
+		wait_scrubbed $pool
+	fi
+	log_note "=== About to scrub2 ==="
+	log_note "$(zpool status -vs)"
 	log_must zpool scrub -w $pool
+	log_note "=== After scrub2 ==="
+	log_note "$(zpool status -vs)"
 
 	log_must check_pool_status $pool "errors" "No known data errors"
 
diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh
index 004f3d1f9255..e416926d1a06 100755
--- a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh
@@ -105,6 +105,10 @@ for disk in ${disks[$(($nparity+2))..$devs]}; do
 		log_fail "pool $pool not expanded"
 	fi
 
+	# It's possible the pool could be auto scrubbing here.  If so, wait.
+	if is_pool_scrubbing $pool ; then
+		wait_scrubbed $pool
+	fi
 	verify_pool $pool
 
 	pool_size=$expand_size