Fix and improve the reporting of errors from timeouts and crashes (#82)

* Test we retry on timeouts * Refactor test-error handling to always record the error * Record more informative error: include timeout/abort info * Update reference tests * Test case where we timeout then pass on retry * fixup! Test case where we timeout then pass on retry * fixup! Test case where we timeout then pass on retry * Reduce time of test so worker=0 case isn't so slow * Handle test failure case differently to worker timeout or crash * Bump version * fixup! Handle test failure case differently to worker timeout or crash * fixup! fixup! Handle test failure case differently to worker timeout or crash * Update test/integrationtests.jl - typo * Don't use `tempdir()`
JuliaTesting · Jun 21, 2023 · 0859990 · 0859990 · nickrobinson251 · Jun 21, 2023
1 parent 4355418
commit 0859990
Show file tree

Hide file tree

Showing 8 changed files with 704 additions and 292 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "ReTestItems"
 uuid = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
-version = "1.8.1"
+version = "1.9.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -23,8 +23,9 @@ DeepDiffs = "ab62b9b5-e342-54a8-a765-a90f495de1a6"
 IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5"
 
 [targets]
-test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Test", "XMLDict"]
+test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Random", "Test", "XMLDict"]
diff --git a/src/ReTestItems.jl b/src/ReTestItems.jl
@@ -354,16 +354,31 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
     return w
 end
 
+function record_timeout!(testitem, run_number::Int, timeout_limit::Real)
+    timeout_s = round(Int, timeout_limit)
+    time_str = if timeout_s < 60
+        string(timeout_s, "s")
+    else
+        mins, secs = divrem(timeout_s, 60)
+        if iszero(secs)
+            string(mins, "m")
+        else
+            string(mins, "m", lpad(secs, 2, "0"), "s")
+        end
+    end
+    msg = "Timed out after $time_str evaluating test item $(repr(testitem.name)) (run=$run_number)"
+    record_test_error!(testitem, msg)
+end
 
-# This is only used to signal that we need to retry. We don't use Test.TestSetException as
-# that requires information that we are not going to use anyway.
-struct TestSetFailure <: Exception end
-throw_if_failed(ts) = ts.anynonpass ? throw(TestSetFailure()) : nothing
+function record_worker_terminated!(testitem, run_number::Int)
+    msg = "Worker aborted evaluating test item $(repr(testitem.name)) (run=$run_number)"
+    record_test_error!(testitem, msg)
+end
 
-function record_test_error!(testitem, ntries)
+function record_test_error!(testitem, msg)
     Test.TESTSET_PRINT_ENABLE[] = false
     ts = DefaultTestSet(testitem.name)
-    err = ErrorException("test item $(repr(testitem.name)) didn't succeed after $ntries tries")
+    err = ErrorException(msg)
     Test.record(ts, Test.Error(:nontest_error, Test.Expr(:tuple), err,
         Base.ExceptionStack([(exception=err, backtrace=Union{Ptr{Nothing}, Base.InterpreterIP}[])]),
         LineNumberNode(testitem.line, testitem.file)))
@@ -384,7 +399,7 @@ function start_and_manage_worker(
 )
     ntestitems = length(testitems.testitems)
     worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
-    nretries = 0
+    run_number = 1
     while testitem !== nothing
         ch = Channel{TestItemResult}(1)
         testitem.workerid[] = worker.pid
@@ -415,70 +430,55 @@ function start_and_manage_worker(
                 ts = testitem_result.testset
                 push!(testitem.testsets, ts)
                 push!(testitem.stats, testitem_result.stats)
-                print_errors_and_captured_logs(testitem, nretries + 1; logs)
+                print_errors_and_captured_logs(testitem, run_number; logs)
                 report_empty_testsets(testitem, ts)
                 # Run GC to free memory on the worker before next testitem.
                 @debugv 2 "Running GC on $worker"
                 remote_fetch(worker, :(GC.gc(true); GC.gc(false)))
-                # If the result isn't a pass, we throw to go to the outer try-catch
-                throw_if_failed(ts)
-                testitem = next_testitem(testitems, testitem.id[])
-                nretries = 0
+                if ts.anynonpass && (run_number != (1 + retry_limit))
+                    run_number += 1
+                    @info "Retrying $(repr(testitem.name)) on $worker. Run=$run_number."
+                else
+                    testitem = next_testitem(testitems, testitem.id[])
+                    run_number = 1
+                end
             finally
                 close(timer)
             end
         catch e
             @debugv 2 "Error" exception=e
-            if !(e isa WorkerTerminatedException || e isa TimeoutException || e isa TestSetFailure)
-                # we don't expect any other kind of error, so rethrow, which will propagate
-                # back up to the main coordinator task and throw to the user
-                rethrow()
-            end
-
-            if !(e isa TestSetFailure)
-                println(DEFAULT_STDOUT[])
-                # Explicitly show captured logs or say there weren't any in case we're about
-                # to terminte the worker
-                _print_captured_logs(DEFAULT_STDOUT[], testitem, nretries + 1)
-            end
-
+            println(DEFAULT_STDOUT[])
+            _print_captured_logs(DEFAULT_STDOUT[], testitem, run_number)
+            # Handle the exception
             if e isa TimeoutException
                 @debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker"
                 terminate!(worker)
                 wait(worker)
+                @warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
+                    Recording test error."
+                record_timeout!(testitem, run_number, timeout)
+            elseif e isa WorkerTerminatedException
+                @warn "$worker died evaluating test item $(repr(testitem.name)). \
+                    Recording test error."
+                record_worker_terminated!(testitem, run_number)
+            else
+                # We don't expect any other kind of error, so rethrow, which will propagate
+                # back up to the main coordinator task and throw to the user
+                rethrow()
             end
-            if nretries == retry_limit
-                if e isa TimeoutException
-                    @warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
-                        Recording test error, and starting a new worker."
-                    record_test_error!(testitem, nretries + 1)
-                elseif e isa WorkerTerminatedException
-                    @warn "$worker died evaluating test item $(repr(testitem.name)). \
-                        Recording test error, and starting a new worker."
-                    record_test_error!(testitem, nretries + 1)
-                else
-                    @assert e isa TestSetFailure
-                    # We already printed the error and recorded the testset.
-                end
+            # Handle retries
+            if run_number == (1 + retry_limit)
                 testitem = next_testitem(testitems, testitem.id[])
-                nretries = 0
+                run_number = 1
             else
-                nretries += 1
-                if e isa TimeoutException
-                    @warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
-                        Starting a new worker and retrying. Retry=$nretries."
-                elseif e isa WorkerTerminatedException
-                    @warn "$worker died evaluating test item $(repr(testitem.name)). \
-                        Starting a new worker and retrying. Retry=$nretries."
-                else
-                    @assert e isa TestSetFailure
-                    @warn "Test item $(repr(testitem.name)) failed. Retrying on $worker. Retry=$nretries."
-                end
+                run_number += 1
+                @info "Retrying $(repr(testitem.name)) on a new worker. Run=$run_number."
             end
-            if (e isa TimeoutException || e isa WorkerTerminatedException)
+            # The worker was terminated, so replace it unless there are no more testitems to run
+            if testitem !== nothing
                 worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
             end
-            # now we loop back around to reschedule the testitem
+            # Now loop back around to reschedule the testitem
             continue
         end
     end

diff --git a/test/integrationtests.jl b/test/integrationtests.jl
@@ -604,29 +604,45 @@ end
     # Test the error is as expected
     err = only(non_passes(results))
     @test err.test_type == :nontest_error
-    @test err.value == string(ErrorException("test item \"Abort\" didn't succeed after 1 tries"))
+    @test err.value == string(ErrorException("Worker aborted evaluating test item \"Abort\" (run=1)"))
 end
 
 @testset "test retrying failing testitem" begin
     file = joinpath(TEST_FILES_DIR, "_retry_tests.jl")
-    results = encased_testset(()->runtests(file; nworkers=1, retries=2))
+    # This directory must match what's set in `_retry_tests`
+    tmpdir = joinpath("/tmp", "JL_RETESTITEMS_TEST_TMPDIR")
+    # must run with `testitem_timeout < 20` for test to timeout as expected.
+    # and must run with `nworkers > 0` for retries to be supported.
+    results = encased_testset(()->runtests(file; nworkers=1, retries=2, testitem_timeout=3))
     # Test we _ran_ each test-item the expected number of times
     read_count(x) = parse(Int, read(x, String))
     # Passes on second attempt, so only need to retry once.
-    @test read_count(joinpath(tempdir(), "num_runs_1")) == 2
+    @test read_count(joinpath(tmpdir, "num_runs_1")) == 2
     # Doesn't pass til third attempt, so needs both retries.
-    @test read_count(joinpath(tempdir(), "num_runs_2")) == 3
+    @test read_count(joinpath(tmpdir, "num_runs_2")) == 3
     # Doesn't pass ever, so need all retries. This testitem set `retries=4` which is greater
     # than the `retries=2` that `runtests` set, so we should retry 4 times.
-    @test read_count(joinpath(tempdir(), "num_runs_3")) == 5
+    @test read_count(joinpath(tmpdir, "num_runs_3")) == 5
     # Doesn't pass ever, so need all retries. This testitem set `retries=1` which is less
     # than the `retries=2` that `runtests` set, so we should retry 2 times.
-    @test read_count(joinpath(tempdir(), "num_runs_4")) == 3
+    @test read_count(joinpath(tmpdir, "num_runs_4")) == 3
+    # Times out always, so should retry as many times as allowed.
+    # Since it will be a new worker for each retry, we write one file for each.
+    @test count(contains("num_runs_5"), readdir(tmpdir)) == 3
+    # Times out on first run, then passes on second attempt.
+    @test count(contains("num_runs_6"), readdir(tmpdir)) == 2
 
     # Test we _report_ the expected number of test-items
-    @test n_tests(results) == 4
-    # The first two testitems pass on retry.
-    @test n_passed(results) == 2
+    @test n_tests(results) == 6
+    # Testitems 1, 2, and 6 pass on retry.
+    @test n_passed(results) == 3
+
+    # Clear out any files created by this testset
+    foreach(readdir(tmpdir; join=true)) do tmp
+        # `force` in case it gets cleaned up between `readdir` and `rm`
+        contains(tmp, "num_runs") && rm(tmp; force=true)
+    end
+    rm(tmpdir; force=true)
 end
 
 @testset "testitem timeout" begin
@@ -641,7 +657,7 @@ end
     # Test the error is as expected
     err = only(non_passes(results))
     @test err.test_type == :nontest_error
-    @test err.value == string(ErrorException("test item \"Test item takes 60 seconds\" didn't succeed after 1 tries"))
+    @test err.value == string(ErrorException("Timed out after 4s evaluating test item \"Test item takes 60 seconds\" (run=1)"))
 end
 
 @testset "Error outside `@testitem`" begin

diff --git a/test/junit_xml.jl b/test/junit_xml.jl
@@ -100,11 +100,25 @@ end
         mktempdir() do dir
             withenv("RETESTITEMS_REPORT_LOCATION" => dir, "RETESTITEMS_NWORKERS" => nworkers) do
                 try # Ignore the fact that the `_retry_tests.jl` testset has failures/errors.
-                    run(`$(Base.julia_cmd()) --project -e 'using ReTestItems; runtests("testfiles/_retry_tests.jl"; report=true, retries=2)'`)
+                    run(`$(Base.julia_cmd()) --project -e 'using ReTestItems; runtests("testfiles/_retry_tests.jl"; testitem_timeout=5, report=true, retries=2)'`)
                 catch
+                finally
+                    # running `_retry_tests` creates files which we need to clean up before
+                    # we run `_retry_tests` again, because these files store state that is
+                    # used to control the behaviour of the tests.
+                    foreach(readdir(joinpath("/tmp", "JL_RETESTITEMS_TEST_TMPDIR"); join=true)) do tmp
+                        rm(tmp; force=true)
+                    end
                 end
                 report = only(filter(endswith("xml"), readdir(dir, join=true)))
-                test_reference(joinpath(REF_DIR, "retry_tests_report.xml"), report)
+                # timeouts are not supported when running without workers, so
+                # with 0 workers the "retry tests" that are expected to timeout will pass.
+                ref = if nworkers == 0
+                    joinpath(REF_DIR, "retry_tests_report_0worker.xml")
+                else
+                    joinpath(REF_DIR, "retry_tests_report_1worker.xml")
+                end
+                test_reference(ref, report)
             end
         end
     end