Skip to content

Commit

Permalink
Fix and improve the reporting of errors from timeouts and crashes (#82)
Browse files Browse the repository at this point in the history
* Test we retry on timeouts

* Refactor test-error handling to always record the error

* Record more informative error: include timeout/abort info

* Update reference tests

* Test case where we timeout then pass on retry

* fixup! Test case where we timeout then pass on retry

* fixup! Test case where we timeout then pass on retry

* Reduce time of test so worker=0 case isn't so slow

* Handle test failure case differently to worker timeout or crash

* Bump version

* fixup! Handle test failure case differently to worker timeout or crash

* fixup! fixup! Handle test failure case differently to worker timeout or crash

* Update test/integrationtests.jl - typo

* Don't use `tempdir()`
  • Loading branch information
nickrobinson251 authored Jun 21, 2023
1 parent 4355418 commit 0859990
Show file tree
Hide file tree
Showing 8 changed files with 704 additions and 292 deletions.
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "ReTestItems"
uuid = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
version = "1.8.1"
version = "1.9.0"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Expand All @@ -23,8 +23,9 @@ DeepDiffs = "ab62b9b5-e342-54a8-a765-a90f495de1a6"
IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5"

[targets]
test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Test", "XMLDict"]
test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Random", "Test", "XMLDict"]
104 changes: 52 additions & 52 deletions src/ReTestItems.jl
Original file line number Diff line number Diff line change
Expand Up @@ -354,16 +354,31 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
return w
end

function record_timeout!(testitem, run_number::Int, timeout_limit::Real)
timeout_s = round(Int, timeout_limit)
time_str = if timeout_s < 60
string(timeout_s, "s")
else
mins, secs = divrem(timeout_s, 60)
if iszero(secs)
string(mins, "m")
else
string(mins, "m", lpad(secs, 2, "0"), "s")
end
end
msg = "Timed out after $time_str evaluating test item $(repr(testitem.name)) (run=$run_number)"
record_test_error!(testitem, msg)
end

# This is only used to signal that we need to retry. We don't use Test.TestSetException as
# that requires information that we are not going to use anyway.
struct TestSetFailure <: Exception end
throw_if_failed(ts) = ts.anynonpass ? throw(TestSetFailure()) : nothing
function record_worker_terminated!(testitem, run_number::Int)
msg = "Worker aborted evaluating test item $(repr(testitem.name)) (run=$run_number)"
record_test_error!(testitem, msg)
end

function record_test_error!(testitem, ntries)
function record_test_error!(testitem, msg)
Test.TESTSET_PRINT_ENABLE[] = false
ts = DefaultTestSet(testitem.name)
err = ErrorException("test item $(repr(testitem.name)) didn't succeed after $ntries tries")
err = ErrorException(msg)
Test.record(ts, Test.Error(:nontest_error, Test.Expr(:tuple), err,
Base.ExceptionStack([(exception=err, backtrace=Union{Ptr{Nothing}, Base.InterpreterIP}[])]),
LineNumberNode(testitem.line, testitem.file)))
Expand All @@ -384,7 +399,7 @@ function start_and_manage_worker(
)
ntestitems = length(testitems.testitems)
worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
nretries = 0
run_number = 1
while testitem !== nothing
ch = Channel{TestItemResult}(1)
testitem.workerid[] = worker.pid
Expand Down Expand Up @@ -415,70 +430,55 @@ function start_and_manage_worker(
ts = testitem_result.testset
push!(testitem.testsets, ts)
push!(testitem.stats, testitem_result.stats)
print_errors_and_captured_logs(testitem, nretries + 1; logs)
print_errors_and_captured_logs(testitem, run_number; logs)
report_empty_testsets(testitem, ts)
# Run GC to free memory on the worker before next testitem.
@debugv 2 "Running GC on $worker"
remote_fetch(worker, :(GC.gc(true); GC.gc(false)))
# If the result isn't a pass, we throw to go to the outer try-catch
throw_if_failed(ts)
testitem = next_testitem(testitems, testitem.id[])
nretries = 0
if ts.anynonpass && (run_number != (1 + retry_limit))
run_number += 1
@info "Retrying $(repr(testitem.name)) on $worker. Run=$run_number."
else
testitem = next_testitem(testitems, testitem.id[])
run_number = 1
end
finally
close(timer)
end
catch e
@debugv 2 "Error" exception=e
if !(e isa WorkerTerminatedException || e isa TimeoutException || e isa TestSetFailure)
# we don't expect any other kind of error, so rethrow, which will propagate
# back up to the main coordinator task and throw to the user
rethrow()
end

if !(e isa TestSetFailure)
println(DEFAULT_STDOUT[])
# Explicitly show captured logs or say there weren't any in case we're about
# to terminte the worker
_print_captured_logs(DEFAULT_STDOUT[], testitem, nretries + 1)
end

println(DEFAULT_STDOUT[])
_print_captured_logs(DEFAULT_STDOUT[], testitem, run_number)
# Handle the exception
if e isa TimeoutException
@debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker"
terminate!(worker)
wait(worker)
@warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
Recording test error."
record_timeout!(testitem, run_number, timeout)
elseif e isa WorkerTerminatedException
@warn "$worker died evaluating test item $(repr(testitem.name)). \
Recording test error."
record_worker_terminated!(testitem, run_number)
else
# We don't expect any other kind of error, so rethrow, which will propagate
# back up to the main coordinator task and throw to the user
rethrow()
end
if nretries == retry_limit
if e isa TimeoutException
@warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
Recording test error, and starting a new worker."
record_test_error!(testitem, nretries + 1)
elseif e isa WorkerTerminatedException
@warn "$worker died evaluating test item $(repr(testitem.name)). \
Recording test error, and starting a new worker."
record_test_error!(testitem, nretries + 1)
else
@assert e isa TestSetFailure
# We already printed the error and recorded the testset.
end
# Handle retries
if run_number == (1 + retry_limit)
testitem = next_testitem(testitems, testitem.id[])
nretries = 0
run_number = 1
else
nretries += 1
if e isa TimeoutException
@warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
Starting a new worker and retrying. Retry=$nretries."
elseif e isa WorkerTerminatedException
@warn "$worker died evaluating test item $(repr(testitem.name)). \
Starting a new worker and retrying. Retry=$nretries."
else
@assert e isa TestSetFailure
@warn "Test item $(repr(testitem.name)) failed. Retrying on $worker. Retry=$nretries."
end
run_number += 1
@info "Retrying $(repr(testitem.name)) on a new worker. Run=$run_number."
end
if (e isa TimeoutException || e isa WorkerTerminatedException)
# The worker was terminated, so replace it unless there are no more testitems to run
if testitem !== nothing
worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
end
# now we loop back around to reschedule the testitem
# Now loop back around to reschedule the testitem
continue
end
end
Expand Down
36 changes: 26 additions & 10 deletions test/integrationtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -604,29 +604,45 @@ end
# Test the error is as expected
err = only(non_passes(results))
@test err.test_type == :nontest_error
@test err.value == string(ErrorException("test item \"Abort\" didn't succeed after 1 tries"))
@test err.value == string(ErrorException("Worker aborted evaluating test item \"Abort\" (run=1)"))
end

@testset "test retrying failing testitem" begin
file = joinpath(TEST_FILES_DIR, "_retry_tests.jl")
results = encased_testset(()->runtests(file; nworkers=1, retries=2))
# This directory must match what's set in `_retry_tests`
tmpdir = joinpath("/tmp", "JL_RETESTITEMS_TEST_TMPDIR")
# must run with `testitem_timeout < 20` for test to timeout as expected.
# and must run with `nworkers > 0` for retries to be supported.
results = encased_testset(()->runtests(file; nworkers=1, retries=2, testitem_timeout=3))
# Test we _ran_ each test-item the expected number of times
read_count(x) = parse(Int, read(x, String))
# Passes on second attempt, so only need to retry once.
@test read_count(joinpath(tempdir(), "num_runs_1")) == 2
@test read_count(joinpath(tmpdir, "num_runs_1")) == 2
# Doesn't pass til third attempt, so needs both retries.
@test read_count(joinpath(tempdir(), "num_runs_2")) == 3
@test read_count(joinpath(tmpdir, "num_runs_2")) == 3
# Doesn't pass ever, so need all retries. This testitem set `retries=4` which is greater
# than the `retries=2` that `runtests` set, so we should retry 4 times.
@test read_count(joinpath(tempdir(), "num_runs_3")) == 5
@test read_count(joinpath(tmpdir, "num_runs_3")) == 5
# Doesn't pass ever, so need all retries. This testitem set `retries=1` which is less
# than the `retries=2` that `runtests` set, so we should retry 2 times.
@test read_count(joinpath(tempdir(), "num_runs_4")) == 3
@test read_count(joinpath(tmpdir, "num_runs_4")) == 3
# Times out always, so should retry as many times as allowed.
# Since it will be a new worker for each retry, we write one file for each.
@test count(contains("num_runs_5"), readdir(tmpdir)) == 3
# Times out on first run, then passes on second attempt.
@test count(contains("num_runs_6"), readdir(tmpdir)) == 2

# Test we _report_ the expected number of test-items
@test n_tests(results) == 4
# The first two testitems pass on retry.
@test n_passed(results) == 2
@test n_tests(results) == 6
# Testitems 1, 2, and 6 pass on retry.
@test n_passed(results) == 3

# Clear out any files created by this testset
foreach(readdir(tmpdir; join=true)) do tmp
# `force` in case it gets cleaned up between `readdir` and `rm`
contains(tmp, "num_runs") && rm(tmp; force=true)
end
rm(tmpdir; force=true)
end

@testset "testitem timeout" begin
Expand All @@ -641,7 +657,7 @@ end
# Test the error is as expected
err = only(non_passes(results))
@test err.test_type == :nontest_error
@test err.value == string(ErrorException("test item \"Test item takes 60 seconds\" didn't succeed after 1 tries"))
@test err.value == string(ErrorException("Timed out after 4s evaluating test item \"Test item takes 60 seconds\" (run=1)"))
end

@testset "Error outside `@testitem`" begin
Expand Down
18 changes: 16 additions & 2 deletions test/junit_xml.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,25 @@ end
mktempdir() do dir
withenv("RETESTITEMS_REPORT_LOCATION" => dir, "RETESTITEMS_NWORKERS" => nworkers) do
try # Ignore the fact that the `_retry_tests.jl` testset has failures/errors.
run(`$(Base.julia_cmd()) --project -e 'using ReTestItems; runtests("testfiles/_retry_tests.jl"; report=true, retries=2)'`)
run(`$(Base.julia_cmd()) --project -e 'using ReTestItems; runtests("testfiles/_retry_tests.jl"; testitem_timeout=5, report=true, retries=2)'`)
catch
finally
# running `_retry_tests` creates files which we need to clean up before
# we run `_retry_tests` again, because these files store state that is
# used to control the behaviour of the tests.
foreach(readdir(joinpath("/tmp", "JL_RETESTITEMS_TEST_TMPDIR"); join=true)) do tmp
rm(tmp; force=true)
end
end
report = only(filter(endswith("xml"), readdir(dir, join=true)))
test_reference(joinpath(REF_DIR, "retry_tests_report.xml"), report)
# timeouts are not supported when running without workers, so
# with 0 workers the "retry tests" that are expected to timeout will pass.
ref = if nworkers == 0
joinpath(REF_DIR, "retry_tests_report_0worker.xml")
else
joinpath(REF_DIR, "retry_tests_report_1worker.xml")
end
test_reference(ref, report)
end
end
end
Expand Down
Loading

2 comments on commit 0859990

@nickrobinson251
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/86007

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.9.0 -m "<description of version>" 085999065a4cd98da0b170062c5a37907f43fba5
git push origin v1.9.0

Please sign in to comment.