Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix and improve the reporting of errors from timeouts and crashes #82

Merged
merged 15 commits into from
Jun 21, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "ReTestItems"
uuid = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
version = "1.8.1"
version = "1.9.0"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Expand All @@ -23,8 +23,9 @@ DeepDiffs = "ab62b9b5-e342-54a8-a765-a90f495de1a6"
IOCapture = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5"

[targets]
test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Test", "XMLDict"]
test = ["AutoHashEquals", "DeepDiffs", "IOCapture", "Logging", "Pkg", "Random", "Test", "XMLDict"]
104 changes: 52 additions & 52 deletions src/ReTestItems.jl
Original file line number Diff line number Diff line change
Expand Up @@ -354,16 +354,31 @@ function start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
return w
end

function record_timeout!(testitem, run_number::Int, timeout_limit::Real)
timeout_s = round(Int, timeout_limit)
time_str = if timeout_s < 60
string(timeout_s, "s")
else
mins, secs = divrem(timeout_s, 60)
if iszero(secs)
string(mins, "m")
else
string(mins, "m", lpad(secs, 2, "0"), "s")
end
end
msg = "Timed out after $time_str evaluating test item $(repr(testitem.name)) (run=$run_number)"
record_test_error!(testitem, msg)
end

# This is only used to signal that we need to retry. We don't use Test.TestSetException as
# that requires information that we are not going to use anyway.
struct TestSetFailure <: Exception end
throw_if_failed(ts) = ts.anynonpass ? throw(TestSetFailure()) : nothing
function record_worker_terminated!(testitem, run_number::Int)
msg = "Worker aborted evaluating test item $(repr(testitem.name)) (run=$run_number)"
record_test_error!(testitem, msg)
end

function record_test_error!(testitem, ntries)
function record_test_error!(testitem, msg)
Test.TESTSET_PRINT_ENABLE[] = false
ts = DefaultTestSet(testitem.name)
err = ErrorException("test item $(repr(testitem.name)) didn't succeed after $ntries tries")
err = ErrorException(msg)
Test.record(ts, Test.Error(:nontest_error, Test.Expr(:tuple), err,
Base.ExceptionStack([(exception=err, backtrace=Union{Ptr{Nothing}, Base.InterpreterIP}[])]),
LineNumberNode(testitem.line, testitem.file)))
Expand All @@ -384,7 +399,7 @@ function start_and_manage_worker(
)
ntestitems = length(testitems.testitems)
worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
nretries = 0
run_number = 1
while testitem !== nothing
ch = Channel{TestItemResult}(1)
testitem.workerid[] = worker.pid
Expand Down Expand Up @@ -415,70 +430,55 @@ function start_and_manage_worker(
ts = testitem_result.testset
push!(testitem.testsets, ts)
push!(testitem.stats, testitem_result.stats)
print_errors_and_captured_logs(testitem, nretries + 1; logs)
print_errors_and_captured_logs(testitem, run_number; logs)
report_empty_testsets(testitem, ts)
# Run GC to free memory on the worker before next testitem.
@debugv 2 "Running GC on $worker"
remote_fetch(worker, :(GC.gc(true); GC.gc(false)))
# If the result isn't a pass, we throw to go to the outer try-catch
throw_if_failed(ts)
testitem = next_testitem(testitems, testitem.id[])
nretries = 0
if ts.anynonpass && (run_number != (1 + retry_limit))
run_number += 1
@info "Retrying $(repr(testitem.name)) on $worker. Run=$run_number."
else
testitem = next_testitem(testitems, testitem.id[])
run_number = 1
end
finally
close(timer)
end
catch e
@debugv 2 "Error" exception=e
if !(e isa WorkerTerminatedException || e isa TimeoutException || e isa TestSetFailure)
# we don't expect any other kind of error, so rethrow, which will propagate
# back up to the main coordinator task and throw to the user
rethrow()
end

if !(e isa TestSetFailure)
println(DEFAULT_STDOUT[])
# Explicitly show captured logs or say there weren't any in case we're about
# to terminte the worker
_print_captured_logs(DEFAULT_STDOUT[], testitem, nretries + 1)
end

println(DEFAULT_STDOUT[])
_print_captured_logs(DEFAULT_STDOUT[], testitem, run_number)
# Handle the exception
if e isa TimeoutException
@debugv 1 "Test item $(repr(testitem.name)) timed out. Terminating worker $worker"
terminate!(worker)
wait(worker)
@warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
Recording test error."
record_timeout!(testitem, run_number, timeout)
elseif e isa WorkerTerminatedException
@warn "$worker died evaluating test item $(repr(testitem.name)). \
Recording test error."
record_worker_terminated!(testitem, run_number)
else
# We don't expect any other kind of error, so rethrow, which will propagate
# back up to the main coordinator task and throw to the user
rethrow()
end
if nretries == retry_limit
if e isa TimeoutException
@warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
Recording test error, and starting a new worker."
record_test_error!(testitem, nretries + 1)
elseif e isa WorkerTerminatedException
@warn "$worker died evaluating test item $(repr(testitem.name)). \
Recording test error, and starting a new worker."
record_test_error!(testitem, nretries + 1)
else
@assert e isa TestSetFailure
# We already printed the error and recorded the testset.
end
# Handle retries
if run_number == (1 + retry_limit)
testitem = next_testitem(testitems, testitem.id[])
nretries = 0
run_number = 1
else
nretries += 1
if e isa TimeoutException
@warn "$worker timed out evaluating test item $(repr(testitem.name)) afer $timeout seconds. \
Starting a new worker and retrying. Retry=$nretries."
elseif e isa WorkerTerminatedException
@warn "$worker died evaluating test item $(repr(testitem.name)). \
Starting a new worker and retrying. Retry=$nretries."
else
@assert e isa TestSetFailure
@warn "Test item $(repr(testitem.name)) failed. Retrying on $worker. Retry=$nretries."
end
run_number += 1
@info "Retrying $(repr(testitem.name)) on a new worker. Run=$run_number."
end
if (e isa TimeoutException || e isa WorkerTerminatedException)
# The worker was terminated, so replace it unless there are no more testitems to run
if testitem !== nothing
worker = start_worker(proj_name, nworker_threads, worker_init_expr, ntestitems)
end
# now we loop back around to reschedule the testitem
# Now loop back around to reschedule the testitem
continue
end
end
Expand Down
30 changes: 24 additions & 6 deletions test/integrationtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -604,12 +604,14 @@ end
# Test the error is as expected
err = only(non_passes(results))
@test err.test_type == :nontest_error
@test err.value == string(ErrorException("test item \"Abort\" didn't succeed after 1 tries"))
@test err.value == string(ErrorException("Worker aborted evaluating test item \"Abort\" (run=1)"))
end

@testset "test retrying failing testitem" begin
file = joinpath(TEST_FILES_DIR, "_retry_tests.jl")
results = encased_testset(()->runtests(file; nworkers=1, retries=2))
# must run with `testitem_timeout < 20` for test to timeout as expected.
# and must run with `nworkers > 0` for retries to be supported.
results = encased_testset(()->runtests(file; nworkers=1, retries=2, testitem_timeout=3))
# Test we _ran_ each test-item the expected number of times
read_count(x) = parse(Int, read(x, String))
# Passes on second attempt, so only need to retry once.
Expand All @@ -622,11 +624,27 @@ end
# Doesn't pass ever, so need all retries. This testitem set `retries=1` which is less
# than the `retries=2` that `runtests` set, so we should retry 2 times.
@test read_count(joinpath(tempdir(), "num_runs_4")) == 3
# This directory must match what's set in `_retry_tests`
tmpdir = joinpath("/tmp", "JL_RETESTITEMS_TEST_TMPDIR")
# Times out always, so should retry as many times as allowed.
# Since it will be a new worker for each retry, we write one file for each.
@test count(contains("num_runs_5"), readdir(tmpdir)) == 3
# Times out on first run, then passes on second attempt.
@test count(contains("num_runs_6"), readdir(tmpdir)) == 2

# Test we _report_ the expected number of test-items
@test n_tests(results) == 4
# The first two testitems pass on retry.
@test n_passed(results) == 2
@test n_tests(results) == 6
# Testitems 1, 2, and 6 pass on retry.
@test n_passed(results) == 3

# Clear out any files created by this testset
for dir in (tempdir(), tmpdir)
foreach(readdir(dir; join=true)) do tmp
# `force` in case it gets cleaned up between `readder` and `rm`
nickrobinson251 marked this conversation as resolved.
Show resolved Hide resolved
contains(tmp, "num_runs") && rm(tmp; force=true)
end
end
rm(tmpdir; force=true)
end

@testset "testitem timeout" begin
Expand All @@ -641,7 +659,7 @@ end
# Test the error is as expected
err = only(non_passes(results))
@test err.test_type == :nontest_error
@test err.value == string(ErrorException("test item \"Test item takes 60 seconds\" didn't succeed after 1 tries"))
@test err.value == string(ErrorException("Timed out after 4s evaluating test item \"Test item takes 60 seconds\" (run=1)"))
end

@testset "Error outside `@testitem`" begin
Expand Down
18 changes: 16 additions & 2 deletions test/junit_xml.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,25 @@ end
mktempdir() do dir
withenv("RETESTITEMS_REPORT_LOCATION" => dir, "RETESTITEMS_NWORKERS" => nworkers) do
try # Ignore the fact that the `_retry_tests.jl` testset has failures/errors.
run(`$(Base.julia_cmd()) --project -e 'using ReTestItems; runtests("testfiles/_retry_tests.jl"; report=true, retries=2)'`)
run(`$(Base.julia_cmd()) --project -e 'using ReTestItems; runtests("testfiles/_retry_tests.jl"; testitem_timeout=5, report=true, retries=2)'`)
catch
finally
# running `_retry_tests` creates files which we need to clean up before
# we run `_retry_tests` again, because these files store state that is
# used to control the behaviour of the tests.
foreach(readdir(joinpath("/tmp", "JL_RETESTITEMS_TEST_TMPDIR"); join=true)) do tmp
rm(tmp; force=true)
end
end
report = only(filter(endswith("xml"), readdir(dir, join=true)))
test_reference(joinpath(REF_DIR, "retry_tests_report.xml"), report)
# timeouts are not supported when running without workers, so
# with 0 workers the "retry tests" that are expected to timeout will pass.
ref = if nworkers == 0
joinpath(REF_DIR, "retry_tests_report_0worker.xml")
else
joinpath(REF_DIR, "retry_tests_report_1worker.xml")
end
test_reference(ref, report)
end
end
end
Expand Down
Loading