Skip to content

Commit

Permalink
Add poll suite log function to job.sh
Browse files Browse the repository at this point in the history
This is a temporary measure to address the repeated pattern in various test
suites where we have to repeatedly poll the suite log for events.
  • Loading branch information
matthewrmshin committed Sep 4, 2019
1 parent 2b21fd2 commit 58fe9aa
Show file tree
Hide file tree
Showing 44 changed files with 99 additions and 254 deletions.
13 changes: 13 additions & 0 deletions cylc/flow/etc/job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,19 @@ cylc__job__main() {
exit 0
}

###############################################################################
# Poll existence of pattern from suite log for up to a minute.
cylc__job__poll_grep_suite_log() {
local TIMEOUT="$(($(date +%s) + 60))" # wait 1 minute
while ! grep -s "$@" "${CYLC_SUITE_LOG_DIR}/log"; do
sleep 1
if (($(date +%s) > TIMEOUT)); then
echo "ERROR: poll timed out: grep -s $* ${CYLC_SUITE_LOG_DIR}/log" >&2
exit 1
fi
done
}

###############################################################################
# Run a function in the task job instance file, if possible.
# Arguments:
Expand Down
11 changes: 4 additions & 7 deletions flakytests/cylc-poll/03-poll-all/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,10 @@ exit 0
script = """
cylc poll "${CYLC_SUITE_NAME}"
pat1="[submit_hold.${CYLC_TASK_CYCLE_POINT}] -ready => submitted"
pat2="[run_kill.${CYLC_TASK_CYCLE_POINT}] -suiciding"
log="${CYLC_SUITE_LOG_DIR}/log"
while (($(grep -c -F -e "${pat1}" -e "${pat2}" "${log}") != 2))
do
sleep 1
done
cylc__job__poll_grep_suite_log -F \
"[submit_hold.${CYLC_TASK_CYCLE_POINT}] -ready => submitted"
cylc__job__poll_grep_suite_log -F \
"[run_kill.${CYLC_TASK_CYCLE_POINT}] -suiciding"
st_file="${CYLC_SUITE_RUN_DIR}/log/job/${CYLC_TASK_CYCLE_POINT}/submit_hold/NN/job.status"
pkill -g "$(awk -F= '$1 == "CYLC_BATCH_SYS_JOB_ID" {print $2}' "${st_file}")"
Expand Down
9 changes: 2 additions & 7 deletions flakytests/cylc-reset/02-output-1/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,10 @@ t1:hello & t2:greet => t4
greet = Greet World
[[t3]]
script = """
LOG="${CYLC_SUITE_LOG_DIR}/log"
cylc reset --debug --output=hello "${CYLC_SUITE_NAME}" 't1.1'
while ! grep -qF '[t1.1] -reset output to complete: hello' "${LOG}"; do
sleep 1 # make sure reset completes
done
cylc__job__poll_grep_suite_log -F '[t1.1] -reset output to complete: hello'
cylc reset --debug --output='Greet World' "${CYLC_SUITE_NAME}" 't2.1'
while ! grep -qF '[t2.1] -reset output to complete: Greet World' "${LOG}"; do
sleep 1 # make sure reset completes
done
cylc__job__poll_grep_suite_log -F '[t2.1] -reset output to complete: Greet World'
"""
[[[job]]]
execution time limit = PT30S
Expand Down
9 changes: 2 additions & 7 deletions flakytests/cylc-reset/03-output-2/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,9 @@
greet = Greet World
[[t2]]
script = """
LOG="${CYLC_SUITE_LOG_DIR}/log"
cylc reset --output='!hello' --output='!Greet World' "${CYLC_SUITE_NAME}" 't1.1'
while ! grep -qF -e '[t1.1] -reset output to incomplete: hello' "${LOG}"; do
sleep 1 # make sure reset completes
done
while ! grep -qF '[t1.1] -reset output to incomplete: Greet World' "${LOG}"; do
sleep 1 # make sure reset completes
done
cylc__job__poll_grep_suite_log -F '[t1.1] -reset output to incomplete: hello'
cylc__job__poll_grep_suite_log -F '[t1.1] -reset output to incomplete: Greet World'
cylc show "${CYLC_SUITE_NAME}" 't1.1' >"${CYLC_SUITE_RUN_DIR}/cylc-show.out"
"""
[[[job]]]
Expand Down
7 changes: 1 addition & 6 deletions flakytests/cylc-take-checkpoints/00-basic/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,7 @@ if [[ "${CYLC_TASK_CYCLE_POINT}" == '2017' ]]; then
wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>/dev/null || true
sleep 2 # state of current task should be recorded after 2 seconds
cylc checkpoint "${CYLC_SUITE_NAME}" 'snappy'
LOG="${CYLC_SUITE_LOG_DIR}/log"
while ! grep -qF "INFO - Command succeeded: take_checkpoints(snappy)" \
"${LOG}"
do
sleep 1 # make sure take_checkpoints command completes
done
cylc__job__poll_grep_suite_log -F "Command succeeded: take_checkpoints(snappy)"
sleep 2 # checkpoint should be recorded after 2 seconds
fi
"""
Expand Down
7 changes: 3 additions & 4 deletions flakytests/hold-release/13-ready-restart.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# Test restart with a "ready" task. See GitHub #958 (update: and #2610).
. "$(dirname "$0")/test_header"

set_test_number 4
set_test_number 3
install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"

run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
Expand All @@ -29,8 +29,7 @@ export PATH="${TEST_DIR}/${SUITE_NAME}/bin:$PATH"
LOG="$(find "${CYLC_SUITE_LOG_DIR}/" -type f -name 'log.*' | sort | head -n 1)"
run_ok "${TEST_NAME_BASE}-restart" timeout 1m my-file-poll "${LOG}"
# foo-1 should run when the suite is released
run_ok "${TEST_NAME_BASE}-foo-1" \
timeout 1m my-log-grepper 'foo-1\.1.*succeeded'
timeout 1m my-log-grepper 'Suite shutting down'
poll_grep_suite_log 'foo-1\.1.*succeeded'
poll_suite_stopped
purge_suite "${SUITE_NAME}"
exit
6 changes: 0 additions & 6 deletions flakytests/hold-release/13-ready-restart/bin/my-log-grepper

This file was deleted.

3 changes: 2 additions & 1 deletion flakytests/hold-release/13-ready-restart/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ bar
# Restart the suite on hold.
cylc restart --hold "${CYLC_SUITE_NAME}"
timeout 1m my-log-grepper 'Held on start-up (no tasks will be submitted)'
cylc__job__poll_grep_suite_log \
'Held on start-up (no tasks will be submitted)'
# Modify the job submission command template for "foo-1".
cylc broadcast "${CYLC_SUITE_NAME}" \
Expand Down
15 changes: 5 additions & 10 deletions flakytests/hold-release/14-hold-kill/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,11 @@
script = """
echo '# killing "sleeper"'
cylc kill "${CYLC_SUITE_NAME}" "sleeper.1"
LOG="${CYLC_SUITE_LOG_DIR}/log"
while ! grep -q -F '[sleeper.1] -running => running (held)' "${LOG}"
do
sleep 1
done
while ! grep -q -F '[sleeper.1] -running (held) => retrying (held)' "${LOG}"
do
sleep 1
done
sleep 10 # sleep should still be held after 10 seconds
cylc__job__poll_grep_suite_log -F \
'[sleeper.1] -running => running (held)'
cylc__job__poll_grep_suite_log -F \
-F '[sleeper.1] -running (held) => retrying (held)'
sleep 10 # sleep, should still be held after 10 seconds
cylc dump -s -t "${CYLC_SUITE_NAME}" >'cylc-dump.out'
diff -u 'cylc-dump.out' - <<'__OUT__'
1, killer, running, spawned, unheld
Expand Down
7 changes: 2 additions & 5 deletions flakytests/hold-release/15-hold-after/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,8 @@
script = cylc hold --after '20140101T12' "${CYLC_SUITE_NAME}"
[[stopper]]
script = """
while ! grep -qF '[bar.20140102T0000Z] -waiting => waiting (held)' \
"${CYLC_SUITE_LOG_DIR}/log"
do
sleep 1
done
cylc__job__poll_grep_suite_log -F \
'[bar.20140102T0000Z] -waiting => waiting (held)'
cylc stop "${CYLC_SUITE_NAME}"
"""
[[[job]]]
Expand Down
24 changes: 7 additions & 17 deletions flakytests/hold-release/20-reset-waiting-output/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,14 @@
[[t1]]
script = """
cylc hold "${CYLC_SUITE_NAME}"
LOG="${CYLC_SUITE_LOG_DIR}/log"
while ! grep -qF 'INFO - Command succeeded: hold_suite()' "${LOG}"; do
sleep 1 # make sure hold completes
done
cylc__job__poll_grep_suite_log -F \
'INFO - Command succeeded: hold_suite()'
cylc reset --state='succeeded' "${CYLC_SUITE_NAME}" 't2.1'
while ! grep -q \
"INFO - Command succeeded: reset_task_states(\\['t2.1'\\],.*state=succeeded" \
"${LOG}"
do
sleep 1 # make sure reset succeeded completes
done
cylc__job__poll_grep_suite_log \
"INFO - Command succeeded: reset_task_states(\\['t2.1'\\],.*state=succeeded"
cylc reset --state='waiting' "${CYLC_SUITE_NAME}" 't2.1'
while ! grep -q \
"INFO - Command succeeded: reset_task_states(\\['t2.1'\\],.*state=waiting" \
"${LOG}"
do
sleep 1 # make sure reset waiting completes
done
cylc__job__poll_grep_suite_log \
"INFO - Command succeeded: reset_task_states(\\['t2.1'\\],.*state=waiting"
cylc release "${CYLC_SUITE_NAME}"
"""
[[[job]]]
Expand All @@ -37,7 +27,7 @@
failed handler = cylc release '%(suite)s'

[[t2]]
script = sleep 10; touch "${CYLC_SUITE_RUN_DIR}/t2.done"
script = touch "${CYLC_SUITE_RUN_DIR}/t2.done"
[[t3]]
# This will fail if t3.1 starts together with t2.1
script = test -e "${CYLC_SUITE_RUN_DIR}/t2.done"
17 changes: 6 additions & 11 deletions flakytests/restart/19-checkpoint/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,17 @@
script = """
wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>/dev/null || true
if [[ "${CYLC_TASK_CYCLE_POINT}" == '2017' ]]; then
LOG="${CYLC_SUITE_LOG_DIR}/log"
while ! grep -qF '[t1.2017] status=submitted: (received)started' "${LOG}"; do
sleep 1 # make sure started message is recorded in suite
done
sleep 2
cylc__job__poll_grep_suite_log -F \
'[t1.2017] status=submitted: (received)started'
sleep 2 # make sure status change recorded in DB
cylc broadcast "${CYLC_SUITE_NAME}" -p '2017' -n 't1' --set='script=true'
cylc hold "${CYLC_SUITE_NAME}"
while ! grep -qF 'INFO - Command succeeded: hold_suite()' "${LOG}"; do
sleep 1 # make sure hold completes
done
cylc__job__poll_grep_suite_log -F \
'INFO - Command succeeded: hold_suite()'
sleep 2
(cd "${CYLC_SUITE_DEF_PATH}"; cp -p 'suite2.rc' 'suite.rc')
cylc reload "${CYLC_SUITE_NAME}"
while ! grep -q 'Reload completed' "${LOG}"; do
sleep 1 # make sure reload completes
done
cylc__job__poll_grep_suite_log 'Reload completed'
cylc insert "${CYLC_SUITE_NAME}" 't2.2017'
while ! cylc show "${CYLC_SUITE_NAME}" 't2.2017' 1>'/dev/null' 2>&1; do
sleep 1 # make sure insert completes
Expand Down
2 changes: 1 addition & 1 deletion tests/authentication/08-shared-fs.t
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ SUITE_PID="$!"
SUITE_LOG="${SUITE_RUN_DIR}/log/suite/log"
# Note: double poll existence of suite log on suite host and then localhost to
# avoid any issues with unstable mounting of the shared file system.
poll ssh -oBatchMode=yes -n '${CYLC_TEST_HOST}' test -e "${SUITE_LOG}"
poll ssh -oBatchMode=yes -n "${CYLC_TEST_HOST}" test -e "${SUITE_LOG}"
poll_grep_suite_log -F '[t1.19700101T0000Z] -submitted => running'
poll_grep_suite_log -F '[t1.19700101T0000Z] -running => failed'

Expand Down
6 changes: 1 addition & 5 deletions tests/broadcast/07-timeout/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,7 @@
"""
[[timeout]]
script = """
while ! grep -qF "[${CYLC_TASK_ID}] -execution timeout after PT1S" \
"${CYLC_SUITE_LOG_DIR}/log"
do
sleep 1
done
cylc__job__poll_grep_suite_log -F "[${CYLC_TASK_ID}] -execution timeout after PT1S"
"""
[[[events]]]
execution timeout = PT1M
6 changes: 1 addition & 5 deletions tests/cylc-cat-log/05-remote-tail.t
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,7 @@ $SCP "${PWD}/bin/my-tailer.sh" \
# Run detached.
suite_run_ok "${TEST_NAME_BASE}-run" cylc run "${SUITE_NAME}"
#-------------------------------------------------------------------------------
while ! grep -q -F '[foo.1] status=submitted: (received)started' \
"${SUITE_RUN_DIR}/log/suite/log"
do
sleep 1
done
poll_grep_suite_log -F '[foo.1] status=submitted: (received)started'
# cylc cat-log -m 't' tail-follows a file, so needs to be killed.
# Send interrupt signal to tail command after 15 seconds.
TEST_NAME="${TEST_NAME_BASE}-cat-log"
Expand Down
19 changes: 5 additions & 14 deletions tests/cylc-poll/05-poll-multi-messages/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,8 @@ wait
echo "CYLC_MESSAGE=$(date +%FT%H:%M:%SZ)|INFO|hello1"
echo "CYLC_MESSAGE=$(date +%FT%H:%M:%SZ)|INFO|hello2"
} >>"${CYLC_TASK_LOG_ROOT}.status"
LOG="${CYLC_SUITE_LOG_DIR}/log"
while ! grep -qF '[speaker1.1] status=running: (polled)hello1' "${LOG}"; do
sleep 1
done
while ! grep -qF '[speaker1.1] status=running: (polled)hello2' "${LOG}"; do
sleep 1
done
cylc__job__poll_grep_suite_log -F '[speaker1.1] status=running: (polled)hello1'
cylc__job__poll_grep_suite_log -F '[speaker1.1] status=running: (polled)hello2'
"""
[[[outputs]]]
hello1 = "hello1"
Expand All @@ -35,13 +30,9 @@ done
wait
# Simulate "cylc task message", messages written to status file but failed to
# get sent back to the suite
{
echo "CYLC_MESSAGE=$(date +%FT%H:%M:%SZ)|INFO|greet"
} >>"${CYLC_TASK_LOG_ROOT}.status"
LOG="${CYLC_SUITE_LOG_DIR}/log"
while ! grep -qF '[speaker2.1] status=running: (polled)greet' "${LOG}"; do
sleep 1
done
echo "CYLC_MESSAGE=$(date +%FT%H:%M:%SZ)|INFO|greet" \
>>"${CYLC_TASK_LOG_ROOT}.status"
cylc__job__poll_grep_suite_log -F '[speaker2.1] status=running: (polled)greet'
"""
[[[outputs]]]
greet = "greet"
Expand Down
4 changes: 1 addition & 3 deletions tests/database/04-lock-recover/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,5 @@ TRUES:succeed-all => done
inherit = TRUES
[[done]]
script = """
while ! grep -F -q 'db: recovered' "${CYLC_SUITE_LOG_DIR}/log"; do
sleep 1
done
cylc__job__poll_grep_suite_log -F 'db: recovered'
"""
6 changes: 1 addition & 5 deletions tests/hold-release/00-suite/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,7 @@
script = """
wait
cylc hold "${CYLC_SUITE_NAME}"
while ! grep -qF 'INFO - Command succeeded: hold_suite()' \
"${CYLC_SUITE_LOG_DIR}/log"
do
sleep 1
done
cylc__job__poll_grep_suite_log -F 'INFO - Command succeeded: hold_suite()'
cylc release "${CYLC_SUITE_NAME}"
"""
[[foo,bar]]
Expand Down
6 changes: 1 addition & 5 deletions tests/hold-release/02-hold-on-spawn/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,7 @@
script = """
wait
cylc hold "${CYLC_SUITE_NAME}"
while ! grep -qF 'INFO - Command succeeded: hold_suite()' \
"${CYLC_SUITE_LOG_DIR}/log"
do
sleep 1
done
cylc__job__poll_grep_suite_log -F 'INFO - Command succeeded: hold_suite()'
# Release all tasks in the first cycle point.
cylc release "${CYLC_SUITE_NAME}" '*.20141009T0000Z'
"""
Expand Down
6 changes: 0 additions & 6 deletions tests/hold-release/11-retrying/bin/my-log-grepper

This file was deleted.

9 changes: 6 additions & 3 deletions tests/hold-release/11-retrying/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ t-retry-able => t-analyse
execution retry delays = PT15S, 2*PT1S
[[t-hold-release]]
script = """
timeout 30s my-log-grepper '[t-retry-able.1] -job(01) failed, retrying in PT15S'
cylc__job__poll_grep_suite_log -F \
'[t-retry-able.1] -job(01) failed, retrying in PT15S'
cylc hold "${CYLC_SUITE_NAME}" 't-retry-able.1'
timeout 30s my-log-grepper '[t-retry-able.1] -retrying => retrying (held)'
cylc__job__poll_grep_suite_log -F \
'[t-retry-able.1] -retrying => retrying (held)'
cylc release "${CYLC_SUITE_NAME}" 't-retry-able.1'
timeout 30s my-log-grepper '[t-retry-able.1] -retrying (held) => retrying'
cylc__job__poll_grep_suite_log -F \
'[t-retry-able.1] -retrying (held) => retrying'
"""
[[t-analyse]]
script = """
Expand Down
6 changes: 0 additions & 6 deletions tests/hold-release/12-hold-then-retry/bin/my-log-grepper

This file was deleted.

12 changes: 7 additions & 5 deletions tests/hold-release/12-hold-then-retry/suite.rc
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,20 @@ t-submit-retry-able:submit => t-hold
script = """
# Hold the suite
cylc hold "${CYLC_SUITE_NAME}"
timeout 15 my-log-grepper 'Command succeeded: hold_suite'
cylc__job__poll_grep_suite_log -F 'Command succeeded: hold_suite'
# Poll t-submit-retry-able, should return submit-fail
cylc poll "${CYLC_SUITE_NAME}" 't-submit-retry-able'
# Allow t-retry-able to continue
rm -f "${CYLC_SUITE_RUN_DIR}/file"
timeout 15 my-log-grepper '[t-retry-able.1] -running (held) => retrying (held)'
timeout 15 my-log-grepper \
cylc__job__poll_grep_suite_log -F \
'[t-retry-able.1] -running (held) => retrying (held)'
cylc__job__poll_grep_suite_log -F \
'[t-submit-retry-able.1] -submitted (held) => submit-retrying (held)'
# Release the suite
cylc release "${CYLC_SUITE_NAME}"
timeout 15 my-log-grepper '[t-retry-able.1] -retrying (held) => retrying'
timeout 15 my-log-grepper \
cylc__job__poll_grep_suite_log -F \
'[t-retry-able.1] -retrying (held) => retrying'
cylc__job__poll_grep_suite_log -F \
'[t-submit-retry-able.1] -submit-retrying (held) => submit-retrying'
"""
[[t-retry-able]]
Expand Down
Loading

0 comments on commit 58fe9aa

Please sign in to comment.