Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wait for extension goal state processing before checking for lag in log #2873

Merged
merged 28 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
da72c37
Update version to dummy 1.0.0.0'
maddieford Nov 8, 2022
59dbd22
Revert version change
maddieford Nov 8, 2022
633a826
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Nov 21, 2022
14a743f
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Dec 8, 2022
54ea0f3
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Jan 10, 2023
e79c4c5
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Feb 8, 2023
498b612
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Feb 14, 2023
1e269f4
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Mar 13, 2023
7b49e76
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Mar 24, 2023
0a426cc
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Apr 6, 2023
17fbf6a
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Apr 7, 2023
995cbb9
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Apr 13, 2023
eaadc83
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Apr 24, 2023
fb03e07
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Apr 27, 2023
6a8e0d6
Merge remote-tracking branch 'upstream/develop' into develop
maddieford May 19, 2023
b4951c8
Merge branch 'develop' of github.com:Azure/WALinuxAgent into develop
maddieford Jun 6, 2023
c6d9300
Merge branch 'develop' of github.com:maddieford/WALinuxAgent into dev…
maddieford Jun 23, 2023
f650fe4
Merge remote-tracking branch 'upstream/develop' into develop
maddieford Jul 10, 2023
a10bdfa
Merge branch 'develop' of github.com:maddieford/WALinuxAgent into dev…
maddieford Jul 10, 2023
7e959d4
Add sleep time to allow goal state processing to complete before lag …
maddieford Jul 10, 2023
fd37fc8
Add retry logic to gs processing lag check
maddieford Jul 10, 2023
35a7b3f
Clean up retry logic
maddieford Jul 10, 2023
8aef4c0
Add back empty line
maddieford Jul 10, 2023
3a21a12
Fix timestamp parsing issue
maddieford Jul 10, 2023
603c90b
Fix timestamp parsing issue
maddieford Jul 10, 2023
c124cc8
Fix timestamp parsing issue
maddieford Jul 11, 2023
a4f100c
Do 3 retries{
maddieford Jul 11, 2023
56ba3df
Merge branch 'develop' into ext_workflow_retry
maddieford Jul 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions tests_e2e/tests/lib/agent_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ def from_dictionary(dictionary: Dict[str, str]):

@property
def timestamp(self) -> datetime:
# Extension logs may follow different timestamp formats
# 2023/07/10 20:50:13.459260
ext_timestamp_regex_1 = r"\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}[.\d]+"
# 2023/07/10 20:50:13
ext_timestamp_regex_2 = r"\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}"

if re.match(ext_timestamp_regex_1, self.when):
return datetime.strptime(self.when, u'%Y/%m/%d %H:%M:%S.%f')
elif re.match(ext_timestamp_regex_2, self.when):
return datetime.strptime(self.when, u'%Y/%m/%d %H:%M:%S')
# Logs from agent follow this format: 2023-07-10T20:50:13.038599Z
return datetime.strptime(self.when, u'%Y-%m-%dT%H:%M:%S.%fZ')


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,73 +21,94 @@
from datetime import timedelta
import re
import sys
import time

from pathlib import Path
from tests_e2e.tests.lib.agent_log import AgentLog


def main():
success = True

agent_started_time = []
agent_msg = []
time_diff_max_secs = 15

# Example: Agent WALinuxAgent-2.2.47.2 is running as the goal state agent
agent_started_regex = r"Azure Linux Agent \(Goal State Agent version [0-9.]+\)"
gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[(?P<id>[a-z_\d]{13,14})\s(?P<duration>\d+)\sms\]"

verified_atleast_one_log_line = False
verified_atleast_one_agent_started_log_line = False
verified_atleast_one_gs_complete_log_line = False

agent_log = AgentLog(Path('/var/log/waagent.log'))

try:
for agent_record in agent_log.read():
verified_atleast_one_log_line = True

agent_started = re.match(agent_started_regex, agent_record.message)
verified_atleast_one_agent_started_log_line = verified_atleast_one_agent_started_log_line or agent_started
if agent_started:
agent_started_time.append(agent_record.timestamp)
agent_msg.append(agent_record.text)

gs_complete = re.match(gs_completed_regex, agent_record.message)
verified_atleast_one_gs_complete_log_line = verified_atleast_one_gs_complete_log_line or gs_complete
if agent_started_time and gs_complete:
duration = gs_complete.group('duration')
diff = agent_record.timestamp - agent_started_time.pop()
# Reduce the duration it took to complete the Goalstate, essentially we should only care about how long
# the agent took after start/restart to start processing GS
diff -= timedelta(milliseconds=int(duration))
agent_msg_line = agent_msg.pop()
if diff.seconds > time_diff_max_secs:
success = False
print("Found delay between agent start and GoalState Processing > {0}secs: "
"Messages: \n {1} {2}".format(time_diff_max_secs, agent_msg_line, agent_record.text))

except IOError as e:
print("Unable to validate no lag time: {0}".format(str(e)))

if not verified_atleast_one_log_line:
success = False
print("Didn't parse a single log line, ensure the log_parser is working fine and verify log regex")

if not verified_atleast_one_agent_started_log_line:
success = False
print("Didn't parse a single agent started log line, ensure the Regex is working fine: {0}"
.format(agent_started_regex))

if not verified_atleast_one_gs_complete_log_line:
success = False
print("Didn't parse a single GS completed log line, ensure the Regex is working fine: {0}"
.format(gs_completed_regex))

if agent_started_time or agent_msg:
success = False
print("Mismatch between number of agent start messages and number of GoalState Processing messages\n "
"Agent Start Messages: \n {0}".format('\n'.join(agent_msg)))
needs_retry = True
retry = 3

while retry >= 0 and needs_retry:
success = True
needs_retry = False

agent_started_time = []
agent_msg = []
time_diff_max_secs = 15
last_agent_log_timestamp = None

# Example: Agent WALinuxAgent-2.2.47.2 is running as the goal state agent
agent_started_regex = r"Azure Linux Agent \(Goal State Agent version [0-9.]+\)"
gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[(?P<id>[a-z_\d]{13,14})\s(?P<duration>\d+)\sms\]"

verified_atleast_one_log_line = False
verified_atleast_one_agent_started_log_line = False
verified_atleast_one_gs_complete_log_line = False

agent_log = AgentLog(Path('/var/log/waagent.log'))

try:
for agent_record in agent_log.read():
last_agent_log_timestamp = agent_record.timestamp
verified_atleast_one_log_line = True

agent_started = re.match(agent_started_regex, agent_record.message)
verified_atleast_one_agent_started_log_line = verified_atleast_one_agent_started_log_line or agent_started
if agent_started:
agent_started_time.append(agent_record.timestamp)
agent_msg.append(agent_record.text)

gs_complete = re.match(gs_completed_regex, agent_record.message)
verified_atleast_one_gs_complete_log_line = verified_atleast_one_gs_complete_log_line or gs_complete
if agent_started_time and gs_complete:
duration = gs_complete.group('duration')
diff = agent_record.timestamp - agent_started_time.pop()
# Reduce the duration it took to complete the Goalstate, essentially we should only care about how long
# the agent took after start/restart to start processing GS
diff -= timedelta(milliseconds=int(duration))
agent_msg_line = agent_msg.pop()
if diff.seconds > time_diff_max_secs:
success = False
print("Found delay between agent start and GoalState Processing > {0}secs: "
"Messages: \n {1} {2}".format(time_diff_max_secs, agent_msg_line, agent_record.text))

except IOError as e:
print("Unable to validate no lag time: {0}".format(str(e)))

if not verified_atleast_one_log_line:
success = False
print("Didn't parse a single log line, ensure the log_parser is working fine and verify log regex")

if not verified_atleast_one_agent_started_log_line:
success = False
print("Didn't parse a single agent started log line, ensure the Regex is working fine: {0}"
.format(agent_started_regex))

if not verified_atleast_one_gs_complete_log_line:
success = False
print("Didn't parse a single GS completed log line, ensure the Regex is working fine: {0}"
.format(gs_completed_regex))

if agent_started_time or agent_msg:
# If agent_started_time or agent_msg is not empty, there is a mismatch in the number of agent start messages
# and GoalState Processing messages
# If another check hasn't already failed, and the last parsed log is less than 15 seconds after the
# mismatched agent start log, we should retry after sleeping for 5s to give the agent time to finish
# GoalState processing
if success and last_agent_log_timestamp < (agent_started_time[-1] + timedelta(seconds=15)):
needs_retry = True
print("Sleeping for 5 seconds to allow goal state processing to complete...")
time.sleep(5)
else:
success = False
print("Mismatch between number of agent start messages and number of GoalState Processing messages\n "
"Agent Start Messages: \n {0}".format('\n'.join(agent_msg)))

retry -= 1

sys.exit(0 if success else 1)

Expand Down
Loading