Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check agent log for errors; install test libraries #2787

Merged
merged 6 commits into from
Mar 22, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 91 additions & 39 deletions tests_e2e/orchestrator/lib/agent_test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#
import contextlib
import datetime
import json
import logging
import re
import traceback
Expand All @@ -29,6 +30,7 @@
# E0401: Unable to import 'lisa' (import-error)
# etc
from lisa import ( # pylint: disable=E0401
Environment,
Logger,
Node,
notifier,
Expand All @@ -43,11 +45,13 @@
import makepkg
from azurelinuxagent.common.version import AGENT_VERSION
from tests_e2e.orchestrator.lib.agent_test_loader import TestSuiteInfo
from tests_e2e.tests.lib.agent_log import AgentLog
from tests_e2e.tests.lib.agent_test import TestSkipped
from tests_e2e.tests.lib.agent_test_context import AgentTestContext
from tests_e2e.tests.lib.identifiers import VmIdentifier
from tests_e2e.tests.lib.logging import log as agent_test_logger # Logger used by the tests
from tests_e2e.tests.lib.logging import set_current_thread_log
from tests_e2e.tests.lib.agent_log import AgentLogRecord
from tests_e2e.tests.lib.shell import run_command
from tests_e2e.tests.lib.ssh_client import SshClient

Expand Down Expand Up @@ -256,7 +260,7 @@ def _clean_up(self) -> None:

def _setup_node(self) -> None:
"""
Prepares the remote node for executing the test suite.
Prepares the remote node for executing the test suite (installs tools and the test agent, etc)
"""
self._log.info("")
self._log.info("************************************** [Node Setup] **************************************")
Expand All @@ -265,23 +269,27 @@ def _setup_node(self) -> None:
self._log.info("Resource Group: %s", self.context.vm.resource_group)
self._log.info("")

self._install_tools_on_node()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I merged _install_agent_on_node and _install_tools_on_node into _setup_node and added code to install the test libraries and agent

self.context.ssh_client.run_command("mkdir -p ~/bin/tests_e2e/tests; touch ~/bin/agent-env")

if self.context.is_vhd:
self._log.info("Using a VHD; will not install the test Agent.")
else:
self._install_agent_on_node()
# Copy the test tools
tools_path = self.context.test_source_directory/"orchestrator"/"scripts"
tools_target_path = Path("~/bin")
self._log.info("Copying %s to %s:%s", Path("~/bin/pypy3.7.tar.bz2"), self.context.node.name, tools_target_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we copy tools, can we change respectively.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks! fixed

self.context.ssh_client.copy_to_node(tools_path, tools_target_path, recursive=True)

def _install_tools_on_node(self) -> None:
"""
Installs the test tools on the test node
"""
self.context.ssh_client.run_command("mkdir -p ~/bin")
# Copy the test libraries
lib_path = self.context.test_source_directory/"tests"/"lib"
lib_target_path = Path("~/bin/tests_e2e/tests")
self._log.info("Copying %s to %s:%s", lib_path, self.context.node.name, lib_target_path)
self.context.ssh_client.copy_to_node(lib_path, lib_target_path, recursive=True)

tools_path = self.context.test_source_directory/"orchestrator"/"scripts"
self._log.info(f"Copying {tools_path} to the test node")
self.context.ssh_client.copy(tools_path, Path("~/bin"), remote_target=True, recursive=True)
# Copy the test agent
agent_package_path: Path = self._get_agent_package_path()
agent_package_target_path = Path("~/bin")/agent_package_path.name
self._log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, agent_package_target_path)
self.context.ssh_client.copy_to_node(agent_package_path, agent_package_target_path)

# Copy Pypy
if self.context.ssh_client.get_architecture() == "aarch64":
pypy_path = Path("/tmp/pypy3.7-arm64.tar.bz2")
pypy_download = "https://downloads.python.org/pypy/pypy3.7-v7.3.5-aarch64.tar.bz2"
Expand All @@ -292,26 +300,21 @@ def _install_tools_on_node(self) -> None:
if not pypy_path.exists():
self._log.info(f"Downloading {pypy_download} to {pypy_path}")
run_command(["wget", pypy_download, "-O", pypy_path])
self._log.info(f"Copying {pypy_path} to the test node")
self.context.ssh_client.copy(pypy_path, Path("~/bin/pypy3.7.tar.bz2"), remote_target=True)
pypy_target_path = Path("~/bin/pypy3.7.tar.bz2")
self._log.info(f"Copying %s to %s:%s", pypy_path, self.context.node.name, pypy_target_path)
self.context.ssh_client.copy_to_node(pypy_path, pypy_target_path)

self._log.info(f'Installing tools on the test node\n{self.context.ssh_client.run_command("~/bin/scripts/install-tools")}')
self._log.info(f'Remote commands will use {self.context.ssh_client.run_command("which python3")}')
# Install the tools and libraries
install_command = lambda: self.context.ssh_client.run_command(f"~/bin/scripts/install-tools --agent-package {agent_package_target_path}")
self._log.info('Installing tools on the test node\n%s', install_command())
self._log.info('Remote commands will use %s', self.context.ssh_client.run_command("which python3"))

def _install_agent_on_node(self) -> None:
"""
Installs the given agent package on the test node.
"""
agent_package_path: Path = self._get_agent_package_path()

self._log.info("Installing %s on %s", agent_package_path, self.context.node.name)
agent_package_remote_path = self.context.remote_working_directory/agent_package_path.name
self._log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, agent_package_remote_path)
self.context.ssh_client.copy(agent_package_path, agent_package_remote_path, remote_target=True)
stdout = self.context.ssh_client.run_command(f"install-agent --package {agent_package_remote_path} --version {AGENT_VERSION}", use_sudo=True)
self._log.info(stdout)

self._log.info("The agent was installed successfully.")
# Install the agent
if self.context.is_vhd:
self._log.info("Using a VHD; will not install the Test Agent.")
else:
install_command = lambda: self.context.ssh_client.run_command(f"install-agent --package {agent_package_target_path} --version {AGENT_VERSION}", use_sudo=True)
self._log.info("Installing the Test Agent on %s\n%s", self.context.node.name, install_command())

def _collect_node_logs(self) -> None:
"""
Expand All @@ -327,23 +330,25 @@ def _collect_node_logs(self) -> None:
remote_path = "/tmp/waagent-logs.tgz"
local_path = self.context.log_path/'{0}.tgz'.format(self.context.image_name)
self._log.info("Copying %s:%s to %s", self.context.node.name, remote_path, local_path)
self.context.ssh_client.copy(remote_path, local_path, remote_source=True)
self.context.ssh_client.copy_from_node(remote_path, local_path)

except: # pylint: disable=bare-except
self._log.exception("Failed to collect logs from the test machine")

@TestCaseMetadata(description="", priority=0)
def agent_test_suite(self, node: Node, variables: Dict[str, Any], log_path: str, log: Logger) -> None:
def agent_test_suite(self, node: Node, environment: Environment, variables: Dict[str, Any], log_path: str, log: Logger) -> None:
"""
Executes each of the AgentTests included in the "c_test_suites" variable (which is generated by the AgentTestSuitesCombinator).
"""
self._set_context(node, variables, log_path, log)

with _set_thread_name(self.context.image_name): # The thread name is added to self._log
# E1133: Non-iterable value self.context.test_suites is used in an iterating context (not-an-iterable)
# (OK to iterate, test_suite is a List)
# Set the thread name to the image; this name is added to self._log
with _set_thread_name(self.context.image_name):
# Log the environment's name and the variables received from the runbook (note that we need to expand the names of the test suites)
self._log.info("LISA Environment: %s", environment.name)
self._log.info(
"Test suite parameters: [test_suites: %s] [skip_setup: %s] [collect_logs: %s]",
[t.name for t in self.context.test_suites], self.context.skip_setup, self.context.collect_logs) # pylint: disable=E1133
"Runbook variables:\n%s",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

several variables were added and not logged; now i'm logging the entire set

'\n'.join([f"\t{name}: {value if name != 'c_test_suites' else [t.name for t in value] }" for name, value in variables.items()]))

start_time: datetime.datetime = datetime.datetime.now()
test_suite_success = True
Expand All @@ -361,6 +366,8 @@ def agent_test_suite(self, node: Node, variables: Dict[str, Any], log_path: str,
for suite in self.context.test_suites: # pylint: disable=E1133
test_suite_success = self._execute_test_suite(suite) and test_suite_success

test_suite_success = self._check_agent_log() and test_suite_success

finally:
collect = self.context.collect_logs
if collect == CollectLogs.Always or collect == CollectLogs.Failed and not test_suite_success:
Expand Down Expand Up @@ -475,6 +482,51 @@ def _execute_test_suite(self, suite: TestSuiteInfo) -> bool:

return success

def _check_agent_log(self) -> bool:
"""
Checks the agent log for errors; returns true on success (no errors int the log)
"""
start_time: datetime.datetime = datetime.datetime.now()

self._log.info("Checking agent log on the test node")
output = self.context.ssh_client.run_command("check-agent-log.py -j")
errors = json.loads(output, object_hook=AgentLogRecord.from_dictionary)

# Individual tests may have rules to ignore known errors; filter those out
ignore_error_rules = []
for suite in self.context.test_suites:
for test in suite.tests:
ignore_error_rules.extend(test(self.context).get_ignore_error_rules())

if len(ignore_error_rules) > 0:
new = []
for e in errors:
if not AgentLog.matches_ignore_rule(e, ignore_error_rules):
new.append(e)
errors = new

if len(errors) == 0:
# If no errors, we are done; don't create a log or test result.
self._log.info("There are no errors in the agent log")
return True

log_path: Path = self.context.log_path/f"CheckAgentLog-{self.context.image_name}.log"
message = f"Detected {len(errors)} error(s) in the agent log. See {log_path} for a full report."
self._log.info(message)

with set_current_thread_log(log_path):
agent_test_logger.info("Detected %s error(s) in the agent log:\n\n%s", len(errors), '\n'.join(['\t' + e.text for e in errors]))

self._report_test_result(
self.context.image_name,
"CheckAgentLog",
TestStatus.FAILED,
start_time,
message=message + '\n' + '\n'.join([e.text for e in errors[0:3]]),
add_exception_stack_trace=True)

return False

@staticmethod
def _report_test_result(
suite_name: str,
Expand Down
49 changes: 49 additions & 0 deletions tests_e2e/orchestrator/scripts/check-agent-log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import json
import sys

from pathlib import Path
from tests_e2e.tests.lib.agent_log import AgentLog
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved

try:
parser = argparse.ArgumentParser()
parser.add_argument('path', nargs='?', help='Path of the log file', default='/var/log/waagent.log')
parser.add_argument('-j', '--json', action='store_true', help='Produce a JSON report')
parser.set_defaults(json=False)
args = parser.parse_args()

error_list = AgentLog(Path(args.path)).get_errors()

if args.json:
print(json.dumps(error_list, default=lambda o: o.__dict__))
else:
if len(error_list) == 0:
print("No errors were found.")
else:
for e in error_list:
print(e.text)

except Exception as e:
print(f"{e}", file=sys.stderr)
sys.exit(1)

sys.exit(0)
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
#
set -euo pipefail

#

# On most distros, 'waagent' is in PATH
if which waagent 2> /dev/null; then
exit 0
Expand All @@ -42,6 +40,7 @@ fi

# try some well-known locations
declare -a known_locations=(
"/usr/sbin/waagent"
"/usr/share/oem/bin/waagent"
)

Expand Down
18 changes: 9 additions & 9 deletions tests_e2e/orchestrator/scripts/install-agent
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ else
fi
echo "Service name: $service_name"

#
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a race condition between get-agent-path and the agent service restart a few lines below (get-agent-path uses the pid file of the agent in its logic) so I moved the initialization here, before we stop and restart the service

# Find the path to the Agent's executable file
#
waagent=$(get-agent-path)
echo "Agent's path: $waagent"
$waagent --version
echo ""

#
# Install the package
#
Expand All @@ -98,20 +106,13 @@ service-start $service_name
#
# Verify that the new agent is running and output its status.
# Note that the extension handler may take some time to start so give 1 minute.
# Also, note that the default Python is set to Pypy, so before executing 'waagent' we need to set the
# Python path to the location of the azurelinuxagent module.
#
echo "Verifying agent installation..."

PYTHONPATH=$(get-agent-pythonpath)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now this is done in ~/bin/agent-dev so that all tests can benefit from this.

Also, now we set the path to the test agent instead of the agent installed in the image. The test agent is the current code, while the installed agent varies wildly across all the distros we test, which makes using considerably more difficult.

export PYTHONPATH

check-version() {
waagent=$(get-waagent-path)

for i in {0..5}
do
if $waagent --version | grep -E "Goal state agent:\s+$1" > /dev/null; then
if $waagent --version | grep -E "Goal state agent:\s+$version" > /dev/null; then
return 0
fi
sleep 10
Expand All @@ -128,7 +129,6 @@ else
exit_code=1
fi

waagent=$(get-waagent-path)
$waagent --version
printf "\n"
service-status $service_name
Expand Down
Loading