Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements in error handling on end-to-end tests #2716

Merged
merged 4 commits into from
Dec 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 95 additions & 91 deletions tests_e2e/orchestrator/lib/agent_test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

from collections.abc import Callable
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main change in this module is that I stopped using before_case and after_case for setup and cleanup.

The issue is that errors in before_case are not reported as failures, they simply make LISA skip test execution. Tests can be skipped for valid reasons (e.g. an unsupported OS), but skipping tests on setup failures without reporting an error is not correct, since those errors may go unnoticed on daily runs.

I replaced the use of before_case and after_case with custom code (mainly in the execute() method).

from pathlib import Path
import shutil
from shutil import rmtree

import makepkg

Expand All @@ -25,69 +25,42 @@
CustomScriptBuilder,
Logger,
Node,
TestSuite,
TestSuiteMetadata,
)
# E0401: Unable to import 'lisa.sut_orchestrator.azure.common' (import-error)
from lisa.sut_orchestrator.azure.common import get_node_context # pylint: disable=E0401

from azurelinuxagent.common.version import AGENT_VERSION


class AgentTestSuite(TestSuite):
class AgentTestScenario(object):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This class used to be an specialization of LISA's TestSuite in order to use the before_case/after_case protocol, but since I am not using anymore now there is no need for that class relationship. Now this is a plain AgentTestScenario class (it is very similar in concept to a DCR scenario) .

I renamed the class, but will rename the py file until my next PR, otherwise reviewing the actual code changes would be more difficult.

"""
Base class for VM Agent tests. It provides initialization, cleanup, and utilities common to all VM Agent test suites.
Instances of this class are used to execute Agent test scenarios. It also provides facilities to execute commands over SSH.
"""
def __init__(self, metadata: TestSuiteMetadata):
super().__init__(metadata)
# The actual initialization happens in _initialize()
self._log = None
self._node = None
self._subscription_id = None
self._resource_group_name = None
self._vm_name = None
self._test_source_directory = None
self._working_directory = None
self._node_home_directory = None

def before_case(self, *_, **kwargs) -> None:
self._initialize(kwargs['node'], kwargs['log'])
self._setup_node()

def after_case(self, *_, **__) -> None:
try:
self._collect_node_logs()
finally:
self._clean_up()

def _initialize(self, node: Node, log: Logger) -> None:
self._node = node
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is now done in init()

self._log = log

node_context = get_node_context(node)
self._subscription_id = node.features._platform.subscription_id
self._resource_group_name = node_context.resource_group_name
self._vm_name = node_context.vm_name

self._test_source_directory = AgentTestSuite._get_test_source_directory()
self._working_directory = Path().home()/"waagent-tmp"
self._node_home_directory = Path('/home')/self._node.connection_info['username']
class Context:
"""
Execution context for test scenarios, this information is passed to test scenarios by AgentTestScenario.execute()
"""
subscription_id: str
resource_group_name: str
vm_name: str
test_source_directory: Path
working_directory: Path
node_home_directory: Path

self._log.info(f"Test Node: {self._vm_name}")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic is now in the _setup() method

self._log.info(f"Resource Group: {self._resource_group_name}")
self._log.info(f"Working directory: {self._working_directory}...")
def __init__(self, node: Node, log: Logger) -> None:
self._node: Node = node
self._log: Logger = log

if self._working_directory.exists():
self._log.info(f"Removing existing working directory: {self._working_directory}...")
try:
shutil.rmtree(self._working_directory.as_posix())
except Exception as exception:
self._log.warning(f"Failed to remove the working directory: {exception}")
self._working_directory.mkdir()
node_context = get_node_context(node)
self._context: AgentTestScenario.Context = AgentTestScenario.Context()
self._context.subscription_id = node.features._platform.subscription_id
self._context.resource_group_name = node_context.resource_group_name
self._context.vm_name = node_context.vm_name

def _clean_up(self) -> None:
self._log.info(f"Removing {self._working_directory}...")
shutil.rmtree(self._working_directory.as_posix(), ignore_errors=True)
self._context.test_source_directory = AgentTestScenario._get_test_source_directory()
self._context.working_directory = Path().home()/"waagent-tmp"
self._context.node_home_directory = Path('/home')/node.connection_info['username']

@staticmethod
def _get_test_source_directory() -> Path:
Expand All @@ -101,6 +74,29 @@ def _get_test_source_directory() -> Path:
path = path.parent
raise Exception("Can't find the test root directory (tests_e2e)")

def _setup(self) -> None:
"""
Prepares the test scenario for execution
"""
self._log.info(f"Test Node: {self._context.vm_name}")
self._log.info(f"Resource Group: {self._context.resource_group_name}")
self._log.info(f"Working directory: {self._context.working_directory}...")

if self._context.working_directory.exists():
self._log.info(f"Removing existing working directory: {self._context.working_directory}...")
try:
rmtree(self._context.working_directory.as_posix())
except Exception as exception:
self._log.warning(f"Failed to remove the working directory: {exception}")
self._context.working_directory.mkdir()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so, continue the setup if we failed to remove? or mkdir raise the exception if already exist?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

continue with warning. this is mainly for the dev scenario. the automation run works on fresh vms

Copy link
Contributor

@nagworld9 nagworld9 Dec 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was reading mkkdir() raise exception if directory exists. So, in dev scenario it won't work if failed to remove

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My point is we could raise the exception if we failed to remove because anyway in next step mkdir throws exception if folder exist.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it will fail and there will be a warning with the info about the delete failure

the dev will get an error, see the warning in the log and fix the issue

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My point is we could raise the exception if we failed to remove because anyway in next step mkdir throws exception if folder exist.

Same difference, with the possibility that the working directory goes away in-between the 2 calls and there is no error


def _clean_up(self) -> None:
"""
Cleans up any leftovers from the test scenario run.
"""
self._log.info(f"Removing {self._context.working_directory}...")
rmtree(self._context.working_directory.as_posix(), ignore_errors=True)

def _setup_node(self) -> None:
"""
Prepares the remote node for executing the test suite.
Expand All @@ -112,18 +108,10 @@ def _build_agent_package(self) -> Path:
"""
Builds the agent package and returns the path to the package.
"""
build_path = self._working_directory/"build"

# The same orchestrator machine may be executing multiple suites on the same test VM, or
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the "*.done" files (they actually never worked, since I am removing the working directory on cleanup).

In the current DCR automation we use a new test VM to run 1 single scenario. I want to be able to run multiple scenarios on the same test VM in the new automation pipeline and the intention of those "*.done" files was to avoid duplicated setup. However, not only the implementation was incorrect, but that approach was cumbersome during test development. I will come up with a better approach when I start combining scenarios.

# the same suite on one or more test VMs; we use this file to mark the build is already done
build_done_path = self._working_directory/"build.done"
if build_done_path.exists():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we remove this case because we're no longer executing multiple suites on the same VM?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not quite, I explained the motivation in the PR comment just a couple of lines above.

self._log.info("The agent build is already completed, will use existing package.")
else:
self._log.info(f"Building agent package to {build_path}")
makepkg.run(agent_family="Test", output_directory=str(build_path), log=self._log)
build_done_path.touch()
build_path = self._context.working_directory/"build"

self._log.info(f"Building agent package to {build_path}")
makepkg.run(agent_family="Test", output_directory=str(build_path), log=self._log)
package_path = build_path/"eggs"/f"WALinuxAgent-{AGENT_VERSION}.zip"
if not package_path.exists():
raise Exception(f"Can't find the agent package at {package_path}")
Expand All @@ -136,54 +124,70 @@ def _install_agent_on_node(self, agent_package: Path) -> None:
"""
Installs the given agent package on the test node.
"""
# The same orchestrator machine may be executing multiple suites on the same test VM,
# we use this file to mark the agent is already installed on the test VM.
install_done_path = self._working_directory/f"agent-install.{self._vm_name}.done"
if install_done_path.exists():
self._log.info(f"Package {agent_package} is already installed on {self._vm_name}...")
return

# The install script needs to unzip the agent package; ensure unzip is installed on the test node
self._log.info(f"Installing unzip on {self._vm_name}...")
self._log.info(f"Installing unzip on {self._context.vm_name}...")
self._node.os.install_packages("unzip")

self._log.info(f"Installing {agent_package} on {self._vm_name}...")
agent_package_remote_path = self._node_home_directory/agent_package.name
self._log.info(f"Copying {agent_package} to {self._vm_name}:{agent_package_remote_path}")
self._log.info(f"Installing {agent_package} on {self._context.vm_name}...")
agent_package_remote_path = self._context.node_home_directory/agent_package.name
self._log.info(f"Copying {agent_package} to {self._context.vm_name}:{agent_package_remote_path}")
self._node.shell.copy(agent_package, agent_package_remote_path)
self._execute_script_on_node(
self._test_source_directory/"orchestrator"/"scripts"/"install-agent",
self.execute_script_on_node(
self._context.test_source_directory/"orchestrator"/"scripts"/"install-agent",
parameters=f"--package {agent_package_remote_path} --version {AGENT_VERSION}",
sudo=True)

self._log.info("The agent was installed successfully.")
install_done_path.touch()

def _collect_node_logs(self) -> None:
"""
Collects the test logs from the remote machine and copied them to the local machine
Collects the test logs from the remote machine and copies them to the local machine
"""
# Collect the logs on the test machine into a compressed tarball
self._log.info("Collecting logs on test machine [%s]...", self._node.name)
self._execute_script_on_node(self._test_source_directory/"orchestrator"/"scripts"/"collect-logs", sudo=True)

# Copy the tarball to the local logs directory
remote_path = self._node_home_directory/"logs.tgz"
local_path = Path.home()/'logs'/'vm-logs-{0}.tgz'.format(self._node.name)
self._log.info(f"Copying {self._node.name}:{remote_path} to {local_path}")
self._node.shell.copy_back(remote_path, local_path)
try:
# Collect the logs on the test machine into a compressed tarball
self._log.info("Collecting logs on test machine [%s]...", self._node.name)
self.execute_script_on_node(self._context.test_source_directory/"orchestrator"/"scripts"/"collect-logs", sudo=True)

# Copy the tarball to the local logs directory
remote_path = self._context.node_home_directory/"logs.tgz"
local_path = Path.home()/'logs'/'vm-logs-{0}.tgz'.format(self._node.name)
self._log.info(f"Copying {self._node.name}:{remote_path} to {local_path}")
self._node.shell.copy_back(remote_path, local_path)
except Exception as e:
self._log.warning(f"Failed to collect logs from the test machine: {e}")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added an exception handler since errors collecting logs should not be reported as test errors.


def execute(self, scenario: Callable[[Context], None]) -> None:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is used to enforce the setup/execution/cleanup of test scenarios and replaces the previous use of before/after_case.

See agent_bvt.py for an example of its usage.

"""
Executes the given scenario
"""
try:
self._setup()
try:
self._setup_node()
scenario(self._context)
finally:
self._collect_node_logs()
finally:
self._clean_up()

def _execute_script_on_node(self, script_path: Path, parameters: str = "", sudo: bool = False) -> int:
def execute_script_on_node(self, script_path: Path, parameters: str = "", sudo: bool = False) -> int:
"""
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made it public, added a docstring and did minor formatting improvements in its logging

Executes the given script on the test node; if 'sudo' is True, the script is executed using the sudo command.
"""
custom_script_builder = CustomScriptBuilder(script_path.parent, [script_path.name])
custom_script = self._node.tools[custom_script_builder]

command_line = f"{script_path} {parameters}"
self._log.info(f"Executing {command_line}")
if parameters == '':
command_line = f"{script_path}"
else:
command_line = f"{script_path} {parameters}"

self._log.info(f"Executing [{command_line}]")
result = custom_script.run(parameters=parameters, sudo=sudo)

# LISA appends stderr to stdout so no need to check for stderr
if result.exit_code != 0:
raise Exception(f"[{command_line}] failed.\n{result.stdout}")
raise Exception(f"Command [{command_line}] failed.\n{result.stdout}")

return result.exit_code

Expand Down
6 changes: 6 additions & 0 deletions tests_e2e/orchestrator/scripts/collect-logs
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,11 @@ tar --exclude='journal/*' --exclude='omsbundle' --exclude='omsagent' --exclude='
/var/lib/waagent/ \
/etc/waagent.conf

# tar exits with 1 on warnings; ignore those
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are hitting a warning about the logs changing while we are creating the tarball. It's just a warning, but tar exits with code 1 (fatal errors are 2, see the man page for details)

exit_code=$?
if [ "$exit_code" != "1" ] && [ "$exit_code" != "0" ]; then
exit $exit_code
fi

chmod +r "$logs_file_name"

17 changes: 16 additions & 1 deletion tests_e2e/orchestrator/scripts/install-agent
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
#!/usr/bin/env bash

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

set -euo pipefail

usage() (
Expand Down Expand Up @@ -59,7 +74,7 @@ echo "Restarting service..."
service $service_name stop

# Rename the previous log to ensure the new log starts with the agent we just installed
mv /var/log/waagent.log /var/log/waagent.pre-install.log
mv /var/log/waagent.log /var/log/waagent."$(date --iso-8601=seconds)".log
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a timestamp to the previous log. Useful if the scenario is run multiple times on the same VM (e.g. during test development) or if multiple scenarios run on the same VM (in the future).


if command -v systemctl &> /dev/null; then
systemctl daemon-reload
Expand Down
18 changes: 18 additions & 0 deletions tests_e2e/orchestrator/scripts/run-scenarios
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
#!/usr/bin/env bash

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# This script runs on the container executing the tests. It creates the SSH keys (private and public) used
# to manage the test VMs taking the initial key value from the file shared by the container host, then it
Expand Down
32 changes: 32 additions & 0 deletions tests_e2e/scenarios/runbooks/samples/hello_world.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Microsoft Azure Linux Agent
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a "sample" LISA test suite that runs on the local machine. Useful to do quick tests/experiments with LISA

#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# E0401: Unable to import 'lisa' (import-error)
from lisa import ( # pylint: disable=E0401
Logger,
Node,
TestCaseMetadata,
TestSuite,
TestSuiteMetadata,
)


@TestSuiteMetadata(area="sample", category="", description="")
class HelloWorld(TestSuite):
@TestCaseMetadata(description="")
def main(self, node: Node, log: Logger) -> None:
log.info(f"Hello world from {node.os.name}!")
28 changes: 28 additions & 0 deletions tests_e2e/scenarios/runbooks/samples/local.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

extension:
- "."
environment:
environments:
- nodes:
- type: local
notifier:
- type: console
testcase:
- criteria:
area: sample
Loading