Create individual log files for each (distro, test suite) pair (#2731)

* Create individual log files for each test * cleanup * typo * remove duplicate Co-authored-by: narrieta <narrieta>
Azure · Jan 18, 2023 · 4b8fe5e · 4b8fe5e
1 parent abf4161
commit 4b8fe5e
Show file tree

Hide file tree

Showing 10 changed files with 334 additions and 93 deletions.
diff --git a/tests_e2e/orchestrator/lib/agent_test_suite.py b/tests_e2e/orchestrator/lib/agent_test_suite.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import logging
 import re
 
 from assertpy import fail
@@ -27,6 +28,7 @@
 #     E0401: Unable to import 'lisa.sut_orchestrator.azure.common' (import-error)
 from lisa import (  # pylint: disable=E0401
     CustomScriptBuilder,
+    Logger,
     Node,
     TestSuite,
     TestSuiteMetadata
@@ -39,7 +41,31 @@
 from tests_e2e.scenarios.lib.agent_test import AgentTest
 from tests_e2e.scenarios.lib.agent_test_context import AgentTestContext
 from tests_e2e.scenarios.lib.identifiers import VmIdentifier
-from tests_e2e.scenarios.lib.logging import log
+from tests_e2e.scenarios.lib.logging import log as agent_test_logger  # Logger used by the tests
+
+
+def _initialize_lisa_logger():
+    """
+    Customizes the LISA logger.
+
+    The default behavior of this logger is too verbose, which makes reading the logs difficult. We set up a more succinct
+    formatter and decrease the log level to INFO (the default is VERBOSE). In the future we may consider making this
+    customization settable at runtime in case we need to debug LISA issues.
+    """
+    logger: Logger = logging.getLogger("lisa")
+
+    logger.setLevel(logging.INFO)
+
+    formatter = logging.Formatter('%(asctime)s.%(msecs)03d [%(levelname)s] [%(threadName)s] %(message)s', datefmt="%Y-%m-%dT%H:%M:%SZ")
+    for handler in logger.handlers:
+        handler.setFormatter(formatter)
+
+
+#
+# We want to customize the LISA logger as early as possible, so we do it when this module is first imported. That will
+# happen early in the LISA workflow, when it loads the test suites to execute.
+#
+_initialize_lisa_logger()
 
 
 class AgentTestSuite(TestSuite):
@@ -52,6 +78,7 @@ class _Context(AgentTestContext):
         def __init__(self, vm: VmIdentifier, paths: AgentTestContext.Paths, connection: AgentTestContext.Connection):
             super().__init__(vm=vm, paths=paths, connection=connection)
             # These are initialized by AgentTestSuite._set_context().
+            self.log: Logger = None
             self.node: Node = None
             self.runbook_name: str = None
             self.suite_name: str = None
@@ -61,7 +88,7 @@ def __init__(self, metadata: TestSuiteMetadata) -> None:
         # The context is initialized by _set_context() via the call to execute()
         self.__context: AgentTestSuite._Context = None
 
-    def _set_context(self, node: Node):
+    def _set_context(self, node: Node, log: Logger):
         connection_info = node.connection_info
         node_context = get_node_context(node)
         runbook = node.capability.get_extended_runbook(AzureNodeSchema, AZURE)
@@ -84,15 +111,23 @@ def _set_context(self, node: Node):
                 private_key_file=connection_info['private_key_file'],
                 ssh_port=connection_info['port']))
 
+        self.__context.log = log
         self.__context.node = node
-        self.__context.suite_name = f"{self._metadata.full_name}:{runbook.marketplace.offer}-{runbook.marketplace.sku}"
+        self.__context.suite_name = f"{self._metadata.full_name}_{runbook.marketplace.offer}-{runbook.marketplace.sku}"
 
     @property
     def context(self):
         if self.__context is None:
             raise Exception("The context for the AgentTestSuite has not been initialized")
         return self.__context
 
+    @property
+    def _log(self) -> Logger:
+        """
+        Returns a reference to the LISA Logger.
+        """
+        return self.context.log
+
     #
     # Test suites within the same runbook may be executed concurrently, and setup needs to be done only once.
     # We use this lock to allow only 1 thread to do the setup. Setup completion is marked using the 'completed'
@@ -110,20 +145,20 @@ def _setup(self) -> None:
         AgentTestSuite._setup_lock.acquire()
 
         try:
-            log.info("")
-            log.info("**************************************** [Build] ****************************************")
-            log.info("")
+            self._log.info("")
+            self._log.info("**************************************** [Build] ****************************************")
+            self._log.info("")
             completed: Path = self.context.working_directory/"completed"
 
             if completed.exists():
-                log.info("Found %s. Build has already been done, skipping", completed)
+                self._log.info("Found %s. Build has already been done, skipping", completed)
                 return
 
-            log.info("Creating working directory: %s", self.context.working_directory)
+            self._log.info("Creating working directory: %s", self.context.working_directory)
             self.context.working_directory.mkdir(parents=True)
             self._build_agent_package()
 
-            log.info("Completed setup, creating %s", completed)
+            self._log.info("Completed setup, creating %s", completed)
             completed.touch()
 
         finally:
@@ -133,15 +168,15 @@ def _build_agent_package(self) -> None:
         """
         Builds the agent package and returns the path to the package.
         """
-        log.info("Building agent package to %s", self.context.working_directory)
+        self._log.info("Building agent package to %s", self.context.working_directory)
 
-        makepkg.run(agent_family="Test", output_directory=str(self.context.working_directory), log=log)
+        makepkg.run(agent_family="Test", output_directory=str(self.context.working_directory), log=self._log)
 
         package_path: Path = self._get_agent_package_path()
         if not package_path.exists():
             raise Exception(f"Can't find the agent package at {package_path}")
 
-        log.info("Built agent package as %s", package_path)
+        self._log.info("Built agent package as %s", package_path)
 
     def _get_agent_package_path(self) -> Path:
         """
@@ -158,12 +193,12 @@ def _setup_node(self) -> None:
         """
         Prepares the remote node for executing the test suite.
         """
-        log.info("")
-        log.info("************************************** [Node Setup] **************************************")
-        log.info("")
-        log.info("Test Node: %s", self.context.vm.name)
-        log.info("Resource Group: %s", self.context.vm.resource_group)
-        log.info("")
+        self._log.info("")
+        self._log.info("************************************** [Node Setup] **************************************")
+        self._log.info("")
+        self._log.info("Test Node: %s", self.context.vm.name)
+        self._log.info("Resource Group: %s", self.context.vm.resource_group)
+        self._log.info("")
 
         self._install_agent_on_node()
 
@@ -174,103 +209,117 @@ def _install_agent_on_node(self) -> None:
         agent_package_path: Path = self._get_agent_package_path()
 
         # The install script needs to unzip the agent package; ensure unzip is installed on the test node
-        log.info("Installing unzip tool on %s", self.context.node.name)
+        self._log.info("Installing unzip tool on %s", self.context.node.name)
         self.context.node.os.install_packages("unzip")
 
-        log.info("Installing %s on %s", agent_package_path, self.context.node.name)
+        self._log.info("Installing %s on %s", agent_package_path, self.context.node.name)
         agent_package_remote_path = self.context.remote_working_directory/agent_package_path.name
-        log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, agent_package_remote_path)
+        self._log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, agent_package_remote_path)
         self.context.node.shell.copy(agent_package_path, agent_package_remote_path)
         self.execute_script_on_node(
             self.context.test_source_directory/"orchestrator"/"scripts"/"install-agent",
             parameters=f"--package {agent_package_remote_path} --version {AGENT_VERSION}",
             sudo=True)
 
-        log.info("The agent was installed successfully.")
+        self._log.info("The agent was installed successfully.")
 
     def _collect_node_logs(self) -> None:
         """
         Collects the test logs from the remote machine and copies them to the local machine
         """
         try:
             # Collect the logs on the test machine into a compressed tarball
-            log.info("Collecting logs on test machine [%s]...", self.context.node.name)
+            self._log.info("Collecting logs on test machine [%s]...", self.context.node.name)
             self.execute_script_on_node(self.context.test_source_directory/"orchestrator"/"scripts"/"collect-logs", sudo=True)
 
             # Copy the tarball to the local logs directory
             remote_path = "/tmp/waagent-logs.tgz"
-            local_path = Path.home()/'logs'/'vm-logs-{0}.tgz'.format(self.context.suite_name)
-            log.info("Copying %s:%s to %s", self.context.node.name, remote_path, local_path)
+            local_path = Path.home()/'logs'/'{0}.tgz'.format(self.context.suite_name)
+            self._log.info("Copying %s:%s to %s", self.context.node.name, remote_path, local_path)
             self.context.node.shell.copy_back(remote_path, local_path)
         except:  # pylint: disable=bare-except
-            log.exception("Failed to collect logs from the test machine")
+            self._log.exception("Failed to collect logs from the test machine")
 
-    def execute(self, node: Node, test_suite: List[Type[AgentTest]]) -> None:
+    def execute(self, node: Node, log: Logger, test_suite: List[Type[AgentTest]]) -> None:
         """
         Executes each of the AgentTests in the given List. Note that 'test_suite' is a list of test classes, rather than
         instances of the test class (this method will instantiate each of these test classes).
         """
-        self._set_context(node)
+        self._set_context(node, log)
 
-        failed: List[str] = []
+        failed: List[str] = []  # List of failed tests (names only)
 
+        # The thread name is added to self._log, set it to the current test suite while we execute it
         thread_name = current_thread().name
         current_thread().name = self.context.suite_name
 
+        # We create a separate log file for the test suite.
+        suite_log_file: Path = Path.home()/'logs'/f"{self.context.suite_name}.log"
+        agent_test_logger.set_current_thread_log(suite_log_file)
+
         try:
             self._setup()
 
             try:
                 self._setup_node()
 
-                log.info("")
-                log.info("**************************************** [%s] ****************************************", self._metadata.full_name)
-                log.info("")
+                agent_test_logger.info("")
+                agent_test_logger.info("**************************************** %s ****************************************", self.context.suite_name)
+                agent_test_logger.info("")
 
                 results: List[str] = []
 
                 for test in test_suite:
-                    try:
-                        log.info("******************** [%s]", test.__name__)
-                        log.info("")
+                    result: str = "[UNKNOWN]"
+                    test_full_name = f"{self.context.suite_name} {test.__name__}"
+                    agent_test_logger.info("******** Executing %s", test_full_name)
+                    self._log.info("******** Executing %s", test_full_name)
+                    agent_test_logger.info("")
 
+                    try:
                         test(self.context).run()
-
-                        result = f"[Passed] {test.__name__}"
-
-                        log.info("")
-                        log.info("******************** %s", result)
-                        log.info("")
-
-                        results.append(result)
+                        result = f"[Passed] {test_full_name}"
+                    except AssertionError as e:
+                        failed.append(test.__name__)
+                        result = f"[Failed] {test_full_name}"
+                        agent_test_logger.error("%s", e)
+                        self._log.error("%s", e)
                     except:  # pylint: disable=bare-except
-                        result = f"[Failed] {test.__name__}"
-
-                        log.info("")
-                        log.exception("******************** %s\n", result)
-                        log.info("")
-
-                        results.append(result)
                         failed.append(test.__name__)
-
-                log.info("**************************************** [Test Results] ****************************************")
-                log.info("")
+                        result = f"[Error] {test_full_name}"
+                        agent_test_logger.exception("UNHANDLED EXCEPTION IN %s", test_full_name)
+                        self._log.exception("UNHANDLED EXCEPTION IN %s", test_full_name)
+
+                    agent_test_logger.info("******** %s", result)
+                    agent_test_logger.info("")
+                    self._log.info("******** %s", result)
+                    results.append(result)
+
+                agent_test_logger.info("")
+                agent_test_logger.info("********* [Test Results]")
+                agent_test_logger.info("")
                 for r in results:
-                    log.info("\t%s", r)
-                log.info("")
+                    agent_test_logger.info("\t%s", r)
+                agent_test_logger.info("")
+
             finally:
                 self._collect_node_logs()
 
         except:   # pylint: disable=bare-except
-            # Log the error here so the it is decorated with the thread name, then re-raise
-            log.exception("Test suite failed")
+            agent_test_logger.exception("UNHANDLED EXCEPTION IN %s", self.context.suite_name)
+            # Note that we report the error to the LISA log and then re-raise it. We log it here
+            # so that the message is decorated with the thread name in the LISA log; we re-raise
+            # to let LISA know the test errored out (LISA will report that error one more time
+            # in its log)
+            self._log.exception("UNHANDLED EXCEPTION IN %s", self.context.suite_name)
             raise
 
         finally:
             self._clean_up()
+            agent_test_logger.close_current_thread_log()
             current_thread().name = thread_name
 
-        # Fail the entire test suite if any test failed
+        # Fail the entire test suite if any test failed; this exception is handled by LISA
         if len(failed) > 0:
             fail(f"{[self.context.suite_name]} One or more tests failed: {failed}")
 
@@ -286,16 +335,16 @@ def execute_script_on_node(self, script_path: Path, parameters: str = "", sudo:
         else:
             command_line = f"{script_path} {parameters}"
 
-        log.info("Executing [%s]", command_line)
+        self._log.info("Executing [%s]", command_line)
 
         result = custom_script.run(parameters=parameters, sudo=sudo)
 
         if result.stdout != "":
             separator = "\n" if "\n" in result.stdout else " "
-            log.info("stdout:%s%s", separator, result.stdout)
+            self._log.info("stdout:%s%s", separator, result.stdout)
         if result.stderr != "":
             separator = "\n" if "\n" in result.stderr else " "
-            log.error("stderr:%s%s", separator, result.stderr)
+            self._log.error("stderr:%s%s", separator, result.stderr)
 
         if result.exit_code != 0:
             raise Exception(f"[{command_line}] failed. Exit code: {result.exit_code}")

diff --git a/tests_e2e/orchestrator/scripts/run-scenarios b/tests_e2e/orchestrator/scripts/run-scenarios
@@ -35,10 +35,15 @@ cp "$HOME/id_rsa" "$HOME/.ssh"
 chmod 700 "$HOME/.ssh/id_rsa"
 ssh-keygen -y -f "$HOME/.ssh/id_rsa" > "$HOME/.ssh/id_rsa.pub"
 
+#
 # Now start the runbook
+#
+lisa_logs="$HOME/logs/lisa"
+
 lisa \
   --runbook "$HOME/WALinuxAgent/tests_e2e/scenarios/runbooks/daily.yml" \
-  --log_path "$HOME/logs" \
-  --working_path "$HOME/logs" \
+  --log_path "$lisa_logs" \
+  --working_path "$lisa_logs" \
   -v subscription_id:"$SUBSCRIPTION_ID" \
-  -v identity_file:"$HOME/.ssh/id_rsa"
+  -v identity_file:"$HOME/.ssh/id_rsa" \
+  || true # force a success exit code to allow execution to continue when a test fails
diff --git a/tests_e2e/pipeline/scripts/execute_tests.sh b/tests_e2e/pipeline/scripts/execute_tests.sh
@@ -29,17 +29,17 @@ docker run --rm \
 sudo chown "$USER" "$BUILD_SOURCESDIRECTORY"
 sudo find "$BUILD_ARTIFACTSTAGINGDIRECTORY" -exec chown "$USER" {} \;
 
-# LISA organizes its logs in a tree similar to
+# The LISA run will produce a tree similar to
 #
-#    .../20221130
-#    .../20221130/20221130-160013-749
-#    .../20221130/20221130-160013-749/environments
-#    .../20221130/20221130-160013-749/lisa-20221130-160013-749.log
-#    .../20221130/20221130-160013-749/lisa.junit.xml
+#    $BUILD_ARTIFACTSTAGINGDIRECTORY/lisa/20221130
+#    $BUILD_ARTIFACTSTAGINGDIRECTORY/lisa/20221130/20221130-160013-749
+#    $BUILD_ARTIFACTSTAGINGDIRECTORY/lisa/20221130/20221130-160013-749/environments
+#    $BUILD_ARTIFACTSTAGINGDIRECTORY/lisa/20221130/20221130-160013-749/lisa-20221130-160013-749.log
+#    $BUILD_ARTIFACTSTAGINGDIRECTORY/lisa/20221130/20221130-160013-749/lisa.junit.xml
 #    etc
 #
-# Remove the first 2 levels of the tree (which indicate the time of the test run) to make navigation
+# Remove the 2 levels of the tree that indicate the time of the test run to make navigation
 # in the Azure Pipelines UI easier.
 #
-mv "$BUILD_ARTIFACTSTAGINGDIRECTORY"/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]/*/* "$BUILD_ARTIFACTSTAGINGDIRECTORY"
-rm -r "$BUILD_ARTIFACTSTAGINGDIRECTORY"/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]
+mv "$BUILD_ARTIFACTSTAGINGDIRECTORY"/lisa/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]/*/* "$BUILD_ARTIFACTSTAGINGDIRECTORY"/lisa
+rm -r "$BUILD_ARTIFACTSTAGINGDIRECTORY"/lisa/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]
diff --git a/tests_e2e/scenarios/lib/agent_test_context.py b/tests_e2e/scenarios/lib/agent_test_context.py
@@ -32,8 +32,7 @@ class AgentTestContext:
           concurrency level of the runbook.
     """
     class Paths:
-        # E1101: Instance of 'list' has no '_path' member (no-member)
-        DEFAULT_TEST_SOURCE_DIRECTORY = Path(tests_e2e.__path__._path[0])  # pylint: disable=E1101
+        DEFAULT_TEST_SOURCE_DIRECTORY = Path(tests_e2e.__path__[0])
 
         def __init__(
             self,