From 430aeca33ea7d27d7836b90edbd60df330fd1892 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Endre=20F=C3=BCl=C3=B6p?= <gamesh411@gmail.com>
Date: Fri, 20 Nov 2020 16:42:26 +0100
Subject: [PATCH] [analyzer][fix] Invocation list YAML config no longer breaks
 lines

The config file 'invocation-list.yml' may contain line breaks inside
list items. However, this is not properly supported by the analyzer YAML
parser. This fix sets the line width parameter to very high value, to
mitigate the problem.
---
 .../analyzers/clangsa/ctu_manager.py          |  6 +-
 analyzer/tests/functional/ctu/test_ctu.py     | 66 +++++++++++++++----
 .../ctu/test_files/complex_buildlog.json      | 13 ++++
 analyzer/tests/libtest/env.py                 | 27 ++++++++
 4 files changed, 98 insertions(+), 14 deletions(-)
 create mode 100644 analyzer/tests/functional/ctu/test_files/complex_buildlog.json

diff --git a/analyzer/codechecker_analyzer/analyzers/clangsa/ctu_manager.py b/analyzer/codechecker_analyzer/analyzers/clangsa/ctu_manager.py
index 74e8f2930b..8ce9387cb1 100644
--- a/analyzer/codechecker_analyzer/analyzers/clangsa/ctu_manager.py
+++ b/analyzer/codechecker_analyzer/analyzers/clangsa/ctu_manager.py
@@ -16,6 +16,7 @@
 import shutil
 import tempfile
 from pathlib import Path
+from sys import maxsize
 
 from codechecker_common.logger import get_logger
 
@@ -83,7 +84,10 @@ def generate_invocation_list(triple_arch, action, source, config, env):
     cmd.extend(['-D__clang_analyzer__', '-w'])
 
     # The YAML mapping entry already has a newline at the end.
-    invocation_line = yaml.dump({str(source_path): cmd})
+    # Line width is set to max int size because of compatibility with the YAML
+    # parser of LLVM. We try to ensure that no lines break in the textual
+    # representation of the list items.
+    invocation_line = yaml.dump({str(source_path): cmd}, width=maxsize)
 
     LOG.debug_analyzer("Appending invocation list item '%s'", invocation_line)
 
diff --git a/analyzer/tests/functional/ctu/test_ctu.py b/analyzer/tests/functional/ctu/test_ctu.py
index c7a7f65c61..f57a739e0e 100644
--- a/analyzer/tests/functional/ctu/test_ctu.py
+++ b/analyzer/tests/functional/ctu/test_ctu.py
@@ -10,11 +10,12 @@
 
 
 import glob
-import json
 import os
 import shutil
 import unittest
 
+from typing import IO
+
 from libtest import env
 from libtest.codechecker import call_command
 from libtest.ctu_decorators import makeSkipUnlessCTUCapable, \
@@ -58,21 +59,18 @@ def setUp(self):
         print("'analyze' reported CTU-on-demand-compatibility? " +
               str(getattr(self, ON_DEMAND_ATTR)))
 
-        # Fix the "template" build JSONs to contain a proper directory
-        # so the tests work.
-        raw_buildlog = os.path.join(self.test_dir, 'buildlog.json')
-        with open(raw_buildlog,
-                  encoding="utf-8", errors="ignore") as log_file:
-            build_json = json.load(log_file)
-            for command in build_json:
-                command['directory'] = self.test_dir
+        self.buildlog = os.path.join(self.test_workspace, 'buildlog.json')
+        self.complex_buildlog = os.path.join(
+            self.test_workspace, 'complex_buildlog.json')
+
+        # Fix the "template" build JSONs to contain a proper directory.
+        env.adjust_buildlog(
+            'buildlog.json', self.test_dir, self.test_workspace)
+        env.adjust_buildlog(
+            'complex_buildlog.json', self.test_dir, self.test_workspace)
 
         self.__old_pwd = os.getcwd()
         os.chdir(self.test_workspace)
-        self.buildlog = os.path.join(self.test_workspace, 'buildlog.json')
-        with open(self.buildlog, 'w',
-                  encoding="utf-8", errors="ignore") as log_file:
-            json.dump(build_json, log_file)
 
     def tearDown(self):
         """ Tear down workspace."""
@@ -228,3 +226,45 @@ def test_ctu_makefile_generation(self):
         self.assertIn("defect(s) in lib.c", output)
         self.assertIn("lib.c:3:", output)
         self.assertIn("[core.NullDereference]", output)
+
+    @skipUnlessCTUCapable
+    @skipUnlessCTUOnDemandCapable
+    def test_ctu_ondemand_yaml_format(self):
+        """ Test the generated YAML used in CTU on-demand mode.
+        The YAML file should not contain newlines in individual entries in the
+        generated textual format. """
+
+        cmd = [self._codechecker_cmd, 'analyze',
+               '-o', self.report_dir,
+               '--analyzers', 'clangsa',
+               '--ctu-collect',  # ctu-directory is needed, and it remains
+                                 # intact only if a single ctu-phase is
+                                 # specified
+               '--ctu-ast-mode', 'parse-on-demand',
+               self.complex_buildlog]
+        call_command(cmd, cwd=self.test_dir, env=self.env)
+
+        ctu_dir = os.path.join(self.report_dir, 'ctu-dir')
+
+        # In order to be architecture-invariant, ctu directory is searched for
+        # invocation list files.
+        invocation_list_paths = list(glob.glob(
+            os.path.join(ctu_dir, '*', 'invocation-list.yml')))
+
+        # At least one invocation list should exist.
+        self.assertGreaterEqual(len(invocation_list_paths), 1)
+
+        # Assert that every line begins with either - or / to approximate that
+        # the line is not a line-broken list entry. If there is no newline in
+        # the textual representation, then every line either starts with a /
+        # (if it is an absolute path posing as a key) or - (if it is a list
+        # entry). This requirement of format is a workaround for the LLVM YAML
+        # parser.
+        def assert_no_linebreak(invocation_list_file: IO):
+            invocation_lines = invocation_list_file.readlines()
+            for line in invocation_lines:
+                self.assertRegex(line, '^ *[-/]')
+
+        for invocation_list_path in invocation_list_paths:
+            with open(invocation_list_path) as invocation_list_file:
+                assert_no_linebreak(invocation_list_file)
diff --git a/analyzer/tests/functional/ctu/test_files/complex_buildlog.json b/analyzer/tests/functional/ctu/test_files/complex_buildlog.json
new file mode 100644
index 0000000000..e7dcf1400f
--- /dev/null
+++ b/analyzer/tests/functional/ctu/test_files/complex_buildlog.json
@@ -0,0 +1,13 @@
+[
+  {
+    "directory": ".",
+    "command": "gcc -c lib.c -o lib.o",
+    "file": "lib.c"
+  },
+  {
+    "directory": ".",
+    "command": "gcc -c main.c -o main.o -DUNUSED_DEFINE=\"Long-long text to test the yaml dump does not break the line in the invocation-list.yml file even when it comes to really long lines like this one.\" -DUNUSED_FLAGS=\"-DTEST_DEFINE_INSIDE_ANOTHER_DEFINE=\\\"Emulate embedded content to ensure the generated yaml is correct, and the on-demand ctu analysis behaves well\\\"\" -DUNUSED_FLAGS=\"-Wall -Wmissing-prototypes -Wpoint-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -fexcess-precision=standard -O2\"",
+    "file": "main.c"
+  }
+]
+
diff --git a/analyzer/tests/libtest/env.py b/analyzer/tests/libtest/env.py
index 9be0c72295..2f84325ad5 100644
--- a/analyzer/tests/libtest/env.py
+++ b/analyzer/tests/libtest/env.py
@@ -14,6 +14,8 @@
 import os
 import tempfile
 
+from pathlib import Path
+
 from functional import PKG_ROOT
 from functional import REPO_ROOT
 
@@ -77,3 +79,28 @@ def export_test_cfg(workspace, test_cfg):
 
 def setup_test_proj_cfg(workspace):
     return import_test_cfg(workspace)['test_project']
+
+
+def adjust_buildlog(buildlog_file: str, source_dir, target_dir):
+    """ Reads the buildlog found at `source_dir` / `buildlog_file`. Overwrites
+    the directory entries with `source_dir`. Finally the modified file is
+    written with the same filename inside `target_dir`.
+
+    Parameters
+    ----------
+    buildlog_file: str
+        the filename name of the buildlog
+    source_dir: str or os.PathLike
+        the directory where the file with name `buildlog_file` is
+    target_dir: str or os.PathLike
+        the directory where the adjusted contents are written
+    """
+    file_contents = Path(source_dir, buildlog_file).read_text(
+        encoding="utf-8", errors="ignore")
+    json_representation = json.loads(file_contents)
+
+    for command in json_representation:
+        command['directory'] = str(source_dir)
+
+    Path(target_dir, buildlog_file).write_text(
+        json.dumps(json_representation), encoding="utf-8", errors="ignore")