diff --git a/docs/usage.md b/docs/usage.md index 13fe2869cb..baca2d6dd3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -423,16 +423,12 @@ enclosing scope of the bug location (function signature, class, namespace). ## How reports are counted? You can list analysis reports in two ways: -1. Using the **`CodeChecker parse`** command, which **does not do deduplication**. -2. Reports view of the **Web UI**, which **does deduplication**. +1. Using the **`CodeChecker parse`** command. +2. Reports view of the **Web UI**. -These two views may show slightly different report list and counts based on how -duplicate findings or findings with the same hash identifier are rendered. +Both of them do **deduplication**: it will not show the same bug report multiple +times even if the analyzer found it multiple times. -The `CodeChecker parse` command does not do deduplication. -It lists reports simply as found by the -analyzers and always lists all duplicate and similar findings. - You may find the same bug report multiple times for two reasons: 1) The same source file is analyzed multiple times (because the `compile_commmands.json` contains the build command multiple times) @@ -440,9 +436,6 @@ then the same findings will be listed multiple times. 2) All findings that are found in headers will be shown as many times as many source file include that header. -Web UI reports view on the other hand does deduplication: It will not show -the same bug report two times even if the analyzer found it multiple times. - **Example:** ```c++ //lib.h: @@ -501,15 +494,7 @@ Found no defects while analyzing a.c 3, lib.c:2:1: Entered call from 'h' 4, lib.c:3:11: Division by zero -[HIGH] lib.h:1:30: Dereference of undefined pointer value [core.NullDereference] -inline int div_h(){int *p; *p=4;}; - ^ - Report hash: 6e7a6b71ac1a26751b7a7f7eea80f5da - Steps: - 1, lib.h:1:20: 'p' declared without an initial value - 2, lib.h:1:30: Dereference of undefined pointer value - -Found 2 defect(s) while analyzing b.c +Found 1 defect(s) while analyzing b.c [HIGH] lib.c:3:11: Division by zero [core.DivideZero] return 1/b; @@ -521,15 +506,7 @@ Found 2 defect(s) while analyzing b.c 3, lib.c:2:1: Entered call from 'f' 4, lib.c:3:11: Division by zero -[HIGH] lib.h:1:30: Dereference of undefined pointer value [core.NullDereference] -inline int div_h(){int *p; *p=4;}; - ^ - Report hash: 6e7a6b71ac1a26751b7a7f7eea80f5da - Steps: - 1, lib.h:1:20: 'p' declared without an initial value - 2, lib.h:1:30: Dereference of undefined pointer value - -Found 2 defect(s) while analyzing a.c +Found 1 defect(s) while analyzing a.c Found no defects while analyzing b.c Found no defects while analyzing lib.c @@ -538,16 +515,15 @@ Found no defects while analyzing lib.c ----------------------- Filename | Report count ----------------------- -lib.h | 3 +lib.h | 1 lib.c | 2 ----------------------- ``` -These results are printed without deduplication and uniqueing. +These results are printed by doing deduplication and without uniqueing. As you can see the *dereference of undefined pointer value* error in the -`lib.h` is printed 3 times, because the header is included from -`a.c, b.c, lib.c`. All three findings have the same Report Identifier value. -The two division by zero errors from `a.c` and `b.c` are printed also separately. +`lib.h` is printed only once, even if the header is included from +`a.c, b.c, lib.c`. In deduplication mode and without uniqueing (in the Web UI) the reports in lib.h would be shown only once, as all three findings are identical. So in diff --git a/libcodechecker/analyze/plist_parser.py b/libcodechecker/analyze/plist_parser.py index 2e40b1652c..c2b4640410 100644 --- a/libcodechecker/analyze/plist_parser.py +++ b/libcodechecker/analyze/plist_parser.py @@ -40,8 +40,8 @@ from libcodechecker import util from libcodechecker.logger import get_logger -from libcodechecker.report import Report -from libcodechecker.report import generate_report_hash +from libcodechecker.report import Report, generate_report_hash, \ + get_report_path_hash from libcodechecker.source_code_comment_handler import \ SourceCodeCommentHandler, skip_suppress_status @@ -326,6 +326,7 @@ def __init__(self, src_comment_handler, skip_handler, severity_map, + processed_path_hashes, analyzer_type="clangsa"): self.__analyzer_type = analyzer_type @@ -333,6 +334,7 @@ def __init__(self, self.__print_steps = False self.src_comment_handler = src_comment_handler self.skiplist_handler = skip_handler + self._processed_path_hashes = processed_path_hashes @property def print_steps(self): @@ -416,6 +418,16 @@ def write(self, files, reports, analyzed_source_file, output=sys.stdout): non_suppressed = 0 for report in reports: + path_hash = get_report_path_hash(report, files) + if path_hash in self._processed_path_hashes: + LOG.debug("Not showing report because it is a deduplication " + "of an already processed report!") + LOG.debug("Path hash: %s", path_hash) + LOG.debug(report) + continue + + self._processed_path_hashes.add(path_hash) + events = [i for i in report.bug_path if i.get('kind') == 'event'] f_path = files[events[-1]['location']['file']] if self.skiplist_handler and \ diff --git a/libcodechecker/cmd/cmd_line_client.py b/libcodechecker/cmd/cmd_line_client.py index 72c28a923b..fee5640ab0 100644 --- a/libcodechecker/cmd/cmd_line_client.py +++ b/libcodechecker/cmd/cmd_line_client.py @@ -25,7 +25,7 @@ from libcodechecker.libclient.client import handle_auth from libcodechecker.libclient.client import setup_client from libcodechecker.output_formatters import twodim_to_str -from libcodechecker.report import Report +from libcodechecker.report import Report, get_report_path_hash from libcodechecker.source_code_comment_handler import SourceCodeCommentHandler from libcodechecker.util import split_server_url @@ -288,6 +288,7 @@ def get_diff_results(client, baseids, cmp_data): def get_report_dir_results(reportdir): all_reports = [] + processed_path_hashes = set() for filename in os.listdir(reportdir): if filename.endswith(".plist"): file_path = os.path.join(reportdir, filename) @@ -295,9 +296,19 @@ def get_report_dir_results(reportdir): try: files, reports = plist_parser.parse_plist(file_path) for report in reports: + path_hash = get_report_path_hash(report, files) + if path_hash in processed_path_hashes: + LOG.debug("Not showing report because it is a " + "deduplication of an already processed " + "report!") + LOG.debug("Path hash: %s", path_hash) + LOG.debug(report) + continue + + processed_path_hashes.add(path_hash) report.main['location']['file_name'] = \ files[int(report.main['location']['file'])] - all_reports.extend(reports) + all_reports.append(report) except Exception as ex: LOG.error('The generated plist is not valid!') diff --git a/libcodechecker/libhandlers/parse.py b/libcodechecker/libhandlers/parse.py index 4b22ac86fa..d7e8f4a81b 100644 --- a/libcodechecker/libhandlers/parse.py +++ b/libcodechecker/libhandlers/parse.py @@ -161,7 +161,8 @@ def arg_match(options): parser.set_defaults(func=__handle) -def parse(f, context, metadata_dict, suppress_handler, skip_handler, steps): +def parse(f, context, metadata_dict, suppress_handler, skip_handler, steps, + processed_path_hashes): """ Prints the results in the given file to the standard output in a human- readable format. @@ -177,7 +178,8 @@ def parse(f, context, metadata_dict, suppress_handler, skip_handler, steps): rh = plist_parser.PlistToPlaintextFormatter(suppress_handler, skip_handler, - context.severity_map) + context.severity_map, + processed_path_hashes) rh.print_steps = steps @@ -275,6 +277,8 @@ def skip_html_report_data_handler(report_hash, source_file, report_line, if 'skipfile' in args: skip_handler = SkipListHandler(args.skipfile) + processed_path_hashes = set() + for input_path in args.input: input_path = os.path.abspath(input_path) @@ -330,7 +334,8 @@ def skip_html_report_data_handler(report_hash, source_file, report_line, metadata_dict, suppress_handler, skip_handler, - 'print_steps' in args) + 'print_steps' in args, + processed_path_hashes) file_change = file_change.union(f_change) severity_stats.update(Counter(report_stats.get('severity', diff --git a/libcodechecker/report.py b/libcodechecker/report.py index 57af7d9cd4..a4773bcf8f 100644 --- a/libcodechecker/report.py +++ b/libcodechecker/report.py @@ -15,7 +15,6 @@ import json import os -import libcodechecker.util as util from libcodechecker.logger import get_logger from libcodechecker.util import get_line @@ -147,6 +146,31 @@ def compare_ctrl_sections(curr, prev): return '' +def get_report_path_hash(report, files): + """ + Returns path hash for the given report. This can be used to filter + deduplications of multiple reports. + """ + report_path_hash = '' + events = filter(lambda i: i.get('kind') == 'event', report.bug_path) + + for event in events: + file_name = os.path.basename(files[event['location']['file']]) + line = str(event['location']['line']) if 'location' in event else 0 + col = str(event['location']['col']) if 'location' in event else 0 + + report_path_hash += line + '|' + col + '|' + event['message'] + \ + file_name + + if not report_path_hash: + LOG.error('Failed to generate report path hash!') + LOG.error(report) + LOG.error(events) + + LOG.debug(report_path_hash) + return hashlib.md5(report_path_hash.encode()).hexdigest() + + class Report(object): """ Just a minimal separation of the main section diff --git a/libcodechecker/server/api/report_server.py b/libcodechecker/server/api/report_server.py index 7e2e93992e..d6d926ed54 100644 --- a/libcodechecker/server/api/report_server.py +++ b/libcodechecker/server/api/report_server.py @@ -34,6 +34,7 @@ from libcodechecker.analyze import plist_parser from libcodechecker.logger import get_logger from libcodechecker.profiler import timeit +from libcodechecker.report import get_report_path_hash from libcodechecker.server import permissions from libcodechecker.server.database import db_cleanup from libcodechecker.server.database.config_db_model import Product @@ -389,27 +390,6 @@ def sort_results_query(query, sort_types, sort_type_map, order_type_map, return query -def get_report_path_hash(report, files): - report_path_hash = '' - events = filter(lambda i: i.get('kind') == 'event', report.bug_path) - - for event in events: - file_name = os.path.basename(files[event['location']['file']]) - line = str(event['location']['line']) if 'location' in event else 0 - col = str(event['location']['col']) if 'location' in event else 0 - - report_path_hash += line + '|' + col + '|' + event['message'] + \ - file_name - - if not len(report_path_hash): - LOG.error('Failed to generate report path hash!') - LOG.error(report) - LOG.error(events) - - LOG.debug(report_path_hash) - return hashlib.md5(report_path_hash.encode()).hexdigest() - - class ThriftRequestHandler(object): """ Connect to database and handle thrift client requests. @@ -1854,8 +1834,7 @@ def __store_reports(self, session, report_dir, source_root, run_id, bug_paths, bug_events = \ store_handler.collect_paths_events(report, file_ids, files) - report_path_hash = get_report_path_hash(report, - files) + report_path_hash = get_report_path_hash(report, files) if report_path_hash in already_added: LOG.debug('Not storing report. Already added') LOG.debug(report) diff --git a/tests/functional/analyze_and_parse/test_files/Makefile b/tests/functional/analyze_and_parse/test_files/Makefile index 8db0ea0b97..fa6035496f 100644 --- a/tests/functional/analyze_and_parse/test_files/Makefile +++ b/tests/functional/analyze_and_parse/test_files/Makefile @@ -16,3 +16,6 @@ saargs_forward: $(CXX) -w -std=c++11 saargs_forward.cpp -o /dev/null source_code_comments: $(CXX) -w source_code_comments.cpp -o /dev/null +deduplication: + $(CXX) -w -DVAR=1 simple1.cpp -o /dev/null + $(CXX) -w -DVAR=2 simple1.cpp -o /dev/null diff --git a/tests/functional/analyze_and_parse/test_files/simple1.deduplication.output b/tests/functional/analyze_and_parse/test_files/simple1.deduplication.output new file mode 100644 index 0000000000..12e90c068e --- /dev/null +++ b/tests/functional/analyze_and_parse/test_files/simple1.deduplication.output @@ -0,0 +1,42 @@ +NORMAL#CodeChecker log --output $LOGFILE$ --build "make deduplication" --quiet +NORMAL#CodeChecker analyze $LOGFILE$ --output $OUTPUT$ --analyzers clangsa +NORMAL#CodeChecker parse $OUTPUT$ +CHECK#CodeChecker check --build "make deduplication" --output $OUTPUT$ --quiet --analyzers clangsa +-------------------------------------------------------------------------------- +[] - Starting build ... +[] - Build finished successfully. +[] - Starting static analysis ... +[] - [1/2] clangsa analyzed simple1.cpp successfully. +[] - [2/2] clangsa analyzed simple1.cpp successfully. +[] - ----==== Summary ====---- +[] - Total analyzed compilation commands: 2 +[] - Successfully analyzed +[] - clangsa: 2 +[] - ----=================---- +[] - Analysis finished. +[] - To view results in the terminal use the "CodeChecker parse" command. +[] - To store results use the "CodeChecker store" command. +[] - See --help and the user guide for further options about parsing and storing the reports. +[] - ----=================---- +[HIGH] simple1.cpp:18:15: Division by zero [core.DivideZero] + return 2015 / x; + ^ + +Found 1 defect(s) while analyzing simple1.cpp + +Found no defects while analyzing simple1.cpp + +----==== Summary ====---- +-------------------------- +Filename | Report count +-------------------------- +simple1.cpp | 1 +-------------------------- +----------------------- +Severity | Report count +----------------------- +HIGH | 1 +----------------------- +----=================---- +Total number of reports: 1 +----=================---- diff --git a/tests/projects/cpp/Makefile b/tests/projects/cpp/Makefile index 31f169eff7..8e9c06a2ea 100644 --- a/tests/projects/cpp/Makefile +++ b/tests/projects/cpp/Makefile @@ -10,7 +10,8 @@ all: $(CXX) -c skip_header.cpp $(CXX) -c path_begin1.cpp $(CXX) -c path_begin2.cpp - $(CXX) -c path_begin.cpp + $(CXX) -c -DVAR=2 path_begin.cpp + $(CXX) -c -DVAR=1 path_begin.cpp clean: rm -f call_and_message.o rm -f divide_zero.o diff --git a/tests/unit/test_report_path_hash.py b/tests/unit/test_report_path_hash.py new file mode 100644 index 0000000000..51fc8f39d3 --- /dev/null +++ b/tests/unit/test_report_path_hash.py @@ -0,0 +1,57 @@ +# ----------------------------------------------------------------------------- +# The CodeChecker Infrastructure +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# ----------------------------------------------------------------------------- +""" Test Store handler features. """ + +import os +import unittest + +from libcodechecker.analyze import plist_parser +from libcodechecker.report import get_report_path_hash + + +class ReportPathHashHandler(unittest.TestCase): + """ + Test report path hash generation handler features. + """ + + @classmethod + def setup_class(cls): + # Already generated plist files for the tests. + cls.__this_dir = os.path.dirname(__file__) + cls.__plist_test_files = os.path.join( + cls.__this_dir, 'plist_test_files') + + def test_report_path_hash_generation(self): + """ + Test report path hash generation. + """ + clang50_trunk_plist = os.path.join( + self.__plist_test_files, 'clang-5.0-trunk.plist') + files, reports = plist_parser.parse_plist(clang50_trunk_plist, None, + False) + self.assertEqual(len(reports), 3) + + # Generate dummy file_ids which should come from the database. + file_ids = {} + for i, file_name in enumerate(files, 1): + file_ids[file_name] = i + + msg = "This test is prepared to handle 3 reports." + self.assertEqual(len(reports), 3, msg) + + report_hash_to_path_hash = { + '79e31a6ba028f0b7d9779faf4a6cb9cf': + 'c473c1a55df72ea4c6e055e18370ac65', + '8714f42d8328bc78d5d7bff6ced918cc': + '94f2a6eee8af6462a810218dff35056a', + 'a6d3464f8aab9eb31a8ea7e167e84322': + '11f410136724cf43c63526841007897e' + } + + for report in reports: + path_hash = get_report_path_hash(report, files) + bug_hash = report.main['issue_hash_content_of_line_in_context'] + self.assertEqual(path_hash, report_hash_to_path_hash[bug_hash])