Group reports only by bug hash when uniqueing #1121

csordasmarton · 2017-11-14T12:59:33Z

No description provided.

csordasmarton · 2017-11-14T13:01:54Z

I did some measurement on these queries:

-- getRunReportCounts
-- Unique, TIME: 1487ms
SELECT runs.id AS runs_id, runs.name AS runs_name, count(DISTINCT reports.bug_id) AS count_1 
FROM reports
LEFT OUTER JOIN files ON reports.file_id = files.id
LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
LEFT OUTER JOIN runs ON reports.run_id = runs.id
GROUP BY runs.id
ORDER BY runs.name

-- Non-Unique, TIME: 318ms
SELECT runs.id AS runs_id, runs.name AS runs_name, count(*) AS count_1 
FROM reports
LEFT OUTER JOIN files ON reports.file_id = files.id
LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
LEFT OUTER JOIN runs ON reports.run_id = runs.id
GROUP BY runs.id
ORDER BY runs.name

--getRunResultCount
-- Unique, TIME: 256ms
SELECT count(*)
FROM (
  SELECT reports.bug_id AS reports_bug_id 
  FROM reports
  LEFT OUTER JOIN files ON reports.file_id = files.id
  LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
  GROUP BY reports.bug_id
) as anon

-- Non-unique, TIME: 134ms
SELECT count(*)
FROM (
  SELECT reports.id AS id 
  FROM reports
  LEFT OUTER JOIN files ON reports.file_id = files.id
  LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
) as anon

-- getRunResults
-- Unique: TIME: 477ms
SELECT reports.id AS reports_id, reports.bug_id AS reports_bug_id, reports.checker_message AS reports_checker_message,
  reports.checker_id AS reports_checker_id, reports.severity AS reports_severity, review_statuses.bug_hash AS review_statuses_bug_hash,
  review_statuses.status AS review_statuses_status, review_statuses.author AS review_statuses_author, review_statuses.message AS review_statuses_message,
  review_statuses.date AS review_statuses_date, files.filename AS files_filename, files.filepath AS files_filepath 
FROM reports
LEFT OUTER JOIN files ON reports.file_id = files.id
LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
LEFT OUTER JOIN (
  SELECT anon_2.id AS id 
  FROM (
    SELECT max(reports.id) AS id, max(reports.severity) AS severity 
    FROM reports
    LEFT OUTER JOIN files ON reports.file_id = files.id
    LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id 
    WHERE review_statuses.status IN ('unreviewed') OR review_statuses.status IS NULL
    GROUP BY reports.bug_id
  ) AS anon_2
  ORDER BY anon_2.severity DESC
  LIMIT 500 OFFSET 0
) AS anon_1 ON anon_1.id = reports.id
WHERE anon_1.id IS NOT NULL


-- getCheckerCounts
-- Unique, TIME: 1523ms
SELECT anon_1.checker_id AS anon_1_checker_id, max(anon_1.severity) AS max_1, count(anon_1.bug_id) AS count_1 
FROM (
  SELECT max(reports.checker_id) AS checker_id, max(reports.severity) AS severity, reports.bug_id AS bug_id 
  FROM reports
  LEFT OUTER JOIN files ON reports.file_id = files.id
  LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
  GROUP BY reports.bug_id
) AS anon_1
GROUP BY anon_1.checker_id
ORDER BY anon_1.checker_id

-- Non-unique, TIME: 263ms
SELECT reports.checker_id AS reports_checker_id, reports.severity AS reports_severity, count(reports.id) AS count_1 
FROM reports
LEFT OUTER JOIN files ON reports.file_id = files.id
LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
GROUP BY reports.checker_id, reports.severity
ORDER BY reports.checker_id

-- getSeverityCounts
-- Unique, TIME: 287ms
SELECT anon_1.severity AS anon_1_severity, count(anon_1.bug_id) AS count_1 
FROM (
  SELECT max(reports.severity) AS severity, reports.bug_id AS bug_id 
  FROM reports
  LEFT OUTER JOIN files ON reports.file_id = files.id
  LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
  GROUP BY reports.bug_id
) AS anon_1
GROUP BY anon_1.severity

-- Non-unique, TIME: 215ms
SELECT reports.severity AS reports_severity, count(reports.id) AS count_1 
FROM reports
LEFT OUTER JOIN files ON reports.file_id = files.id
LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
GROUP BY reports.severity

-- getCheckerMsgCounts
-- Unique, TIME: 3954ms, ROWS: 20115
SELECT anon_1.checker_message AS anon_1_checker_message, count(anon_1.bug_id) AS count_1 
FROM (
  SELECT max(reports.checker_message) AS checker_message, reports.bug_id AS bug_id 
  FROM reports
  LEFT OUTER JOIN files ON reports.file_id = files.id
  LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
  GROUP BY reports.bug_id
) AS anon_1
GROUP BY anon_1.checker_message
ORDER BY anon_1.checker_message

-- Non-unique, TIME: 2451ms
SELECT reports.checker_message AS reports_checker_message, count(reports.id) AS count_1 
FROM reports
LEFT OUTER JOIN files ON reports.file_id = files.id
LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
GROUP BY reports.checker_message
ORDER BY reports.checker_message

-- getReviewStatusCounts
-- Unique, TIME: 338ms
SELECT max(anon_1.bug_id) AS max_1, anon_1.status AS anon_1_status, count(anon_1.bug_id) AS count_1 
FROM (
  SELECT reports.bug_id AS bug_id, max(review_statuses.status) AS status 
  FROM reports
  LEFT OUTER JOIN files ON reports.file_id = files.id
  LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
  GROUP BY reports.bug_id
) AS anon_1
GROUP BY anon_1.status

-- Non-unique, TIME: 313ms
SELECT max(reports.bug_id) AS max_1, review_statuses.status AS review_statuses_status, count(reports.id) AS count_1 
FROM reports
LEFT OUTER JOIN files ON reports.file_id = files.id
LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
GROUP BY review_statuses.status

-- getFileCounts
-- Unique, TIME: 772ms
SELECT files.filename AS files_filename, anon_1.report_count AS anon_1_report_count 
FROM files JOIN (
  SELECT anon_2.file_id AS file_id, count(1) AS report_count 
  FROM (
    SELECT reports.bug_id, max(reports.file_id) AS file_id 
    FROM reports
    LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
    GROUP BY reports.bug_id
  ) AS anon_2
  GROUP BY anon_2.file_id
) AS anon_1 ON anon_1.file_id = files.id
ORDER BY files.filename

-- Non-unique, TIME: 1441ms, ROWS: 10033
SELECT files.filepath AS files_filepath, anon_1.report_count AS anon_1_report_count 
FROM files JOIN (
  SELECT anon_2.file_id AS file_id, count(1) AS report_count 
  FROM (
    SELECT reports.bug_id AS bug_id, reports.file_id AS file_id 
    FROM reports
    LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
  ) AS anon_2
  GROUP BY anon_2.file_id
) AS anon_1 ON anon_1.file_id = files.id
ORDER BY files.filepath

-- getRunHistoryTagCounts
-- Unique, TIME: 1451ms
SELECT anon_1.run_id AS anon_1_run_id, max(runs.name) AS run_name, max(run_histories.time) AS max_1, max(run_histories.version_tag) AS max_2, sum(anon_2.report_count) AS sum_1 
FROM (
  SELECT run_histories.run_id AS run_id, max(run_histories.id) AS run_history_id 
  FROM run_histories 
  WHERE run_histories.version_tag IS NOT NULL
  GROUP BY run_histories.run_id
) AS anon_1
LEFT OUTER JOIN run_histories ON run_histories.id = anon_1.run_history_id
LEFT OUTER JOIN runs ON runs.id = anon_1.run_id
LEFT OUTER JOIN (
  SELECT reports.run_id AS run_id, count(DISTINCT reports.bug_id) AS report_count 
  FROM reports
  LEFT OUTER JOIN files ON reports.file_id = files.id
  LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
  GROUP BY reports.run_id
) AS anon_2 ON anon_2.run_id = run_histories.run_id 
WHERE run_histories.version_tag IS NOT NULL
GROUP BY anon_1.run_id
ORDER BY run_name

-- Non-unique, TIME: 278
SELECT anon_1.run_id AS anon_1_run_id, max(runs.name) AS run_name, max(run_histories.time) AS max_1, max(run_histories.version_tag) AS max_2, sum(anon_2.report_count) AS sum_1 
FROM (
  SELECT run_histories.run_id AS run_id, max(run_histories.id) AS run_history_id 
  FROM run_histories 
  WHERE run_histories.version_tag IS NOT NULL
  GROUP BY run_histories.run_id
) AS anon_1
LEFT OUTER JOIN run_histories ON run_histories.id = anon_1.run_history_id
LEFT OUTER JOIN runs ON runs.id = anon_1.run_id
LEFT OUTER JOIN (
  SELECT reports.run_id AS run_id, count(*) AS report_count 
  FROM reports
  LEFT OUTER JOIN files ON reports.file_id = files.id
  LEFT OUTER JOIN review_statuses ON review_statuses.bug_hash = reports.bug_id
  GROUP BY reports.run_id
) AS anon_2 ON anon_2.run_id = run_histories.run_id 
WHERE run_histories.version_tag IS NOT NULL
GROUP BY anon_1.run_id
ORDER BY run_name

gyorb · 2017-11-14T14:15:47Z

tests/functional/report_viewer_api/test_report_filter.py

@@ -303,6 +303,6 @@ def test_uniqueing_compared_to_test_config(self):
        unique_bugs = set()
        # Uniqueing is done based on file name, line number, and hash.


Please update comment here.

gyorb · 2017-11-14T14:17:12Z

libcodechecker/server/api/report_server.py


-                    review_data = create_review_data(review_status)
+                if cmp_data:
+                    q = q.filter(Report.bug_id.in_(diff_hashes))


Do we use the file name when calculating the diff between two runs?

No, we don't use the file name in diff. We use only bug hash.

Xazax-hun · 2017-11-14T15:22:29Z

libcodechecker/server/api/report_server.py

-
-    return filter_report_filter(q, filter_expression, run_ids, cmp_data,
-                                diff_hashes)
+def group_by_unique_fields(q):


Maybe it is just me but I find the name of this confusing. Some documentation might help. So bug_id is the only unique field in report table??

- Increase performance of the filter queries. - Uniqueing the filters and reports only by bughash.

csordasmarton added enhancement 🌟 database 🗄️ Issues related to the database schema. labels Nov 14, 2017

csordasmarton added this to the release 6.2 milestone Nov 14, 2017

csordasmarton requested review from dkrupp, gyorb and bruntib November 14, 2017 12:59

gyorb suggested changes Nov 14, 2017

View reviewed changes

csordasmarton force-pushed the group_by_bughash branch 2 times, most recently from a829b86 to e4455e3 Compare November 14, 2017 14:24

Xazax-hun reviewed Nov 14, 2017

View reviewed changes

Group reports only by bug hash when uniqueing

91cd8c3

- Increase performance of the filter queries. - Uniqueing the filters and reports only by bughash.

csordasmarton force-pushed the group_by_bughash branch from e4455e3 to 91cd8c3 Compare November 14, 2017 17:25

bruntib merged commit 28d4acb into Ericsson:master Nov 14, 2017

This was referenced Nov 16, 2017

Mismatch between filter result count and number of listed reports #1093

Closed

Performance of report filters #1038

Closed

csordasmarton deleted the group_by_bughash branch November 16, 2017 09:13

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Group reports only by bug hash when uniqueing #1121

Group reports only by bug hash when uniqueing #1121

csordasmarton commented Nov 14, 2017

csordasmarton commented Nov 14, 2017

gyorb Nov 14, 2017

gyorb Nov 14, 2017

csordasmarton Nov 14, 2017

Xazax-hun Nov 14, 2017

		@@ -303,6 +303,6 @@ def test_uniqueing_compared_to_test_config(self):
		unique_bugs = set()
		# Uniqueing is done based on file name, line number, and hash.

Group reports only by bug hash when uniqueing #1121

Group reports only by bug hash when uniqueing #1121

Conversation

csordasmarton commented Nov 14, 2017

csordasmarton commented Nov 14, 2017

gyorb Nov 14, 2017

Choose a reason for hiding this comment

gyorb Nov 14, 2017

Choose a reason for hiding this comment

csordasmarton Nov 14, 2017

Choose a reason for hiding this comment

Xazax-hun Nov 14, 2017

Choose a reason for hiding this comment