Skip to content

Commit

Permalink
fixup! Do not detect clone entry as duplicated content.
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Dec 18, 2023
1 parent 3289a6b commit 3a1b0ff
Showing 1 changed file with 16 additions and 9 deletions.
25 changes: 16 additions & 9 deletions src/zimcheck/checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <mutex>
#include <thread>
#include <queue>
#include <optional>
#include <zim/archive.h>
#include <zim/item.h>

Expand Down Expand Up @@ -350,7 +351,7 @@ class ArticleChecker

// All article with the same hash will be recorded in the same bucket of
// this hash table.
std::map<unsigned int, std::list<std::tuple<zim::entry_index_type, zim::cluster_index_type, zim::blob_index_type>>> hash_main;
std::map<unsigned int, std::list<zim::entry_index_type>> hash_main;

zim::ConcurrentCache<std::string, bool> linkStatusCache;
};
Expand Down Expand Up @@ -387,7 +388,7 @@ void ArticleChecker::check_item(const zim::Item& item)
data = item.getData();

if(checks.isEnabled(TestType::REDUNDANT))
hash_main[adler32(data)].push_back( {item.getIndex(), item.getClusterIndex(), item.getBlobIndex()} );
hash_main[adler32(data)].push_back( item.getIndex() );

if (item.getMimetype() != "text/html")
return;
Expand Down Expand Up @@ -487,20 +488,26 @@ void ArticleChecker::detect_redundant_articles()
progress.report();
auto l = it.second;
while ( !l.empty() ) {
const auto [e1_idx, e1_cluster_idx, e1_blob_idx] = l.front();
// The way we have constructed `l`, e1 MUST BE an item
const auto e1 = archive.getEntryByPath(l.front()).getItem();
l.pop_front();
const auto e1 = archive.getEntryByPath(e1_idx);
const auto e1_cluster_idx = e1.getClusterIndex();
const auto e1_blob_idx = e1.getBlobIndex();
if ( !l.empty() ) {
// The way we have constructed `l`, e1 MUST BE an item
const std::string s1 = e1.getItem().getData();
std::optional<std::string> s1;
decltype(l) articlesDifferentFromE1;
for(auto other : l) {
const auto [e2_idx, e2_cluster_idx, e2_blob_idx] = other;
// The way we have constructed `l`, e2 MUST BE an item
const auto e2 = archive.getEntryByPath(other).getItem();
const auto e2_cluster_idx = e2.getClusterIndex();
const auto e2_blob_idx = e2.getBlobIndex();
if (e1_cluster_idx == e2_cluster_idx && e1_blob_idx == e2_blob_idx) {
continue;

Check warning on line 505 in src/zimcheck/checks.cpp

View check run for this annotation

Codecov / codecov/patch

src/zimcheck/checks.cpp#L505

Added line #L505 was not covered by tests
}
auto e2 = archive.getEntryByPath(e2_idx);
std::string s2 = e2.getItem().getData();
if (!s1) {
s1 = e1.getData();
}
std::string s2 = e2.getData();
if (s1 != s2 ) {
articlesDifferentFromE1.push_back(other);
continue;
Expand Down

0 comments on commit 3a1b0ff

Please sign in to comment.