Skip to content

Commit

Permalink
Optimize adding hashes to CkmameDB.
Browse files Browse the repository at this point in the history
  • Loading branch information
dillof committed Apr 19, 2024
1 parent 963cbaf commit fe73d78
Show file tree
Hide file tree
Showing 10 changed files with 129 additions and 35 deletions.
47 changes: 33 additions & 14 deletions src/Archive.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Archive::Archive(ArchiveContentsPtr contents_) :
name(contents->name),
filetype(contents->filetype),
where(contents->where),
cache_changed(false),
cache_changed(NONE),
modified(false) {
changes.resize(files.size());
}
Expand Down Expand Up @@ -197,6 +197,7 @@ bool Archive::file_ensure_hashes(uint64_t idx, size_t detector_id, int hashtypes
f->open();
} catch (Exception &e) {
output.error("%s: %s: can't open: %s", name.c_str(), file.name.c_str(), e.what());
set_cache_changed(FILES);
file.broken = true;
return false;
}
Expand All @@ -207,11 +208,13 @@ bool Archive::file_ensure_hashes(uint64_t idx, size_t detector_id, int hashtypes

case READ_ERROR:
output.error("%s: %s: can't compute hashes: %s", name.c_str(), file.name.c_str(), strerror(errno));
set_cache_changed(FILES);
file.broken = true;
return false;

case CRC_ERROR:
output.error("%s: %s: CRC error: %08x != %08x", name.c_str(), file.name.c_str(), hashes.crc, file.hashes.crc);
set_cache_changed(FILES);
file.broken = true;
return false;
}
Expand All @@ -223,7 +226,8 @@ bool Archive::file_ensure_hashes(uint64_t idx, size_t detector_id, int hashtypes
return false;
}
}
cache_changed = true;
set_cache_changed(HASHES_ONLY);
changes[idx].updated_hashes.insert(detector_id);

return true;
}
Expand Down Expand Up @@ -388,8 +392,8 @@ ArchivePtr Archive::open_toplevel(const std::string &name, filetype_t filetype,
bool Archive::read_infos() {
std::vector<File> files_cache;

cache_changed = false;
set_cache_changed(NONE);

contents->read_infos_from_cachedb(&files_cache);

if (contents->cache_id > 0) {
Expand All @@ -398,7 +402,7 @@ bool Archive::read_infos() {
return false;

case 0:
cache_changed = true;
set_cache_changed(FILES);
break;

case 1:
Expand All @@ -413,12 +417,12 @@ bool Archive::read_infos() {
}

if (!read_infos_xxx()) {
cache_changed = true;
set_cache_changed(FILES);
return false;
}

merge_files(files_cache);
changes.resize(files.size());
merge_files(files_cache);

return true;
}
Expand Down Expand Up @@ -493,38 +497,46 @@ Archive::GetHashesStatus Archive::get_hashes(ZipSource *source, uint64_t length,


void Archive::merge_files(const std::vector<File> &files_cache) {
set_cache_changed(NONE);

for (uint64_t i = 0; i < files.size(); i++) {
auto &file = files[i];

file.filename_extension = contents->filename_extension;
auto it = std::find_if(files_cache.cbegin(), files_cache.cend(), [&file](const File &file_cache){ return file.name == file_cache.name; });
if (it != files_cache.cend()) {
if (file.mtime == (*it).mtime && file.compare_size_hashes(*it)) {
if ((file.hashes.get_types() & ~(it->hashes.get_types())) == 0) {
changes[i].updated_hashes.clear();
}
else {
changes[i].updated_hashes.insert(0);
}
file.hashes.merge((*it).hashes);
file.detector_hashes = it->detector_hashes;
}
else {
cache_changed = true;
set_cache_changed(FILES);
}
}
else {
cache_changed = true;
set_cache_changed(FILES);
}

if (want_crc() && !file.hashes.has_type(Hashes::TYPE_CRC)) {
if (!file_ensure_hashes(i, Hashes::TYPE_ALL)) {
file.broken = true;
if (it == files_cache.cend() || !(*it).broken) {
cache_changed = true;
set_cache_changed(FILES);
}
continue;
}
cache_changed = true;
set_cache_changed(HASHES_ONLY);
}
}

if (files.size() != files_cache.size()) {
cache_changed = true;
set_cache_changed(FILES);
}
}

Expand Down Expand Up @@ -619,7 +631,7 @@ bool Archive::compute_detector_hashes(const std::unordered_map<size_t, DetectorP
}

if (got_new_hashes) {
cache_changed = true;
set_cache_changed(HASHES_ONLY);
}
return got_new_hashes;
}
Expand Down Expand Up @@ -649,7 +661,7 @@ bool Archive::compute_detector_hashes(size_t index, const std::unordered_map<siz
return false;
}

return Detector::compute_hashes(data, &file, detectors);
return Detector::compute_hashes(data, &file, db->detectors, &changes[index].updated_hashes);
}


Expand Down Expand Up @@ -690,3 +702,10 @@ bool Archive::compare_size_hashes(size_t index, size_t detector_id, const FileDa

return ok;
}

void Archive::set_cache_changed(Archive::CacheChange new_changed) {
if (new_changed == HASHES_ONLY && cache_changed == FILES) {
return;
}
cache_changed = new_changed;
}
11 changes: 10 additions & 1 deletion src/Archive.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <optional>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -125,6 +126,11 @@ struct hash<ArchiveContents::TypeAndName> {

class Archive {
public:
enum CacheChange {
NONE,
HASHES_ONLY,
FILES
};
class Change {
public:
enum Status {
Expand All @@ -140,6 +146,7 @@ class Archive {
std::string source_name;
ZipSourcePtr source;
std::string file;
std::unordered_set<size_t> updated_hashes;
};

static ArchivePtr open(const std::string &name, filetype_t filetype, where_t where, int flags);
Expand Down Expand Up @@ -190,14 +197,16 @@ class Archive {
virtual std::string get_full_filename(uint64_t index) { return ""; }
virtual std::string get_original_filename(uint64_t index) { return ""; }

void set_cache_changed(CacheChange new_changed);

ArchiveContentsPtr contents;
std::vector<File> &files;
std::string &name;
const filetype_t filetype;
const where_t where;
std::vector<Change> changes;

bool cache_changed;
CacheChange cache_changed{NONE};
bool modified;

protected:
Expand Down
1 change: 1 addition & 0 deletions src/ArchiveLibarchive.cc
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ bool ArchiveLibarchive::read_infos_xxx() {
r.name = archive_entry_pathname_utf8(entry);
r.broken = false;
files.push_back(r);
changes.emplace_back();

header_read = true;
file_ensure_hashes(current_index, Hashes::TYPE_ALL);
Expand Down
2 changes: 1 addition & 1 deletion src/ArchiveZip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ void ArchiveZip::commit_cleanup() {


void ArchiveZip::get_last_update() {
if (cache_changed) {
if (cache_changed != NONE) {
close_xxx();
}
struct stat st;
Expand Down
32 changes: 30 additions & 2 deletions src/CkmameDB.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ std::unordered_map<CkmameDB::Statement, std::string> CkmameDB::queries = {
{ QUERY_ARCHIVE_ID, "select archive_id from archive where name = :name and file_type = :file_type" },
{ QUERY_ARCHIVE_LAST_CHANGE, "select mtime, size from archive where archive_id = :archive_id" },
{ QUERY_FILE, "select file_idx, detector_id, name, mtime, status, size, crc, md5, sha1 from file where archive_id = :archive_id order by file_idx, detector_id" },
{ QUERY_HAS_ARCHIVES, "select archive_id from archive limit 1" }
{ QUERY_HAS_ARCHIVES, "select archive_id from archive limit 1" },
{ UPDATE_FILE_HASHES, "update file set crc = :crc, md5 = :md5, sha1 = :sha1 where archive_id = :archive_id and file_idx = :file_idx and detector_id = 0" }
};

std::unordered_map<CkmameDB::ParameterizedStatement, std::string> CkmameDB::parameterized_queries = {
Expand Down Expand Up @@ -515,4 +516,31 @@ bool CkmameDB::compute_detector_hashes(const std::unordered_map<size_t, Detector
}

return got_new_hashes;
}
}

void CkmameDB::update_file_hashes(int archive_id, size_t file_id, const Hashes& hashes) {
auto stmt = get_statement(UPDATE_FILE_HASHES);

stmt->set_int("archive_id", archive_id);
stmt->set_int("file_idx", static_cast<int>(file_id));
stmt->set_hashes(hashes, true);

stmt->execute();
}

void CkmameDB::insert_file_detector_hashes(int archive_id, size_t file_id, size_t detector_id, const Hashes& hashes) {
auto local_detector_id = get_detector_id(detector_id);

auto stmt = get_statement(INSERT_FILE);

stmt->set_int("archive_id", archive_id);
stmt->set_int("file_idx", static_cast<int>(file_id));
stmt->set_uint64("detector_id", local_detector_id);
stmt->set_string("name", "", true);
stmt->set_int64("mtime", 0);
stmt->set_int("status", 0);
stmt->set_uint64("size", hashes.size);
stmt->set_hashes(hashes, true);

stmt->execute();
}
5 changes: 4 additions & 1 deletion src/CkmameDB.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ class CkmameDB : public DB {
QUERY_ARCHIVE_ID,
QUERY_ARCHIVE_LAST_CHANGE,
QUERY_FILE,
QUERY_HAS_ARCHIVES
QUERY_HAS_ARCHIVES,
UPDATE_FILE_HASHES
};
enum ParameterizedStatement {
QUERY_FIND_FILE
Expand All @@ -95,6 +96,8 @@ class CkmameDB : public DB {
std::vector<ArchiveLocation> list_archives();
int read_files(int archive_id, std::vector<File> *files);
void write_archive(ArchiveContents *archive);
void update_file_hashes(int archive_id, size_t file_id, const Hashes& hashes);
void insert_file_detector_hashes(int archive_id, size_t file_id, size_t detector_id, const Hashes& hashes);

void find_file(filetype_t filetype, size_t detector_id, const FileData& file, std::vector<FindResult> &results);
bool compute_detector_hashes(const std::unordered_map<size_t, DetectorPtr>& detectors);
Expand Down
3 changes: 2 additions & 1 deletion src/Detector.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
*/

#include <memory>
#include <unordered_set>
#include <vector>

#include "DetectorCollection.h"
Expand Down Expand Up @@ -128,7 +129,7 @@ class Detector {
static const DetectorDescriptor *get_descriptor(size_t id) { return detector_ids.get_descriptor(id); }

// Returns true if new hashes were computed.
static bool compute_hashes(const std::vector<uint8_t> &data, File *file, const std::unordered_map<size_t, DetectorPtr> &detectors);
static bool compute_hashes(const std::vector<uint8_t> &data, File *file, const std::unordered_map<size_t, DetectorPtr> &detectors, std::unordered_set<size_t>* changed = {});

private:
static uint64_t operation_unit_size(Operation operation);
Expand Down
2 changes: 1 addition & 1 deletion src/File.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
Hashes File::empty_hashes;

bool File::has_all_hashes(size_t detector, int requested_types) const {
return hashes.has_all_types(requested_types) && get_hashes(detector).has_all_types(requested_types);
return get_hashes(detector).has_all_types(requested_types);
}


Expand Down
46 changes: 35 additions & 11 deletions src/archive_modify.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ bool Archive::commit() {
if (modified) {
output.set_error_archive(name);

cache_changed = true;
set_cache_changed(FILES);

if (!commit_xxx()) {
return false;
Expand Down Expand Up @@ -86,7 +86,7 @@ bool Archive::commit() {
}

void Archive::update_cache() {
if (!cache_changed) {
if (cache_changed == NONE) {
return;
}

Expand All @@ -109,22 +109,46 @@ void Archive::update_cache() {
}
else {
get_last_update();

try {
contents->cache_db->write_archive(contents.get());

// TODO: check if size/mtime changed

if (contents->cache_id != 0 && cache_changed == HASHES_ONLY) {
for (size_t i = 0; i < changes.size(); i++) {
auto& change = changes[i];
const auto& file = files[i];

if (change.updated_hashes.empty()) {
continue;
}

for (auto detector_id: change.updated_hashes) {
if (detector_id == 0) {
contents->cache_db->update_file_hashes(contents->cache_id, i, file.hashes);
}
else {
contents->cache_db->insert_file_detector_hashes(contents->cache_id, i, detector_id, file.get_hashes(detector_id));
}
}
change.updated_hashes.clear();
}
}
catch (Exception &exception) {
contents->cache_db->seterr();
output.error_database("%s: error writing to %s", name.c_str(), CkmameDB::db_name.c_str());
contents->cache_id = 0;
else {
try {
contents->cache_db->write_archive(contents.get());
}
catch (Exception& exception) {
contents->cache_db->seterr();
output.error_database("%s: error writing to %s", name.c_str(), CkmameDB::db_name.c_str());
contents->cache_id = 0;
}
}
}
}
else {
contents->cache_id = 0;
}

cache_changed = false;
set_cache_changed(NONE);
}


Expand Down Expand Up @@ -260,7 +284,7 @@ bool Archive::file_rename(uint64_t index, const std::string &filename) {
return false;
}
if (changes[index].status != Change::EXISTS) {
output.archive_error("cannot copy broken/added/deleted file");
output.archive_error("cannot rename broken/added/deleted file");
return false;
}

Expand Down
Loading

0 comments on commit fe73d78

Please sign in to comment.