diff --git a/clamd/clamd.c b/clamd/clamd.c index 866746a40d..6ff4ff03b3 100644 --- a/clamd/clamd.c +++ b/clamd/clamd.c @@ -466,6 +466,8 @@ int main(int argc, char **argv) break; } + if ((opt = optget(opts, "cache-size"))->enabled) + cl_engine_set_num(engine, CL_ENGINE_CACHE_SIZE, opt->numarg); if (optget(opts, "disable-cache")->enabled) cl_engine_set_num(engine, CL_ENGINE_DISABLE_CACHE, 1); diff --git a/clamscan/manager.c b/clamscan/manager.c index dc6c4e413c..4b1e4906c3 100644 --- a/clamscan/manager.c +++ b/clamscan/manager.c @@ -1116,6 +1116,8 @@ int scanmanager(const struct optstruct *opts) #endif } + if ((opt = optget(opts, "cache-size"))->enabled) + cl_engine_set_num(engine, CL_ENGINE_CACHE_SIZE, opt->numarg); if (optget(opts, "disable-cache")->enabled) cl_engine_set_num(engine, CL_ENGINE_DISABLE_CACHE, 1); diff --git a/common/optparser.c b/common/optparser.c index c13d56ed0e..ae41aaac3c 100644 --- a/common/optparser.c +++ b/common/optparser.c @@ -248,6 +248,8 @@ const struct clam_option __clam_options[] = { /* config file/cmdline options */ {"AlertExceedsMax", "alert-exceeds-max", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "", ""}, + {"CacheSize", "cache-size", 0, CLOPT_TYPE_NUMBER, MATCH_NUMBER, CLI_DEFAULT_CACHE_SIZE, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Number of entries the cache can store.", "65536"}, + {"PreludeEnable", "prelude-enable", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD, "Enable prelude", ""}, {"PreludeAnalyzerName", "prelude-analyzer-name", 0, CLOPT_TYPE_STRING, NULL, -1, NULL, 0, OPT_CLAMD, "Name of the analyzer as seen in prewikka", ""}, diff --git a/docs/man/clamd.conf.5.in b/docs/man/clamd.conf.5.in index 4d3ea8bc6c..4dcb19a3f9 100644 --- a/docs/man/clamd.conf.5.in +++ b/docs/man/clamd.conf.5.in @@ -543,6 +543,11 @@ By default, the engine will store an MD5 in a cache of any files that are not fl .br Default: no .TP +\fBCacheSize\fR +This option allows you to set the number of entries the cache can store. The value should be a square number or will be rounded up to the nearest square number. +.br +Default: 65536 +.TP \fBForceToDisk\fR This option causes memory or nested map scans to dump the content to disk. .br diff --git a/etc/clamd.conf.sample b/etc/clamd.conf.sample index 8cf370c600..86b3442af0 100644 --- a/etc/clamd.conf.sample +++ b/etc/clamd.conf.sample @@ -282,6 +282,11 @@ Example # Default: no #DisableCache yes +# This option allows you to set the number of entries the cache can store. +# The value should be a square number or will be rounded up to the nearest +# square number. +#CacheSize 65536 + # In some cases (eg. complex malware, exploits in graphic files, and others), # ClamAV uses special algorithms to detect abnormal patterns and behaviors that # may be malicious. This option enables alerting on such heuristically diff --git a/libclamav/cache.c b/libclamav/cache.c index 5047e6e421..0fe873406a 100644 --- a/libclamav/cache.c +++ b/libclamav/cache.c @@ -31,30 +31,26 @@ #include "mpool.h" #include "clamav.h" #include "cache.h" +#include "math.h" #include "fmap.h" #include "clamav_rust.h" -/* The number of root trees and the chooser function +/* The chooser function Each tree is protected by a mutex against concurrent access */ -/* #define TREES 1 */ -/* static inline unsigned int getkey(uint8_t *hash) { return 0; } */ -#define TREES 256 -static inline unsigned int getkey(uint8_t *hash) +static inline unsigned int getkey(uint8_t *hash, size_t trees) { if (hash) { - return *hash; + // Take the first two bytes (16 bits) of the hash, which total to 65536 values, + // and modulus that by the number of trees desired. + // As long as trees < 65536, and the hash is uniformly distributed, + // the resulting key will be a good value to use a bucket identifier + // for evenly placing values. + return (hash[0] | (((unsigned int)hash[1]) << 8)) % trees; } return 0; } -/* #define TREES 4096 */ -/* static inline unsigned int getkey(uint8_t *hash) { return hash[0] | ((unsigned int)(hash[1] & 0xf)<<8) ; } */ -/* #define TREES 65536 */ -/* static inline unsigned int getkey(uint8_t *hash) { return hash[0] | (((unsigned int)hash[1])<<8) ; } */ - -/* The number of nodes in each tree */ -#define NODES 256 /* SPLAY --------------------------------------------------------------------- */ struct node { /* a node */ @@ -77,13 +73,15 @@ struct cache_set { /* a tree */ struct CACHE { struct cache_set cacheset; + uint32_t trees; + uint32_t nodes_per_tree; #ifdef CL_THREAD_SAFE pthread_mutex_t mutex; #endif }; /* Allocates all the nodes and sets up the replacement chain */ -static int cacheset_init(struct cache_set *cs, mpool_t *mempool) +static int cacheset_init(struct cache_set *cs, mpool_t *mempool, uint32_t nodes_per_tree) { unsigned int i; @@ -91,19 +89,19 @@ static int cacheset_init(struct cache_set *cs, mpool_t *mempool) UNUSEDPARAM(mempool); #endif - cs->data = MPOOL_CALLOC(mempool, NODES, sizeof(*cs->data)); + cs->data = MPOOL_CALLOC(mempool, nodes_per_tree, sizeof(*cs->data)); cs->root = NULL; if (!cs->data) return 1; - for (i = 1; i < NODES; i++) { + for (i = 1; i < nodes_per_tree; i++) { cs->data[i - 1].next = &cs->data[i]; cs->data[i].prev = &cs->data[i - 1]; } cs->first = cs->data; - cs->last = &cs->data[NODES - 1]; + cs->last = &cs->data[nodes_per_tree - 1]; return 0; } @@ -540,7 +538,7 @@ static int cache_lookup_hash(unsigned char *md5, size_t len, struct CACHE *cache return ret; } - key = getkey(md5); + key = getkey(md5, cache->trees); c = &cache[key]; @@ -575,12 +573,24 @@ int clean_cache_init(struct cl_engine *engine) return 0; } - if (!(cache = MPOOL_MALLOC(engine->mempool, sizeof(struct CACHE) * TREES))) { + // The user requested the cache size to be engine->cache_size + // The nodes within each tree are locked together, so having one tree would result in excessive lock contention. + // However, having too many trees is inefficient. + // A good balance is to have trees and nodes per tree be equal, which is done by using the sqrt of the user request cache size. + const uint32_t trees = ceil(sqrt(engine->cache_size)); + const uint32_t nodes_per_tree = ceil(sqrt(engine->cache_size)); + + cli_dbgmsg("clean_cache_init: Requested cache size: %d. Actual cache size: %d. Trees: %d. Nodes per tree: %d.\n", engine->cache_size, trees * nodes_per_tree, trees, nodes_per_tree); + + if (!(cache = MPOOL_MALLOC(engine->mempool, sizeof(struct CACHE) * trees))) { cli_errmsg("clean_cache_init: mpool malloc fail\n"); return 1; } - for (i = 0; i < TREES; i++) { + cache->trees = trees; + cache->nodes_per_tree = nodes_per_tree; + + for (i = 0; i < trees; i++) { #ifdef CL_THREAD_SAFE if (pthread_mutex_init(&cache[i].mutex, NULL)) { cli_errmsg("clean_cache_init: mutex init fail\n"); @@ -590,7 +600,7 @@ int clean_cache_init(struct cl_engine *engine) return 1; } #endif - if (cacheset_init(&cache[i].cacheset, engine->mempool)) { + if (cacheset_init(&cache[i].cacheset, engine->mempool, cache->nodes_per_tree)) { for (j = 0; j < i; j++) cacheset_destroy(&cache[j].cacheset, engine->mempool); #ifdef CL_THREAD_SAFE for (j = 0; j <= i; j++) pthread_mutex_destroy(&cache[j].mutex); @@ -615,7 +625,7 @@ void clean_cache_destroy(struct cl_engine *engine) return; } - for (i = 0; i < TREES; i++) { + for (i = 0; i < cache->trees; i++) { cacheset_destroy(&cache[i].cacheset, engine->mempool); #ifdef CL_THREAD_SAFE pthread_mutex_destroy(&cache[i].mutex); @@ -667,7 +677,7 @@ void clean_cache_add(unsigned char *md5, size_t size, cli_ctx *ctx) level = (ctx->fmap && ctx->fmap->dont_cache_flag) ? ctx->recursion_level : 0; - key = getkey(md5); + key = getkey(md5, ctx->engine->cache->trees); c = &ctx->engine->cache[key]; #ifdef CL_THREAD_SAFE @@ -709,7 +719,7 @@ void clean_cache_remove(unsigned char *md5, size_t size, const struct cl_engine return; } - key = getkey(md5); + key = getkey(md5, engine->cache->trees); c = &engine->cache[key]; #ifdef CL_THREAD_SAFE diff --git a/libclamav/clamav.h b/libclamav/clamav.h index a3b187ebbe..f4ba496e4c 100644 --- a/libclamav/clamav.h +++ b/libclamav/clamav.h @@ -301,6 +301,7 @@ enum cl_engine_field { CL_ENGINE_MAX_SCRIPTNORMALIZE, /* uint64_t */ CL_ENGINE_MAX_ZIPTYPERCG, /* uint64_t */ CL_ENGINE_FORCETODISK, /* uint32_t */ + CL_ENGINE_CACHE_SIZE, /* uint32_t */ CL_ENGINE_DISABLE_CACHE, /* uint32_t */ CL_ENGINE_DISABLE_PE_STATS, /* uint32_t */ CL_ENGINE_STATS_TIMEOUT, /* uint32_t */ diff --git a/libclamav/default.h b/libclamav/default.h index 3855e709e7..fc3c04a972 100644 --- a/libclamav/default.h +++ b/libclamav/default.h @@ -51,6 +51,8 @@ #define CLI_DEFAULT_MAXPARTITIONS 50 +#define CLI_DEFAULT_CACHE_SIZE 65536 + /* TODO - set better defaults */ #define CLI_DEFAULT_PCRE_MATCH_LIMIT 100000 #define CLI_DEFAULT_PCRE_RECMATCH_LIMIT 2000 diff --git a/libclamav/others.c b/libclamav/others.c index 5eeba0a074..dd7adba60b 100644 --- a/libclamav/others.c +++ b/libclamav/others.c @@ -473,6 +473,7 @@ struct cl_engine *cl_engine_new(void) new->maxhtmlnotags = CLI_DEFAULT_MAXHTMLNOTAGS; new->maxscriptnormalize = CLI_DEFAULT_MAXSCRIPTNORMALIZE; new->maxziptypercg = CLI_DEFAULT_MAXZIPTYPERCG; + new->cache_size = CLI_DEFAULT_CACHE_SIZE; new->bytecode_security = CL_BYTECODE_TRUST_SIGNED; /* 5 seconds timeout */ @@ -730,6 +731,11 @@ cl_error_t cl_engine_set_num(struct cl_engine *engine, enum cl_engine_field fiel clean_cache_init(engine); } break; + case CL_ENGINE_CACHE_SIZE: + if (num) { + engine->cache_size = (uint32_t)num; + } + break; case CL_ENGINE_DISABLE_PE_STATS: if (num) { engine->engine_options |= ENGINE_OPTIONS_DISABLE_PE_STATS; @@ -846,6 +852,8 @@ long long cl_engine_get_num(const struct cl_engine *engine, enum cl_engine_field return engine->bytecode_mode; case CL_ENGINE_DISABLE_CACHE: return engine->engine_options & ENGINE_OPTIONS_DISABLE_CACHE; + case CL_ENGINE_CACHE_SIZE: + return engine->cache_size; case CL_ENGINE_STATS_TIMEOUT: return ((cli_intel_t *)(engine->stats_data))->timeout; case CL_ENGINE_MAX_PARTITIONS: @@ -976,6 +984,7 @@ struct cl_settings *cl_engine_settings_copy(const struct cl_engine *engine) settings->cb_meta = engine->cb_meta; settings->cb_file_props = engine->cb_file_props; settings->engine_options = engine->engine_options; + settings->cache_size = engine->cache_size; settings->cb_stats_add_sample = engine->cb_stats_add_sample; settings->cb_stats_remove_sample = engine->cb_stats_remove_sample; @@ -1020,6 +1029,7 @@ cl_error_t cl_engine_settings_apply(struct cl_engine *engine, const struct cl_se engine->bytecode_timeout = settings->bytecode_timeout; engine->bytecode_mode = settings->bytecode_mode; engine->engine_options = settings->engine_options; + engine->cache_size = settings->cache_size; if (engine->tmpdir) MPOOL_FREE(engine->mempool, engine->tmpdir); diff --git a/libclamav/others.h b/libclamav/others.h index 8ee30f5340..c43a590d83 100644 --- a/libclamav/others.h +++ b/libclamav/others.h @@ -327,6 +327,7 @@ struct cl_engine { char *tmpdir; uint32_t keeptmp; uint64_t engine_options; + uint32_t cache_size; /* Limits */ uint32_t maxscantime; /* Time limit (in milliseconds) */ @@ -492,6 +493,7 @@ struct cl_settings { enum bytecode_mode bytecode_mode; char *pua_cats; uint64_t engine_options; + uint32_t cache_size; /* callbacks */ clcb_pre_cache cb_pre_cache; diff --git a/libfreshclam/libfreshclam.c b/libfreshclam/libfreshclam.c index 96d8bd8a1c..c3a560e04b 100644 --- a/libfreshclam/libfreshclam.c +++ b/libfreshclam/libfreshclam.c @@ -432,6 +432,10 @@ fc_error_t fc_test_database(const char *dbFilename, int bBytecodeEnabled) goto done; } + // Disable cache as testing the database doesn't need caching, + // having cache will only waste time and memory. + engine->engine_options |= ENGINE_OPTIONS_DISABLE_CACHE; + cl_engine_set_clcb_stats_submit(engine, NULL); if (CL_SUCCESS != (cl_ret = cl_load( diff --git a/win32/conf_examples/clamd.conf.sample b/win32/conf_examples/clamd.conf.sample index 272a661740..ad44b6f1d3 100644 --- a/win32/conf_examples/clamd.conf.sample +++ b/win32/conf_examples/clamd.conf.sample @@ -255,6 +255,11 @@ TCPAddr localhost # Default: no #DisableCache yes +# This option allows you to set the number of entries the cache can store. +# The value should be a square number or will be rounded up to the nearest +# square number. +#CacheSize 65536 + # In some cases (eg. complex malware, exploits in graphic files, and others), # ClamAV uses special algorithms to detect abnormal patterns and behaviors that # may be malicious. This option enables alerting on such heuristically