From 81192d48f454a044d6974e52c627e65d6ac37d01 Mon Sep 17 00:00:00 2001 From: Shivshankar Date: Fri, 27 Sep 2024 15:20:47 -0400 Subject: [PATCH 01/51] Change return value of aeTimeProc callback function to long long. (#1057) moduleTimerHandler is aeTimeProc handler and event loop gets created with this. However, found that the function return type is int but actually returns "long long" value(i.e., next_period). and return value being assigned to int variable in processTimeEvents(where time events are processed), this might cause an overflow of the timer values. So changed the return type of the function to long long. And also updated other callback function return type to be consistent. I found this when I was checking functions reported in https://github.com/valkey-io/valkey/issues/1054 issue stacktrace. (FYI, this is just to update the return type to be consistent and it will not the fix for the issue reported) Signed-off-by: Shivshankar-Reddy --- src/ae.c | 2 +- src/ae.h | 2 +- src/evict.c | 2 +- src/module.c | 2 +- src/rdma.c | 2 +- src/server.c | 2 +- src/valkey-benchmark.c | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ae.c b/src/ae.c index f91d0b7e8a..9bf8619902 100644 --- a/src/ae.c +++ b/src/ae.c @@ -363,7 +363,7 @@ static int processTimeEvents(aeEventLoop *eventLoop) { } if (te->when <= now) { - int retval; + long long retval; id = te->id; te->refcount++; diff --git a/src/ae.h b/src/ae.h index 2c1ab2dd5f..985429cd56 100644 --- a/src/ae.h +++ b/src/ae.h @@ -67,7 +67,7 @@ struct aeEventLoop; /* Types and data structures */ typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask); -typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData); +typedef long long aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData); typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData); typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop); typedef void aeAfterSleepProc(struct aeEventLoop *eventLoop, int numevents); diff --git a/src/evict.c b/src/evict.c index 4b9f70eaa5..5e4b6220eb 100644 --- a/src/evict.c +++ b/src/evict.c @@ -447,7 +447,7 @@ int overMaxmemoryAfterAlloc(size_t moremem) { * eviction cycles until the "maxmemory" condition has resolved or there are no * more evictable items. */ static int isEvictionProcRunning = 0; -static int evictionTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData) { +static long long evictionTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData) { UNUSED(eventLoop); UNUSED(id); UNUSED(clientData); diff --git a/src/module.c b/src/module.c index 570affec70..15a7fb91f4 100644 --- a/src/module.c +++ b/src/module.c @@ -9113,7 +9113,7 @@ typedef struct ValkeyModuleTimer { /* This is the timer handler that is called by the main event loop. We schedule * this timer to be called when the nearest of our module timers will expire. */ -int moduleTimerHandler(struct aeEventLoop *eventLoop, long long id, void *clientData) { +long long moduleTimerHandler(struct aeEventLoop *eventLoop, long long id, void *clientData) { UNUSED(eventLoop); UNUSED(id); UNUSED(clientData); diff --git a/src/rdma.c b/src/rdma.c index bca01b9839..66dd0dd6fc 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -672,7 +672,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat } } -static int rdmaKeepaliveTimeProc(struct aeEventLoop *el, long long id, void *clientData) { +static long long rdmaKeepaliveTimeProc(struct aeEventLoop *el, long long id, void *clientData) { struct rdma_cm_id *cm_id = clientData; RdmaContext *ctx = cm_id->context; connection *conn = ctx->conn; diff --git a/src/server.c b/src/server.c index 9aa1de135f..866023a455 100644 --- a/src/server.c +++ b/src/server.c @@ -1257,7 +1257,7 @@ void cronUpdateMemoryStats(void) { * a macro is used: run_with_period(milliseconds) { .... } */ -int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { +long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { int j; UNUSED(eventLoop); UNUSED(id); diff --git a/src/valkey-benchmark.c b/src/valkey-benchmark.c index 0a3139771d..b22ee8cbed 100644 --- a/src/valkey-benchmark.c +++ b/src/valkey-benchmark.c @@ -195,7 +195,7 @@ static redisContext *getRedisContext(const char *ip, int port, const char *hosts static void freeServerConfig(serverConfig *cfg); static int fetchClusterSlotsConfiguration(client c); static void updateClusterSlotsConfiguration(void); -int showThroughput(struct aeEventLoop *eventLoop, long long id, void *clientData); +static long long showThroughput(struct aeEventLoop *eventLoop, long long id, void *clientData); /* Dict callbacks */ static uint64_t dictSdsHash(const void *key); @@ -1604,7 +1604,7 @@ int parseOptions(int argc, char **argv) { exit(exit_status); } -int showThroughput(struct aeEventLoop *eventLoop, long long id, void *clientData) { +long long showThroughput(struct aeEventLoop *eventLoop, long long id, void *clientData) { UNUSED(eventLoop); UNUSED(id); benchmarkThread *thread = (benchmarkThread *)clientData; From fc5862c8a161254cc0fced26b620c61f6d619385 Mon Sep 17 00:00:00 2001 From: chx9 Date: Sat, 28 Sep 2024 11:48:35 +0800 Subject: [PATCH 02/51] Fix typo in test_helper.tcl (#1080) Fix typo in test_helper.tcl: even driven => event driven Signed-off-by: chx9 --- tests/test_helper.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 665ba97714..a9640b4f22 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -533,7 +533,7 @@ proc the_end {} { } } -# The client is not even driven (the test server is instead) as we just need +# The client is not event driven (the test server is instead) as we just need # to read the command, execute, reply... all this in a loop. proc test_client_main server_port { set ::test_server_fd [socket localhost $server_port] From 4cb85fcf1600041e15bc46f047044bbace441f1b Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Mon, 30 Sep 2024 17:54:05 +0800 Subject: [PATCH 03/51] RDMA: Support .is_local method (#1089) There is no ethernet style virtual device (like lo 127.0.0.1) for RDMA, however a connection with the same local address and peer address are considered as local. Signed-off-by: zhenwei pi --- src/rdma.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/rdma.c b/src/rdma.c index 66dd0dd6fc..6cbc5c70b5 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -1501,6 +1501,26 @@ static int rdmaServer(char *err, int port, char *bindaddr, int af, rdma_listener return ret; } +static int connRdmaIsLocal(connection *conn) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + struct sockaddr *laddr = rdma_get_local_addr(rdma_conn->cm_id); + struct sockaddr *raddr = rdma_get_peer_addr(rdma_conn->cm_id); + struct sockaddr_in *lsa4, *rsa4; + struct sockaddr_in6 *lsa6, *rsa6; + + if (laddr->sa_family == AF_INET) { + lsa4 = (struct sockaddr_in *)laddr; + rsa4 = (struct sockaddr_in *)raddr; + return !memcmp(&lsa4->sin_addr, &rsa4->sin_addr, sizeof(lsa4->sin_addr)); + } else if (laddr->sa_family == AF_INET6) { + lsa6 = (struct sockaddr_in6 *)laddr; + rsa6 = (struct sockaddr_in6 *)raddr; + return !memcmp(&lsa6->sin6_addr, &rsa6->sin6_addr, sizeof(lsa6->sin6_addr)); + } + + return -1; +} + int connRdmaListen(connListener *listener) { int j, ret; char **bindaddr = listener->bindaddr; @@ -1673,6 +1693,7 @@ static ConnectionType CT_RDMA = { .ae_handler = connRdmaEventHandler, .accept_handler = connRdmaAcceptHandler, //.cluster_accept_handler = NULL, + .is_local = connRdmaIsLocal, .listen = connRdmaListen, .addr = connRdmaAddr, From e978a01c534926a5adb826baf22e65dde64623fa Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Mon, 30 Sep 2024 17:41:05 +0300 Subject: [PATCH 04/51] avoid double close on replica main channel (#1097) fixes #1088 Signed-off-by: Ran Shidlansik --- src/replication.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/replication.c b/src/replication.c index 44b7c7b715..64df85f19a 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2738,8 +2738,10 @@ static void fullSyncWithPrimary(connection *conn) { error: sdsfree(err); - connClose(conn); - server.repl_transfer_s = NULL; + if (server.repl_transfer_s) { + connClose(server.repl_transfer_s); + server.repl_transfer_s = NULL; + } if (server.repl_rdb_transfer_s) { connClose(server.repl_rdb_transfer_s); server.repl_rdb_transfer_s = NULL; From 35692b419ef20aa7e760d06efe9bbc66e8c3e9c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Mon, 30 Sep 2024 19:55:23 +0200 Subject: [PATCH 05/51] Speed up AOF rewrite test case (#1093) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These two test cases run in a loop: * AOF rewrite during write load: RDB preamble=yes * AOF rewrite during write load: RDB preamble=no Both of the test cases build up a lot of data (3-4 million keys when I run locally) so we should empty the data before the second test case. Otherwise, the second test cases adds keys on top of the keys added in the first test case, resulting in the double number of keys and takes more time. Before this commit: [ok]: AOF rewrite during write load: RDB preamble=yes (18225 ms) [ok]: AOF rewrite during write load: RDB preamble=no (37249 ms) After: [ok]: AOF rewrite during write load: RDB preamble=yes (18777 ms) [ok]: AOF rewrite during write load: RDB preamble=no (19940 ms) Signed-off-by: Viktor Söderqvist --- tests/unit/aofrw.tcl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/aofrw.tcl b/tests/unit/aofrw.tcl index b5310edae4..5cca6d90b9 100644 --- a/tests/unit/aofrw.tcl +++ b/tests/unit/aofrw.tcl @@ -9,6 +9,9 @@ start_server {tags {"aofrw external:skip logreqres:skip"} overrides {save {}}} { foreach rdbpre {yes no} { r config set aof-use-rdb-preamble $rdbpre test "AOF rewrite during write load: RDB preamble=$rdbpre" { + # Start with an empty db + r flushall + # Start a write load for 10 seconds set master [srv 0 client] set master_host [srv 0 host] From 9fa4ef177760837bc6d28f3c29258159ddbc2ff3 Mon Sep 17 00:00:00 2001 From: Masahiro Ide Date: Tue, 1 Oct 2024 12:59:22 +0900 Subject: [PATCH 06/51] Create empty lua tables with specified initial capacity as much as possible (#1092) Currently, we create a Lua table without initial capacity even when the capacity is known. As a result, we need to resize the Lua tables repeatedly when converting RESP serialized object to Lua object and it consumes extra cpu resources a bit when we need to transfer RESP-serialized data to Lua world. This patch try to remove this extra resize to reduce (re-)allocation overhead. | name | unstable bb57dfe6303 (rps) | this patch(rps) | improvements | | --------------- | -------- | --------- | -------------- | | evalsha - hgetall h1 | 60565.68 | 64487.01 | 6.47% | | evalsha - hgetall h10 | 47023.41 | 50602.17 | 7.61% | | evalsha - hgetall h25 | 33572.82 | 37345.48 | 11.23% | | evalsha - hgetall h50 | 24206.63 | 25276.14 | 4.42% | | evalsha - hgetall h100 | 15068.87 | 15656.8 | 3.90% | | evalsha - hgetall h300 | 5948.56 | 6094.74 | 2.46% | Signed-off-by: Masahiro Ide Co-authored-by: Masahiro Ide --- src/script_lua.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/script_lua.c b/src/script_lua.c index bbb5e3fc32..ce13b71277 100644 --- a/src/script_lua.c +++ b/src/script_lua.c @@ -361,7 +361,7 @@ static void redisProtocolToLuaType_Map(struct ReplyParser *parser, void *ctx, si } lua_newtable(lua); lua_pushstring(lua, "map"); - lua_newtable(lua); + lua_createtable(lua, 0, len); } for (size_t j = 0; j < len; j++) { parseReply(parser, lua); @@ -383,7 +383,7 @@ static void redisProtocolToLuaType_Set(struct ReplyParser *parser, void *ctx, si } lua_newtable(lua); lua_pushstring(lua, "set"); - lua_newtable(lua); + lua_createtable(lua, 0, len); } for (size_t j = 0; j < len; j++) { parseReply(parser, lua); @@ -412,7 +412,7 @@ static void redisProtocolToLuaType_Array(struct ReplyParser *parser, void *ctx, * to push elements to the stack. On failure, exit with panic. */ serverPanic("lua stack limit reach when parsing server.call reply"); } - lua_newtable(lua); + lua_createtable(lua, len, 0); } for (size_t j = 0; j < len; j++) { if (lua) lua_pushnumber(lua, j + 1); @@ -1534,7 +1534,7 @@ void luaRegisterServerAPI(lua_State *lua) { static void luaCreateArray(lua_State *lua, robj **elev, int elec) { int j; - lua_newtable(lua); + lua_createtable(lua, elec, 0); for (j = 0; j < elec; j++) { lua_pushlstring(lua, (char *)elev[j]->ptr, sdslen(elev[j]->ptr)); lua_rawseti(lua, -2, j + 1); From 9c0746813fbae6ceec312fba695d35a37b4800ca Mon Sep 17 00:00:00 2001 From: Wen Hui Date: Tue, 1 Oct 2024 04:30:35 -0400 Subject: [PATCH 07/51] Update keyspace notifications link to valkey.io in code comment (#1100) As title description ![image](https://github.com/user-attachments/assets/655324e6-b042-4c2f-b558-b912a7d2c10c) Signed-off-by: hwware --- src/notify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/notify.c b/src/notify.c index 5305f24664..c655457e8b 100644 --- a/src/notify.c +++ b/src/notify.c @@ -30,7 +30,7 @@ #include "server.h" /* This file implements keyspace events notification via Pub/Sub and - * described at https://redis.io/topics/notifications. */ + * described at https://valkey.io/topics/notifications */ /* Turn a string representing notification classes into an integer * representing notification classes flags xored. From 673d4c27aa2a01ba2b2cdcb210bda97c92ab0302 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 2 Oct 2024 04:14:30 +0800 Subject: [PATCH 08/51] Avoid timing issue in diskless-load-swapdb test (#1077) Since we paused the primary node earlier, the replica may enter cluster down due to primary node pfail. Here set allow read to prevent subsequent read errors. Signed-off-by: Binbin --- tests/unit/cluster/diskless-load-swapdb.tcl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unit/cluster/diskless-load-swapdb.tcl b/tests/unit/cluster/diskless-load-swapdb.tcl index 68c2135493..e237fd81de 100644 --- a/tests/unit/cluster/diskless-load-swapdb.tcl +++ b/tests/unit/cluster/diskless-load-swapdb.tcl @@ -78,6 +78,11 @@ test "Main db not affected when fail to diskless load" { fail "Fail to stop the full sync" } + # Since we paused the primary node earlier, the replica may enter + # cluster down due to primary node pfail. Here set allow read to + # prevent subsequent read errors. + $replica config set cluster-allow-reads-when-down yes + # Replica keys and keys to slots map still both are right assert_equal {1} [$replica get $slot0_key] assert_equal $slot0_key [$replica CLUSTER GETKEYSINSLOT 0 1] From a106e8a743b2c9c0dbaae42c0acf96839806fa33 Mon Sep 17 00:00:00 2001 From: Guillaume Koenig <106696198+knggk@users.noreply.github.com> Date: Wed, 2 Oct 2024 13:28:55 -0400 Subject: [PATCH 09/51] Rax size tracking (#688) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a `size_t` field into the rax struct to track allocation size. Update the allocation size on rax insert and deletes. Return the allocation size when `raxAllocSize` is called. This size tracking is now used in MEMORY USAGE and MEMORY STATS in place of the previous method based on sampling. The module API allows to create sorted dictionaries, which are backed by rax. Users now also get precise memory allocation for them (through `ValkeyModule_MallocSizeDict`). Fixes #677. For the release notes: * MEMORY USAGE and MEMORY STATS are now exact for streams, rather than based on sampling. --------- Signed-off-by: Guillaume Koenig Signed-off-by: Guillaume Koenig <106696198+knggk@users.noreply.github.com> Co-authored-by: Joey Co-authored-by: Viktor Söderqvist --- .config/typos.toml | 1 + src/module.c | 6 +- src/object.c | 29 +- src/rax.c | 28 +- src/rax.h | 8 +- src/rax_malloc.h | 1 + src/unit/test_files.h | 14 + src/unit/test_rax.c | 1025 +++++++++++++++++++++++++++++++++++++++++ 8 files changed, 1078 insertions(+), 34 deletions(-) create mode 100644 src/unit/test_rax.c diff --git a/.config/typos.toml b/.config/typos.toml index d378b5655a..1dc44ea0e9 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -20,6 +20,7 @@ extend-ignore-re = [ "D4C4DAA4", # sha1.c "Georg Nees", "\\[l\\]ist", # eval.c + "LKE", # test_rax.c ] [type.tcl] diff --git a/src/module.c b/src/module.c index 15a7fb91f4..38d0c2d968 100644 --- a/src/module.c +++ b/src/module.c @@ -10840,10 +10840,8 @@ size_t VM_MallocSizeString(ValkeyModuleString *str) { * it does not include the allocation size of the keys and values. */ size_t VM_MallocSizeDict(ValkeyModuleDict *dict) { - size_t size = sizeof(ValkeyModuleDict) + sizeof(rax); - size += dict->rax->numnodes * sizeof(raxNode); - /* For more info about this weird line, see streamRadixTreeMemoryUsage */ - size += dict->rax->numnodes * sizeof(long) * 30; + size_t size = sizeof(ValkeyModuleDict); + size += raxAllocSize(dict->rax); return size; } diff --git a/src/object.c b/src/object.c index d409fa8d5c..2508f20ab6 100644 --- a/src/object.c +++ b/src/object.c @@ -952,29 +952,6 @@ char *strEncoding(int encoding) { /* =========================== Memory introspection ========================= */ -/* This is a helper function with the goal of estimating the memory - * size of a radix tree that is used to store Stream IDs. - * - * Note: to guess the size of the radix tree is not trivial, so we - * approximate it considering 16 bytes of data overhead for each - * key (the ID), and then adding the number of bare nodes, plus some - * overhead due by the data and child pointers. This secret recipe - * was obtained by checking the average radix tree created by real - * workloads, and then adjusting the constants to get numbers that - * more or less match the real memory usage. - * - * Actually the number of nodes and keys may be different depending - * on the insertion speed and thus the ability of the radix tree - * to compress prefixes. */ -size_t streamRadixTreeMemoryUsage(rax *rax) { - size_t size = sizeof(*rax); - size = rax->numele * sizeof(streamID); - size += rax->numnodes * sizeof(raxNode); - /* Add a fixed overhead due to the aux data pointer, children, ... */ - size += rax->numnodes * sizeof(long) * 30; - return size; -} - /* Returns the size in bytes consumed by the key's value in RAM. * Note that the returned value is just an approximation, especially in the * case of aggregated data types where only "sample_size" elements @@ -1072,7 +1049,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { } else if (o->type == OBJ_STREAM) { stream *s = o->ptr; asize = sizeof(*o) + sizeof(*s); - asize += streamRadixTreeMemoryUsage(s->rax); + asize += raxAllocSize(s->rax); /* Now we have to add the listpacks. The last listpack is often non * complete, so we estimate the size of the first N listpacks, and @@ -1112,7 +1089,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { while (raxNext(&ri)) { streamCG *cg = ri.data; asize += sizeof(*cg); - asize += streamRadixTreeMemoryUsage(cg->pel); + asize += raxAllocSize(cg->pel); asize += sizeof(streamNACK) * raxSize(cg->pel); /* For each consumer we also need to add the basic data @@ -1124,7 +1101,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { streamConsumer *consumer = cri.data; asize += sizeof(*consumer); asize += sdslen(consumer->name); - asize += streamRadixTreeMemoryUsage(consumer->pel); + asize += raxAllocSize(consumer->pel); /* Don't count NACKs again, they are shared with the * consumer group PEL. */ } diff --git a/src/rax.c b/src/rax.c index 319d89a2dc..ed17f3735d 100644 --- a/src/rax.c +++ b/src/rax.c @@ -192,6 +192,7 @@ rax *raxNew(void) { rax->numele = 0; rax->numnodes = 1; rax->head = raxNewNode(0, 0); + rax->alloc_size = rax_ptr_alloc_size(rax) + rax_ptr_alloc_size(rax->head); if (rax->head == NULL) { rax_free(rax); return NULL; @@ -510,8 +511,12 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** debugf("### Insert: node representing key exists\n"); /* Make space for the value pointer if needed. */ if (!h->iskey || (h->isnull && overwrite)) { + size_t oldalloc = rax_ptr_alloc_size(h); h = raxReallocForData(h, data); - if (h) memcpy(parentlink, &h, sizeof(h)); + if (h) { + memcpy(parentlink, &h, sizeof(h)); + rax->alloc_size = rax->alloc_size - oldalloc + rax_ptr_alloc_size(h); + } } if (h == NULL) { errno = ENOMEM; @@ -706,6 +711,7 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** return 0; } splitnode->data[0] = h->data[j]; + rax->alloc_size += rax_ptr_alloc_size(splitnode); if (j == 0) { /* 3a: Replace the old node with the split node. */ @@ -730,6 +736,7 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** memcpy(parentlink, &trimmed, sizeof(trimmed)); parentlink = cp; /* Set parentlink to splitnode parent. */ rax->numnodes++; + rax->alloc_size += rax_ptr_alloc_size(trimmed); } /* 4: Create the postfix node: what remains of the original @@ -744,6 +751,7 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** raxNode **cp = raxNodeLastChildPtr(postfix); memcpy(cp, &next, sizeof(next)); rax->numnodes++; + rax->alloc_size += rax_ptr_alloc_size(postfix); } else { /* 4b: just use next as postfix node. */ postfix = next; @@ -756,6 +764,7 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** /* 6. Continue insertion: this will cause the splitnode to * get a new child (the non common character at the currently * inserted key). */ + rax->alloc_size -= rax_ptr_alloc_size(h); rax_free(h); h = splitnode; } else if (h->iscompr && i == len) { @@ -794,6 +803,7 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** raxNode **cp = raxNodeLastChildPtr(postfix); memcpy(cp, &next, sizeof(next)); rax->numnodes++; + rax->alloc_size += rax_ptr_alloc_size(postfix); /* 3: Trim the compressed node. */ trimmed->size = j; @@ -806,6 +816,7 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** void *aux = raxGetData(h); raxSetData(trimmed, aux); } + rax->alloc_size += rax_ptr_alloc_size(trimmed); /* Fix the trimmed node child pointer to point to * the postfix node. */ @@ -815,6 +826,7 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** /* Finish! We don't need to continue with the insertion * algorithm for ALGO 2. The key is already inserted. */ rax->numele++; + rax->alloc_size -= rax_ptr_alloc_size(h); rax_free(h); return 1; /* Key inserted. */ } @@ -823,6 +835,7 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** * chars in our string. We need to insert the missing nodes. */ while (i < len) { raxNode *child; + size_t oldalloc = rax_ptr_alloc_size(h); /* If this node is going to have a single child, and there * are other characters, so that that would result in a chain @@ -848,14 +861,17 @@ int raxGenericInsert(rax *rax, unsigned char *s, size_t len, void *data, void ** i++; } rax->numnodes++; + rax->alloc_size = rax->alloc_size - oldalloc + rax_ptr_alloc_size(h) + rax_ptr_alloc_size(child); h = child; } + size_t oldalloc = rax_ptr_alloc_size(h); raxNode *newh = raxReallocForData(h, data); if (newh == NULL) goto oom; h = newh; if (!h->iskey) rax->numele++; raxSetData(h, data); memcpy(parentlink, &h, sizeof(h)); + rax->alloc_size = rax->alloc_size - oldalloc + rax_ptr_alloc_size(h); return 1; /* Element inserted. */ oom: @@ -1025,6 +1041,7 @@ int raxRemove(rax *rax, unsigned char *s, size_t len, void **old) { child = h; debugf("Freeing child %p [%.*s] key:%d\n", (void *)child, (int)child->size, (char *)child->data, child->iskey); + rax->alloc_size -= rax_ptr_alloc_size(child); rax_free(child); rax->numnodes--; h = raxStackPop(&ts); @@ -1034,7 +1051,9 @@ int raxRemove(rax *rax, unsigned char *s, size_t len, void **old) { } if (child) { debugf("Unlinking child %p from parent %p\n", (void *)child, (void *)h); + size_t oldalloc = rax_ptr_alloc_size(h); raxNode *new = raxRemoveChild(h, child); + rax->alloc_size = rax->alloc_size - oldalloc + rax_ptr_alloc_size(new); if (new != h) { raxNode *parent = raxStackPeek(&ts); raxNode **parentlink; @@ -1151,6 +1170,7 @@ int raxRemove(rax *rax, unsigned char *s, size_t len, void **old) { new->iscompr = 1; new->size = comprsize; rax->numnodes++; + rax->alloc_size += rax_ptr_alloc_size(new); /* Scan again, this time to populate the new node content and * to fix the new node child pointer. At the same time we free @@ -1163,6 +1183,7 @@ int raxRemove(rax *rax, unsigned char *s, size_t len, void **old) { raxNode **cp = raxNodeLastChildPtr(h); raxNode *tofree = h; memcpy(&h, cp, sizeof(h)); + rax->alloc_size -= rax_ptr_alloc_size(tofree); rax_free(tofree); rax->numnodes--; if (h->iskey || (!h->iscompr && h->size != 1)) break; @@ -1764,6 +1785,11 @@ uint64_t raxSize(rax *rax) { return rax->numele; } +/* Return the rax tree allocation size in bytes */ +size_t raxAllocSize(rax *rax) { + return rax->alloc_size; +} + /* ----------------------------- Introspection ------------------------------ */ /* This function is mostly used for debugging and learning purposes. diff --git a/src/rax.h b/src/rax.h index 5347dc480e..2d0c940698 100644 --- a/src/rax.h +++ b/src/rax.h @@ -131,9 +131,10 @@ typedef struct raxNode { } raxNode; typedef struct rax { - raxNode *head; - uint64_t numele; - uint64_t numnodes; + raxNode *head; /* Pointer to root node of tree */ + uint64_t numele; /* Number of keys in the tree */ + uint64_t numnodes; /* Number of rax nodes in the tree */ + size_t alloc_size; /* Total allocation size of the tree in bytes */ } rax; /* Stack data structure used by raxLowWalk() in order to, optionally, return @@ -203,6 +204,7 @@ void raxStop(raxIterator *it); int raxEOF(raxIterator *it); void raxShow(rax *rax); uint64_t raxSize(rax *rax); +size_t raxAllocSize(rax *rax); unsigned long raxTouch(raxNode *n); void raxSetDebugMsg(int onoff); diff --git a/src/rax_malloc.h b/src/rax_malloc.h index 03c952e1a4..49a626595a 100644 --- a/src/rax_malloc.h +++ b/src/rax_malloc.h @@ -41,4 +41,5 @@ #define rax_malloc zmalloc #define rax_realloc zrealloc #define rax_free zfree +#define rax_ptr_alloc_size zmalloc_size #endif diff --git a/src/unit/test_files.h b/src/unit/test_files.h index 71952e343f..cd2e0c5b92 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -84,6 +84,18 @@ int test_listpackBenchmarkLpValidateIntegrity(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithString(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags); int test_listpackBenchmarkFree(int argc, char **argv, int flags); +int test_raxRandomWalk(int argc, char **argv, int flags); +int test_raxIteratorUnitTests(int argc, char **argv, int flags); +int test_raxTryInsertUnitTests(int argc, char **argv, int flags); +int test_raxRegressionTest1(int argc, char **argv, int flags); +int test_raxRegressionTest2(int argc, char **argv, int flags); +int test_raxRegressionTest3(int argc, char **argv, int flags); +int test_raxRegressionTest4(int argc, char **argv, int flags); +int test_raxRegressionTest5(int argc, char **argv, int flags); +int test_raxRegressionTest6(int argc, char **argv, int flags); +int test_raxBenchmark(int argc, char **argv, int flags); +int test_raxHugeKey(int argc, char **argv, int flags); +int test_raxFuzz(int argc, char **argv, int flags); int test_sds(int argc, char **argv, int flags); int test_typesAndAllocSize(int argc, char **argv, int flags); int test_sdsHeaderSizes(int argc, char **argv, int flags); @@ -144,6 +156,7 @@ unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, N unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}}; unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}}; unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; +unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}}; unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {NULL, NULL}}; unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}}; unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {NULL, NULL}}; @@ -162,6 +175,7 @@ struct unitTestSuite { {"test_intset.c", __test_intset_c}, {"test_kvstore.c", __test_kvstore_c}, {"test_listpack.c", __test_listpack_c}, + {"test_rax.c", __test_rax_c}, {"test_sds.c", __test_sds_c}, {"test_sha1.c", __test_sha1_c}, {"test_util.c", __test_util_c}, diff --git a/src/unit/test_rax.c b/src/unit/test_rax.c new file mode 100644 index 0000000000..5f346b4115 --- /dev/null +++ b/src/unit/test_rax.c @@ -0,0 +1,1025 @@ +/* Rax -- A radix tree implementation. + * + * Copyright (c) 2017-2018, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include "../rax.c" +#include "../mt19937-64.c" +#include "test_help.h" + +uint16_t crc16(const char *buf, int len); /* From crc16.c */ +long long _ustime(void); /* From test_crc64combine.c */ + +/* --------------------------------------------------------------------------- + * Simple hash table implementation, no rehashing, just chaining. This is + * used in order to test the radix tree implementation against something that + * will always "tell the truth" :-) */ + +/* This is huge but we want it fast enough without reahshing needed. */ +#define HT_TABLE_SIZE 100000 +typedef struct htNode { + uint64_t keylen; + unsigned char *key; + void *data; + struct htNode *next; +} htNode; + +typedef struct ht { + uint64_t numele; + htNode *table[HT_TABLE_SIZE]; +} hashtable; + +/* Create a new hash table. */ +hashtable *htNew(void) { + hashtable *ht = zcalloc(sizeof(*ht)); + ht->numele = 0; + return ht; +} + +/* djb2 hash function. */ +uint32_t htHash(unsigned char *s, size_t len) { + uint32_t hash = 5381; + for (size_t i = 0; i < len; i++) hash = hash * 33 + s[i]; + return hash % HT_TABLE_SIZE; +} + +/* Low level hash table lookup function. */ +htNode *htRawLookup(hashtable *t, unsigned char *s, size_t len, uint32_t *hash, htNode ***parentlink) { + uint32_t h = htHash(s, len); + if (hash) *hash = h; + htNode *n = t->table[h]; + if (parentlink) *parentlink = &t->table[h]; + while (n) { + if (n->keylen == len && memcmp(n->key, s, len) == 0) return n; + if (parentlink) *parentlink = &n->next; + n = n->next; + } + return NULL; +} + +/* Add an element to the hash table, return 1 if the element is new, + * 0 if it existed and the value was updated to the new one. */ +int htAdd(hashtable *t, unsigned char *s, size_t len, void *data) { + uint32_t hash; + htNode *n = htRawLookup(t, s, len, &hash, NULL); + + if (!n) { + n = zmalloc(sizeof(*n)); + n->key = zmalloc(len); + memcpy(n->key, s, len); + n->keylen = len; + n->data = data; + n->next = t->table[hash]; + t->table[hash] = n; + t->numele++; + return 1; + } else { + n->data = data; + return 0; + } +} + +/* Remove the specified element, returns 1 on success, 0 if the element + * was not there already. */ +int htRem(hashtable *t, unsigned char *s, size_t len) { + htNode **parentlink; + htNode *n = htRawLookup(t, s, len, NULL, &parentlink); + + if (!n) return 0; + *parentlink = n->next; + zfree(n->key); + zfree(n); + t->numele--; + return 1; +} + +void *htNotFound = (void *)"ht-not-found"; + +/* Find an element inside the hash table. Returns htNotFound if the + * element is not there, otherwise returns the associated value. */ +void *htFind(hashtable *t, unsigned char *s, size_t len) { + htNode *n = htRawLookup(t, s, len, NULL, NULL); + if (!n) return htNotFound; + return n->data; +} + +/* Free the whole hash table including all the linked nodes. */ +void htFree(hashtable *ht) { + for (int j = 0; j < HT_TABLE_SIZE; j++) { + htNode *next = ht->table[j]; + while (next) { + htNode *this = next; + next = this->next; + zfree(this->key); + zfree(this); + } + } + zfree(ht); +} + +/* -------------------------------------------------------------------------- + * Utility functions to generate keys, check time usage and so forth. + * -------------------------------------------------------------------------*/ + +/* This is a simple Feistel network in order to turn every possible + * uint32_t input into another "randomly" looking uint32_t. It is a + * one to one map so there are no repetitions. */ +static uint32_t int2int(uint32_t input) { + uint16_t l = input & 0xffff; + uint16_t r = input >> 16; + for (int i = 0; i < 8; i++) { + uint16_t nl = r; + uint16_t F = (((r * 31) + (r >> 5) + 7 * 371) ^ r) & 0xffff; + r = l ^ F; + l = nl; + } + return (r << 16) | l; +} + +/* Turn an uint32_t integer into an alphanumerical key and return its + * length. This function is used in order to generate keys that have + * a large charset, so that the radix tree can be testsed with many + * children per node. */ +static size_t int2alphakey(char *s, size_t maxlen, uint32_t i) { + const char *set = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789"; + const size_t setlen = 62; + + if (maxlen == 0) return 0; + maxlen--; /* Space for null term char. */ + size_t len = 0; + while (len < maxlen) { + s[len++] = set[i % setlen]; + i /= setlen; + if (i == 0) break; + } + s[len] = '\0'; + return len; +} + + +/* Turn the integer 'i' into a key according to 'mode'. + * KEY_INT: Just represents the integer as a string. + * KEY_UNIQUE_ALPHA: Turn it into a random-looking alphanumerical string + * according to the int2alphakey() function, so that + * at every integer is mapped a different string. + * KEY_RANDOM: Totally random string up to maxlen bytes. + * KEY_RANDOM_ALPHA: Alphanumerical random string up to maxlen bytes. + * KEY_RANDOM_SMALL_CSET: Small charset random strings. + * KEY_CHAIN: 'i' times the character "A". */ +#define KEY_INT 0 +#define KEY_UNIQUE_ALPHA 1 +#define KEY_RANDOM 2 +#define KEY_RANDOM_ALPHA 3 +#define KEY_RANDOM_SMALL_CSET 4 +#define KEY_CHAIN 5 +static size_t int2key(char *s, size_t maxlen, uint32_t i, int mode) { + if (mode == KEY_INT) { + return snprintf(s, maxlen, "%lu", (unsigned long)i); + } else if (mode == KEY_UNIQUE_ALPHA) { + if (maxlen > 16) maxlen = 16; + i = int2int(i); + return int2alphakey(s, maxlen, i); + } else if (mode == KEY_RANDOM) { + if (maxlen > 16) maxlen = 16; + int r = genrand64_int64() % maxlen; + for (int i = 0; i < r; i++) s[i] = genrand64_int64() & 0xff; + return r; + } else if (mode == KEY_RANDOM_ALPHA) { + if (maxlen > 16) maxlen = 16; + int r = genrand64_int64() % maxlen; + for (int i = 0; i < r; i++) s[i] = 'A' + genrand64_int64() % ('z' - 'A' + 1); + return r; + } else if (mode == KEY_RANDOM_SMALL_CSET) { + if (maxlen > 16) maxlen = 16; + int r = genrand64_int64() % maxlen; + for (int i = 0; i < r; i++) s[i] = 'A' + genrand64_int64() % 4; + return r; + } else if (mode == KEY_CHAIN) { + if (i > maxlen) i = maxlen; + memset(s, 'A', i); + return i; + } else { + return 0; + } +} + +/* -------------------------------------------------------------------------- */ + +/* Perform a fuzz test, returns 0 on success, 1 on error. */ +int fuzzTest(int keymode, size_t count, double addprob, double remprob) { + hashtable *ht = htNew(); + rax *rax = raxNew(); + + printf("Fuzz test in mode %d [%zu]: ", keymode, count); + fflush(stdout); + + /* Perform random operations on both the dictionaries. */ + for (size_t i = 0; i < count; i++) { + unsigned char key[1024]; + uint32_t keylen; + + /* Insert element. */ + if ((double)genrand64_int64() / RAND_MAX < addprob) { + keylen = int2key((char *)key, sizeof(key), i, keymode); + void *val = (void *)(unsigned long)genrand64_int64(); + /* Stress NULL values more often, they use a special encoding. */ + if (!(genrand64_int64() % 100)) val = NULL; + int retval1 = htAdd(ht, key, keylen, val); + int retval2 = raxInsert(rax, key, keylen, val, NULL); + if (retval1 != retval2) { + printf("Fuzz: key insertion reported mismatching value in HT/RAX\n"); + return 1; + } + } + + /* Remove element. */ + if ((double)genrand64_int64() / RAND_MAX < remprob) { + keylen = int2key((char *)key, sizeof(key), i, keymode); + int retval1 = htRem(ht, key, keylen); + int retval2 = raxRemove(rax, key, keylen, NULL); + if (retval1 != retval2) { + printf("Fuzz: key deletion of '%.*s' reported mismatching " + "value in HT=%d RAX=%d\n", + (int)keylen, (char *)key, retval1, retval2); + return 1; + } + } + } + + /* Check that count matches. */ + if (ht->numele != raxSize(rax)) { + printf("Fuzz: HT / RAX keys count mismatch: %lu vs %lu\n", (unsigned long)ht->numele, + (unsigned long)raxSize(rax)); + return 1; + } + printf("%lu elements inserted\n", (unsigned long)ht->numele); + + /* Check that elements match. */ + raxIterator iter; + raxStart(&iter, rax); + raxSeek(&iter, "^", NULL, 0); + + size_t numkeys = 0; + while (raxNext(&iter)) { + void *val1 = htFind(ht, iter.key, iter.key_len); + void *val2 = NULL; + raxFind(rax, iter.key, iter.key_len, &val2); + if (val1 != val2) { + printf("Fuzz: HT=%p, RAX=%p value do not match " + "for key %.*s\n", + val1, val2, (int)iter.key_len, (char *)iter.key); + return 1; + } + numkeys++; + } + + /* Check that the iterator reported all the elements. */ + if (ht->numele != numkeys) { + printf("Fuzz: the iterator reported %lu keys instead of %lu\n", (unsigned long)numkeys, + (unsigned long)ht->numele); + return 1; + } + + raxStop(&iter); + raxFree(rax); + htFree(ht); + return 0; +} + +/* Redis Cluster alike fuzz testing. + * + * This test simulates the radix tree usage made by Redis Cluster in order + * to maintain the hash slot -> keys mappig. The keys are alphanumerical + * but the first two bytes that are binary (and are the key hashed). + * + * In this test there is no comparison with the hash table, the only goal + * is to crash the radix tree implementation, or to trigger Valgrind + * warnings. */ +int fuzzTestCluster(size_t count, double addprob, double remprob) { + unsigned char key[128]; + int keylen = 0; + + printf("Cluster Fuzz test [keys:%zu keylen:%d]: ", count, keylen); + fflush(stdout); + + rax *rax = raxNew(); + + /* This is our template to generate keys. The first two bytes will + * be replaced with the binary redis cluster hash slot. */ + keylen = snprintf((char *)key, sizeof(key), "__geocode:2e68e5df3624"); + char *cset = "0123456789abcdef"; + + for (unsigned long j = 0; j < count; j++) { + /* Generate a random key by altering our template key. */ + + /* With a given probability, let's use a common prefix so that there + * is a subset of keys that have an higher percentage of probability + * of being hit again and again. */ + size_t commonprefix = genrand64_int64() & 0xf; + if (commonprefix == 0) memcpy(key + 10, "2e68e5", 6); + + /* Alter a random char in the key. */ + int pos = 10 + genrand64_int64() % 12; + key[pos] = cset[genrand64_int64() % 16]; + + /* Compute the Redis Cluster hash slot to set the first two + * binary bytes of the key. */ + int hashslot = crc16((char *)key, keylen) & 0x3FFF; + key[0] = (hashslot >> 8) & 0xff; + key[1] = hashslot & 0xff; + + /* Insert element. */ + if ((double)genrand64_int64() / RAND_MAX < addprob) { + raxInsert(rax, key, keylen, NULL, NULL); + TEST_ASSERT(raxAllocSize(rax) == zmalloc_used_memory()); + } + + /* Remove element. */ + if ((double)genrand64_int64() / RAND_MAX < remprob) { + raxRemove(rax, key, keylen, NULL); + TEST_ASSERT(raxAllocSize(rax) == zmalloc_used_memory()); + } + } + size_t finalkeys = raxSize(rax); + raxFree(rax); + printf("ok with %zu final keys\n", finalkeys); + return 0; +} + +/* Iterator fuzz testing. Compared the items returned by the Rax iterator with + * a C implementation obtained by sorting the inserted strings in a linear + * array. */ +typedef struct arrayItem { + unsigned char *key; + size_t key_len; +} arrayItem; + +/* Utility functions used with qsort() in order to sort the array of strings + * in the same way Rax sorts keys (which is, lexicographically considering + * every byte an unsigned integer. */ +int compareAB(const unsigned char *keya, size_t lena, const unsigned char *keyb, size_t lenb) { + size_t minlen = (lena <= lenb) ? lena : lenb; + int retval = memcmp(keya, keyb, minlen); + if (lena == lenb || retval != 0) return retval; + return (lena > lenb) ? 1 : -1; +} + +int compareArrayItems(const void *aptr, const void *bptr) { + const arrayItem *a = aptr; + const arrayItem *b = bptr; + return compareAB(a->key, a->key_len, b->key, b->key_len); +} + +/* Seek an element in the array, returning the seek index (the index inside the + * array). If the seek is not possible (== operator and key not found or empty + * array) -1 is returned. */ +int arraySeek(arrayItem *array, int count, unsigned char *key, size_t len, char *op) { + if (count == 0) return -1; + if (op[0] == '^') return 0; + if (op[0] == '$') return count - 1; + + int eq = 0, lt = 0, gt = 0; + if (op[1] == '=') eq = 1; + if (op[0] == '<') lt = 1; + if (op[0] == '>') gt = 1; + + int i; + for (i = 0; i < count; i++) { + int cmp = compareAB(array[i].key, array[i].key_len, key, len); + if (eq && !cmp) return i; + if (cmp > 0 && gt) return i; + if (cmp >= 0 && lt) { + i--; + break; + } + } + if (lt && i == count) return count - 1; + if (i < 0 || i >= count) return -1; + return i; +} + +int iteratorFuzzTest(int keymode, size_t count) { + count = genrand64_int64() % count; + rax *rax = raxNew(); + arrayItem *array = zmalloc(sizeof(arrayItem) * count); + + /* Fill a radix tree and a linear array with some data. */ + unsigned char key[1024]; + size_t j = 0; + for (size_t i = 0; i < count; i++) { + uint32_t keylen = int2key((char *)key, sizeof(key), i, keymode); + void *val = (void *)(unsigned long)htHash(key, keylen); + + if (raxInsert(rax, key, keylen, val, NULL)) { + array[j].key = zmalloc(keylen); + array[j].key_len = keylen; + memcpy(array[j].key, key, keylen); + j++; + } + } + count = raxSize(rax); + + /* Sort the array. */ + qsort(array, count, sizeof(arrayItem), compareArrayItems); + + /* Perform a random seek operation. */ + uint32_t keylen = int2key((char *)key, sizeof(key), genrand64_int64() % (count ? count : 1), keymode); + raxIterator iter; + raxStart(&iter, rax); + char *seekops[] = {"==", ">=", "<=", ">", "<", "^", "$"}; + char *seekop = seekops[genrand64_int64() % 7]; + raxSeek(&iter, seekop, key, keylen); + int seekidx = arraySeek(array, count, key, keylen, seekop); + + int next = genrand64_int64() % 2; + int iteration = 0; + while (1) { + int rax_res; + int array_res; + unsigned char *array_key = NULL; + size_t array_key_len = 0; + + array_res = (seekidx == -1) ? 0 : 1; + if (array_res) { + if (next && seekidx == (signed)count) array_res = 0; + if (!next && seekidx == -1) array_res = 0; + if (array_res != 0) { + array_key = array[seekidx].key; + array_key_len = array[seekidx].key_len; + } + } + + if (next) { + rax_res = raxNext(&iter); + if (array_res) seekidx++; + } else { + rax_res = raxPrev(&iter); + if (array_res) seekidx--; + } + + /* Both the iteratos should agree about EOF. */ + if (array_res != rax_res) { + printf("Iter fuzz: iterators do not agree about EOF " + "at iteration %d: " + "array_more=%d rax_more=%d next=%d\n", + iteration, array_res, rax_res, next); + return 1; + } + if (array_res == 0) break; /* End of iteration reached. */ + + /* Check that the returned keys are the same. */ + if (iter.key_len != array_key_len || memcmp(iter.key, array_key, iter.key_len)) { + printf("Iter fuzz: returned element %d mismatch\n", iteration); + printf("SEEKOP was %s\n", seekop); + if (keymode != KEY_RANDOM) { + printf("\n"); + printf("BUG SEEKING: %s %.*s\n", seekop, keylen, key); + printf("%.*s (iter) VS %.*s (array) next=%d idx=%d " + "count=%lu keymode=%d\n", + (int)iter.key_len, (char *)iter.key, (int)array_key_len, (char *)array_key, next, seekidx, + (unsigned long)count, keymode); + if (count < 500) { + printf("\n"); + for (unsigned int j = 0; j < count; j++) { + printf("%d) '%.*s'\n", j, (int)array[j].key_len, array[j].key); + } + } + exit(1); + } + return 1; + } + iteration++; + } + + for (unsigned int i = 0; i < count; i++) zfree(array[i].key); + zfree(array); + raxStop(&iter); + raxFree(rax); + return 0; +} + +/* Test the random walk function. */ +int test_raxRandomWalk(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *t = raxNew(); + char *toadd[] = {"alligator", "alien", "byword", "chromodynamic", "romane", "romanus", "romulus", "rubens", + "ruber", "rubicon", "rubicundus", "all", "rub", "by", NULL}; + + long numele; + for (numele = 0; toadd[numele] != NULL; numele++) { + raxInsert(t, (unsigned char *)toadd[numele], strlen(toadd[numele]), (void *)numele, NULL); + TEST_ASSERT(raxAllocSize(t) == zmalloc_used_memory()); + } + + raxIterator iter; + raxStart(&iter, t); + raxSeek(&iter, "^", NULL, 0); + int maxloops = 100000; + while (raxRandomWalk(&iter, 0) && maxloops--) { + int nulls = 0; + for (long i = 0; i < numele; i++) { + if (toadd[i] == NULL) { + nulls++; + continue; + } + if (strlen(toadd[i]) == iter.key_len && memcmp(toadd[i], iter.key, iter.key_len) == 0) { + toadd[i] = NULL; + nulls++; + } + } + if (nulls == numele) break; + } + if (maxloops == 0) { + printf("randomWalkTest() is unable to report all the elements " + "after 100k iterations!\n"); + return 1; + } + raxStop(&iter); + raxFree(t); + return 0; +} + +int test_raxIteratorUnitTests(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *t = raxNew(); + char *toadd[] = {"alligator", "alien", "byword", "chromodynamic", "romane", "romanus", "romulus", "rubens", + "ruber", "rubicon", "rubicundus", "all", "rub", "by", NULL}; + + for (int x = 0; x < 10000; x++) genrand64_int64(); + + long items = 0; + while (toadd[items] != NULL) items++; + + for (long i = 0; i < items; i++) { + raxInsert(t, (unsigned char *)toadd[i], strlen(toadd[i]), (void *)i, NULL); + TEST_ASSERT(raxAllocSize(t) == zmalloc_used_memory()); + } + + raxIterator iter; + raxStart(&iter, t); + + struct { + char *seek; + size_t seeklen; + char *seekop; + char *expected; + } tests[] = {/* Seek value. */ /* Expected result. */ + {"rpxxx", 5, "<=", "romulus"}, + {"rom", 3, ">=", "romane"}, + {"rub", 3, ">=", "rub"}, + {"rub", 3, ">", "rubens"}, + {"rub", 3, "<", "romulus"}, + {"rom", 3, ">", "romane"}, + {"chro", 4, ">", "chromodynamic"}, + {"chro", 4, "<", "byword"}, + {"chromz", 6, "<", "chromodynamic"}, + {"", 0, "^", "alien"}, + {"zorro", 5, "<=", "rubicundus"}, + {"zorro", 5, "<", "rubicundus"}, + {"zorro", 5, "<", "rubicundus"}, + {"", 0, "$", "rubicundus"}, + {"ro", 2, ">=", "romane"}, + {"zo", 2, ">", NULL}, + {"zo", 2, "==", NULL}, + {"romane", 6, "==", "romane"}}; + + for (int i = 0; tests[i].expected != NULL; i++) { + raxSeek(&iter, tests[i].seekop, (unsigned char *)tests[i].seek, tests[i].seeklen); + int retval = raxNext(&iter); + + if (tests[i].expected != NULL) { + if (strlen(tests[i].expected) != iter.key_len || memcmp(tests[i].expected, iter.key, iter.key_len) != 0) { + printf("Iterator unit test error: " + "test %d, %s expected, %.*s reported\n", + i, tests[i].expected, (int)iter.key_len, (char *)iter.key); + return 1; + } + } else { + if (retval != 0) { + printf("Iterator unit test error: " + "EOF expected in test %d\n", + i); + return 1; + } + } + } + raxStop(&iter); + raxFree(t); + return 0; +} + +/* Test that raxInsert() / raxTryInsert() overwrite semantic + * works as expected. */ +int test_raxTryInsertUnitTests(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *t = raxNew(); + raxInsert(t, (unsigned char *)"FOO", 3, (void *)(long)1, NULL); + void *old, *val; + raxTryInsert(t, (unsigned char *)"FOO", 3, (void *)(long)2, &old); + if (old != (void *)(long)1) { + printf("Old value not returned correctly by raxTryInsert(): %p", old); + return 1; + } + + val = NULL; + raxFind(t, (unsigned char *)"FOO", 3, &val); + if (val != (void *)(long)1) { + printf("FOO value mismatch: is %p instead of 1", val); + return 1; + } + + raxInsert(t, (unsigned char *)"FOO", 3, (void *)(long)2, NULL); + val = NULL; + raxFind(t, (unsigned char *)"FOO", 3, &val); + if (val != (void *)(long)2) { + printf("FOO value mismatch: is %p instead of 2", val); + return 1; + } + + raxFree(t); + return 0; +} + +/* Regression test #1: Iterator wrong element returned after seek. */ +int test_raxRegressionTest1(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *rax = raxNew(); + raxInsert(rax, (unsigned char *)"LKE", 3, (void *)(long)1, NULL); + raxInsert(rax, (unsigned char *)"TQ", 2, (void *)(long)2, NULL); + raxInsert(rax, (unsigned char *)"B", 1, (void *)(long)3, NULL); + raxInsert(rax, (unsigned char *)"FY", 2, (void *)(long)4, NULL); + raxInsert(rax, (unsigned char *)"WI", 2, (void *)(long)5, NULL); + + raxIterator iter; + raxStart(&iter, rax); + raxSeek(&iter, ">", (unsigned char *)"FMP", 3); + if (raxNext(&iter)) { + if (iter.key_len != 2 || memcmp(iter.key, "FY", 2)) { + printf("Regression test 1 failed: 'FY' expected, got: '%.*s'\n", (int)iter.key_len, (char *)iter.key); + return 1; + } + } + + raxStop(&iter); + raxFree(rax); + return 0; +} + +/* Regression test #2: Crash when mixing NULL and not NULL values. */ +int test_raxRegressionTest2(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *rt = raxNew(); + raxInsert(rt, (unsigned char *)"a", 1, (void *)100, NULL); + raxInsert(rt, (unsigned char *)"ab", 2, (void *)101, NULL); + raxInsert(rt, (unsigned char *)"abc", 3, (void *)NULL, NULL); + raxInsert(rt, (unsigned char *)"abcd", 4, (void *)NULL, NULL); + raxInsert(rt, (unsigned char *)"abc", 3, (void *)102, NULL); + raxFree(rt); + return 0; +} + +/* Regression test #3: Wrong access at node value in raxRemoveChild() + * when iskey == 1 and isnull == 1: the memmove() was performed including + * the value length regardless of the fact there was no actual value. + * + * Note that this test always returns success but will trigger a + * Valgrind error. */ +int test_raxRegressionTest3(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *rt = raxNew(); + raxInsert(rt, (unsigned char *)"D", 1, (void *)1, NULL); + raxInsert(rt, (unsigned char *)"", 0, NULL, NULL); + raxRemove(rt, (unsigned char *)"D", 1, NULL); + raxFree(rt); + return 0; +} + +/* Regression test #4: Github issue #8, iterator does not populate the + * data field after seek in case of exact match. The test case is looks odd + * because it is quite indirect: Seeking "^" will result into seeking + * the element >= "", and since we just added "" an exact match happens, + * however we are using the original one from the bug report, since this + * is quite odd and may later protect against different bugs related to + * storing and fetching the empty string key. */ +int test_raxRegressionTest4(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *rt = raxNew(); + raxIterator iter; + raxInsert(rt, (unsigned char *)"", 0, (void *)-1, NULL); + void *val = NULL; + raxFind(rt, (unsigned char *)"", 0, &val); + if (val != (void *)-1) { + printf("Regression test 4 failed. Key value mismatch in raxFind()\n"); + return 1; + } + raxStart(&iter, rt); + raxSeek(&iter, "^", NULL, 0); + raxNext(&iter); + if (iter.data != (void *)-1) { + printf("Regression test 4 failed. Key value mismatch in raxNext()\n"); + return 1; + } + raxStop(&iter); + raxFree(rt); + return 0; +} + +/* Less than seek bug when stopping in the middle of a compressed node. */ +int test_raxRegressionTest5(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *rax = raxNew(); + + raxInsert(rax, (unsigned char *)"b", 1, (void *)(long)1, NULL); + raxInsert(rax, (unsigned char *)"by", 2, (void *)(long)2, NULL); + raxInsert(rax, (unsigned char *)"byword", 6, (void *)(long)3, NULL); + + raxInsert(rax, (unsigned char *)"f", 1, (void *)(long)4, NULL); + raxInsert(rax, (unsigned char *)"foobar", 6, (void *)(long)5, NULL); + raxInsert(rax, (unsigned char *)"foobar123", 9, (void *)(long)6, NULL); + + raxIterator ri; + raxStart(&ri, rax); + + raxSeek(&ri, "<", (unsigned char *)"foo", 3); + raxNext(&ri); + if (ri.key_len != 1 || ri.key[0] != 'f') { + printf("Regression test 4 failed. Key value mismatch in raxNext()\n"); + return 1; + } + + raxStop(&ri); + raxFree(rax); + return 0; +} + +/* Seek may not populate iterator data. See issue #25. */ +int test_raxRegressionTest6(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + rax *rax = raxNew(); + + char *key1 = "172.17.141.2/adminguide/v5.0/"; + char *key2 = "172.17.141.2/adminguide/v5.0/entitlements-configure.html"; + char *seekpoint = "172.17.141.2/adminguide/v5.0/entitlements"; + + raxInsert(rax, (unsigned char *)key1, strlen(key1), (void *)(long)1234, NULL); + raxInsert(rax, (unsigned char *)key2, strlen(key2), (void *)(long)5678, NULL); + + raxIterator ri; + raxStart(&ri, rax); + raxSeek(&ri, "<=", (unsigned char *)seekpoint, strlen(seekpoint)); + raxPrev(&ri); + if ((long)ri.data != 1234) { + printf("Regression test 6 failed. Key data not populated.\n"); + return 1; + } + + raxStop(&ri); + raxFree(rax); + return 0; +} + +int test_raxBenchmark(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + + if (!(flags & UNIT_TEST_SINGLE)) return 0; + + for (int mode = 0; mode < 2; mode++) { + printf("Benchmark with %s keys:\n", (mode == 0) ? "integer" : "alphanumerical"); + rax *t = raxNew(); + long long start = _ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int len = int2key(buf, sizeof(buf), i, mode); + raxInsert(t, (unsigned char *)buf, len, (void *)(long)i, NULL); + TEST_ASSERT(raxAllocSize(t) == zmalloc_used_memory()); + } + printf("Insert: %f\n", (double)(_ustime() - start) / 1000000); + printf("%llu total nodes\n", (unsigned long long)t->numnodes); + printf("%llu total elements\n", (unsigned long long)t->numele); + + start = _ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int len = int2key(buf, sizeof(buf), i, mode); + void *data; + if (!raxFind(t, (unsigned char *)buf, len, &data) || data != (void *)(long)i) { + printf("Issue with %s: %p instead of %p\n", buf, data, (void *)(long)i); + } + } + printf("Linear lookup: %f\n", (double)(_ustime() - start) / 1000000); + + start = _ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int r = genrand64_int64() % 5000000; + int len = int2key(buf, sizeof(buf), r, mode); + void *data; + if (!raxFind(t, (unsigned char *)buf, len, &data) || data != (void *)(long)r) { + printf("Issue with %s: %p instead of %p\n", buf, data, (void *)(long)r); + } + } + printf("Random lookup: %f\n", (double)(_ustime() - start) / 1000000); + + start = _ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int len = int2key(buf, sizeof(buf), i, mode); + buf[i % len] = '!'; /* "!" is never set into keys. */ + TEST_ASSERT_MESSAGE("Lookup should have failed", !raxFind(t, (unsigned char *)buf, len, NULL)); + } + printf("Failed lookup: %f\n", (double)(_ustime() - start) / 1000000); + + start = _ustime(); + raxIterator ri; + raxStart(&ri, t); + raxSeek(&ri, "^", NULL, 0); + int iter = 0; + while (raxNext(&ri)) iter++; + TEST_ASSERT_MESSAGE("Iteration is incomplete", iter == 5000000); + raxStop(&ri); + printf("Full iteration: %f\n", (double)(_ustime() - start) / 1000000); + + start = _ustime(); + for (int i = 0; i < 5000000; i++) { + char buf[64]; + int len = int2key(buf, sizeof(buf), i, mode); + int retval = raxRemove(t, (unsigned char *)buf, len, NULL); + TEST_ASSERT(retval == 1); + TEST_ASSERT(raxAllocSize(t) == zmalloc_used_memory()); + } + printf("Deletion: %f\n", (double)(_ustime() - start) / 1000000); + + printf("%llu total nodes\n", (unsigned long long)t->numnodes); + printf("%llu total elements\n", (unsigned long long)t->numele); + raxFree(t); + } + + return 0; +} + +/* Compressed nodes can only hold (2^29)-1 characters, so it is important + * to test for keys bigger than this amount, in order to make sure that + * the code to handle this edge case works as expected. + * + * This test is disabled by default because it uses a lot of memory. */ +int test_raxHugeKey(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + + if (!(flags & UNIT_TEST_LARGE_MEMORY)) return 0; + + size_t max_keylen = ((1 << 29) - 1) + 100; + unsigned char *key = zmalloc(max_keylen); + if (key == NULL) goto oom; + + memset(key, 'a', max_keylen); + key[10] = 'X'; + key[max_keylen - 1] = 'Y'; + rax *rax = raxNew(); + int retval = raxInsert(rax, (unsigned char *)"aaabbb", 6, (void *)5678L, NULL); + if (retval == 0 && errno == ENOMEM) goto oom; + retval = raxInsert(rax, key, max_keylen, (void *)1234L, NULL); + if (retval == 0 && errno == ENOMEM) goto oom; + void *value1, *value2; + int found1 = raxFind(rax, (unsigned char *)"aaabbb", 6, &value1); + int found2 = raxFind(rax, key, max_keylen, &value2); + zfree(key); + if (!found1 || !found2) { + printf("Huge key test failed on elementhood\n"); + return 1; + } + if (value1 != (void *)5678L || value2 != (void *)1234L) { + printf("Huge key test failed\n"); + return 1; + } + raxFree(rax); + return 0; + +oom: + fprintf(stderr, "Sorry, not enough memory to execute --hugekey test."); + exit(1); +} + +int test_raxFuzz(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + + if (!(flags & UNIT_TEST_ACCURATE)) return 0; + + int errors = 0; + + init_genrand64(1234); + + for (int i = 0; i < 10; i++) { + double alpha = (double)genrand64_int64() / RAND_MAX; + double beta = 1 - alpha; + if (fuzzTestCluster(genrand64_int64() % 100000000, alpha, beta)) errors++; + } + + for (int i = 0; i < 10; i++) { + double alpha = (double)genrand64_int64() / RAND_MAX; + double beta = 1 - alpha; + if (fuzzTest(KEY_INT, genrand64_int64() % 10000, alpha, beta)) errors++; + if (fuzzTest(KEY_UNIQUE_ALPHA, genrand64_int64() % 10000, alpha, beta)) errors++; + if (fuzzTest(KEY_RANDOM, genrand64_int64() % 10000, alpha, beta)) errors++; + if (fuzzTest(KEY_RANDOM_ALPHA, genrand64_int64() % 10000, alpha, beta)) errors++; + if (fuzzTest(KEY_RANDOM_SMALL_CSET, genrand64_int64() % 10000, alpha, beta)) errors++; + } + + size_t numops = 100000, cycles = 3; + while (cycles--) { + if (fuzzTest(KEY_INT, numops, .7, .3)) errors++; + if (fuzzTest(KEY_UNIQUE_ALPHA, numops, .7, .3)) errors++; + if (fuzzTest(KEY_RANDOM, numops, .7, .3)) errors++; + if (fuzzTest(KEY_RANDOM_ALPHA, numops, .7, .3)) errors++; + if (fuzzTest(KEY_RANDOM_SMALL_CSET, numops, .7, .3)) errors++; + numops *= 10; + } + + if (fuzzTest(KEY_CHAIN, 1000, .7, .3)) errors++; + printf("Iterator fuzz test: "); + fflush(stdout); + for (int i = 0; i < 100000; i++) { + if (iteratorFuzzTest(KEY_INT, 100)) errors++; + if (iteratorFuzzTest(KEY_UNIQUE_ALPHA, 100)) errors++; + if (iteratorFuzzTest(KEY_RANDOM_ALPHA, 1000)) errors++; + if (iteratorFuzzTest(KEY_RANDOM, 1000)) errors++; + if (i && !(i % 100)) { + printf("."); + if (!(i % 1000)) { + printf("%d%% done", i / 1000); + } + fflush(stdout); + } + } + printf("\n"); + + if (errors) { + printf("!!! WARNING !!!: %d errors found\n", errors); + } else { + printf("OK! \\o/\n"); + } + return !!errors; +} From 3a748a3ff206d8042572e3fcdb63bb0e0a224163 Mon Sep 17 00:00:00 2001 From: Melroy van den Berg Date: Wed, 2 Oct 2024 19:43:34 +0200 Subject: [PATCH 10/51] Avoid .c, .d and .o files from being copied to the binary tar.gz releases (#1106) As discussed here: https://github.com/orgs/valkey-io/discussions/1103#discussioncomment-10814006 `cp` can't be used anymore, `rsync` is more powerful and allow to exclude files. Alternatively: 1. Remove the c, d and o files. Which isn't ideal either. 2. Improve the build. Eg. by building inside a `build` directory instead of in the src folder. Ps. I know these workflows aren't trigger in this PR. Only via "Build Release Packages" workflow action: https://github.com/valkey-io/valkey/actions/workflows/build-release-packages.yml.. So I can't fully test in this PR. But it should work ^^ Ps. ps. I did test `rsync -av --exclude='*.c' --exclude='*.d' --exclude='*.o' src/valkey-*` command in isolation and that works as expected! --------- Signed-off-by: Melroy van den Berg --- .github/workflows/call-build-linux-arm-packages.yml | 6 +++--- .github/workflows/call-build-linux-x86-packages.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/call-build-linux-arm-packages.yml b/.github/workflows/call-build-linux-arm-packages.yml index 80416806c9..8862f19736 100644 --- a/.github/workflows/call-build-linux-arm-packages.yml +++ b/.github/workflows/call-build-linux-arm-packages.yml @@ -57,9 +57,9 @@ jobs: - name: Create Tarball and SHA256sums run: | TAR_FILE_NAME=valkey-${{inputs.version}}-${{matrix.distro.platform}}-${{ matrix.distro.arch}} - mkdir -p $TAR_FILE_NAME/bin $TAR_FILE_NAME/share - cp -rfv src/valkey-* $TAR_FILE_NAME/bin - cp -v /home/runner/work/valkey/valkey/COPYING $TAR_FILE_NAME/share/LICENSE + mkdir -p "$TAR_FILE_NAME/bin" "$TAR_FILE_NAME/share" + rsync -av --exclude='*.c' --exclude='*.d' --exclude='*.o' src/valkey-* "$TAR_FILE_NAME/bin/" + cp -v /home/runner/work/valkey/valkey/COPYING "$TAR_FILE_NAME/share/LICENSE" tar -czvf $TAR_FILE_NAME.tar.gz $TAR_FILE_NAME sha256sum $TAR_FILE_NAME.tar.gz > $TAR_FILE_NAME.tar.gz.sha256 mkdir -p packages-files diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml index b037f9c507..42b9146ea4 100644 --- a/.github/workflows/call-build-linux-x86-packages.yml +++ b/.github/workflows/call-build-linux-x86-packages.yml @@ -55,9 +55,9 @@ jobs: - name: Create Tarball and SHA256sums run: | TAR_FILE_NAME=valkey-${{inputs.version}}-${{matrix.distro.platform}}-${{ matrix.distro.arch}} - mkdir -p $TAR_FILE_NAME/bin $TAR_FILE_NAME/share - cp -rfv src/valkey-* $TAR_FILE_NAME/bin - cp -v /home/runner/work/valkey/valkey/COPYING $TAR_FILE_NAME/share/LICENSE + mkdir -p "$TAR_FILE_NAME/bin" "$TAR_FILE_NAME/share" + rsync -av --exclude='*.c' --exclude='*.d' --exclude='*.o' src/valkey-* "$TAR_FILE_NAME/bin/" + cp -v /home/runner/work/valkey/valkey/COPYING "$TAR_FILE_NAME/share/LICENSE" tar -czvf $TAR_FILE_NAME.tar.gz $TAR_FILE_NAME sha256sum $TAR_FILE_NAME.tar.gz > $TAR_FILE_NAME.tar.gz.sha256 mkdir -p packages-files From a120069e1c700e958ba47e7fe439610c25cedfaa Mon Sep 17 00:00:00 2001 From: Melroy van den Berg Date: Wed, 2 Oct 2024 19:48:54 +0200 Subject: [PATCH 11/51] Build binary releases with systemd support (#1107) - Add systemd support to the build artifact tarballs, so people can use it under systemd compatible distros. As discussed here: https://github.com/orgs/valkey-io/discussions/1103#discussioncomment-10815549. Adding `libsystemd-dev` to install and add `USE_SYSTEMD=yes` to the build. - Cleanup & bring the arm & x86 workflow files in-sync. It was a bit of a mess ;) (removing `jq wget awscli` from the 'Tarball' step) Signed-off-by: Melroy van den Berg --- .github/workflows/call-build-linux-arm-packages.yml | 6 +++--- .github/workflows/call-build-linux-x86-packages.yml | 10 +++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/call-build-linux-arm-packages.yml b/.github/workflows/call-build-linux-arm-packages.yml index 8862f19736..2a7bcc533f 100644 --- a/.github/workflows/call-build-linux-arm-packages.yml +++ b/.github/workflows/call-build-linux-arm-packages.yml @@ -36,7 +36,7 @@ jobs: build-valkey: # Capture source tarball and generate checksum for it name: Build package ${{ matrix.distro.target }} ${{ matrix.distro.arch }} - runs-on: 'ubuntu-latest' + runs-on: "ubuntu-latest" strategy: fail-fast: false matrix: ${{ fromJSON(inputs.build_matrix) }} @@ -51,8 +51,8 @@ jobs: with: arch: aarch64 distro: ${{matrix.distro.target}} - install: apt-get update && apt-get install -y build-essential libssl-dev - run: make -C src all BUILD_TLS=yes + install: apt-get update && apt-get install -y build-essential libssl-dev libsystemd-dev + run: make -C src all BUILD_TLS=yes USE_SYSTEMD=yes - name: Create Tarball and SHA256sums run: | diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml index 42b9146ea4..9e438fa61a 100644 --- a/.github/workflows/call-build-linux-x86-packages.yml +++ b/.github/workflows/call-build-linux-x86-packages.yml @@ -36,7 +36,7 @@ jobs: build-valkey: # Capture source tarball and generate checksum for it name: Build package ${{ matrix.distro.target }} ${{ matrix.distro.arch }} - runs-on: 'ubuntu-latest' + runs-on: "ubuntu-latest" strategy: fail-fast: false matrix: ${{ fromJSON(inputs.build_matrix) }} @@ -47,10 +47,10 @@ jobs: ref: ${{ inputs.version }} - name: Install dependencies - run: sudo apt-get update && sudo apt-get install -y build-essential libssl-dev jq wget awscli + run: sudo apt-get update && sudo apt-get install -y build-essential libssl-dev libsystemd-dev - name: Make Valkey - run: make -C src all BUILD_TLS=yes + run: make -C src all BUILD_TLS=yes USE_SYSTEMD=yes - name: Create Tarball and SHA256sums run: | @@ -63,6 +63,10 @@ jobs: mkdir -p packages-files cp -rfv $TAR_FILE_NAME.tar* packages-files/ + - name: Install AWS cli. + run: | + sudo apt-get install -y awscli + - name: Configure AWS credentials run: | aws configure set region us-west-2 From 245340baa88aa419ba542660c3c0e7e7c751a8bf Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Wed, 2 Oct 2024 16:22:09 -0700 Subject: [PATCH 12/51] Apply CVE patches for CVE-2024-31449, CVE-2024-31227, CVE-2024-31228 (#1115) Applying the CVEs against mainline. (CVE-2024-31449) Lua library commands may lead to stack overflow and potential RCE. (CVE-2024-31227) Potential Denial-of-service due to malformed ACL selectors. (CVE-2024-31228) Potential Denial-of-service due to unbounded pattern matching. Signed-off-by: Madelyn Olson --- deps/lua/src/lua_bit.c | 1 + src/acl.c | 2 +- src/util.c | 11 ++++++++--- tests/unit/acl-v2.tcl | 5 +++++ tests/unit/keyspace.tcl | 6 ++++++ tests/unit/scripting.tcl | 6 ++++++ 6 files changed, 27 insertions(+), 4 deletions(-) diff --git a/deps/lua/src/lua_bit.c b/deps/lua/src/lua_bit.c index 9f83b8594b..7e43faea47 100644 --- a/deps/lua/src/lua_bit.c +++ b/deps/lua/src/lua_bit.c @@ -132,6 +132,7 @@ static int bit_tohex(lua_State *L) const char *hexdigits = "0123456789abcdef"; char buf[8]; int i; + if (n == INT32_MIN) n = INT32_MIN+1; if (n < 0) { n = -n; hexdigits = "0123456789ABCDEF"; } if (n > 8) n = 8; for (i = (int)n; --i >= 0; ) { buf[i] = hexdigits[b & 15]; b >>= 4; } diff --git a/src/acl.c b/src/acl.c index 17632df0d4..14d9c6e6f3 100644 --- a/src/acl.c +++ b/src/acl.c @@ -1079,7 +1079,7 @@ int ACLSetSelector(aclSelector *selector, const char *op, size_t oplen) { flags |= ACL_READ_PERMISSION; } else if (toupper(op[offset]) == 'W' && !(flags & ACL_WRITE_PERMISSION)) { flags |= ACL_WRITE_PERMISSION; - } else if (op[offset] == '~') { + } else if (op[offset] == '~' && flags) { offset++; break; } else { diff --git a/src/util.c b/src/util.c index 66f62c9001..b1235c2822 100644 --- a/src/util.c +++ b/src/util.c @@ -59,7 +59,11 @@ static int stringmatchlen_impl(const char *pattern, const char *string, int stringLen, int nocase, - int *skipLongerMatches) { + int *skipLongerMatches, + int nesting) { + /* Protection against abusive patterns. */ + if (nesting > 1000) return 0; + while (patternLen && stringLen) { switch (pattern[0]) { case '*': @@ -69,7 +73,8 @@ static int stringmatchlen_impl(const char *pattern, } if (patternLen == 1) return 1; /* match */ while (stringLen) { - if (stringmatchlen_impl(pattern + 1, patternLen - 1, string, stringLen, nocase, skipLongerMatches)) + if (stringmatchlen_impl(pattern + 1, patternLen - 1, string, stringLen, nocase, skipLongerMatches, + nesting + 1)) return 1; /* match */ if (*skipLongerMatches) return 0; /* no match */ string++; @@ -179,7 +184,7 @@ static int stringmatchlen_impl(const char *pattern, int stringmatchlen(const char *pattern, int patternLen, const char *string, int stringLen, int nocase) { int skipLongerMatches = 0; - return stringmatchlen_impl(pattern, patternLen, string, stringLen, nocase, &skipLongerMatches); + return stringmatchlen_impl(pattern, patternLen, string, stringLen, nocase, &skipLongerMatches, 0); } int stringmatch(const char *pattern, const char *string, int nocase) { diff --git a/tests/unit/acl-v2.tcl b/tests/unit/acl-v2.tcl index 9827c0ba02..a509bc0ff6 100644 --- a/tests/unit/acl-v2.tcl +++ b/tests/unit/acl-v2.tcl @@ -116,6 +116,11 @@ start_server {tags {"acl external:skip"}} { assert_match "*NOPERM*key*" $err } + test {Validate read and write permissions format} { + catch {r ACL SETUSER key-permission-RW %~} err + set err + } {ERR Error in ACL SETUSER modifier '%~': Syntax error} + test {Test separate read and write permissions on different selectors are not additive} { r ACL SETUSER key-permission-RW-selector on nopass "(%R~read* +@all)" "(%W~write* +@all)" $r2 auth key-permission-RW-selector password diff --git a/tests/unit/keyspace.tcl b/tests/unit/keyspace.tcl index c8bdcac984..ba55c1b8ea 100644 --- a/tests/unit/keyspace.tcl +++ b/tests/unit/keyspace.tcl @@ -547,4 +547,10 @@ foreach {type large} [array get largevalue] { assert_no_match "*db2:keys=*" [r info keyspace] r flushall } {OK} {singledb:skip} + + test {Regression for pattern matching very long nested loops} { + r flushdb + r SET [string repeat "a" 50000] 1 + r KEYS [string repeat "*?" 50000] + } {} } diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index 1a73b9d1ab..db37129103 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -733,6 +733,12 @@ start_server {tags {"scripting"}} { set e } {ERR *Attempt to modify a readonly table*} + test {lua bit.tohex bug} { + set res [run_script {return bit.tohex(65535, -2147483648)} 0] + r ping + set res + } {0000FFFF} + test {Test an example script DECR_IF_GT} { set decr_if_gt { local current From 6dce17218436c1ac4402c8d126494f3b41075149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Thu, 3 Oct 2024 17:34:03 +0200 Subject: [PATCH 13/51] Fix undefined-santitizer warning in rax test (#1122) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the warning introduced in #688: ``` unit/test_rax.c:168:15: runtime error: left shift of 36625 by 16 places cannot be represented in type 'int' SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior unit/test_rax.c:168:15 in Fuzz test in mode 1 [7504]: ``` Signed-off-by: Viktor Söderqvist --- src/unit/test_rax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unit/test_rax.c b/src/unit/test_rax.c index 5f346b4115..d6c7f0bef8 100644 --- a/src/unit/test_rax.c +++ b/src/unit/test_rax.c @@ -165,7 +165,7 @@ static uint32_t int2int(uint32_t input) { r = l ^ F; l = nl; } - return (r << 16) | l; + return ((uint32_t)r << 16) | l; } /* Turn an uint32_t integer into an alphanumerical key and return its From d0fa9efa75c5bf18f7c3cdc27172d3a6aa64b134 Mon Sep 17 00:00:00 2001 From: Ricardo Dias Date: Fri, 4 Oct 2024 17:17:49 +0100 Subject: [PATCH 14/51] Fix some unitialized fields in `client` struct (#1126) This commit adds initialization code for the fields `io_last_reply_block` and `io_last_bufpos` of the `client` struct. While in the current code flow, these fields are only accessed after being written in the `trySendWriteToIOThreads`, I discovered that they were not being initialized while doing some changes to the code flow of IO threads. I believe it's good pratice to initialize all fields of a struct upon creation, and will avoid future bugs which are usually hard to debug. Signed-off-by: Ricardo Dias --- src/networking.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/networking.c b/src/networking.c index dda86e4d66..b18b798eaa 100644 --- a/src/networking.c +++ b/src/networking.c @@ -232,6 +232,8 @@ client *createClient(connection *conn) { c->net_output_bytes = 0; c->net_output_bytes_curr_cmd = 0; c->commands_processed = 0; + c->io_last_reply_block = NULL; + c->io_last_bufpos = 0; return c; } From 7df88bcaf0d7c5391402d682d5e8c8a3b53c15d3 Mon Sep 17 00:00:00 2001 From: Shivshankar Date: Fri, 4 Oct 2024 13:19:44 -0400 Subject: [PATCH 15/51] Include second solo test execution in total test count (#1071) This change counts both solo test executions to give an accurate total number of tests being run. --------- Signed-off-by: Shivshankar-Reddy --- tests/test_helper.tcl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index a9640b4f22..7c15413806 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -91,6 +91,7 @@ set ::ignoredigest 0 set ::large_memory 0 set ::log_req_res 0 set ::force_resp3 0 +set ::solo_tests_count 0 # Set to 1 when we are running in client mode. The server test uses a # server-client model to run tests simultaneously. The server instance @@ -373,9 +374,10 @@ proc read_from_test_client fd { signal_idle_client $fd } elseif {$status eq {done}} { set elapsed [expr {[clock seconds]-$::clients_start_time($fd)}] - set all_tests_count [llength $::all_tests] + set all_tests_count [expr {[llength $::all_tests]+$::solo_tests_count}] set running_tests_count [expr {[llength $::active_clients]-1}] - set completed_tests_count [expr {$::next_test-$running_tests_count}] + set completed_solo_tests_count [expr {$::solo_tests_count-[llength $::run_solo_tests]}] + set completed_tests_count [expr {$::next_test-$running_tests_count+$completed_solo_tests_count}] puts "\[$completed_tests_count/$all_tests_count [colorstr yellow $status]\]: $data ($elapsed seconds)" lappend ::clients_time_history $elapsed $data signal_idle_client $fd @@ -427,6 +429,7 @@ proc read_from_test_client fd { set ::active_clients_task($fd) "(KILLED SERVER) pid:$data" } elseif {$status eq {run_solo}} { lappend ::run_solo_tests $data + incr ::solo_tests_count } else { if {!$::quiet} { puts "\[$status\]: $data" From 7eeeb45f57291b86ea0da302be7ca0a53133e0de Mon Sep 17 00:00:00 2001 From: Parth <661497+parthpatel@users.noreply.github.com> Date: Fri, 4 Oct 2024 12:58:42 -0700 Subject: [PATCH 16/51] Removing Redis from internal lua function names and comments (#1102) Improved documentation and readability of lua code as well as removed references to Redis. --------- Signed-off-by: Parth Patel <661497+parthpatel@users.noreply.github.com> --- src/eval.c | 19 +++++--- src/function_lua.c | 2 +- src/rand.h | 2 +- src/script_lua.c | 116 ++++++++++++++++++++++++++------------------- src/server.c | 2 + 5 files changed, 84 insertions(+), 57 deletions(-) diff --git a/src/eval.c b/src/eval.c index 302d515653..fd12e40ad2 100644 --- a/src/eval.c +++ b/src/eval.c @@ -136,7 +136,7 @@ void sha1hex(char *digest, char *script, size_t len) { digest[40] = '\0'; } -/* server.breakpoint() +/* Adds server.breakpoint() function used by lua debugger. * * Allows to stop execution during a debugging session from within * the Lua code implementation, like if a breakpoint was set in the code @@ -151,7 +151,7 @@ int luaServerBreakpointCommand(lua_State *lua) { return 1; } -/* server.debug() +/* Adds server.debug() function used by lua debugger * * Log a string message into the output console. * Can take multiple arguments that will be separated by commas. @@ -168,7 +168,7 @@ int luaServerDebugCommand(lua_State *lua) { return 0; } -/* server.replicate_commands() +/* Adds server.replicate_commands() * * DEPRECATED: Now do nothing and always return true. * Turn on single commands replication if the script never called @@ -208,7 +208,8 @@ void scriptingInit(int setup) { luaRegisterServerAPI(lua); - /* register debug commands */ + /* register debug commands. we only need to add it under 'server' as 'redis' is effectively aliased to 'server' + * table at this point. */ lua_getglobal(lua, "server"); /* server.breakpoint */ @@ -235,7 +236,7 @@ void scriptingInit(int setup) { { char *errh_func = "local dbg = debug\n" "debug = nil\n" - "function __redis__err__handler(err)\n" + "function __server__err__handler(err)\n" " local i = dbg.getinfo(2,'nSl')\n" " if i and i.what == 'C' then\n" " i = dbg.getinfo(3,'nSl')\n" @@ -251,6 +252,9 @@ void scriptingInit(int setup) { "end\n"; luaL_loadbuffer(lua, errh_func, strlen(errh_func), "@err_handler_def"); lua_pcall(lua, 0, 0, 0); + /* Duplicate the function with __redis__err_handler name for backwards compatibility */ + lua_getglobal(lua, "__server__err__handler"); + lua_setglobal(lua, "__redis__err__handler"); } /* Create the (non connected) client that we use to execute server commands @@ -577,7 +581,7 @@ void evalGenericCommand(client *c, int evalsha) { evalCalcFunctionName(evalsha, c->argv[1]->ptr, funcname); /* Push the pcall error handler function on the stack. */ - lua_getglobal(lua, "__redis__err__handler"); + lua_getglobal(lua, "__server__err__handler"); /* Try to lookup the Lua function */ lua_getfield(lua, LUA_REGISTRYINDEX, funcname); @@ -1525,7 +1529,8 @@ void ldbServer(lua_State *lua, sds *argv, int argc) { lua_getglobal(lua, "server"); lua_pushstring(lua, "call"); lua_gettable(lua, -2); /* Stack: server, server.call */ - for (j = 1; j < argc; j++) lua_pushlstring(lua, argv[j], sdslen(argv[j])); + for (j = 1; j < argc; j++) + lua_pushlstring(lua, argv[j], sdslen(argv[j])); ldb.step = 1; /* Force server.call() to log. */ lua_pcall(lua, argc - 1, 1, 0); /* Stack: server, result */ ldb.step = 0; /* Disable logging. */ diff --git a/src/function_lua.c b/src/function_lua.c index 685485e37e..fa9983bf7e 100644 --- a/src/function_lua.c +++ b/src/function_lua.c @@ -427,7 +427,7 @@ int luaEngineInitEngine(void) { /* Register the library commands table and fields and store it to registry */ lua_newtable(lua_engine_ctx->lua); /* load library globals */ - lua_newtable(lua_engine_ctx->lua); /* load library `redis` table */ + lua_newtable(lua_engine_ctx->lua); /* load library `server` table */ lua_pushstring(lua_engine_ctx->lua, "register_function"); lua_pushcfunction(lua_engine_ctx->lua, luaRegisterFunction); diff --git a/src/rand.h b/src/rand.h index dec7465084..16b1d26f89 100644 --- a/src/rand.h +++ b/src/rand.h @@ -33,6 +33,6 @@ int32_t serverLrand48(void); void serverSrand48(int32_t seedval); -#define REDIS_LRAND48_MAX INT32_MAX +#define SERVER_LRAND48_MAX INT32_MAX #endif diff --git a/src/script_lua.c b/src/script_lua.c index ce13b71277..e378157d8c 100644 --- a/src/script_lua.c +++ b/src/script_lua.c @@ -56,10 +56,11 @@ static char *libraries_allow_list[] = { }; /* Lua API globals */ -static char *redis_api_allow_list[] = { +static char *server_api_allow_list[] = { SERVER_API_NAME, REDIS_API_NAME, - "__redis__err__handler", /* error handler for eval, currently located on globals. + "__redis__err__handler", /* Backwards compatible error handler */ + "__server__err__handler", /* error handler for eval, currently located on globals. Should move to registry. */ NULL, }; @@ -117,7 +118,7 @@ static char *lua_builtins_removed_after_initialization_allow_list[] = { * after that the global table is locked so not need to check anything.*/ static char **allow_lists[] = { libraries_allow_list, - redis_api_allow_list, + server_api_allow_list, lua_builtins_allow_list, lua_builtins_not_documented_allow_list, lua_builtins_removed_after_initialization_allow_list, @@ -135,8 +136,8 @@ static char *deny_list[] = { NULL, }; -static int redis_math_random(lua_State *L); -static int redis_math_randomseed(lua_State *L); +static int server_math_random(lua_State *L); +static int server_math_randomseed(lua_State *L); static void redisProtocolToLuaType_Int(void *ctx, long long val, const char *proto, size_t proto_len); static void redisProtocolToLuaType_BulkString(void *ctx, const char *str, size_t len, const char *proto, size_t proto_len); @@ -159,7 +160,8 @@ static void redisProtocolToLuaType_VerbatimString(void *ctx, const char *proto, size_t proto_len); static void redisProtocolToLuaType_Attribute(struct ReplyParser *parser, void *ctx, size_t len, const char *proto); -static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lua); + +static void luaReplyToServerReply(client *c, client *script_client, lua_State *lua); /* * Save the give pointer on Lua registry, used to save the Lua context and @@ -204,7 +206,7 @@ void *luaGetFromRegistry(lua_State *lua, const char *name) { /* Take a server reply in the RESP format and convert it into a * Lua type. Thanks to this function, and the introduction of not connected - * clients, it is trivial to implement the redis() lua function. + * clients, it is trivial to implement the server() lua function. * * Basically we take the arguments, execute the command in the context * of a non connected client, then take the generated reply and convert it @@ -610,7 +612,7 @@ int luaError(lua_State *lua) { /* Reply to client 'c' converting the top element in the Lua stack to a * server reply. As a side effect the element is consumed from the stack. */ -static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lua) { +static void luaReplyToServerReply(client *c, client *script_client, lua_State *lua) { int t = lua_type(lua, -1); if (!lua_checkstack(lua, 4)) { @@ -733,9 +735,9 @@ static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lu lua_pushnil(lua); /* Use nil to start iteration. */ while (lua_next(lua, -2)) { /* Stack now: table, key, value */ - lua_pushvalue(lua, -2); /* Dup key before consuming. */ - luaReplyToRedisReply(c, script_client, lua); /* Return key. */ - luaReplyToRedisReply(c, script_client, lua); /* Return value. */ + lua_pushvalue(lua, -2); /* Dup key before consuming. */ + luaReplyToServerReply(c, script_client, lua); /* Return key. */ + luaReplyToServerReply(c, script_client, lua); /* Return value. */ /* Stack now: table, key. */ maplen++; } @@ -756,9 +758,9 @@ static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lu lua_pushnil(lua); /* Use nil to start iteration. */ while (lua_next(lua, -2)) { /* Stack now: table, key, true */ - lua_pop(lua, 1); /* Discard the boolean value. */ - lua_pushvalue(lua, -1); /* Dup key before consuming. */ - luaReplyToRedisReply(c, script_client, lua); /* Return key. */ + lua_pop(lua, 1); /* Discard the boolean value. */ + lua_pushvalue(lua, -1); /* Dup key before consuming. */ + luaReplyToServerReply(c, script_client, lua); /* Return key. */ /* Stack now: table, key. */ setlen++; } @@ -780,7 +782,7 @@ static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lu lua_pop(lua, 1); break; } - luaReplyToRedisReply(c, script_client, lua); + luaReplyToServerReply(c, script_client, lua); mbulklen++; } setDeferredArrayLen(c, replylen, mbulklen); @@ -793,7 +795,7 @@ static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lu /* --------------------------------------------------------------------------- * Lua server.* functions implementations. * ------------------------------------------------------------------------- */ -void freeLuaRedisArgv(robj **argv, int argc, int argv_len); +void freeLuaServerArgv(robj **argv, int argc, int argv_len); /* Cached argv array across calls. */ static robj **lua_argv = NULL; @@ -803,7 +805,7 @@ static int lua_argv_size = 0; static robj *lua_args_cached_objects[LUA_CMD_OBJCACHE_SIZE]; static size_t lua_args_cached_objects_len[LUA_CMD_OBJCACHE_SIZE]; -static robj **luaArgsToRedisArgv(lua_State *lua, int *argc, int *argv_len) { +static robj **luaArgsToServerArgv(lua_State *lua, int *argc, int *argv_len) { int j; /* Require at least one argument */ *argc = lua_gettop(lua); @@ -864,7 +866,7 @@ static robj **luaArgsToRedisArgv(lua_State *lua, int *argc, int *argv_len) { * is not a string or an integer (lua_isstring() return true for * integers as well). */ if (j != *argc) { - freeLuaRedisArgv(lua_argv, j, lua_argv_size); + freeLuaServerArgv(lua_argv, j, lua_argv_size); luaPushError(lua, "Command arguments must be strings or integers"); return NULL; } @@ -872,7 +874,7 @@ static robj **luaArgsToRedisArgv(lua_State *lua, int *argc, int *argv_len) { return lua_argv; } -void freeLuaRedisArgv(robj **argv, int argc, int argv_len) { +void freeLuaServerArgv(robj **argv, int argc, int argv_len) { int j; for (j = 0; j < argc; j++) { robj *o = argv[j]; @@ -899,7 +901,7 @@ void freeLuaRedisArgv(robj **argv, int argc, int argv_len) { } } -static int luaRedisGenericCommand(lua_State *lua, int raise_error) { +static int luaServerGenericCommand(lua_State *lua, int raise_error) { int j; scriptRunCtx *rctx = luaGetFromRegistry(lua, REGISTRY_RUN_CTX_NAME); serverAssert(rctx); /* Only supported inside script invocation */ @@ -907,7 +909,7 @@ static int luaRedisGenericCommand(lua_State *lua, int raise_error) { client *c = rctx->c; sds reply; - c->argv = luaArgsToRedisArgv(lua, &c->argc, &c->argv_len); + c->argv = luaArgsToServerArgv(lua, &c->argc, &c->argv_len); if (c->argv == NULL) { return raise_error ? luaError(lua) : 1; } @@ -915,7 +917,7 @@ static int luaRedisGenericCommand(lua_State *lua, int raise_error) { static int inuse = 0; /* Recursive calls detection. */ /* By using Lua debug hooks it is possible to trigger a recursive call - * to luaRedisGenericCommand(), which normally should never happen. + * to luaServerGenericCommand(), which normally should never happen. * To make this function reentrant is futile and makes it slower, but * we should at least detect such a misuse, and abort. */ if (inuse) { @@ -978,7 +980,8 @@ static int luaRedisGenericCommand(lua_State *lua, int raise_error) { redisProtocolToLuaType(lua, reply); /* If the debugger is active, log the reply from the server. */ - if (ldbIsEnabled()) ldbLogRespReply(reply); + if (ldbIsEnabled()) + ldbLogRespReply(reply); if (reply != c->buf) sdsfree(reply); c->reply_bytes = 0; @@ -986,7 +989,7 @@ static int luaRedisGenericCommand(lua_State *lua, int raise_error) { cleanup: /* Clean up. Command code may have changed argv/argc so we use the * argv/argc of the client instead of the local variables. */ - freeLuaRedisArgv(c->argv, c->argc, c->argv_len); + freeLuaServerArgv(c->argv, c->argc, c->argv_len); c->argc = c->argv_len = 0; c->user = NULL; c->argv = NULL; @@ -1032,12 +1035,12 @@ static int luaRedisPcall(lua_State *lua) { /* server.call() */ static int luaRedisCallCommand(lua_State *lua) { - return luaRedisGenericCommand(lua, 1); + return luaServerGenericCommand(lua, 1); } /* server.pcall() */ static int luaRedisPCallCommand(lua_State *lua) { - return luaRedisGenericCommand(lua, 0); + return luaServerGenericCommand(lua, 0); } /* This adds server.sha1hex(string) to Lua scripts using the same hashing @@ -1137,7 +1140,7 @@ static int luaRedisAclCheckCmdPermissionsCommand(lua_State *lua) { int raise_error = 0; int argc, argv_len; - robj **argv = luaArgsToRedisArgv(lua, &argc, &argv_len); + robj **argv = luaArgsToServerArgv(lua, &argc, &argv_len); /* Require at least one argument */ if (argv == NULL) return luaError(lua); @@ -1156,7 +1159,7 @@ static int luaRedisAclCheckCmdPermissionsCommand(lua_State *lua) { } } - freeLuaRedisArgv(argv, argc, argv_len); + freeLuaServerArgv(argv, argc, argv_len); if (raise_error) return luaError(lua); else @@ -1432,44 +1435,61 @@ void luaRegisterLogFunction(lua_State *lua) { lua_settable(lua, -3); } +/* + * Adds server.* functions/fields to lua such as server.call etc. + * This function only handles fields common between Functions and LUA scripting. + * scriptingInit() and functionsInit() may add additional fields specific to each. + */ void luaRegisterServerAPI(lua_State *lua) { + /* In addition to registering server.call/pcall API, we will throw a custom message when a script accesses + * undefined global variable. LUA stores global variables in the global table, accessible to us on stack at virtual + * index = LUA_GLOBALSINDEX. We will set __index handler in global table's metatable to a custom C function to + * achieve this - handled by luaSetAllowListProtection. Refer to https://www.lua.org/pil/13.4.1.html for + * documentation on __index and https://www.lua.org/pil/contents.html#13 for documentation on metatables. We need to + * pass global table to lua invocations as parameters. To achieve this, lua_pushvalue invocation brings global + * variable table to the top of the stack by pushing value from global index onto the stack. And lua_pop invocation + * after luaSetAllowListProtection removes it - resetting the stack to its original state. */ lua_pushvalue(lua, LUA_GLOBALSINDEX); luaSetAllowListProtection(lua); lua_pop(lua, 1); + /* Add default C functions provided in deps/lua codebase to handle basic data types such as table, string etc. */ luaLoadLibraries(lua); + /* Before Redis OSS 7, Lua used to return error messages as strings from pcall function. With Valkey (or Redis OSS 7), Lua now returns + * error messages as tables. To keep backwards compatibility, we wrap the Lua pcall function with our own + * implementation of C function that converts table to string. */ lua_pushcfunction(lua, luaRedisPcall); lua_setglobal(lua, "pcall"); - /* Register the commands table and fields */ + /* Create a top level table object on the stack to temporarily hold fields for 'server' table. We will name it as + * 'server' and send it to LUA at the very end. Also add 'call' and 'pcall' functions to the table. */ lua_newtable(lua); - - /* server.call */ lua_pushstring(lua, "call"); lua_pushcfunction(lua, luaRedisCallCommand); lua_settable(lua, -3); - - /* server.pcall */ lua_pushstring(lua, "pcall"); lua_pushcfunction(lua, luaRedisPCallCommand); lua_settable(lua, -3); + /* Add server.log function and debug level constants. LUA scripts use it to print messages to server log. */ luaRegisterLogFunction(lua); + /* Add SERVER_VERSION_NUM, SERVER_VERSION and SERVER_NAME fields with appropriate values. */ luaRegisterVersion(lua); - /* server.setresp */ + /* Add server.setresp function to allow LUA scripts to change the RESP version for server.call and server.pcall + * invocations. */ lua_pushstring(lua, "setresp"); lua_pushcfunction(lua, luaSetResp); lua_settable(lua, -3); - /* server.sha1hex */ + /* Add server.sha1hex function. */ lua_pushstring(lua, "sha1hex"); lua_pushcfunction(lua, luaRedisSha1hexCommand); lua_settable(lua, -3); - /* server.error_reply and server.status_reply */ + /* Add server.error_reply and server.status_reply functions. */ lua_pushstring(lua, "error_reply"); lua_pushcfunction(lua, luaRedisErrorReplyCommand); lua_settable(lua, -3); @@ -1477,7 +1497,7 @@ void luaRegisterServerAPI(lua_State *lua) { lua_pushcfunction(lua, luaRedisStatusReplyCommand); lua_settable(lua, -3); - /* server.set_repl and associated flags. */ + /* Add server.set_repl function and associated flags. */ lua_pushstring(lua, "set_repl"); lua_pushcfunction(lua, luaRedisSetReplCommand); lua_settable(lua, -3); @@ -1502,7 +1522,7 @@ void luaRegisterServerAPI(lua_State *lua) { lua_pushnumber(lua, PROPAGATE_AOF | PROPAGATE_REPL); lua_settable(lua, -3); - /* server.acl_check_cmd */ + /* Add server.acl_check_cmd function. */ lua_pushstring(lua, "acl_check_cmd"); lua_pushcfunction(lua, luaRedisAclCheckCmdPermissionsCommand); lua_settable(lua, -3); @@ -1515,17 +1535,16 @@ void luaRegisterServerAPI(lua_State *lua) { * This is not a deep copy but is enough for our purpose here. */ lua_getglobal(lua, SERVER_API_NAME); lua_setglobal(lua, REDIS_API_NAME); - /* Replace math.random and math.randomseed with our implementations. */ - lua_getglobal(lua, "math"); + /* Replace math.random and math.randomseed with custom implementations. */ + lua_getglobal(lua, "math"); lua_pushstring(lua, "random"); - lua_pushcfunction(lua, redis_math_random); + lua_pushcfunction(lua, server_math_random); lua_settable(lua, -3); - lua_pushstring(lua, "randomseed"); - lua_pushcfunction(lua, redis_math_randomseed); + lua_pushcfunction(lua, server_math_randomseed); lua_settable(lua, -3); - + /* overwrite value of global variable 'math' with this modified math table present on top of stack. */ lua_setglobal(lua, "math"); } @@ -1551,10 +1570,11 @@ static void luaCreateArray(lua_State *lua, robj **elev, int elec) { /* The following implementation is the one shipped with Lua itself but with * rand() replaced by serverLrand48(). */ -static int redis_math_random(lua_State *L) { +static int server_math_random(lua_State *L) { /* the `%' avoids the (rare) case of r==1, and is needed also because on some systems (SunOS!) `rand()' may return a value larger than RAND_MAX */ - lua_Number r = (lua_Number)(serverLrand48() % REDIS_LRAND48_MAX) / (lua_Number)REDIS_LRAND48_MAX; + lua_Number r = (lua_Number)(serverLrand48() % SERVER_LRAND48_MAX) / + (lua_Number)SERVER_LRAND48_MAX; switch (lua_gettop(L)) { /* check number of arguments */ case 0: { /* no arguments */ lua_pushnumber(L, r); /* Number between 0 and 1 */ @@ -1578,7 +1598,7 @@ static int redis_math_random(lua_State *L) { return 1; } -static int redis_math_randomseed(lua_State *L) { +static int server_math_randomseed(lua_State *L) { serverSrand48(luaL_checkint(L, 1)); return 0; } @@ -1741,7 +1761,7 @@ void luaCallFunction(scriptRunCtx *run_ctx, } else { /* On success convert the Lua return value into RESP, and * send it to * the client. */ - luaReplyToRedisReply(c, run_ctx->c, lua); /* Convert and consume the reply. */ + luaReplyToServerReply(c, run_ctx->c, lua); /* Convert and consume the reply. */ } /* Perform some cleanup that we need to do both on error and success. */ diff --git a/src/server.c b/src/server.c index 866023a455..56a49c85a8 100644 --- a/src/server.c +++ b/src/server.c @@ -2756,7 +2756,9 @@ void initServer(void) { server.maxmemory_policy = MAXMEMORY_NO_EVICTION; } + /* Initialize the LUA scripting engine. */ scriptingInit(1); + /* Initialize the functions engine based off of LUA initialization. */ if (functionsInit() == C_ERR) { serverPanic("Functions initialization failed, check the server logs."); exit(1); From 093cf1009a79cdd2dd705839357652aa5401d226 Mon Sep 17 00:00:00 2001 From: Shivshankar Date: Fri, 4 Oct 2024 16:30:59 -0400 Subject: [PATCH 17/51] Correct the typo in valkey.conf file (#1118) Correct the typo in valkey.conf file Signed-off-by: Shivshankar-Reddy --- valkey.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/valkey.conf b/valkey.conf index f9d102a95d..56110a52bd 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1394,7 +1394,7 @@ oom-score-adj-values 0 200 800 #################### KERNEL transparent hugepage CONTROL ###################### # Usually the kernel Transparent Huge Pages control is set to "madvise" or -# or "never" by default (/sys/kernel/mm/transparent_hugepage/enabled), in which +# "never" by default (/sys/kernel/mm/transparent_hugepage/enabled), in which # case this config has no effect. On systems in which it is set to "always", # the server will attempt to disable it specifically for the server process in order # to avoid latency problems specifically with fork(2) and CoW. From 993e0a60f183961e949cfbc94dbcd1568e5c1783 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Sat, 5 Oct 2024 05:22:48 +0800 Subject: [PATCH 18/51] RDMA: use protected mode for test (#1124) Since a7cbca40661 ("RDMA: Support .is_local method (#1089)"), valkey-server started to support auto-detect local connection, then we can use protected mode for local RDMA device for test. Signed-off-by: zhenwei pi --- tests/rdma/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rdma/run.py b/tests/rdma/run.py index 61cae6f41f..0724c27adc 100755 --- a/tests/rdma/run.py +++ b/tests/rdma/run.py @@ -61,7 +61,7 @@ def test_rdma(ipaddr): # step 2, start server svrpath = valkeydir + "/src/valkey-server" rdmapath = valkeydir + "/src/valkey-rdma.so" - svrcmd = [svrpath, "--port", "0", "--loglevel", "verbose", "--protected-mode", "no", + svrcmd = [svrpath, "--port", "0", "--loglevel", "verbose", "--protected-mode", "yes", "--appendonly", "no", "--daemonize", "no", "--dir", valkeydir + "/tests/rdma/tmp", "--loadmodule", rdmapath, "port=6379", "bind=" + ipaddr] From 024f7779ba685e2033a8e8ca705c015ad5c086c7 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Sat, 5 Oct 2024 16:03:57 +0800 Subject: [PATCH 19/51] Add tags into .gitignore (#1125) ctags is used widely on a linux platform, add tags into .gitignore. Signed-off-by: zhenwei pi --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3175ad4b4f..e448e23f7e 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,4 @@ redis.code-workspace nodes*.conf tests/cluster/tmp/* tests/rdma/rdma-test +tags From 71ffb577856c81137a5db5e36fad3287a6fc9551 Mon Sep 17 00:00:00 2001 From: Shivshankar Date: Sun, 6 Oct 2024 00:12:07 -0400 Subject: [PATCH 20/51] Adding the "-j" option in ci make commands to parallelize CI builds (#1128) fixes: https://github.com/valkey-io/valkey/issues/1123 As per github documentation below is core information on runners. **Linux:** public repositories: 4 cores private repositories: 2 cores **Macos:** its 3 or 4 based on both and its depends on the Processor. **Reference details for more information:** Discussion in https://github.com/valkey-io/valkey/issues/1123 - Public repo: https://docs.github.com/en/actions/using-github-hosted-runners/using-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories - Private repo: https://docs.github.com/en/actions/using-github-hosted-runners/using-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for--private-repositories Suggested-by: zhenwei pi Signed-off-by: Shivshankar-Reddy --- .github/workflows/ci.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6a1a9460fb..f1d23f40fa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: - name: make # Fail build if there are warnings # build with TLS just for compilation coverage - run: make all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes + run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes - name: test run: | sudo apt-get install tcl8.6 tclx @@ -40,7 +40,7 @@ jobs: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make # build with TLS module just for compilation coverage - run: make SANITIZER=address SERVER_CFLAGS='-Werror' BUILD_TLS=module + run: make -j4 SANITIZER=address SERVER_CFLAGS='-Werror' BUILD_TLS=module - name: testprep run: sudo apt-get install tcl8.6 tclx -y - name: test @@ -55,7 +55,7 @@ jobs: - name: make run: | sudo apt-get install librdmacm-dev libibverbs-dev - make BUILD_RDMA=module + make -j4 BUILD_RDMA=module - name: clone-rxe-kmod run: | mkdir -p tests/rdma/rxe @@ -76,14 +76,14 @@ jobs: - name: make run: | apt-get update && apt-get install -y build-essential - make SERVER_CFLAGS='-Werror' + make -j4 SERVER_CFLAGS='-Werror' build-macos-latest: runs-on: macos-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make SERVER_CFLAGS='-Werror' + run: make -j3 SERVER_CFLAGS='-Werror' build-32bit: runs-on: ubuntu-latest @@ -92,14 +92,14 @@ jobs: - name: make run: | sudo apt-get update && sudo apt-get install libc6-dev-i386 - make SERVER_CFLAGS='-Werror' 32bit + make -j4 SERVER_CFLAGS='-Werror' 32bit build-libc-malloc: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make SERVER_CFLAGS='-Werror' MALLOC=libc + run: make -j4 SERVER_CFLAGS='-Werror' MALLOC=libc build-almalinux8-jemalloc: runs-on: ubuntu-latest @@ -110,7 +110,7 @@ jobs: - name: make run: | dnf -y install epel-release gcc make procps-ng which - make -j SERVER_CFLAGS='-Werror' + make -j4 SERVER_CFLAGS='-Werror' format-yaml: runs-on: ubuntu-latest From aedfb04197da11c79bc24afab6126202e3c949c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Sun, 6 Oct 2024 19:40:36 +0200 Subject: [PATCH 21/51] Make ./runtest --dump-logs dump logs on crash (#1117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now, this flag only dumped logs on a failed assert in test case. It is useful that this flag dumps logs on a crash as well. Signed-off-by: Viktor Söderqvist --- tests/support/server.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/support/server.tcl b/tests/support/server.tcl index e8f9f8fb44..7257339042 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -697,8 +697,8 @@ proc start_server {options {code undefined}} { dict set srv "skipleaks" 1 kill_server $srv - if {$::dump_logs && $assertion} { - # if we caught an assertion ($::num_failed isn't incremented yet) + if {$::dump_logs} { + # crash or assertion ($::num_failed isn't incremented yet) # this happens when the test spawns a server and not the other way around dump_server_log $srv } else { From f1fbd7d6f5172ba8e3d9a4e17e40a9374629d5e2 Mon Sep 17 00:00:00 2001 From: otheng Date: Mon, 7 Oct 2024 02:40:58 +0900 Subject: [PATCH 22/51] Reuse `obey_client` variable in `processCommand()` function (#1101) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I’ve prepared a minor fix for `processCommand()` function. In `processCommand()`, the `obey_client` variable is created, but some conditional statements call the `mustObeyClient()` function instead of reusing `obey_client`. I’ve modified these statements to `reuse obey_client`. Since I’m relatively new to Redis, please let me know if there are any reasons why the conditional statements need to call `mustObeyClient()` again. Thank you for taking the time to review my PR. Signed-off-by: otheng03 <07c00h@gmail.com> --- src/server.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/server.c b/src/server.c index 56a49c85a8..d9f7d73843 100644 --- a/src/server.c +++ b/src/server.c @@ -3891,7 +3891,7 @@ int processCommand(client *c) { (c->cmd->proc == execCommand && (c->mstate.cmd_flags & (CMD_WRITE | CMD_MAY_REPLICATE))); int is_deny_async_loading_command = (cmd_flags & CMD_NO_ASYNC_LOADING) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_NO_ASYNC_LOADING)); - int obey_client = mustObeyClient(c); + const int obey_client = mustObeyClient(c); if (authRequired(c)) { /* AUTH and HELLO and no auth commands are valid even in @@ -3924,7 +3924,7 @@ int processCommand(client *c) { * However we don't perform the redirection if: * 1) The sender of this command is our primary. * 2) The command has no key arguments. */ - if (server.cluster_enabled && !mustObeyClient(c) && + if (server.cluster_enabled && !obey_client && !(!(c->cmd->flags & CMD_MOVABLE_KEYS) && c->cmd->key_specs_num == 0 && c->cmd->proc != execCommand)) { int error_code; clusterNode *n = getNodeByQuery(c, c->cmd, c->argv, c->argc, &c->slot, &error_code); @@ -3941,7 +3941,7 @@ int processCommand(client *c) { } } - if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary_host && !mustObeyClient(c) && + if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary_host && !obey_client && (is_write_command || (is_read_command && !c->flag.readonly))) { if (server.failover_state == FAILOVER_IN_PROGRESS) { /* During the FAILOVER process, when conditions are met (such as From 219e64676357c17f8f295151020b95c82372bf87 Mon Sep 17 00:00:00 2001 From: Masahiro Ide Date: Mon, 7 Oct 2024 04:34:45 +0900 Subject: [PATCH 23/51] Eliminate hashTypeIterator memory allocation by assigning it on stack (#1105) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Masahiro Ide Signed-off-by: Masahiro Ide Co-authored-by: Viktor Söderqvist Co-authored-by: Masahiro Ide --- src/aof.c | 14 +++++----- src/debug.c | 11 ++++---- src/server.h | 6 ++--- src/t_hash.c | 75 ++++++++++++++++++++++++++-------------------------- 4 files changed, 53 insertions(+), 53 deletions(-) diff --git a/src/aof.c b/src/aof.c index e712295127..bc29bb0d9e 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1947,30 +1947,30 @@ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { /* Emit the commands needed to rebuild a hash object. * The function returns 0 on error, 1 on success. */ int rewriteHashObject(rio *r, robj *key, robj *o) { - hashTypeIterator *hi; + hashTypeIterator hi; long long count = 0, items = hashTypeLength(o); - hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { + hashTypeInitIterator(o, &hi); + while (hashTypeNext(&hi) != C_ERR) { if (count == 0) { int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? AOF_REWRITE_ITEMS_PER_CMD : items; if (!rioWriteBulkCount(r, '*', 2 + cmd_items * 2) || !rioWriteBulkString(r, "HMSET", 5) || !rioWriteBulkObject(r, key)) { - hashTypeReleaseIterator(hi); + hashTypeResetIterator(&hi); return 0; } } - if (!rioWriteHashIteratorCursor(r, hi, OBJ_HASH_KEY) || !rioWriteHashIteratorCursor(r, hi, OBJ_HASH_VALUE)) { - hashTypeReleaseIterator(hi); + if (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_KEY) || !rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) { + hashTypeResetIterator(&hi); return 0; } if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; items--; } - hashTypeReleaseIterator(hi); + hashTypeResetIterator(&hi); return 1; } diff --git a/src/debug.c b/src/debug.c index be1abf46b4..98512fd436 100644 --- a/src/debug.c +++ b/src/debug.c @@ -221,21 +221,22 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) serverPanic("Unknown sorted set encoding"); } } else if (o->type == OBJ_HASH) { - hashTypeIterator *hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { + hashTypeIterator hi; + hashTypeInitIterator(o, &hi); + while (hashTypeNext(&hi) != C_ERR) { unsigned char eledigest[20]; sds sdsele; memset(eledigest, 0, 20); - sdsele = hashTypeCurrentObjectNewSds(hi, OBJ_HASH_KEY); + sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY); mixDigest(eledigest, sdsele, sdslen(sdsele)); sdsfree(sdsele); - sdsele = hashTypeCurrentObjectNewSds(hi, OBJ_HASH_VALUE); + sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE); mixDigest(eledigest, sdsele, sdslen(sdsele)); sdsfree(sdsele); xorDigest(digest, eledigest, 20); } - hashTypeReleaseIterator(hi); + hashTypeResetIterator(&hi); } else if (o->type == OBJ_STREAM) { streamIterator si; streamIteratorStart(&si, o->ptr, NULL, NULL, 0); diff --git a/src/server.h b/src/server.h index 1b8f08833f..84a282b6f5 100644 --- a/src/server.h +++ b/src/server.h @@ -2603,7 +2603,7 @@ typedef struct { unsigned char *fptr, *vptr; - dictIterator *di; + dictIterator di; dictEntry *de; } hashTypeIterator; @@ -3370,8 +3370,8 @@ void hashTypeTryConversion(robj *subject, robj **argv, int start, int end); int hashTypeExists(robj *o, sds key); int hashTypeDelete(robj *o, sds key); unsigned long hashTypeLength(const robj *o); -hashTypeIterator *hashTypeInitIterator(robj *subject); -void hashTypeReleaseIterator(hashTypeIterator *hi); +void hashTypeInitIterator(robj *subject, hashTypeIterator *hi); +void hashTypeResetIterator(hashTypeIterator *hi); int hashTypeNext(hashTypeIterator *hi); void hashTypeCurrentFromListpack(hashTypeIterator *hi, int what, diff --git a/src/t_hash.c b/src/t_hash.c index 9398dc3e3f..96252bc39b 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -307,8 +307,7 @@ unsigned long hashTypeLength(const robj *o) { return length; } -hashTypeIterator *hashTypeInitIterator(robj *subject) { - hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator)); +void hashTypeInitIterator(robj *subject, hashTypeIterator *hi) { hi->subject = subject; hi->encoding = subject->encoding; @@ -316,16 +315,14 @@ hashTypeIterator *hashTypeInitIterator(robj *subject) { hi->fptr = NULL; hi->vptr = NULL; } else if (hi->encoding == OBJ_ENCODING_HT) { - hi->di = dictGetIterator(subject->ptr); + dictInitIterator(&hi->di, subject->ptr); } else { serverPanic("Unknown hash encoding"); } - return hi; } -void hashTypeReleaseIterator(hashTypeIterator *hi) { - if (hi->encoding == OBJ_ENCODING_HT) dictReleaseIterator(hi->di); - zfree(hi); +void hashTypeResetIterator(hashTypeIterator *hi) { + if (hi->encoding == OBJ_ENCODING_HT) dictResetIterator(&hi->di); } /* Move to the next entry in the hash. Return C_OK when the next entry @@ -358,7 +355,7 @@ int hashTypeNext(hashTypeIterator *hi) { hi->fptr = fptr; hi->vptr = vptr; } else if (hi->encoding == OBJ_ENCODING_HT) { - if ((hi->de = dictNext(hi->di)) == NULL) return C_ERR; + if ((hi->de = dictNext(&hi->di)) == NULL) return C_ERR; } else { serverPanic("Unknown hash encoding"); } @@ -448,31 +445,31 @@ void hashTypeConvertListpack(robj *o, int enc) { /* Nothing to do... */ } else if (enc == OBJ_ENCODING_HT) { - hashTypeIterator *hi; + hashTypeIterator hi; dict *dict; int ret; - hi = hashTypeInitIterator(o); + hashTypeInitIterator(o, &hi); dict = dictCreate(&hashDictType); /* Presize the dict to avoid rehashing */ dictExpand(dict, hashTypeLength(o)); - while (hashTypeNext(hi) != C_ERR) { + while (hashTypeNext(&hi) != C_ERR) { sds key, value; - key = hashTypeCurrentObjectNewSds(hi, OBJ_HASH_KEY); - value = hashTypeCurrentObjectNewSds(hi, OBJ_HASH_VALUE); + key = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY); + value = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE); ret = dictAdd(dict, key, value); if (ret != DICT_OK) { sdsfree(key); - sdsfree(value); /* Needed for gcc ASAN */ - hashTypeReleaseIterator(hi); /* Needed for gcc ASAN */ + sdsfree(value); /* Needed for gcc ASAN */ + hashTypeResetIterator(&hi); /* Needed for gcc ASAN */ serverLogHexDump(LL_WARNING, "listpack with dup elements dump", o->ptr, lpBytes(o->ptr)); serverPanic("Listpack corruption detected"); } } - hashTypeReleaseIterator(hi); + hashTypeResetIterator(&hi); zfree(o->ptr); o->encoding = OBJ_ENCODING_HT; o->ptr = dict; @@ -498,7 +495,7 @@ void hashTypeConvert(robj *o, int enc) { * The resulting object always has refcount set to 1 */ robj *hashTypeDup(robj *o) { robj *hobj; - hashTypeIterator *hi; + hashTypeIterator hi; serverAssert(o->type == OBJ_HASH); @@ -513,20 +510,20 @@ robj *hashTypeDup(robj *o) { dict *d = dictCreate(&hashDictType); dictExpand(d, dictSize((const dict *)o->ptr)); - hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { + hashTypeInitIterator(o, &hi); + while (hashTypeNext(&hi) != C_ERR) { sds field, value; sds newfield, newvalue; /* Extract a field-value pair from an original hash object.*/ - field = hashTypeCurrentFromHashTable(hi, OBJ_HASH_KEY); - value = hashTypeCurrentFromHashTable(hi, OBJ_HASH_VALUE); + field = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_KEY); + value = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_VALUE); newfield = sdsdup(field); newvalue = sdsdup(value); /* Add a field-value pair to a new hash object. */ dictAdd(d, newfield, newvalue); } - hashTypeReleaseIterator(hi); + hashTypeResetIterator(&hi); hobj = createObject(OBJ_HASH, d); hobj->encoding = OBJ_ENCODING_HT; @@ -812,7 +809,7 @@ static void addHashIteratorCursorToReply(client *c, hashTypeIterator *hi, int wh void genericHgetallCommand(client *c, int flags) { robj *o; - hashTypeIterator *hi; + hashTypeIterator hi; int length, count = 0; robj *emptyResp = (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) ? shared.emptymap[c->resp] : shared.emptyarray; @@ -827,19 +824,19 @@ void genericHgetallCommand(client *c, int flags) { addReplyArrayLen(c, length); } - hi = hashTypeInitIterator(o); - while (hashTypeNext(hi) != C_ERR) { + hashTypeInitIterator(o, &hi); + while (hashTypeNext(&hi) != C_ERR) { if (flags & OBJ_HASH_KEY) { - addHashIteratorCursorToReply(c, hi, OBJ_HASH_KEY); + addHashIteratorCursorToReply(c, &hi, OBJ_HASH_KEY); count++; } if (flags & OBJ_HASH_VALUE) { - addHashIteratorCursorToReply(c, hi, OBJ_HASH_VALUE); + addHashIteratorCursorToReply(c, &hi, OBJ_HASH_VALUE); count++; } } - hashTypeReleaseIterator(hi); + hashTypeResetIterator(&hi); /* Make sure we returned the right number of elements. */ if (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) count /= 2; @@ -973,13 +970,14 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { * The number of requested elements is greater than the number of * elements inside the hash: simply return the whole hash. */ if (count >= size) { - hashTypeIterator *hi = hashTypeInitIterator(hash); - while (hashTypeNext(hi) != C_ERR) { + hashTypeIterator hi; + hashTypeInitIterator(hash, &hi); + while (hashTypeNext(&hi) != C_ERR) { if (withvalues && c->resp > 2) addReplyArrayLen(c, 2); - addHashIteratorCursorToReply(c, hi, OBJ_HASH_KEY); - if (withvalues) addHashIteratorCursorToReply(c, hi, OBJ_HASH_VALUE); + addHashIteratorCursorToReply(c, &hi, OBJ_HASH_KEY); + if (withvalues) addHashIteratorCursorToReply(c, &hi, OBJ_HASH_VALUE); } - hashTypeReleaseIterator(hi); + hashTypeResetIterator(&hi); return; } @@ -1015,21 +1013,22 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { /* Hashtable encoding (generic implementation) */ dict *d = dictCreate(&sdsReplyDictType); dictExpand(d, size); - hashTypeIterator *hi = hashTypeInitIterator(hash); + hashTypeIterator hi; + hashTypeInitIterator(hash, &hi); /* Add all the elements into the temporary dictionary. */ - while ((hashTypeNext(hi)) != C_ERR) { + while ((hashTypeNext(&hi)) != C_ERR) { int ret = DICT_ERR; sds key, value = NULL; - key = hashTypeCurrentObjectNewSds(hi, OBJ_HASH_KEY); - if (withvalues) value = hashTypeCurrentObjectNewSds(hi, OBJ_HASH_VALUE); + key = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY); + if (withvalues) value = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE); ret = dictAdd(d, key, value); serverAssert(ret == DICT_OK); } serverAssert(dictSize(d) == size); - hashTypeReleaseIterator(hi); + hashTypeResetIterator(&hi); /* Remove random elements to reach the right count. */ while (size > count) { From bd2666cf42e14043556d81e9fc95de3711c1acfb Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 7 Oct 2024 11:56:15 -0700 Subject: [PATCH 24/51] Removing incorrect comment about a warning (#1132) There is a lot of bad legacy usage of `default:` with enums, which is an anti-pattern. If you omit the default, the compiler will tell you if a new enum value was added and that it is missing from a switch statement. Someone mentioned on another PR they used `default:` because of this warning, so just removing it, but might create an issue to do a wider cleanup. Signed-off-by: Madelyn Olson --- src/cluster_legacy.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 120d55501c..14f8a6bd1e 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -227,12 +227,8 @@ static clusterNode *clusterNodeIterNext(ClusterNodeIterator *iter) { /* Return the value associated with the node, or NULL if no more nodes */ return ln ? listNodeValue(ln) : NULL; } - /* This line is unreachable but added to avoid compiler warnings */ - default: { - serverPanic("bad type"); - return NULL; - } } + serverPanic("Unknown iterator type %d", iter->type); } static void clusterNodeIterReset(ClusterNodeIterator *iter) { From a91f913aab1fb5a5965bc44e3cc9a14c4cd31846 Mon Sep 17 00:00:00 2001 From: chx9 Date: Tue, 8 Oct 2024 23:07:51 +0800 Subject: [PATCH 25/51] fix typo (#1136) Signed-off-by: chx9 --- src/object.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/object.c b/src/object.c index 2508f20ab6..8c1cf64892 100644 --- a/src/object.c +++ b/src/object.c @@ -1416,7 +1416,7 @@ int objectSetLRUOrLFU(robj *val, long long lfu_freq, long long lru_idle, long lo lru_idle = lru_idle * lru_multiplier / LRU_CLOCK_RESOLUTION; long lru_abs = lru_clock - lru_idle; /* Absolute access time. */ /* If the LRU field underflows (since lru_clock is a wrapping clock), - * we need to make it positive again. This be handled by the unwrapping + * we need to make it positive again. This will be handled by the unwrapping * code in estimateObjectIdleTime. I.e. imagine a day when lru_clock * wrap arounds (happens once in some 6 months), and becomes a low * value, like 10, an lru_idle of 1000 should be near LRU_CLOCK_MAX. */ From 8994460c0b010d28220176b5c2f0f93a8074411f Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 9 Oct 2024 16:10:29 +0800 Subject: [PATCH 26/51] Add server log when module load fails with busy name (#1084) Currently when module loading fails due to busy name, we don't have a clean way to assist to troubleshooting. Case 1: when loading the same module multiple times, we can not detemine the cause of its failure without referring to the module list or the earliest module load log. The log may not exist and sometimes it is difficult for people to associate module list. Case 2: when multiple modules use the same module name, we can not quickly associate the busy name without referring to the module list and the earliest module load log. Different people wrote modules with the same module name, they don't easily associate module name. So in this PR, when doing module onload, we will try to print a busy name log if this happen. Currently we check ctx.module since if it is NULL it means the Init call failed, and Init currently only fails with busy name. It's kind of ugly. It would have been nice if we could have had a better way for onload to signal why the load failed. Signed-off-by: Binbin --- src/module.c | 6 +++++- tests/unit/moduleapi/basics.tcl | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/module.c b/src/module.c index 38d0c2d968..2884239200 100644 --- a/src/module.c +++ b/src/module.c @@ -12194,11 +12194,15 @@ int moduleLoad(const char *path, void **module_argv, int module_argc, int is_loa ValkeyModuleCtx ctx; moduleCreateContext(&ctx, NULL, VALKEYMODULE_CTX_TEMP_CLIENT); /* We pass NULL since we don't have a module yet. */ if (onload((void *)&ctx, module_argv, module_argc) == VALKEYMODULE_ERR) { - serverLog(LL_WARNING, "Module %s initialization failed. Module not loaded", path); if (ctx.module) { + serverLog(LL_WARNING, "Module %s initialization failed. Module not loaded.", path); moduleUnregisterCleanup(ctx.module); moduleRemoveCateogires(ctx.module); moduleFreeModuleStructure(ctx.module); + } else { + /* If there is no ctx.module, this means that our ValkeyModule_Init call failed, + * and currently init will only fail on busy name. */ + serverLog(LL_WARNING, "Module %s initialization failed. Module name is busy.", path); } moduleFreeContext(&ctx); dlclose(handle); diff --git a/tests/unit/moduleapi/basics.tcl b/tests/unit/moduleapi/basics.tcl index 042e3474a0..733e8e1962 100644 --- a/tests/unit/moduleapi/basics.tcl +++ b/tests/unit/moduleapi/basics.tcl @@ -34,7 +34,12 @@ start_server {tags {"modules"}} { } } - test "Unload the module - test" { + test "Busy module name" { + assert_error {ERR Error loading the extension. Please check the server logs.} {r module load $testmodule} + verify_log_message 0 "*Module name is busy*" 0 + } + + test "Unload the module - basics" { assert_equal {OK} [r module unload test] } } From 7f953111393a6beed78b98d021421c25bcd12e4a Mon Sep 17 00:00:00 2001 From: kronwerk Date: Wed, 9 Oct 2024 14:11:53 +0300 Subject: [PATCH 27/51] Add flush-before-load option for repl-diskless-load (#909) A new option for diskless replication on the replica side. After a network failure, the replica may need to perform a full sync. The other option for diskless full sync is `swapdb`, but it uses twice as much memory, temporarily. In situations where this is not acceptable, and where losing data is acceptable, the `flush-before-load` can be useful. If the full sync fails, the old data is lost though. Therefore, the new option is marked as "dangerous". --------- Signed-off-by: kronwerk Signed-off-by: kronwerk Co-authored-by: kronwerk --- src/config.c | 1 + src/replication.c | 1 + src/server.h | 1 + tests/integration/replication.tcl | 57 +++++++++++++++++++++++++++++++ valkey.conf | 25 ++++++++------ 5 files changed, 74 insertions(+), 11 deletions(-) diff --git a/src/config.c b/src/config.c index 6c03cbb476..62ed5dd91c 100644 --- a/src/config.c +++ b/src/config.c @@ -107,6 +107,7 @@ configEnum repl_diskless_load_enum[] = { {"disabled", REPL_DISKLESS_LOAD_DISABLED}, {"on-empty-db", REPL_DISKLESS_LOAD_WHEN_DB_EMPTY}, {"swapdb", REPL_DISKLESS_LOAD_SWAPDB}, + {"flush-before-load", REPL_DISKLESS_LOAD_FLUSH_BEFORE_LOAD}, {NULL, 0}}; configEnum tls_auth_clients_enum[] = { diff --git a/src/replication.c b/src/replication.c index 64df85f19a..e2941c6d9c 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1943,6 +1943,7 @@ void restartAOFAfterSYNC(void) { static int useDisklessLoad(void) { /* compute boolean decision to use diskless load */ int enabled = server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB || + server.repl_diskless_load == REPL_DISKLESS_LOAD_FLUSH_BEFORE_LOAD || (server.repl_diskless_load == REPL_DISKLESS_LOAD_WHEN_DB_EMPTY && dbTotalServerKeyCount() == 0); if (enabled) { diff --git a/src/server.h b/src/server.h index 84a282b6f5..84ce96a49a 100644 --- a/src/server.h +++ b/src/server.h @@ -490,6 +490,7 @@ typedef enum { #define REPL_DISKLESS_LOAD_DISABLED 0 #define REPL_DISKLESS_LOAD_WHEN_DB_EMPTY 1 #define REPL_DISKLESS_LOAD_SWAPDB 2 +#define REPL_DISKLESS_LOAD_FLUSH_BEFORE_LOAD 3 /* TLS Client Authentication */ #define TLS_CLIENT_AUTH_NO 0 diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index 203574e391..1b5b0c030a 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -1478,6 +1478,63 @@ start_server {tags {"repl external:skip"}} { } } +foreach dualchannel {yes no} { + test "replica actually flushes db if use diskless load with flush-before-load dual-channel-replication-enabled=$dualchannel" { + start_server {tags {"repl"}} { + set replica [srv 0 client] + set replica_log [srv 0 stdout] + start_server {} { + set master [srv 0 client] + set master_host [srv 0 host] + set master_port [srv 0 port] + + # Fill both replica and master with data + $master debug populate 100 master 100000 + $replica debug populate 201 replica 100000 + assert_equal [$replica dbsize] 201 + # Set up master + $master config set save "" + $master config set rdbcompression no + $master config set repl-diskless-sync yes + $master config set repl-diskless-sync-delay 0 + $master config set dual-channel-replication-enabled $dualchannel + # Set master with a slow rdb generation, so that we can easily intercept loading + # 10ms per key, with 1000 keys is 10 seconds + $master config set rdb-key-save-delay 10000 + # Set up replica + $replica config set save "" + $replica config set repl-diskless-load flush-before-load + $replica config set dual-channel-replication-enabled $dualchannel + # Start the replication process... + $replica replicaof $master_host $master_port + + wait_for_condition 100 100 { + [s -1 loading] eq 1 + } else { + fail "Replica didn't start loading" + } + + # Make sure that next sync will not start immediately so that we can catch the replica in between syncs + $master config set repl-diskless-sync-delay 5 + + # Kill the replica connection on the master + set killed [$master client kill type replica] + + wait_for_condition 100 100 { + [s -1 loading] eq 0 + } else { + fail "Replica didn't disconnect" + } + + assert_equal [$replica dbsize] 0 + + # Speed up shutdown + $master config set rdb-key-save-delay 0 + } + } + } {} {external:skip} +} + start_server {tags {"repl external:skip"}} { set replica [srv 0 client] $replica set replica_key replica_value diff --git a/valkey.conf b/valkey.conf index 56110a52bd..9793d7825e 100644 --- a/valkey.conf +++ b/valkey.conf @@ -667,17 +667,20 @@ repl-diskless-sync-max-replicas 0 # fully loaded in memory, resulting in higher memory usage. # For this reason we have the following options: # -# "disabled" - Don't use diskless load (store the rdb file to the disk first) -# "swapdb" - Keep current db contents in RAM while parsing the data directly -# from the socket. Replicas in this mode can keep serving current -# dataset while replication is in progress, except for cases where -# they can't recognize primary as having a data set from same -# replication history. -# Note that this requires sufficient memory, if you don't have it, -# you risk an OOM kill. -# "on-empty-db" - Use diskless load only when current dataset is empty. This is -# safer and avoid having old and new dataset loaded side by side -# during replication. +# "disabled" - Don't use diskless load (store the rdb file to the disk first) +# "swapdb" - Keep current db contents in RAM while parsing the data directly +# from the socket. Replicas in this mode can keep serving current +# dataset while replication is in progress, except for cases where +# they can't recognize primary as having a data set from same +# replication history. +# Note that this requires sufficient memory, if you don't have it, +# you risk an OOM kill. +# "on-empty-db" - Use diskless load only when current dataset is empty. This is +# safer and avoid having old and new dataset loaded side by side +# during replication. +# "flush-before-load" - [dangerous] Flush all data before parsing. Note that if +# there's a problem before the replication succeeded you may +# lose all your data. repl-diskless-load disabled # This dual channel replication sync feature optimizes the full synchronization process From 2389b2ce013ed99f4c5e268db0ceb82765190911 Mon Sep 17 00:00:00 2001 From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com> Date: Wed, 9 Oct 2024 21:20:47 -0700 Subject: [PATCH 28/51] Fix empty response for ACL CAT category subcommand for module defined categories (#1140) The module commands which were added to acl categories were getting skipped when `ACL CAT category` command was executed. This PR fixes the bug. Before: ``` 127.0.0.1:6379> ACL CAT foocategory (empty array) ``` After: ``` 127.0.0.1:6379> ACL CAT foocategory aclcheck.module.command.test.add.new.aclcategories ``` --------- Signed-off-by: Roshan Khatri Co-authored-by: Harkrishn Patro --- src/acl.c | 1 - tests/unit/moduleapi/aclcheck.tcl | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/acl.c b/src/acl.c index 14d9c6e6f3..688820fd89 100644 --- a/src/acl.c +++ b/src/acl.c @@ -2760,7 +2760,6 @@ void aclCatWithFlags(client *c, dict *commands, uint64_t cflag, int *arraylen) { while ((de = dictNext(di)) != NULL) { struct serverCommand *cmd = dictGetVal(de); - if (cmd->flags & CMD_MODULE) continue; if (cmd->acl_categories & cflag) { addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname)); (*arraylen)++; diff --git a/tests/unit/moduleapi/aclcheck.tcl b/tests/unit/moduleapi/aclcheck.tcl index 1ea09a2324..4c5a8380eb 100644 --- a/tests/unit/moduleapi/aclcheck.tcl +++ b/tests/unit/moduleapi/aclcheck.tcl @@ -143,6 +143,10 @@ start_server {tags {"modules acl"}} { assert_equal [r acl DRYRUN j8 aclcheck.module.command.test.add.new.aclcategories] OK } + test {test if ACL CAT output for the new category is correct} { + assert_equal [r ACL CAT foocategory] aclcheck.module.command.test.add.new.aclcategories + } + test {test permission compaction and simplification for categories added by a module} { r acl SETUSER j9 on >password -@all +@foocategory -@foocategory catch {r ACL GETUSER j9} res From 7ea3a972c7830656c47c43878a25e4c4b44fa33a Mon Sep 17 00:00:00 2001 From: Shivshankar Date: Thu, 10 Oct 2024 11:46:09 -0400 Subject: [PATCH 29/51] Add io-threads-do-reads config to deprecated config table to have no effect. (#1138) this fixes: https://github.com/valkey-io/valkey/issues/1116 _Issue details from #1116 by @zuiderkwast_ > This config is undocumented since #758. The default was changed to "yes" and it is quite useless to set it to "no". Yet, it can happen that some user has an old config file where it is explicitly set to "no". The result will be bad performace, since I/O threads will not do all the I/O. > > It's indeed confusing. > > 1. Either remove the whole option from the code. And thus no need for documentation. _OR:_ > 2. Introduce the option back in the configuration, just as a comment is fine. And showing the default value "yes": `# io-threads-do-reads yes` with additional text. > > _Originally posted by @melroy89 in [#1019 (reply in thread)](https://github.com/orgs/valkey-io/discussions/1019#discussioncomment-10824778)_ --------- Signed-off-by: Shivshankar-Reddy --- src/config.c | 2 +- src/io_threads.c | 1 - src/server.h | 1 - tests/unit/introspection.tcl | 1 - valkey.conf | 6 +++++- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/config.c b/src/config.c index 62ed5dd91c..663cf5da38 100644 --- a/src/config.c +++ b/src/config.c @@ -445,6 +445,7 @@ void loadServerConfigFromString(char *config) { {"list-max-ziplist-entries", 2, 2}, {"list-max-ziplist-value", 2, 2}, {"lua-replicate-commands", 2, 2}, + {"io-threads-do-reads", 2, 2}, {NULL, 0}, }; char buf[1024]; @@ -3087,7 +3088,6 @@ standardConfig static_configs[] = { /* Bool configs */ createBoolConfig("rdbchecksum", NULL, IMMUTABLE_CONFIG, server.rdb_checksum, 1, NULL, NULL), createBoolConfig("daemonize", NULL, IMMUTABLE_CONFIG, server.daemonize, 0, NULL, NULL), - createBoolConfig("io-threads-do-reads", NULL, DEBUG_CONFIG | IMMUTABLE_CONFIG, server.io_threads_do_reads, 1, NULL, NULL), /* Read + parse from threads */ createBoolConfig("always-show-logo", NULL, IMMUTABLE_CONFIG, server.always_show_logo, 0, NULL, NULL), createBoolConfig("protected-mode", NULL, MODIFIABLE_CONFIG, server.protected_mode, 1, NULL, NULL), createBoolConfig("rdbcompression", NULL, MODIFIABLE_CONFIG, server.rdb_compression, 1, NULL, NULL), diff --git a/src/io_threads.c b/src/io_threads.c index 5b2230f635..b0368cf07b 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -319,7 +319,6 @@ void initIOThreads(void) { int trySendReadToIOThreads(client *c) { if (server.active_io_threads_num <= 1) return C_ERR; - if (!server.io_threads_do_reads) return C_ERR; /* If IO thread is areadty reading, return C_OK to make sure the main thread will not handle it. */ if (c->io_read_state != CLIENT_IDLE) return C_OK; /* Currently, replica/master writes are not offloaded and are processed synchronously. */ diff --git a/src/server.h b/src/server.h index 84ce96a49a..29d675bb18 100644 --- a/src/server.h +++ b/src/server.h @@ -1745,7 +1745,6 @@ struct valkeyServer { _Atomic uint64_t next_client_id; /* Next client unique ID. Incremental. */ int protected_mode; /* Don't accept external connections. */ int io_threads_num; /* Number of IO threads to use. */ - int io_threads_do_reads; /* Read and parse from IO threads? */ int active_io_threads_num; /* Current number of active IO threads, includes main thread. */ int events_per_io_thread; /* Number of events on the event loop to trigger IO threads activation. */ int prefetch_batch_max_size; /* Maximum number of keys to prefetch in a single batch */ diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 286b02b7d0..352f5f183e 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -514,7 +514,6 @@ start_server {tags {"introspection"}} { set skip_configs { rdbchecksum daemonize - io-threads-do-reads tcp-backlog always-show-logo syslog-enabled diff --git a/valkey.conf b/valkey.conf index 9793d7825e..07cd293a0c 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1357,7 +1357,11 @@ lazyfree-lazy-user-flush yes # # prefetch-batch-max-size 16 # -# NOTE: If you want to test the server speedup using valkey-benchmark, make +# NOTE: +# 1. The 'io-threads-do-reads' config is deprecated and has no effect. +# It will be removed in the future. Please avoid using this option if possible. +# +# 2. If you want to test the server speedup using valkey-benchmark, make # sure you also run the benchmark itself in threaded mode, using the # --threads option to match the number of server threads, otherwise you'll not # be able to notice the improvements. From cfca78a796119bae95367e020c0da41dc58c9124 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 11 Oct 2024 00:09:22 +0800 Subject: [PATCH 30/51] Fix typo last_procssed -> last_processed (#1142) Minor typo. Signed-off-by: Binbin --- src/server.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/server.c b/src/server.c index d9f7d73843..653cfa7794 100644 --- a/src/server.c +++ b/src/server.c @@ -1612,12 +1612,12 @@ void beforeSleep(struct aeEventLoop *eventLoop) { processed += connTypeProcessPendingData(); if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); processed += handleClientsWithPendingWrites(); - int last_procssed = 0; + int last_processed = 0; do { /* Try to process all the pending IO events. */ - last_procssed = processIOThreadsReadDone() + processIOThreadsWriteDone(); - processed += last_procssed; - } while (last_procssed != 0); + last_processed = processIOThreadsReadDone() + processIOThreadsWriteDone(); + processed += last_processed; + } while (last_processed != 0); processed += freeClientsInAsyncFreeQueue(); server.events_processed_while_blocked += processed; return; From c86cdd695458d9c3043d4502f4d70cb458fbe80b Mon Sep 17 00:00:00 2001 From: Shivshankar Date: Sat, 12 Oct 2024 00:21:09 -0400 Subject: [PATCH 31/51] Correct the note details for deprecated config 'io-threads-do-reads' (#1150) Remove explicit reference to removal and just indicate to avoid using it. Signed-off-by: Shivshankar-Reddy --- valkey.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/valkey.conf b/valkey.conf index 07cd293a0c..eabb5aa5d1 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1358,8 +1358,8 @@ lazyfree-lazy-user-flush yes # prefetch-batch-max-size 16 # # NOTE: -# 1. The 'io-threads-do-reads' config is deprecated and has no effect. -# It will be removed in the future. Please avoid using this option if possible. +# 1. The 'io-threads-do-reads' config is deprecated and has no effect. Please +# avoid using this config if possible. # # 2. If you want to test the server speedup using valkey-benchmark, make # sure you also run the benchmark itself in threaded mode, using the From b37f749c52861239cf8d35b15ef0fe6f487ccd6a Mon Sep 17 00:00:00 2001 From: Masahiro Ide Date: Sat, 12 Oct 2024 13:28:42 +0900 Subject: [PATCH 32/51] Move prepareClientToWrite out of loop for HGETALL command (#1119) Similar to #860 but this is for HGETALL families (HGETALL/HKEYS/HVALS). This patch moves `prepareClientToWrite` out of the loop to reduce the function overhead. Signed-off-by: Masahiro Ide Co-authored-by: Madelyn Olson --- src/networking.c | 25 ++++++++++++++++--- src/server.h | 4 ++- src/t_hash.c | 65 ++++++++++++++++++++++++++---------------------- 3 files changed, 59 insertions(+), 35 deletions(-) diff --git a/src/networking.c b/src/networking.c index b18b798eaa..c24a95019b 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1043,7 +1043,8 @@ void addReplyArrayLen(client *c, long length) { addReplyAggregateLen(c, length, '*'); } -void addWritePreparedReplyArrayLen(writePreparedClient *c, long length) { +void addWritePreparedReplyArrayLen(writePreparedClient *wpc, long length) { + client *c = (client *)wpc; serverAssert(length >= 0); _addReplyLongLongWithPrefix(c, length, '*'); } @@ -1054,6 +1055,13 @@ void addReplyMapLen(client *c, long length) { addReplyAggregateLen(c, length, prefix); } +void addWritePreparedReplyMapLen(writePreparedClient *wpc, long length) { + client *c = (client *)wpc; + int prefix = c->resp == 2 ? '*' : '%'; + if (c->resp == 2) length *= 2; + _addReplyLongLongWithPrefix(c, length, prefix); +} + void addReplySetLen(client *c, long length) { int prefix = c->resp == 2 ? '*' : '~'; addReplyAggregateLen(c, length, prefix); @@ -1120,7 +1128,8 @@ void addReplyBulkCBuffer(client *c, const void *p, size_t len) { _addReplyToBufferOrList(c, "\r\n", 2); } -void addWritePreparedReplyBulkCBuffer(writePreparedClient *c, const void *p, size_t len) { +void addWritePreparedReplyBulkCBuffer(writePreparedClient *wpc, const void *p, size_t len) { + client *c = (client *)wpc; _addReplyLongLongWithPrefix(c, len, '$'); _addReplyToBufferOrList(c, p, len); _addReplyToBufferOrList(c, "\r\n", 2); @@ -1138,6 +1147,14 @@ void addReplyBulkSds(client *c, sds s) { _addReplyToBufferOrList(c, "\r\n", 2); } +void addWritePreparedReplyBulkSds(writePreparedClient *wpc, sds s) { + client *c = (client *)wpc; + _addReplyLongLongWithPrefix(c, sdslen(s), '$'); + _addReplyToBufferOrList(c, s, sdslen(s)); + sdsfree(s); + _addReplyToBufferOrList(c, "\r\n", 2); +} + /* Set sds to a deferred reply (for symmetry with addReplyBulkSds it also frees the sds) */ void setDeferredReplyBulkSds(client *c, void *node, sds s) { sds reply = sdscatprintf(sdsempty(), "$%d\r\n%s\r\n", (unsigned)sdslen(s), s); @@ -1164,12 +1181,12 @@ void addReplyBulkLongLong(client *c, long long ll) { addReplyBulkCBuffer(c, buf, len); } -void addWritePreparedReplyBulkLongLong(writePreparedClient *c, long long ll) { +void addWritePreparedReplyBulkLongLong(writePreparedClient *wpc, long long ll) { char buf[64]; int len; len = ll2string(buf, 64, ll); - addWritePreparedReplyBulkCBuffer(c, buf, len); + addWritePreparedReplyBulkCBuffer(wpc, buf, len); } /* Reply with a verbatim type having the specified extension. diff --git a/src/server.h b/src/server.h index 29d675bb18..44ba429b16 100644 --- a/src/server.h +++ b/src/server.h @@ -1379,7 +1379,7 @@ typedef struct client { * skip certain types of initialization. This type is used to indicate a client that has been initialized * and can be used with addWritePreparedReply* functions. A client can be cast into this type with * prepareClientForFutureWrites(client *c). */ -typedef client writePreparedClient; +typedef struct writePreparedClient writePreparedClient; /* ACL information */ typedef struct aclInfo { @@ -2789,6 +2789,7 @@ void addReply(client *c, robj *obj); void addReplyStatusLength(client *c, const char *s, size_t len); void addReplySds(client *c, sds s); void addReplyBulkSds(client *c, sds s); +void addWritePreparedReplyBulkSds(writePreparedClient *c, sds s); void setDeferredReplyBulkSds(client *c, void *node, sds s); void addReplyErrorObject(client *c, robj *err); void addReplyOrErrorObject(client *c, robj *reply); @@ -2808,6 +2809,7 @@ void addReplyLongLong(client *c, long long ll); void addReplyArrayLen(client *c, long length); void addWritePreparedReplyArrayLen(writePreparedClient *c, long length); void addReplyMapLen(client *c, long length); +void addWritePreparedReplyMapLen(writePreparedClient *c, long length); void addReplySetLen(client *c, long length); void addReplyAttributeLen(client *c, long length); void addReplyPushLen(client *c, long length); diff --git a/src/t_hash.c b/src/t_hash.c index 96252bc39b..dabe279808 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -788,7 +788,7 @@ void hstrlenCommand(client *c) { addReplyLongLong(c, hashTypeGetValueLength(o, c->argv[2]->ptr)); } -static void addHashIteratorCursorToReply(client *c, hashTypeIterator *hi, int what) { +static void addHashIteratorCursorToReply(writePreparedClient *wpc, hashTypeIterator *hi, int what) { if (hi->encoding == OBJ_ENCODING_LISTPACK) { unsigned char *vstr = NULL; unsigned int vlen = UINT_MAX; @@ -796,12 +796,12 @@ static void addHashIteratorCursorToReply(client *c, hashTypeIterator *hi, int wh hashTypeCurrentFromListpack(hi, what, &vstr, &vlen, &vll); if (vstr) - addReplyBulkCBuffer(c, vstr, vlen); + addWritePreparedReplyBulkCBuffer(wpc, vstr, vlen); else - addReplyBulkLongLong(c, vll); + addWritePreparedReplyBulkLongLong(wpc, vll); } else if (hi->encoding == OBJ_ENCODING_HT) { sds value = hashTypeCurrentFromHashTable(hi, what); - addReplyBulkCBuffer(c, value, sdslen(value)); + addWritePreparedReplyBulkCBuffer(wpc, value, sdslen(value)); } else { serverPanic("Unknown hash encoding"); } @@ -815,23 +815,25 @@ void genericHgetallCommand(client *c, int flags) { robj *emptyResp = (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) ? shared.emptymap[c->resp] : shared.emptyarray; if ((o = lookupKeyReadOrReply(c, c->argv[1], emptyResp)) == NULL || checkType(c, o, OBJ_HASH)) return; + writePreparedClient *wpc = prepareClientForFutureWrites(c); + if (!wpc) return; /* We return a map if the user requested keys and values, like in the * HGETALL case. Otherwise to use a flat array makes more sense. */ length = hashTypeLength(o); if (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) { - addReplyMapLen(c, length); + addWritePreparedReplyMapLen(wpc, length); } else { - addReplyArrayLen(c, length); + addWritePreparedReplyArrayLen(wpc, length); } hashTypeInitIterator(o, &hi); while (hashTypeNext(&hi) != C_ERR) { if (flags & OBJ_HASH_KEY) { - addHashIteratorCursorToReply(c, &hi, OBJ_HASH_KEY); + addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_KEY); count++; } if (flags & OBJ_HASH_VALUE) { - addHashIteratorCursorToReply(c, &hi, OBJ_HASH_VALUE); + addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_VALUE); count++; } } @@ -871,18 +873,19 @@ void hscanCommand(client *c) { scanGenericCommand(c, o, cursor); } -static void hrandfieldReplyWithListpack(client *c, unsigned int count, listpackEntry *keys, listpackEntry *vals) { +static void hrandfieldReplyWithListpack(writePreparedClient *wpc, unsigned int count, listpackEntry *keys, listpackEntry *vals) { + client *c = (client *)wpc; for (unsigned long i = 0; i < count; i++) { - if (vals && c->resp > 2) addReplyArrayLen(c, 2); + if (vals && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); if (keys[i].sval) - addReplyBulkCBuffer(c, keys[i].sval, keys[i].slen); + addWritePreparedReplyBulkCBuffer(wpc, keys[i].sval, keys[i].slen); else - addReplyBulkLongLong(c, keys[i].lval); + addWritePreparedReplyBulkLongLong(wpc, keys[i].lval); if (vals) { if (vals[i].sval) - addReplyBulkCBuffer(c, vals[i].sval, vals[i].slen); + addWritePreparedReplyBulkCBuffer(wpc, vals[i].sval, vals[i].slen); else - addReplyBulkLongLong(c, vals[i].lval); + addWritePreparedReplyBulkLongLong(wpc, vals[i].lval); } } } @@ -918,6 +921,8 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { return; } + writePreparedClient *wpc = prepareClientForFutureWrites(c); + if (!wpc) return; /* CASE 1: The count was negative, so the extraction method is just: * "return N random elements" sampling the whole set every time. * This case is trivial and can be served without auxiliary data @@ -925,18 +930,18 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { * elements in random order. */ if (!uniq || count == 1) { if (withvalues && c->resp == 2) - addReplyArrayLen(c, count * 2); + addWritePreparedReplyArrayLen(wpc, count * 2); else - addReplyArrayLen(c, count); + addWritePreparedReplyArrayLen(wpc, count); if (hash->encoding == OBJ_ENCODING_HT) { sds key, value; while (count--) { dictEntry *de = dictGetFairRandomKey(hash->ptr); key = dictGetKey(de); value = dictGetVal(de); - if (withvalues && c->resp > 2) addReplyArrayLen(c, 2); - addReplyBulkCBuffer(c, key, sdslen(key)); - if (withvalues) addReplyBulkCBuffer(c, value, sdslen(value)); + if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); + addWritePreparedReplyBulkCBuffer(wpc, key, sdslen(key)); + if (withvalues) addWritePreparedReplyBulkCBuffer(wpc, value, sdslen(value)); if (c->flag.close_asap) break; } } else if (hash->encoding == OBJ_ENCODING_LISTPACK) { @@ -950,7 +955,7 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { sample_count = count > limit ? limit : count; count -= sample_count; lpRandomPairs(hash->ptr, sample_count, keys, vals); - hrandfieldReplyWithListpack(c, sample_count, keys, vals); + hrandfieldReplyWithListpack(wpc, sample_count, keys, vals); if (c->flag.close_asap) break; } zfree(keys); @@ -962,9 +967,9 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { /* Initiate reply count, RESP3 responds with nested array, RESP2 with flat one. */ long reply_size = count < size ? count : size; if (withvalues && c->resp == 2) - addReplyArrayLen(c, reply_size * 2); + addWritePreparedReplyArrayLen(wpc, reply_size * 2); else - addReplyArrayLen(c, reply_size); + addWritePreparedReplyArrayLen(wpc, reply_size); /* CASE 2: * The number of requested elements is greater than the number of @@ -973,9 +978,9 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { hashTypeIterator hi; hashTypeInitIterator(hash, &hi); while (hashTypeNext(&hi) != C_ERR) { - if (withvalues && c->resp > 2) addReplyArrayLen(c, 2); - addHashIteratorCursorToReply(c, &hi, OBJ_HASH_KEY); - if (withvalues) addHashIteratorCursorToReply(c, &hi, OBJ_HASH_VALUE); + if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); + addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_KEY); + if (withvalues) addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_VALUE); } hashTypeResetIterator(&hi); return; @@ -994,7 +999,7 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { keys = zmalloc(sizeof(listpackEntry) * count); if (withvalues) vals = zmalloc(sizeof(listpackEntry) * count); serverAssert(lpRandomPairsUnique(hash->ptr, count, keys, vals) == count); - hrandfieldReplyWithListpack(c, count, keys, vals); + hrandfieldReplyWithListpack(wpc, count, keys, vals); zfree(keys); zfree(vals); return; @@ -1048,9 +1053,9 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { while ((de = dictNext(di)) != NULL) { sds key = dictGetKey(de); sds value = dictGetVal(de); - if (withvalues && c->resp > 2) addReplyArrayLen(c, 2); - addReplyBulkSds(c, key); - if (withvalues) addReplyBulkSds(c, value); + if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); + addWritePreparedReplyBulkSds(wpc, key); + if (withvalues) addWritePreparedReplyBulkSds(wpc, value); } dictReleaseIterator(di); @@ -1081,7 +1086,7 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { added++; /* We can reply right away, so that we don't need to store the value in the dict. */ - if (withvalues && c->resp > 2) addReplyArrayLen(c, 2); + if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); hashReplyFromListpackEntry(c, &key); if (withvalues) hashReplyFromListpackEntry(c, &value); } From f1beadb8321f3844ee2a78cd55afb8dd4b2f9ca6 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sun, 13 Oct 2024 22:06:28 +0800 Subject: [PATCH 33/51] Fix aof race in shutdown nosave timedout script test (#1156) Ci report this failure: ``` *** [err]: SHUTDOWN NOSAVE can kill a timedout script anyway in tests/unit/scripting.tcl Expected 'BUSY Valkey is busy running a script. *' to match '*connection refused*' (context: type eval line 8 cmd {assert_match {*connection refused*} $e} proc ::test) ``` We can see the logs the shutdown got rejected because there is an AOFRW pending: ``` Writing initial AOF, can't exit. Errors trying to shut down the server. Check the logs for more information. ``` The reason is that the previous test enabled the aof. Signed-off-by: Binbin --- tests/unit/scripting.tcl | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index db37129103..d72139f178 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -1297,6 +1297,7 @@ start_server {tags {"scripting"}} { $rd close $rd2 close $r3 close + r config set appendonly no r DEBUG set-disable-deny-scripts 0 } {OK} {external:skip needs:debug} From 85461b8764dc3a74df4e2d3a5c85bc089c2defe4 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 14 Oct 2024 12:37:19 +0800 Subject: [PATCH 34/51] Minor comments cleanup around replication.c (#1154) Typo, comment cleanups. Signed-off-by: Binbin --- src/replication.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/replication.c b/src/replication.c index e2941c6d9c..948a2762bc 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2470,7 +2470,7 @@ char *sendCommand(connection *conn, ...) { size_t argslen = 0; char *arg; - /* Create the command to send to the primarprimaryuse binary + /* Create the command to send to the primary, we use binary * protocol to make sure correct arguments are sent. This function * is not safe for all binary data. */ va_start(ap, conn); @@ -2592,7 +2592,7 @@ int sendCurrentOffsetToReplica(client *replica) { * This connection handler is used to initialize the RDB connection (dual-channel-replication). * Once a replica with dual-channel-replication enabled, denied from PSYNC with its primary, * fullSyncWithPrimary begins its role. The connection handler prepares server.repl_rdb_transfer_s - * for a rdb stream, and server.repl_transfer_s for increamental replication data stream. */ + * for a rdb stream, and server.repl_transfer_s for incremental replication data stream. */ static void fullSyncWithPrimary(connection *conn) { char *err = NULL; serverAssert(conn == server.repl_rdb_transfer_s); @@ -2629,7 +2629,7 @@ static void fullSyncWithPrimary(connection *conn) { return; } } - /* Send replica lisening port to primary for clarification */ + /* Send replica listening port to primary for clarification */ sds portstr = getReplicaPortString(); err = sendCommand(conn, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr, NULL); @@ -2893,7 +2893,7 @@ int streamReplDataBufToDb(client *c) { /* Replication: Replica side. * After done loading the snapshot using the rdb-channel prepare this replica for steady state by - * initializing the primary client, amd stream local increamental buffer into memory. */ + * initializing the primary client, amd stream local incremental buffer into memory. */ void dualChannelSyncSuccess(void) { server.primary_initial_offset = server.repl_provisional_primary.reploff; replicationResurrectProvisionalPrimary(); @@ -3237,10 +3237,10 @@ void setupMainConnForPsync(connection *conn) { if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */ if (psync_result == PSYNC_CONTINUE) { - serverLog(LL_NOTICE, "Primary <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); if (server.supervised_mode == SUPERVISED_SYSTEMD) { - serverCommunicateSystemd("STATUS=Primary <-> REPLICA sync: Partial Resynchronization accepted. Ready to " + serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Partial Resynchronization accepted. Ready to " "accept connections in read-write mode.\n"); } dualChannelSyncHandlePsync(); @@ -3262,7 +3262,7 @@ void setupMainConnForPsync(connection *conn) { * - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease * the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted * then primary's COB as the replica plays less critical part in the replication group. While increasing the - * primary’s COB may end up with primary reaching swap and clients suffering, at replica side we’re more at + * primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at * ease with it. Larger COB means better chance to sync successfully. * - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child * processes can have direct access to the new channel. Due to TLS connection restrictions, this was not @@ -3299,21 +3299,21 @@ void setupMainConnForPsync(connection *conn) { * │RECEIVE_AUTH_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ * └────────┬──────────┘ │ │ │+OK │ │+OK │ * │+OK │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ - * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_REPLCONF_.│ │ │SEND_PSYNC │ │ + * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC │ │ * │RECEIVE_PORT_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ * └────────┬──────────┘ │ │ │+OK │ │PSYNC use snapshot │ - * │+OK │ │ ┌───────▼────────────────┐ │ │end-offset provided │ - * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_EN.│ │ │by the primary │ - * │RECEIVE_IP_REPLY │ │ │ └───────┬────────────────┘ │ ┌──▼────────────────┐ │ + * │+OK │ │ ┌───────▼───────────────────┐ │ │end-offset provided │ + * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_ENDOFF│ │ │by the primary │ + * │RECEIVE_IP_REPLY │ │ │ └───────┬───────────────────┘ │ ┌──▼────────────────┐ │ * └────────┬──────────┘ │ │ │$ENDOFF │ │RECEIVE_PSYNC_REPLY│ │ - * │+OK │ │ ├─────────────────────────┘ └──┬────────────────┘ │ - * ┌────────▼──────────┐ │ │ │ │+CONTINUE │ - * │RECEIVE_IP_REPLY │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ - * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ + * │ │ │ ├─────────────────────────┘ └──┬────────────────┘ │ + * │ │ │ │ │+CONTINUE │ + * │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ + * │ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ * │+OK │ │ └───────┬───────────────┘ └─────┬─────────────┘ │ * ┌────────▼──────────┐ │ │ │Done loading │ │ * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ │ │ - * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOADE.│ │ │ + * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ * │ │ │ └───────┬───────────────┘ │ │ * ┌────────▼───┐ │ │ │ │ │ * │SEND_PSYNC │ │ │ │Replica loads local replication │ │ From 892fc8425c23e16dd7c1522aeaed783508a26b46 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:30:29 +0300 Subject: [PATCH 35/51] Deflake test Primary COB growth with inactive replica (#1165) in case of valgrind run, the replica might get disconnected from the primary due to repl-timeout reached. Fix is to configure larger timeout in case of valgrind test. **Partially** fixes: #1152 Signed-off-by: Ran Shidlansik --- tests/integration/dual-channel-replication.tcl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index f8c9eae44d..b1aab1c23a 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -500,7 +500,11 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set dual-channel-replication-enabled yes $primary config set repl-backlog-size $backlog_size $primary config set loglevel debug - $primary config set repl-timeout 10 + if {$::valgrind} { + $primary config set repl-timeout 100 + } else { + $primary config set repl-timeout 10 + } $primary config set rdb-key-save-delay 200 populate 10000 primary 10000 @@ -510,7 +514,11 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica config set dual-channel-replication-enabled yes $replica config set loglevel debug - $replica config set repl-timeout 10 + if {$::valgrind} { + $primary config set repl-timeout 100 + } else { + $primary config set repl-timeout 10 + } # Pause replica after primary fork $replica debug pause-after-fork 1 From 41c4a320c221909af34db59711f401e1bca28fa3 Mon Sep 17 00:00:00 2001 From: ranshid <88133677+ranshid@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:31:59 +0300 Subject: [PATCH 36/51] Deflake test ync should continue if not all slaves dropped dual-channel-replication (#1164) Sometimes when dual-channel is turned off the tested replica might disconnect on COB overrun. disable the replica COB limit in order to prevent such cases. Fixes: #1153 Signed-off-by: Ran Shidlansik Signed-off-by: Binbin Co-authored-by: Binbin --- tests/integration/dual-channel-replication.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index b1aab1c23a..5302030db9 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -842,11 +842,11 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set dual-channel-replication-enabled yes $primary config set loglevel debug $primary config set repl-diskless-sync-delay 5 - + $primary config set client-output-buffer-limit "replica 0 0 0" + # Generating RDB will cost 5s(10000 * 0.0005s) $primary debug populate 10000 primary 1 $primary config set rdb-key-save-delay 500 - $primary config set dual-channel-replication-enabled $dualchannel start_server {} { From 2010f69d893b01f2d9146066ed743c7280d84aaa Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 15 Oct 2024 10:29:34 +0800 Subject: [PATCH 37/51] Set fail-fast to false in daily CI (#1162) Currently in our daily, if a job fails, it will cancel the other jobs in the same matrix, we want to avoid this so that all jobs in a matrix can eventually run to completion. Docs: jobs..strategy.fail-fast applies to the entire matrix. If jobs..strategy.fail-fast is set to true or its expression evaluates to true, GitHub will cancel all in-progress and queued jobs in the matrix if any job in the matrix fails. This property defaults to true. Signed-off-by: Binbin --- .github/workflows/daily.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index 1edf76d386..bcfa35c939 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -607,6 +607,7 @@ jobs: !contains(github.event.inputs.skipjobs, 'sanitizer') timeout-minutes: 14400 strategy: + fail-fast: false matrix: compiler: [gcc, clang] env: @@ -659,6 +660,7 @@ jobs: !contains(github.event.inputs.skipjobs, 'sanitizer') timeout-minutes: 14400 strategy: + fail-fast: false matrix: compiler: [gcc, clang] env: @@ -1001,6 +1003,7 @@ jobs: build-macos: strategy: + fail-fast: false matrix: os: [macos-12, macos-14] runs-on: ${{ matrix.os }} From 6a05d59d57a989abf635a09c92c18a7b144ba91e Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 15 Oct 2024 10:29:52 +0800 Subject: [PATCH 38/51] Use listLast to replace listIndex -1 (#1163) Minor cleanup, listLast do the same thing and is widely used and easier to understand (less code). Signed-off-by: Binbin --- src/aof.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aof.c b/src/aof.c index bc29bb0d9e..e0ca6fbb61 100644 --- a/src/aof.c +++ b/src/aof.c @@ -476,7 +476,7 @@ sds getLastIncrAofName(aofManifest *am) { } /* Or return the last one. */ - listNode *lastnode = listIndex(am->incr_aof_list, -1); + listNode *lastnode = listLast(am->incr_aof_list); aofInfo *ai = listNodeValue(lastnode); return ai->file_name; } From 3967affb88404bc1db296def7a7ceeb6ee9f87d4 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 15 Oct 2024 10:30:03 +0800 Subject: [PATCH 39/51] Minor cleanups in acl-v2 tests (#1166) 1. Make sure to assert the ERR prefix. 2. Match "Syntax error*" in case of the message change. Signed-off-by: Binbin --- tests/unit/acl-v2.tcl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unit/acl-v2.tcl b/tests/unit/acl-v2.tcl index a509bc0ff6..e8229d1b36 100644 --- a/tests/unit/acl-v2.tcl +++ b/tests/unit/acl-v2.tcl @@ -42,19 +42,19 @@ start_server {tags {"acl external:skip"}} { test {Test selector syntax error reports the error in the selector context} { catch {r ACL SETUSER selector-syntax on (this-is-invalid)} e - assert_match "*ERR Error in ACL SETUSER modifier '(*)*Syntax*" $e + assert_match "ERR Error in ACL SETUSER modifier '(*)*Syntax*" $e catch {r ACL SETUSER selector-syntax on (&* &fail)} e - assert_match "*ERR Error in ACL SETUSER modifier '(*)*Adding a pattern after the*" $e + assert_match "ERR Error in ACL SETUSER modifier '(*)*Adding a pattern after the*" $e catch {r ACL SETUSER selector-syntax on (+PING (+SELECT (+DEL} e - assert_match "*ERR Unmatched parenthesis in acl selector*" $e + assert_match "ERR Unmatched parenthesis in acl selector*" $e catch {r ACL SETUSER selector-syntax on (+PING (+SELECT (+DEL ) ) ) } e - assert_match "*ERR Error in ACL SETUSER modifier*" $e + assert_match "ERR Error in ACL SETUSER modifier*" $e catch {r ACL SETUSER selector-syntax on (+PING (+SELECT (+DEL ) } e - assert_match "*ERR Error in ACL SETUSER modifier*" $e + assert_match "ERR Error in ACL SETUSER modifier*" $e assert_equal "" [r ACL GETUSER selector-syntax] } @@ -75,11 +75,11 @@ start_server {tags {"acl external:skip"}} { # Test invalid selector syntax catch {r ACL SETUSER invalid-selector " () "} err - assert_match "*ERR*Syntax error*" $err + assert_match "ERR*Syntax error*" $err catch {r ACL SETUSER invalid-selector (} err - assert_match "*Unmatched parenthesis*" $err + assert_match "ERR*Unmatched parenthesis*" $err catch {r ACL SETUSER invalid-selector )} err - assert_match "*ERR*Syntax error" $err + assert_match "ERR*Syntax error*" $err } test {Test separate read permission} { From a05fc0e5ec891c750c8b1e0bad63a6cfd8de7d8e Mon Sep 17 00:00:00 2001 From: "Romain Geissler @ Amadeus" Date: Tue, 15 Oct 2024 13:05:22 +0200 Subject: [PATCH 40/51] Rename z{malloc,calloc,realloc,free} into valkey_{malloc,calloc,realloc,free} (#1169) The zcalloc symbol is a symbol name already used by zlib, which is defining other names using the "z" prefix specific to zlib. In practice, linking valkey with a static openssl, which itself might depend on a static libz will result in link time error rejecting multiple symbol definitions. Fixes: #1157 Signed-off-by: Romain Geissler --- deps/hdr_histogram/hdr_redis_malloc.h | 12 ++++++------ src/zmalloc.h | 9 +++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/deps/hdr_histogram/hdr_redis_malloc.h b/deps/hdr_histogram/hdr_redis_malloc.h index d9401ca70b..a40f710d9d 100644 --- a/deps/hdr_histogram/hdr_redis_malloc.h +++ b/deps/hdr_histogram/hdr_redis_malloc.h @@ -1,13 +1,13 @@ #ifndef HDR_MALLOC_H__ #define HDR_MALLOC_H__ -void *zmalloc(size_t size); +void *valkey_malloc(size_t size); void *zcalloc_num(size_t num, size_t size); -void *zrealloc(void *ptr, size_t size); -void zfree(void *ptr); +void *valkey_realloc(void *ptr, size_t size); +void valkey_free(void *ptr); -#define hdr_malloc zmalloc +#define hdr_malloc valkey_malloc #define hdr_calloc zcalloc_num -#define hdr_realloc zrealloc -#define hdr_free zfree +#define hdr_realloc valkey_realloc +#define hdr_free valkey_free #endif diff --git a/src/zmalloc.h b/src/zmalloc.h index d5301318aa..9b51f4c866 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -107,6 +107,15 @@ #define HAVE_DEFRAG #endif +/* The zcalloc symbol is a symbol name already used by zlib, which is defining + * other names using the "z" prefix specific to zlib. In practice, linking + * valkey with a static openssl, which itself might depend on a static libz + * will result in link time error rejecting multiple symbol definitions. */ +#define zmalloc valkey_malloc +#define zcalloc valkey_calloc +#define zrealloc valkey_realloc +#define zfree valkey_free + /* 'noinline' attribute is intended to prevent the `-Wstringop-overread` warning * when using gcc-12 later with LTO enabled. It may be removed once the * bug[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503] is fixed. */ From b92675dc6f1478855685a043fb1cfabcb7b2c26a Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 15 Oct 2024 23:32:22 +0800 Subject: [PATCH 41/51] Take hz into account in activerehashing to avoid CPU spikes (#977) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently in conf we describe activerehashing as: Active rehashing uses 1 millisecond every 100 milliseconds of CPU time. This is the case for hz = 10. If we change hz, the description in conf will be inaccurate. Users may notice that the server spends some CPU (used in activerehashing) at high hz but don't know why, since our cron calls are fixed to 1ms. This PR takes hz into account and fixed the CPU usage at 1% (this may not be accurate in some cases because we do 100 step rehashing in dictRehashMicroseconds but it can avoid CPU spikes in this case). This PR also improves the description of the activerehashing configuration item to explain this change. Signed-off-by: Binbin Co-authored-by: Viktor Söderqvist --- src/server.c | 9 +++++---- src/server.h | 5 ++--- valkey.conf | 34 ++++++++++++++++------------------ 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/server.c b/src/server.c index 653cfa7794..ab95f84346 100644 --- a/src/server.c +++ b/src/server.c @@ -1090,12 +1090,13 @@ void databasesCron(void) { /* Rehash */ if (server.activerehashing) { uint64_t elapsed_us = 0; + uint64_t threshold_us = 1 * 1000000 / server.hz / 100; for (j = 0; j < dbs_per_call; j++) { serverDb *db = &server.db[rehash_db % server.dbnum]; - elapsed_us += kvstoreIncrementallyRehash(db->keys, INCREMENTAL_REHASHING_THRESHOLD_US - elapsed_us); - if (elapsed_us >= INCREMENTAL_REHASHING_THRESHOLD_US) break; - elapsed_us += kvstoreIncrementallyRehash(db->expires, INCREMENTAL_REHASHING_THRESHOLD_US - elapsed_us); - if (elapsed_us >= INCREMENTAL_REHASHING_THRESHOLD_US) break; + elapsed_us += kvstoreIncrementallyRehash(db->keys, threshold_us - elapsed_us); + if (elapsed_us >= threshold_us) break; + elapsed_us += kvstoreIncrementallyRehash(db->expires, threshold_us - elapsed_us); + if (elapsed_us >= threshold_us) break; rehash_db++; } } diff --git a/src/server.h b/src/server.h index 44ba429b16..4cc93ed8aa 100644 --- a/src/server.h +++ b/src/server.h @@ -140,9 +140,8 @@ struct hdr_histogram; #define CONFIG_BINDADDR_MAX 16 #define CONFIG_MIN_RESERVED_FDS 32 #define CONFIG_DEFAULT_PROC_TITLE_TEMPLATE "{title} {listen-addr} {server-mode}" -#define DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE 60 /* Grace period in seconds for replica main \ - channel to establish psync. */ -#define INCREMENTAL_REHASHING_THRESHOLD_US 1000 +#define DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE 60 /* Grace period in seconds for replica main \ + * channel to establish psync. */ #define LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT 100 /* Default: 0.1 seconds */ /* Bucket sizes for client eviction pools. Each bucket stores clients with diff --git a/valkey.conf b/valkey.conf index eabb5aa5d1..f485b42b1a 100644 --- a/valkey.conf +++ b/valkey.conf @@ -2082,24 +2082,22 @@ hll-sparse-max-bytes 3000 stream-node-max-bytes 4096 stream-node-max-entries 100 -# Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in -# order to help rehashing the main server hash table (the one mapping top-level -# keys to values). The hash table implementation the server uses (see dict.c) -# performs a lazy rehashing: the more operation you run into a hash table -# that is rehashing, the more rehashing "steps" are performed, so if the -# server is idle the rehashing is never complete and some more memory is used -# by the hash table. -# -# The default is to use this millisecond 10 times every second in order to -# actively rehash the main dictionaries, freeing memory when possible. -# -# If unsure: -# use "activerehashing no" if you have hard latency requirements and it is -# not a good thing in your environment that the server can reply from time to time -# to queries with 2 milliseconds delay. -# -# use "activerehashing yes" if you don't have such hard requirements but -# want to free memory asap when possible. +# Active rehashing uses 1% of the CPU time to help perform incremental rehashing +# of the main server hash tables, the ones mapping top-level keys to values. +# +# If active rehashing is disabled and rehashing is needed, a hash table is +# rehashed one "step" on every operation performed on the hash table (add, find, +# etc.), so if the server is idle, the rehashing may never complete and some +# more memory is used by the hash tables. Active rehashing helps prevent this. +# +# Active rehashing runs as a background task. Depending on the value of 'hz', +# the frequency at which the server performs background tasks, active rehashing +# can cause the server to freeze for a short time. For example, if 'hz' is set +# to 10, active rehashing runs for up to one millisecond every 100 milliseconds. +# If a freeze of one millisecond is not acceptable, you can increase 'hz' to let +# active rehashing run more often. If instead 'hz' is set to 100, active +# rehashing runs up to only 100 microseconds every 10 milliseconds. The total is +# still 1% of the time. activerehashing yes # The client output buffer limits can be used to force disconnection of clients From 881019c9f482be21c9012005c473b540f0bcd06b Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 15 Oct 2024 23:32:42 +0800 Subject: [PATCH 42/51] Fix FUNCTION KILL error message being displayed as SCRIPT KILL (#1171) The client that was killed by FUNCTION KILL received a reply of SCRIPT KILL and the server log also showed SCRIPT KILL. Signed-off-by: Binbin --- src/script_lua.c | 10 ++++++++-- tests/unit/functions.tcl | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/script_lua.c b/src/script_lua.c index e378157d8c..5093fa944f 100644 --- a/src/script_lua.c +++ b/src/script_lua.c @@ -1609,7 +1609,13 @@ static void luaMaskCountHook(lua_State *lua, lua_Debug *ar) { scriptRunCtx *rctx = luaGetFromRegistry(lua, REGISTRY_RUN_CTX_NAME); serverAssert(rctx); /* Only supported inside script invocation */ if (scriptInterrupt(rctx) == SCRIPT_KILL) { - serverLog(LL_NOTICE, "Lua script killed by user with SCRIPT KILL."); + char *err = NULL; + if (rctx->flags & SCRIPT_EVAL_MODE) { + err = "Script killed by user with SCRIPT KILL."; + } else { + err = "Script killed by user with FUNCTION KILL."; + } + serverLog(LL_NOTICE, "%s", err); /* * Set the hook to invoke all the time so the user @@ -1618,7 +1624,7 @@ static void luaMaskCountHook(lua_State *lua, lua_Debug *ar) { */ lua_sethook(lua, luaMaskCountHook, LUA_MASKLINE, 0); - luaPushError(lua, "Script killed by user with SCRIPT KILL..."); + luaPushError(lua, err); luaError(lua); } } diff --git a/tests/unit/functions.tcl b/tests/unit/functions.tcl index 6a720419d3..7ddd36dd7d 100644 --- a/tests/unit/functions.tcl +++ b/tests/unit/functions.tcl @@ -246,6 +246,8 @@ start_server {tags {"scripting"}} { r function kill after 200 ; # Give some time to Lua to call the hook again... assert_equal [r ping] "PONG" + assert_error {ERR Script killed by user with FUNCTION KILL*} {$rd read} + $rd close } test {FUNCTION - test script kill not working on function} { @@ -261,6 +263,8 @@ start_server {tags {"scripting"}} { r function kill after 200 ; # Give some time to Lua to call the hook again... assert_equal [r ping] "PONG" + assert_error {ERR Script killed by user with FUNCTION KILL*} {$rd read} + $rd close } test {FUNCTION - test function kill not working on eval} { @@ -275,6 +279,8 @@ start_server {tags {"scripting"}} { r script kill after 200 ; # Give some time to Lua to call the hook again... assert_equal [r ping] "PONG" + assert_error {ERR Script killed by user with SCRIPT KILL*} {$rd read} + $rd close } test {FUNCTION - test function flush} { From 3357ea3db6530b0750d655c84258c67bb9e69d23 Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Tue, 15 Oct 2024 19:26:42 +0300 Subject: [PATCH 43/51] Refactor return and goto statements (#945) Consolidate the cleanup of local variables to a single point within the method, ensuring proper resource management and p reventing memory leaks or double-free issues. Previoslly descused here: - https://github.com/valkey-io/valkey/pull/60#discussion_r1667872633 - https://github.com/valkey-io/valkey/pull/60#discussion_r1668045666 --------- Signed-off-by: naglera Signed-off-by: Amit Nagler <58042354+naglera@users.noreply.github.com> Co-authored-by: Ping Xie --- src/replication.c | 422 +++++++++++++++++++++++++++------------------- src/server.h | 1 + 2 files changed, 246 insertions(+), 177 deletions(-) diff --git a/src/replication.c b/src/replication.c index 948a2762bc..63433de865 100644 --- a/src/replication.c +++ b/src/replication.c @@ -53,8 +53,9 @@ int replicaPutOnline(client *replica); void replicaStartCommandStream(client *replica); int cancelReplicationHandshake(int reconnect); void replicationSteadyStateInit(void); -void setupMainConnForPsync(connection *conn); +void dualChannelSetupMainConnForPsync(connection *conn); void dualChannelSyncHandleRdbLoadCompletion(void); +static void dualChannelFullSyncWithPrimary(connection *conn); /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case @@ -2588,13 +2589,135 @@ int sendCurrentOffsetToReplica(client *replica) { return C_OK; } +static int dualChannelReplHandleHandshake(connection *conn, sds *err) { + serverLog(LL_DEBUG, "Received first reply from primary using rdb connection."); + /* AUTH with the primary if required. */ + if (server.primary_auth) { + char *args[] = {"AUTH", NULL, NULL}; + size_t lens[] = {4, 0, 0}; + int argc = 1; + if (server.primary_user) { + args[argc] = server.primary_user; + lens[argc] = strlen(server.primary_user); + argc++; + } + args[argc] = server.primary_auth; + lens[argc] = sdslen(server.primary_auth); + argc++; + *err = sendCommandArgv(conn, argc, args, lens); + if (*err) { + serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); + return C_ERR; + } + } + /* Send replica listening port to primary for clarification */ + sds portstr = getReplicaPortString(); + *err = sendCommand(conn, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr, + NULL); + sdsfree(portstr); + if (*err) { + serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); + return C_ERR; + } + + if (connSetReadHandler(conn, dualChannelFullSyncWithPrimary) == C_ERR) { + char conninfo[CONN_INFO_LEN]; + serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); + return C_ERR; + } + return C_OK; +} + +static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { + *err = receiveSynchronousResponse(conn); + if (*err == NULL) { + serverLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); + return C_ERR; + } + if ((*err)[0] == '-') { + serverLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); + return C_ERR; + } + server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; + return C_OK; +} + +static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) { + *err = receiveSynchronousResponse(conn); + if (*err == NULL) { + serverLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); + return C_ERR; + } + + if (*err[0] == '-') { + serverLog(LL_NOTICE, "Server does not support sync with offset, dual channel sync approach cannot be used: %s", + *err); + return C_ERR; + } + if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { + serverLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); + return C_ERR; + } + return C_OK; +} + +static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { + uint64_t rdb_client_id; + *err = receiveSynchronousResponse(conn); + if (*err == NULL) { + return C_ERR; + } + if (*err[0] == '\0') { + /* Retry again later */ + serverLog(LL_DEBUG, "Received empty $ENDOFF response"); + return C_RETRY; + } + long long reploffset; + char primary_replid[CONFIG_RUN_ID_SIZE + 1]; + int dbid; + /* Parse end offset response */ + char *endoff_format = "$ENDOFF:%lld %40s %d %llu"; + if (sscanf(*err, endoff_format, &reploffset, primary_replid, &dbid, &rdb_client_id) != 4) { + serverLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); + return C_ERR; + } + server.rdb_client_id = rdb_client_id; + server.primary_initial_offset = reploffset; + + /* Initiate repl_provisional_primary to act as this replica temp primary until RDB is loaded */ + server.repl_provisional_primary.conn = server.repl_transfer_s; + memcpy(server.repl_provisional_primary.replid, primary_replid, CONFIG_RUN_ID_SIZE); + server.repl_provisional_primary.reploff = reploffset; + server.repl_provisional_primary.read_reploff = reploffset; + server.repl_provisional_primary.dbid = dbid; + + /* Now that we have the snapshot end-offset, we can ask for psync from that offset. Prepare the + * main connection accordingly.*/ + server.repl_transfer_s->state = CONN_STATE_CONNECTED; + server.repl_state = REPL_STATE_SEND_HANDSHAKE; + serverAssert(connSetReadHandler(server.repl_transfer_s, dualChannelSetupMainConnForPsync) != C_ERR); + dualChannelSetupMainConnForPsync(server.repl_transfer_s); + + /* As the next block we will receive using this connection is the rdb, we need to prepare + * the connection accordingly */ + serverAssert(connSetReadHandler(server.repl_rdb_transfer_s, readSyncBulkPayload) != C_ERR); + server.repl_transfer_size = -1; + server.repl_transfer_read = 0; + server.repl_transfer_last_fsync_off = 0; + server.repl_transfer_lastio = server.unixtime; + + return C_OK; +} + /* Replication: Replica side. * This connection handler is used to initialize the RDB connection (dual-channel-replication). * Once a replica with dual-channel-replication enabled, denied from PSYNC with its primary, - * fullSyncWithPrimary begins its role. The connection handler prepares server.repl_rdb_transfer_s + * dualChannelFullSyncWithPrimary begins its role. The connection handler prepares server.repl_rdb_transfer_s * for a rdb stream, and server.repl_transfer_s for incremental replication data stream. */ -static void fullSyncWithPrimary(connection *conn) { +static void dualChannelFullSyncWithPrimary(connection *conn) { char *err = NULL; + int ret = 0; serverAssert(conn == server.repl_rdb_transfer_s); /* If this event fired after the user turned the instance into a primary * with REPLICAOF NO ONE we must just return ASAP. */ @@ -2607,138 +2730,40 @@ static void fullSyncWithPrimary(connection *conn) { serverLog(LL_WARNING, "Error condition on socket for dual channel replication: %s", connGetLastError(conn)); goto error; } - /* Send replica capabilities */ - if (server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_SEND_HANDSHAKE) { - serverLog(LL_DEBUG, "Received first reply from primary using rdb connection."); - /* AUTH with the primary if required. */ + switch (server.repl_rdb_channel_state) { + case REPL_DUAL_CHANNEL_SEND_HANDSHAKE: + ret = dualChannelReplHandleHandshake(conn, &err); + if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY; + break; + case REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY: if (server.primary_auth) { - char *args[] = {"AUTH", NULL, NULL}; - size_t lens[] = {4, 0, 0}; - int argc = 1; - if (server.primary_user) { - args[argc] = server.primary_user; - lens[argc] = strlen(server.primary_user); - argc++; - } - args[argc] = server.primary_auth; - lens[argc] = sdslen(server.primary_auth); - argc++; - err = sendCommandArgv(conn, argc, args, lens); - if (err) { - serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", err); - return; - } - } - /* Send replica listening port to primary for clarification */ - sds portstr = getReplicaPortString(); - err = sendCommand(conn, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", - portstr, NULL); - sdsfree(portstr); - if (err) { - serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", err); - return; - } - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY; - - if (connSetReadHandler(conn, fullSyncWithPrimary) == C_ERR) { - char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); - goto error; - } - return; - } - if (server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY && !server.primary_auth) { - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; - } - /* Receive AUTH reply. */ - if (server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY) { - err = receiveSynchronousResponse(conn); - if (err == NULL) { - serverLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); - goto error; - } - if (err[0] == '-') { - serverLog(LL_WARNING, "Unable to AUTH to Primary: %s", err); - goto error; + ret = dualChannelReplHandleAuthReply(conn, &err); + if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; + /* Wait for next bulk before trying to read replconf reply. */ + break; } - sdsfree(err); server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; - return; - } - /* Receive replconf response */ - if (server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY) { - err = receiveSynchronousResponse(conn); - if (err == NULL) { - serverLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); - goto error; - } + /* fall through */ + case REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY: + ret = dualChannelReplHandleReplconfReply(conn, &err); + if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF; + break; + case REPL_DUAL_CHANNEL_RECEIVE_ENDOFF: + ret = dualChannelReplHandleEndOffsetResponse(conn, &err); + if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD; + break; + default: + serverPanic("Unexpected dual replication state: %d", server.repl_rdb_channel_state); + } + if (ret == C_ERR) goto error; + sdsfree(err); + return; - if (err[0] == '-') { - serverLog(LL_NOTICE, - "Server does not support sync with offset, dual channel sync approach cannot be used: %s", err); - goto error; - } - if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - serverLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); - goto error; - } - sdsfree(err); - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF; - return; - } - /* Receive end offset response */ - if (server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RECEIVE_ENDOFF) { - uint64_t rdb_client_id; - err = receiveSynchronousResponse(conn); - if (err == NULL) goto error; - if (err[0] == '\0') { - /* Retry again later */ - serverLog(LL_DEBUG, "Received empty $ENDOFF response"); - sdsfree(err); - return; - } - long long reploffset; - char primary_replid[CONFIG_RUN_ID_SIZE + 1]; - int dbid; - /* Parse end offset response */ - char *endoff_format = "$ENDOFF:%lld %40s %d %llu"; - if (sscanf(err, endoff_format, &reploffset, primary_replid, &dbid, &rdb_client_id) != 4) { - serverLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", err); - goto error; - } +error: + if (err) { + serverLog(LL_WARNING, "Dual channel sync failed with error %s", err); sdsfree(err); - server.rdb_client_id = rdb_client_id; - server.primary_initial_offset = reploffset; - - /* Initiate repl_provisional_primary to act as this replica temp primary until RDB is loaded */ - server.repl_provisional_primary.conn = server.repl_transfer_s; - memcpy(server.repl_provisional_primary.replid, primary_replid, CONFIG_RUN_ID_SIZE); - server.repl_provisional_primary.reploff = reploffset; - server.repl_provisional_primary.read_reploff = reploffset; - server.repl_provisional_primary.dbid = dbid; - - /* Now that we have the snapshot end-offset, we can ask for psync from that offset. Prepare the - * main connection accordingly.*/ - server.repl_transfer_s->state = CONN_STATE_CONNECTED; - server.repl_state = REPL_STATE_SEND_HANDSHAKE; - serverAssert(connSetReadHandler(server.repl_transfer_s, setupMainConnForPsync) != C_ERR); - setupMainConnForPsync(server.repl_transfer_s); - - /* As the next block we will receive using this connection is the rdb, we need to prepare - * the connection accordingly */ - serverAssert(connSetReadHandler(server.repl_rdb_transfer_s, readSyncBulkPayload) != C_ERR); - server.repl_transfer_size = -1; - server.repl_transfer_read = 0; - server.repl_transfer_last_fsync_off = 0; - server.repl_transfer_lastio = server.unixtime; - - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD; - return; } - -error: - sdsfree(err); if (server.repl_transfer_s) { connClose(server.repl_transfer_s); server.repl_transfer_s = NULL; @@ -2751,7 +2776,6 @@ static void fullSyncWithPrimary(connection *conn) { server.repl_transfer_fd = -1; server.repl_state = REPL_STATE_CONNECT; replicationAbortDualChannelSyncTransfer(); - return; } /* Replication: Replica side. @@ -2920,24 +2944,23 @@ void dualChannelSyncSuccess(void) { /* Replication: Replica side. * Main channel successfully established psync with primary. Check whether the rdb channel * has completed its part and act accordingly. */ -void dualChannelSyncHandlePsync(void) { +int dualChannelSyncHandlePsync(void) { serverAssert(server.repl_state == REPL_STATE_RECEIVE_PSYNC_REPLY); if (server.repl_rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) { /* RDB is still loading */ if (connSetReadHandler(server.repl_provisional_primary.conn, bufferReplData) == C_ERR) { serverLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); cancelReplicationHandshake(1); - return; + return C_ERR; } replDataBufInit(); - server.repl_state = REPL_STATE_TRANSFER; - return; + return C_OK; } serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED); /* RDB is loaded */ serverLog(LL_DEBUG, "Dual channel sync - psync established after rdb load"); dualChannelSyncSuccess(); - return; + return C_OK; } /* Replication: Replica side. @@ -3195,46 +3218,54 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { return PSYNC_NOT_SUPPORTED; } -/* Replication: Replica side. - * This connection handler fires after rdb-connection was initialized. We use it - * to adjust the replica main for loading incremental changes into the local buffer. */ -void setupMainConnForPsync(connection *conn) { - int psync_result = -1; - char llstr[LONG_STR_SIZE]; - char *err = NULL; - if (server.repl_state == REPL_STATE_SEND_HANDSHAKE) { - /* We already have an initialized connection at primary side, we only need to associate it with RDB connection */ - ull2string(llstr, sizeof(llstr), server.rdb_client_id); - err = sendCommand(conn, "REPLCONF", "set-rdb-client-id", llstr, NULL); - if (err) goto error; - server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; - sdsfree(err); - return; + +sds getTryPsyncString(int result) { + switch (result) { + case PSYNC_WRITE_ERROR: return sdsnew("PSYNC_WRITE_ERROR"); + case PSYNC_WAIT_REPLY: return sdsnew("PSYNC_WAIT_REPLY"); + case PSYNC_CONTINUE: return sdsnew("PSYNC_CONTINUE"); + case PSYNC_FULLRESYNC: return sdsnew("PSYNC_FULLRESYNC"); + case PSYNC_NOT_SUPPORTED: return sdsnew("PSYNC_NOT_SUPPORTED"); + case PSYNC_TRY_LATER: return sdsnew("PSYNC_TRY_LATER"); + case PSYNC_FULLRESYNC_DUAL_CHANNEL: return sdsnew("PSYNC_FULLRESYNC_DUAL_CHANNEL"); + default: return sdsnew("Unknown result"); } +} - if (server.repl_state == REPL_STATE_RECEIVE_CAPA_REPLY) { - err = receiveSynchronousResponse(conn); - if (err == NULL) goto error; - if (err[0] == '-') { - serverLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", err); - goto error; - } - sdsfree(err); - err = NULL; - server.repl_state = REPL_STATE_SEND_PSYNC; +int dualChannelReplMainConnSendHandshake(connection *conn, sds *err) { + char llstr[LONG_STR_SIZE]; + ull2string(llstr, sizeof(llstr), server.rdb_client_id); + *err = sendCommand(conn, "REPLCONF", "set-rdb-client-id", llstr, NULL); + if (*err) return C_ERR; + server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; + return C_OK; +} + +int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { + *err = receiveSynchronousResponse(conn); + if (*err == NULL) return C_ERR; + if ((*err)[0] == '-') { + serverLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); + return C_ERR; } + server.repl_state = REPL_STATE_SEND_PSYNC; + return C_OK; +} - if (server.repl_state == REPL_STATE_SEND_PSYNC) { - if (server.debug_pause_after_fork) debugPauseProcess(); - if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { - serverLog(LL_WARNING, "Aborting dual channel sync. Write error."); - cancelReplicationHandshake(1); - } - server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; - return; +int dualChannelReplMainConnSendPsync(connection *conn, sds *err) { + if (server.debug_pause_after_fork) debugPauseProcess(); + if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { + serverLog(LL_WARNING, "Aborting dual channel sync. Write error."); + *err = sdsnew(connGetLastError(conn)); + return C_ERR; } - psync_result = replicaTryPartialResynchronization(conn, 1); - if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */ + server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; + return C_OK; +} + +int dualChannelReplMainConnRecvPsyncReply(connection *conn, sds *err) { + int psync_result = replicaTryPartialResynchronization(conn, 1); + if (psync_result == PSYNC_WAIT_REPLY) return C_OK; /* Try again later... */ if (psync_result == PSYNC_CONTINUE) { serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", @@ -3244,15 +3275,52 @@ void setupMainConnForPsync(connection *conn) { "accept connections in read-write mode.\n"); } dualChannelSyncHandlePsync(); - return; + return C_OK; } + *err = getTryPsyncString(psync_result); + return C_ERR; +} -error: +/* Replication: Replica side. + * This connection handler fires after rdb-connection was initialized. We use it + * to adjust the replica main for loading incremental changes into the local buffer. */ +void dualChannelSetupMainConnForPsync(connection *conn) { + char *err = NULL; + int ret; + + switch (server.repl_state) { + case REPL_STATE_SEND_HANDSHAKE: + ret = dualChannelReplMainConnSendHandshake(conn, &err); + if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; + break; + case REPL_STATE_RECEIVE_CAPA_REPLY: + ret = dualChannelReplMainConnRecvCapaReply(conn, &err); + if (ret == C_ERR) { + break; + } + if (ret == C_OK) server.repl_state = REPL_STATE_SEND_PSYNC; + sdsfree(err); + err = NULL; + /* fall through */ + case REPL_STATE_SEND_PSYNC: + ret = dualChannelReplMainConnSendPsync(conn, &err); + if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; + break; + case REPL_STATE_RECEIVE_PSYNC_REPLY: + ret = dualChannelReplMainConnRecvPsyncReply(conn, &err); + if (ret == C_OK && server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) + server.repl_state = REPL_STATE_TRANSFER; + /* In case the RDB is already loaded, the repl_state will be set during establishPrimaryConnection. */ + break; + default: + serverPanic("Unexpected replication state: %d", server.repl_state); + } + + if (ret == C_ERR) { + serverLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); + cancelReplicationHandshake(1); + } sdsfree(err); - /* The dual-channel sync session must be aborted for any psync_result other than PSYNC_CONTINUE or PSYNC_WAIT_REPLY. - */ - serverLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d", psync_result); - cancelReplicationHandshake(1); } /* @@ -3625,7 +3693,7 @@ void syncWithPrimary(connection *conn) { /* Create RDB connection */ server.repl_rdb_transfer_s = connCreate(connTypeOfReplication()); if (connConnect(server.repl_rdb_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr, - fullSyncWithPrimary) == C_ERR) { + dualChannelFullSyncWithPrimary) == C_ERR) { serverLog(LL_WARNING, "Unable to connect to Primary: %s", connGetLastError(server.repl_transfer_s)); connClose(server.repl_rdb_transfer_s); server.repl_rdb_transfer_s = NULL; diff --git a/src/server.h b/src/server.h index 4cc93ed8aa..4fad8d2508 100644 --- a/src/server.h +++ b/src/server.h @@ -110,6 +110,7 @@ struct hdr_histogram; /* Error codes */ #define C_OK 0 #define C_ERR -1 +#define C_RETRY -2 /* Static server configuration */ #define CONFIG_DEFAULT_HZ 10 /* Time interrupt calls/sec. */ From 1f06a1cb3ae6f941d3694935148060c7c6f3520b Mon Sep 17 00:00:00 2001 From: Shivshankar Date: Tue, 15 Oct 2024 17:03:27 -0400 Subject: [PATCH 44/51] Remove 'posting in the mailing list' in CONTRIBUTING.md (#1174) Remove reference to "the mailing list". We don't have a mailing list. --- CONTRIBUTING.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cc807a79b6..5e2a712133 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -79,10 +79,9 @@ you need to ensure that the contribution is in accordance with the DCO. 1. If it is a major feature or a semantical change, please don't start coding straight away: if your feature is not a conceptual fit you'll lose a lot of -time writing the code without any reason. Start by posting in the mailing list -and creating an issue at Github with the description of, exactly, what you want -to accomplish and why. Use cases are important for features to be accepted. -Here you can see if there is consensus about your idea. +time writing the code without any reason. Start by creating an issue at Github with the +description of, exactly, what you want to accomplish and why. Use cases are important for +features to be accepted. Here you can see if there is consensus about your idea. 2. If in step 1 you get an acknowledgment from the project leaders, use the following procedure to submit a patch: From 2fe71005dedfd6420b0a8b7b571b5cb73841f626 Mon Sep 17 00:00:00 2001 From: zarkash-aws Date: Wed, 16 Oct 2024 00:18:58 +0200 Subject: [PATCH 45/51] Improved hashing algorithm in luaS_newlstr (#1168) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Overview** This PR introduces the use of [MurmurHash3](https://en.wikipedia.org/wiki/MurmurHash) as the hashing function for Lua's luaS_newlstr function, replacing the previous simple hash function. The change aims to improve performance, particularly for large strings. **Changes** Implemented MurmurHash3 algorithm in lstring.c Updated luaS_newlstr to use MurmurHash3 for string hashing **Performance Testing:** Test Setup: 1. Ran a valkey server 2. Loaded 1000 keys with large values (100KB each) to the server using a Lua script ``` local numKeys = 1000 for i = 1, numKeys do local key = "large_key_" .. i local largeValue = string.rep("x", 1024*100) redis.call("SET", key, largeValue) end ``` 3. Used a Lua script to randomly select and retrieve keys ``` local randomKey = redis.call("RANDOMKEY") local result = redis.call("GET", randomKey) ``` 4. Benchmarked using valkey-benchmark: `./valkey-benchmark -n 100000 evalsha c157a37967e69569339a39a953c046fc2ecb4258 0` Results: A | Unstable | This PR | Change -- | -- | -- | -- Throughput | 6,835.74 requests per second | 17,061.94 requests per second | **+150% increase** Avg Latency | 7.218 ms | 2.838 ms | **-61% decrease** Min Latency | 3.144 ms | 1.320 ms | **-58% decrease** P50 Latency | 8.463 ms | 3.167 ms | **-63% decrease** P95 Latency | 8.863 ms | 3.527 ms | **-60% decrease** P99 Latency | 9.063 ms | 3.663 ms | **-60% decrease** Max Latency | 63.871 ms | 55.327 ms | **-13% decrease** Summary: * Throughput: Improved by 150%. * Latency: Significant reductions in average, minimum, and percentile latencies (P50, P95, P99), leading to much faster response times. * Max Latency: Slightly decreased by 13%, indicating fewer outlier delays after the fix. --------- Signed-off-by: Shai Zarka Signed-off-by: zarkash-aws Signed-off-by: Madelyn Olson Co-authored-by: Madelyn Olson Co-authored-by: Viktor Söderqvist --- deps/README.md | 1 + deps/lua/src/lstring.c | 52 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/deps/README.md b/deps/README.md index 8a04f04b00..b918b47456 100644 --- a/deps/README.md +++ b/deps/README.md @@ -94,6 +94,7 @@ and our version: 1. Makefile is modified to allow a different compiler than GCC. 2. We have the implementation source code, and directly link to the following external libraries: `lua_cjson.o`, `lua_struct.o`, `lua_cmsgpack.o` and `lua_bit.o`. 3. There is a security fix in `ldo.c`, line 498: The check for `LUA_SIGNATURE[0]` is removed in order to avoid direct bytecode execution. +4. In `lstring.c`, the luaS_newlstr function's hash calculation has been upgraded from a simple hash function to MurmurHash3, implemented within the same file, to enhance performance, particularly for operations involving large strings. Hdr_Histogram --- diff --git a/deps/lua/src/lstring.c b/deps/lua/src/lstring.c index 6a825f7865..043a7867c0 100644 --- a/deps/lua/src/lstring.c +++ b/deps/lua/src/lstring.c @@ -6,6 +6,7 @@ #include +#include #define lstring_c #define LUA_CORE @@ -71,14 +72,55 @@ static TString *newlstr (lua_State *L, const char *str, size_t l, return ts; } +uint32_t murmur32(const uint8_t* key, size_t len, uint32_t seed) { + static const uint32_t c1 = 0xcc9e2d51; + static const uint32_t c2 = 0x1b873593; + static const uint32_t r1 = 15; + static const uint32_t r2 = 13; + static const uint32_t m = 5; + static const uint32_t n = 0xe6546b64; + uint32_t hash = seed; + + const int nblocks = len / 4; + const uint32_t* blocks = (const uint32_t*) key; + for (int i = 0; i < nblocks; i++) { + uint32_t k = blocks[i]; + k *= c1; + k = (k << r1) | (k >> (32 - r1)); + k *= c2; + + hash ^= k; + hash = ((hash << r2) | (hash >> (32 - r2))) * m + n; + } + + const uint8_t* tail = (const uint8_t*) (key + nblocks * 4); + uint32_t k1 = 0; + switch (len & 3) { + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = (k1 << r1) | (k1 >> (32 - r1)); + k1 *= c2; + hash ^= k1; + } + + hash ^= len; + hash ^= (hash >> 16); + hash *= 0x85ebca6b; + hash ^= (hash >> 13); + hash *= 0xc2b2ae35; + hash ^= (hash >> 16); + + return hash; + } TString *luaS_newlstr (lua_State *L, const char *str, size_t l) { GCObject *o; - unsigned int h = cast(unsigned int, l); /* seed */ - size_t step = 1; - size_t l1; - for (l1=l; l1>=step; l1-=step) /* compute hash */ - h = h ^ ((h<<5)+(h>>2)+cast(unsigned char, str[l1-1])); + unsigned int h = murmur32((uint8_t *)str, l, (uint32_t)l); for (o = G(L)->strt.hash[lmod(h, G(L)->strt.size)]; o != NULL; o = o->gch.next) { From 5d8921189345c239288ebaf3952362e2d16959e6 Mon Sep 17 00:00:00 2001 From: Nadav Levanoni <38641521+nadav-levanoni@users.noreply.github.com> Date: Wed, 16 Oct 2024 17:40:11 -0700 Subject: [PATCH 46/51] Add 'WithDictIndex' expiry API and update RANDOMKEY command (#1155) https://github.com/valkey-io/valkey/issues/1145 First part of a two-step effort to add `WithSlot` API for expiry. This PR is to fix a crash that occurs when a RANDOMKEY uses a different slot than the cached slot of a client during a multi-exec. The next part will be to utilize the new API as an optimization to prevent duplicate work when calculating the slot for a key. --------- Signed-off-by: Nadav Levanoni Signed-off-by: Madelyn Olson Co-authored-by: Nadav Levanoni Co-authored-by: Madelyn Olson --- src/db.c | 141 +++++++++++++++++++++++++++---------------- tests/unit/multi.tcl | 12 ++++ 2 files changed, 100 insertions(+), 53 deletions(-) diff --git a/src/db.c b/src/db.c index 3493e2d863..00e6e7b2d6 100644 --- a/src/db.c +++ b/src/db.c @@ -52,10 +52,13 @@ typedef enum { KEY_DELETED /* The key was deleted now. */ } keyStatus; +keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index); keyStatus expireIfNeeded(serverDb *db, robj *key, int flags); +int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index); int keyIsExpired(serverDb *db, robj *key); static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEntry *de); static int getKVStoreIndexForKey(sds key); +dictEntry *dbFindExpiresWithDictIndex(serverDb *db, void *key, int dict_index); /* Update LFU when an object is accessed. * Firstly, decrement the counter if the decrement time is reached. @@ -269,7 +272,7 @@ int getKeySlot(sds key) { * In this case a copy of `key` is copied in kvstore, the caller must ensure the `key` is properly freed. */ int dbAddRDBLoad(serverDb *db, sds key, robj *val) { - int dict_index = server.cluster_enabled ? getKeySlot(key) : 0; + int dict_index = getKVStoreIndexForKey(key); dictEntry *de = kvstoreDictAddRaw(db->keys, dict_index, key, NULL); if (de == NULL) return 0; initObjectLRUOrLFU(val); @@ -375,13 +378,13 @@ robj *dbRandomKey(serverDb *db) { while (1) { sds key; robj *keyobj; - int randomSlot = kvstoreGetFairRandomDictIndex(db->keys); - de = kvstoreDictGetFairRandomKey(db->keys, randomSlot); + int randomDictIndex = kvstoreGetFairRandomDictIndex(db->keys); + de = kvstoreDictGetFairRandomKey(db->keys, randomDictIndex); if (de == NULL) return NULL; key = dictGetKey(de); keyobj = createStringObject(key, sdslen(key)); - if (dbFindExpires(db, key)) { + if (dbFindExpiresWithDictIndex(db, key, randomDictIndex)) { if (allvolatile && server.primary_host && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, * it could happen that all the keys are already logically @@ -393,7 +396,7 @@ robj *dbRandomKey(serverDb *db) { * return a key name that may be already expired. */ return keyobj; } - if (expireIfNeeded(db, keyobj, 0) != KEY_VALID) { + if (expireIfNeededWithDictIndex(db, keyobj, 0, randomDictIndex) != KEY_VALID) { decrRefCount(keyobj); continue; /* search for another key. This expired. */ } @@ -402,11 +405,9 @@ robj *dbRandomKey(serverDb *db) { } } -/* Helper for sync and async delete. */ -int dbGenericDelete(serverDb *db, robj *key, int async, int flags) { +int dbGenericDeleteWithDictIndex(serverDb *db, robj *key, int async, int flags, int dict_index) { dictEntry **plink; int table; - int dict_index = getKVStoreIndexForKey(key->ptr); dictEntry *de = kvstoreDictTwoPhaseUnlinkFind(db->keys, dict_index, key->ptr, &plink, &table); if (de) { robj *val = dictGetVal(de); @@ -436,6 +437,12 @@ int dbGenericDelete(serverDb *db, robj *key, int async, int flags) { } } +/* Helper for sync and async delete. */ +int dbGenericDelete(serverDb *db, robj *key, int async, int flags) { + int dict_index = getKVStoreIndexForKey(key->ptr); + return dbGenericDeleteWithDictIndex(db, key, async, flags, dict_index); +} + /* Delete a key, value, and associated expiration entry if any, from the DB */ int dbSyncDelete(serverDb *db, robj *key) { return dbGenericDelete(db, key, 0, DB_FLAG_KEY_DELETED); @@ -1693,19 +1700,25 @@ void setExpire(client *c, serverDb *db, robj *key, long long when) { /* Return the expire time of the specified key, or -1 if no expire * is associated with this key (i.e. the key is non volatile) */ -long long getExpire(serverDb *db, robj *key) { +long long getExpireWithDictIndex(serverDb *db, robj *key, int dict_index) { dictEntry *de; - if ((de = dbFindExpires(db, key->ptr)) == NULL) return -1; + if ((de = dbFindExpiresWithDictIndex(db, key->ptr, dict_index)) == NULL) return -1; return dictGetSignedIntegerVal(de); } -/* Delete the specified expired key and propagate expire. */ -void deleteExpiredKeyAndPropagate(serverDb *db, robj *keyobj) { +/* Return the expire time of the specified key, or -1 if no expire + * is associated with this key (i.e. the key is non volatile) */ +long long getExpire(serverDb *db, robj *key) { + int dict_index = getKVStoreIndexForKey(key->ptr); + return getExpireWithDictIndex(db, key, dict_index); +} + +void deleteExpiredKeyAndPropagateWithDictIndex(serverDb *db, robj *keyobj, int dict_index) { mstime_t expire_latency; latencyStartMonitor(expire_latency); - dbGenericDelete(db, keyobj, server.lazyfree_lazy_expire, DB_FLAG_KEY_EXPIRED); + dbGenericDeleteWithDictIndex(db, keyobj, server.lazyfree_lazy_expire, DB_FLAG_KEY_EXPIRED, dict_index); latencyEndMonitor(expire_latency); latencyAddSampleIfNeeded("expire-del", expire_latency); notifyKeyspaceEvent(NOTIFY_EXPIRED, "expired", keyobj, db->id); @@ -1714,6 +1727,12 @@ void deleteExpiredKeyAndPropagate(serverDb *db, robj *keyobj) { server.stat_expiredkeys++; } +/* Delete the specified expired key and propagate expire. */ +void deleteExpiredKeyAndPropagate(serverDb *db, robj *keyobj) { + int dict_index = getKVStoreIndexForKey(keyobj->ptr); + deleteExpiredKeyAndPropagateWithDictIndex(db, keyobj, dict_index); +} + /* Delete the specified expired key from overwriting and propagate the DEL or UNLINK. */ void deleteExpiredKeyFromOverwriteAndPropagate(client *c, robj *keyobj) { int deleted = dbGenericDelete(c->db, keyobj, server.lazyfree_lazy_expire, DB_FLAG_KEY_EXPIRED); @@ -1765,12 +1784,11 @@ void propagateDeletion(serverDb *db, robj *key, int lazy) { decrRefCount(argv[1]); } -/* Check if the key is expired. */ -int keyIsExpired(serverDb *db, robj *key) { +int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { /* Don't expire anything while loading. It will be done later. */ if (server.loading) return 0; - mstime_t when = getExpire(db, key); + mstime_t when = getExpireWithDictIndex(db, key, dict_index); mstime_t now; if (when < 0) return 0; /* No expire for this key */ @@ -1782,39 +1800,15 @@ int keyIsExpired(serverDb *db, robj *key) { return now > when; } -/* This function is called when we are going to perform some operation - * in a given key, but such key may be already logically expired even if - * it still exists in the database. The main way this function is called - * is via lookupKey*() family of functions. - * - * The behavior of the function depends on the replication role of the - * instance, because by default replicas do not delete expired keys. They - * wait for DELs from the primary for consistency matters. However even - * replicas will try to have a coherent return value for the function, - * so that read commands executed in the replica side will be able to - * behave like if the key is expired even if still present (because the - * primary has yet to propagate the DEL). - * - * In primary as a side effect of finding a key which is expired, such - * key will be evicted from the database. Also this may trigger the - * propagation of a DEL/UNLINK command in AOF / replication stream. - * - * On replicas, this function does not delete expired keys by default, but - * it still returns KEY_EXPIRED if the key is logically expired. To force deletion - * of logically expired keys even on replicas, use the EXPIRE_FORCE_DELETE_EXPIRED - * flag. Note though that if the current client is executing - * replicated commands from the primary, keys are never considered expired. - * - * On the other hand, if you just want expiration check, but need to avoid - * the actual key deletion and propagation of the deletion, use the - * EXPIRE_AVOID_DELETE_EXPIRED flag. - * - * The return value of the function is KEY_VALID if the key is still valid. - * The function returns KEY_EXPIRED if the key is expired BUT not deleted, - * or returns KEY_DELETED if the key is expired and deleted. */ -keyStatus expireIfNeeded(serverDb *db, robj *key, int flags) { +/* Check if the key is expired. */ +int keyIsExpired(serverDb *db, robj *key) { + int dict_index = getKVStoreIndexForKey(key->ptr); + return keyIsExpiredWithDictIndex(db, key, dict_index); +} + +keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index) { if (server.lazy_expire_disabled) return KEY_VALID; - if (!keyIsExpired(db, key)) return KEY_VALID; + if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return KEY_VALID; /* If we are running in the context of a replica, instead of * evicting the expired key from the database, we return ASAP: @@ -1849,13 +1843,48 @@ keyStatus expireIfNeeded(serverDb *db, robj *key, int flags) { key = createStringObject(key->ptr, sdslen(key->ptr)); } /* Delete the key */ - deleteExpiredKeyAndPropagate(db, key); + deleteExpiredKeyAndPropagateWithDictIndex(db, key, dict_index); if (static_key) { decrRefCount(key); } return KEY_DELETED; } +/* This function is called when we are going to perform some operation + * in a given key, but such key may be already logically expired even if + * it still exists in the database. The main way this function is called + * is via lookupKey*() family of functions. + * + * The behavior of the function depends on the replication role of the + * instance, because by default replicas do not delete expired keys. They + * wait for DELs from the primary for consistency matters. However even + * replicas will try to have a coherent return value for the function, + * so that read commands executed in the replica side will be able to + * behave like if the key is expired even if still present (because the + * primary has yet to propagate the DEL). + * + * In primary as a side effect of finding a key which is expired, such + * key will be evicted from the database. Also this may trigger the + * propagation of a DEL/UNLINK command in AOF / replication stream. + * + * On replicas, this function does not delete expired keys by default, but + * it still returns KEY_EXPIRED if the key is logically expired. To force deletion + * of logically expired keys even on replicas, use the EXPIRE_FORCE_DELETE_EXPIRED + * flag. Note though that if the current client is executing + * replicated commands from the primary, keys are never considered expired. + * + * On the other hand, if you just want expiration check, but need to avoid + * the actual key deletion and propagation of the deletion, use the + * EXPIRE_AVOID_DELETE_EXPIRED flag. + * + * The return value of the function is KEY_VALID if the key is still valid. + * The function returns KEY_EXPIRED if the key is expired BUT not deleted, + * or returns KEY_DELETED if the key is expired and deleted. */ +keyStatus expireIfNeeded(serverDb *db, robj *key, int flags) { + int dict_index = getKVStoreIndexForKey(key->ptr); + return expireIfNeededWithDictIndex(db, key, flags, dict_index); +} + /* CB passed to kvstoreExpand. * The purpose is to skip expansion of unused dicts in cluster mode (all * dicts not mapped to *my* slots) */ @@ -1897,16 +1926,22 @@ int dbExpandExpires(serverDb *db, uint64_t db_size, int try_expand) { return dbExpandGeneric(db->expires, db_size, try_expand); } -static dictEntry *dbFindGeneric(kvstore *kvs, void *key) { - return kvstoreDictFind(kvs, server.cluster_enabled ? getKeySlot(key) : 0, key); +dictEntry *dbFindWithDictIndex(serverDb *db, void *key, int dict_index) { + return kvstoreDictFind(db->keys, dict_index, key); } dictEntry *dbFind(serverDb *db, void *key) { - return dbFindGeneric(db->keys, key); + int dict_index = getKVStoreIndexForKey(key); + return dbFindWithDictIndex(db, key, dict_index); +} + +dictEntry *dbFindExpiresWithDictIndex(serverDb *db, void *key, int dict_index) { + return kvstoreDictFind(db->expires, dict_index, key); } dictEntry *dbFindExpires(serverDb *db, void *key) { - return dbFindGeneric(db->expires, key); + int dict_index = getKVStoreIndexForKey(key); + return dbFindExpiresWithDictIndex(db, key, dict_index); } unsigned long long dbSize(serverDb *db) { diff --git a/tests/unit/multi.tcl b/tests/unit/multi.tcl index dafbc66c10..ffdaa3edd4 100644 --- a/tests/unit/multi.tcl +++ b/tests/unit/multi.tcl @@ -919,3 +919,15 @@ start_server {overrides {appendonly {yes} appendfilename {appendonly.aof} append r get foo } {} } + +start_cluster 1 0 {tags {"external:skip cluster"}} { + test "Regression test for multi-exec with RANDOMKEY accessing the wrong per-slot dictionary" { + R 0 SETEX FOO 10000 BAR + R 0 SETEX FIZZ 10000 BUZZ + + R 0 MULTI + R 0 DEL FOO + R 0 RANDOMKEY + assert_equal [R 0 EXEC] {1 FIZZ} + } +} From ebe09434864c2f14ec1638918ff9c6c1a9158d09 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 17 Oct 2024 13:33:44 +0800 Subject: [PATCH 47/51] Remove the restriction that cli --cluster create requires at least 3 primary nodes (#1075) There is no limitation in Valkey to create a cluster with 1 or 2 primaries, only that it cannot do automatic failover. Remove this restriction and add `are you sure` prompt to prompt the user. This allow we use it to create a test cluster by cli or by create-cluster. Signed-off-by: Binbin --- src/valkey-cli.c | 16 +++++++++------- tests/unit/cluster/cli.tcl | 27 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/valkey-cli.c b/src/valkey-cli.c index 2af3c4c352..b4a7fcaf91 100644 --- a/src/valkey-cli.c +++ b/src/valkey-cli.c @@ -6686,15 +6686,17 @@ static int clusterManagerCommandCreate(int argc, char **argv) { int replicas = config.cluster_manager_command.replicas; int primaries_count = CLUSTER_MANAGER_PRIMARIES_COUNT(node_len, replicas); if (primaries_count < 3) { - clusterManagerLogErr("*** ERROR: Invalid configuration for cluster creation.\n" - "*** Valkey Cluster requires at least 3 primary nodes.\n" - "*** This is not possible with %d nodes and %d replicas per node.", - node_len, replicas); - clusterManagerLogErr("\n*** At least %d nodes are required.\n", 3 * (replicas + 1)); - return 0; + int ignore_force = 0; + clusterManagerLogInfo("Requested to create a cluster with %d primaries and " + "%d replicas per primary.\n", + primaries_count, replicas); + if (!confirmWithYes("Valkey cluster requires at least 3 primary nodes for " + "automatic failover. Are you sure?", + ignore_force)) + return 0; } clusterManagerLogInfo(">>> Performing hash slots allocation " - "on %d nodes...\n", + "on %d node(s)...\n", node_len); int interleaved_len = 0, ip_count = 0; clusterManagerNode **interleaved = zcalloc(node_len * sizeof(**interleaved)); diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index f5599f6c5a..3f1f0e9ffa 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -9,6 +9,33 @@ set ::singledb 1 # cluster creation is complicated with TLS, and the current tests don't really need that coverage tags {tls:skip external:skip cluster} { +set base_conf [list cluster-enabled yes cluster-node-timeout 1000] +start_multiple_servers 3 [list overrides $base_conf] { + test {Create 1 node cluster} { + exec src/valkey-cli --cluster-yes --cluster create \ + 127.0.0.1:[srv 0 port] + + wait_for_condition 1000 50 { + [CI 0 cluster_state] eq {ok} + } else { + fail "Cluster doesn't stabilize" + } + } + + test {Create 2 node cluster} { + exec src/valkey-cli --cluster-yes --cluster create \ + 127.0.0.1:[srv -1 port] \ + 127.0.0.1:[srv -2 port] + + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} + } else { + fail "Cluster doesn't stabilize" + } + } +} + # start three servers set base_conf [list cluster-enabled yes cluster-node-timeout 1000] start_multiple_servers 3 [list overrides $base_conf] { From a4d48700ee0d4ff2eccf691f10f4328e9eb71240 Mon Sep 17 00:00:00 2001 From: Lipeng Zhu Date: Thu, 17 Oct 2024 18:37:10 +0800 Subject: [PATCH 48/51] Fix false sharing issue between main thread and io-threads when access `used_memory_thread`. (#1179) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When profiling some workloads with `io-threads` enabled. We found the false sharing issue is heavy. This patch try to split the the elements accessed by main thread and io-threads into different cache line by padding the elements in the head of `used_memory_thread_padded` array. This design helps mitigate the false sharing between main thread and io-threads, because the main thread has been the bottleneck with io-threads enabled. We didn't put each element in an individual cache line is that we don't want to bring the additional cache line fetch operation (3 vs 16 cache line) when call function like `zmalloc_used_memory()`. --------- Signed-off-by: Lipeng Zhu Signed-off-by: Lipeng Zhu Signed-off-by: Viktor Söderqvist Co-authored-by: Wangyang Guo Co-authored-by: Viktor Söderqvist --- src/zmalloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zmalloc.c b/src/zmalloc.c index 1cb01ee88c..e18fa8bac2 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -90,6 +90,7 @@ void zlibc_free(void *ptr) { #define thread_local _Thread_local +#define PADDING_ELEMENT_NUM (CACHE_LINE_SIZE / sizeof(size_t) - 1) #define MAX_THREADS_NUM (IO_THREADS_MAX_NUM + 3 + 1) /* A thread-local storage which keep the current thread's index in the used_memory_thread array. */ static thread_local int thread_index = -1; @@ -101,10 +102,11 @@ static thread_local int thread_index = -1; * For the other architecture, lets fall back to the atomic operation to keep safe. */ #if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || defined(__POWERPC__) || defined(__arm__) || \ defined(__arm64__) -static __attribute__((aligned(sizeof(size_t)))) size_t used_memory_thread[MAX_THREADS_NUM]; +static __attribute__((aligned(CACHE_LINE_SIZE))) size_t used_memory_thread_padded[MAX_THREADS_NUM + PADDING_ELEMENT_NUM]; #else -static _Atomic size_t used_memory_thread[MAX_THREADS_NUM]; +static __attribute__((aligned(CACHE_LINE_SIZE))) _Atomic size_t used_memory_thread_padded[MAX_THREADS_NUM + PADDING_ELEMENT_NUM]; #endif +static size_t *used_memory_thread = &used_memory_thread_padded[PADDING_ELEMENT_NUM]; static atomic_int total_active_threads = 0; /* This is a simple protection. It's used only if some modules create a lot of threads. */ static atomic_size_t used_memory_for_additional_threads = 0; From e3d8f94ff45659dbe539228e98b90cc6c0db2a75 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Sat, 19 Oct 2024 08:48:18 +0800 Subject: [PATCH 49/51] Introduce connection context for Unix socket (#1160) Hide 'unixsocketgroup' and 'unixsocketperm' into a Unix socket specific data structure. A single opaque pointer 'void *priv' is enough for a listener. Once any new config is added, we don't need 'void *priv2', 'void *priv3' and so on. Signed-off-by: zhenwei pi --- src/config.c | 4 ++-- src/connection.h | 3 +-- src/rdma.c | 8 ++++---- src/server.c | 3 +-- src/server.h | 12 ++++++++++-- src/unix.c | 7 ++++--- 6 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/config.c b/src/config.c index 663cf5da38..7bee87946d 100644 --- a/src/config.c +++ b/src/config.c @@ -3140,7 +3140,7 @@ standardConfig static_configs[] = { /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL), createStringConfig("unixsocket", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.unixsocket, NULL, NULL, NULL), - createStringConfig("unixsocketgroup", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.unixsocketgroup, NULL, NULL, NULL), + createStringConfig("unixsocketgroup", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.unix_ctx_config.group, NULL, NULL, NULL), createStringConfig("pidfile", NULL, IMMUTABLE_CONFIG, EMPTY_STRING_IS_NULL, server.pidfile, NULL, NULL, NULL), createStringConfig("replica-announce-ip", "slave-announce-ip", MODIFIABLE_CONFIG, EMPTY_STRING_IS_NULL, server.replica_announce_ip, NULL, NULL, NULL), createStringConfig("primaryuser", "masteruser", MODIFIABLE_CONFIG | SENSITIVE_CONFIG, EMPTY_STRING_IS_NULL, server.primary_user, NULL, NULL, NULL), @@ -3234,7 +3234,7 @@ standardConfig static_configs[] = { /* Unsigned int configs */ createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), - createUIntConfig("unixsocketperm", NULL, IMMUTABLE_CONFIG, 0, 0777, server.unixsocketperm, 0, OCTAL_CONFIG, NULL, NULL), + createUIntConfig("unixsocketperm", NULL, IMMUTABLE_CONFIG, 0, 0777, server.unix_ctx_config.perm, 0, OCTAL_CONFIG, NULL, NULL), createUIntConfig("socket-mark-id", NULL, IMMUTABLE_CONFIG, 0, UINT_MAX, server.socket_mark_id, 0, INTEGER_CONFIG, NULL, NULL), createUIntConfig("max-new-connections-per-cycle", NULL, MODIFIABLE_CONFIG, 1, 1000, server.max_new_conns_per_cycle, 10, INTEGER_CONFIG, NULL, NULL), createUIntConfig("max-new-tls-connections-per-cycle", NULL, MODIFIABLE_CONFIG, 1, 1000, server.max_new_tls_conns_per_cycle, 1, INTEGER_CONFIG, NULL, NULL), diff --git a/src/connection.h b/src/connection.h index 97d79e5655..0762441732 100644 --- a/src/connection.h +++ b/src/connection.h @@ -144,8 +144,7 @@ struct connListener { int bindaddr_count; int port; ConnectionType *ct; - void *priv1; /* used by connection type specified data */ - void *priv2; /* used by connection type specified data */ + void *priv; /* used by connection type specified data */ }; /* The connection module does not deal with listening and accepting sockets, diff --git a/src/rdma.c b/src/rdma.c index 6cbc5c70b5..dd6de395d0 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -748,7 +748,7 @@ static rdma_listener *rdmaFdToListener(connListener *listener, int fd) { for (int i = 0; i < listener->count; i++) { if (listener->fd[i] != fd) continue; - return (rdma_listener *)listener->priv1 + i; + return (rdma_listener *)listener->priv + i; } return NULL; @@ -1537,7 +1537,7 @@ int connRdmaListen(connListener *listener) { bindaddr = default_bindaddr; } - listener->priv1 = rdma_listener = zcalloc_num(bindaddr_count, sizeof(*rdma_listener)); + listener->priv = rdma_listener = zcalloc_num(bindaddr_count, sizeof(*rdma_listener)); for (j = 0; j < bindaddr_count; j++) { char *addr = bindaddr[j]; int optional = *addr == '-'; @@ -1757,13 +1757,13 @@ static int rdmaChangeListener(void) { aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE); listener->fd[i] = -1; - struct rdma_listener *rdma_listener = (struct rdma_listener *)listener->priv1 + i; + struct rdma_listener *rdma_listener = (struct rdma_listener *)listener->priv + i; rdma_destroy_id(rdma_listener->cm_id); rdma_destroy_event_channel(rdma_listener->cm_channel); } listener->count = 0; - zfree(listener->priv1); + zfree(listener->priv); closeListener(listener); diff --git a/src/server.c b/src/server.c index ab95f84346..531fb07b76 100644 --- a/src/server.c +++ b/src/server.c @@ -2818,8 +2818,7 @@ void initListeners(void) { listener->bindaddr = &server.unixsocket; listener->bindaddr_count = 1; listener->ct = connectionByType(CONN_TYPE_UNIX); - listener->priv1 = &server.unixsocketperm; /* Unix socket specified */ - listener->priv2 = server.unixsocketgroup; /* Unix socket group specified */ + listener->priv = &server.unix_ctx_config; /* Unix socket specified */ } /* create all the configured listener, and add handler to start to accept */ diff --git a/src/server.h b/src/server.h index 4fad8d2508..280c439323 100644 --- a/src/server.h +++ b/src/server.h @@ -1593,6 +1593,15 @@ typedef struct serverTLSContextConfig { int session_cache_timeout; } serverTLSContextConfig; +/*----------------------------------------------------------------------------- + * Unix Context Configuration + *----------------------------------------------------------------------------*/ + +typedef struct serverUnixContextConfig { + char *group; /* UNIX socket group */ + unsigned int perm; /* UNIX socket permission (see mode_t) */ +} serverUnixContextConfig; + /*----------------------------------------------------------------------------- * AOF manifest definition *----------------------------------------------------------------------------*/ @@ -1704,8 +1713,6 @@ struct valkeyServer { int bindaddr_count; /* Number of addresses in server.bindaddr[] */ char *bind_source_addr; /* Source address to bind on for outgoing connections */ char *unixsocket; /* UNIX socket path */ - char *unixsocketgroup; /* UNIX socket group */ - unsigned int unixsocketperm; /* UNIX socket permission (see mode_t) */ connListener listeners[CONN_TYPE_MAX]; /* TCP/Unix/TLS even more types */ uint32_t socket_mark_id; /* ID for listen socket marking */ connListener clistener; /* Cluster bus listener */ @@ -2202,6 +2209,7 @@ struct valkeyServer { int tls_replication; int tls_auth_clients; serverTLSContextConfig tls_ctx_config; + serverUnixContextConfig unix_ctx_config; /* cpu affinity */ char *server_cpulist; /* cpu affinity list of server main/io thread. */ char *bio_cpulist; /* cpu affinity list of bio thread. */ diff --git a/src/unix.c b/src/unix.c index ddfd73465a..35778779f9 100644 --- a/src/unix.c +++ b/src/unix.c @@ -51,8 +51,9 @@ static int connUnixIsLocal(connection *conn) { static int connUnixListen(connListener *listener) { int fd; - mode_t *perm = (mode_t *)listener->priv1; - char *group = (char *)listener->priv2; + serverUnixContextConfig *ctx_cfg = listener->priv; + mode_t perm = ctx_cfg->perm; + char *group = ctx_cfg->group; if (listener->bindaddr_count == 0) return C_OK; @@ -62,7 +63,7 @@ static int connUnixListen(connListener *listener) { char *addr = listener->bindaddr[j]; unlink(addr); /* don't care if this fails */ - fd = anetUnixServer(server.neterr, addr, *perm, server.tcp_backlog, group); + fd = anetUnixServer(server.neterr, addr, perm, server.tcp_backlog, group); if (fd == ANET_ERR) { serverLog(LL_WARNING, "Failed opening Unix socket: %s", server.neterr); exit(1); From 748b68d5821b6d3b4418c0f5bfb9afa120ab4dc0 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 19 Oct 2024 14:56:10 +0800 Subject: [PATCH 50/51] Fix SORT GET to ignore special pattern # in cluster slot check (#1182) This special pattern '#' is used to get the element itself, it does not actually participate in the slot check. In this case, passing `GET #` will cause '#' to participate in the slot check, causing the command to get an `pattern may be in different slots` error. Signed-off-by: Binbin --- src/db.c | 4 ++-- src/sort.c | 8 +++++++- tests/unit/sort.tcl | 20 ++++++++++++-------- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/db.c b/src/db.c index 00e6e7b2d6..ceb3105f9b 100644 --- a/src/db.c +++ b/src/db.c @@ -2418,9 +2418,9 @@ int bzmpopGetKeys(struct serverCommand *cmd, robj **argv, int argc, getKeysResul /* Helper function to extract keys from the SORT RO command. * - * SORT + * SORT_RO * - * The second argument of SORT is always a key, however an arbitrary number of + * The second argument of SORT_RO is always a key, however an arbitrary number of * keys may be accessed while doing the sort (the BY and GET args), so the * key-spec declares incomplete keys which is why we have to provide a concrete * implementation to fetch the keys. diff --git a/src/sort.c b/src/sort.c index f027b0c321..92777b068c 100644 --- a/src/sort.c +++ b/src/sort.c @@ -43,6 +43,11 @@ serverSortOperation *createSortOperation(int type, robj *pattern) { return so; } +/* Return 1 if pattern is the special pattern '#'. */ +static int isReturnSubstPattern(sds pattern) { + return pattern[0] == '#' && pattern[1] == '\0'; +} + /* Return the value associated to the key with a name obtained using * the following rules: * @@ -68,7 +73,7 @@ robj *lookupKeyByPattern(serverDb *db, robj *pattern, robj *subst) { /* If the pattern is "#" return the substitution object itself in order * to implement the "SORT ... GET #" feature. */ spat = pattern->ptr; - if (spat[0] == '#' && spat[1] == '\0') { + if (isReturnSubstPattern(spat)) { incrRefCount(subst); return subst; } @@ -258,6 +263,7 @@ void sortCommandGeneric(client *c, int readonly) { * unless we can make sure the keys formed by the pattern are in the same slot * as the key to sort. */ if (server.cluster_enabled && + !isReturnSubstPattern(c->argv[j + 1]->ptr) && patternHashSlot(c->argv[j + 1]->ptr, sdslen(c->argv[j + 1]->ptr)) != getKeySlot(c->argv[1]->ptr)) { addReplyError(c, "GET option of SORT denied in Cluster mode when " "keys formed by the pattern may be in different slots."); diff --git a/tests/unit/sort.tcl b/tests/unit/sort.tcl index 397e7e12ea..cd171ee51e 100644 --- a/tests/unit/sort.tcl +++ b/tests/unit/sort.tcl @@ -384,24 +384,28 @@ start_cluster 1 0 {tags {"external:skip cluster sort"}} { test "sort by in cluster mode" { catch {r sort "{a}mylist" by by*} e assert_match {ERR BY option of SORT denied in Cluster mode when *} $e - r sort "{a}mylist" by "{a}by*" - } {3 1 2} + assert_equal {3 1 2} [r sort "{a}mylist" by "{a}by*"] + assert_equal {3 1 2} [r sort "{a}mylist" by "{a}by*" get #] + } test "sort get in cluster mode" { catch {r sort "{a}mylist" by "{a}by*" get get*} e assert_match {ERR GET option of SORT denied in Cluster mode when *} $e - r sort "{a}mylist" by "{a}by*" get "{a}get*" - } {30 200 100} + assert_equal {30 200 100} [r sort "{a}mylist" by "{a}by*" get "{a}get*"] + assert_equal {30 3 200 1 100 2} [r sort "{a}mylist" by "{a}by*" get "{a}get*" get #] + } test "sort_ro by in cluster mode" { catch {r sort_ro "{a}mylist" by by*} e assert_match {ERR BY option of SORT denied in Cluster mode when *} $e - r sort_ro "{a}mylist" by "{a}by*" - } {3 1 2} + assert_equal {3 1 2} [r sort_ro "{a}mylist" by "{a}by*"] + assert_equal {3 1 2} [r sort_ro "{a}mylist" by "{a}by*" get #] + } test "sort_ro get in cluster mode" { catch {r sort_ro "{a}mylist" by "{a}by*" get get*} e assert_match {ERR GET option of SORT denied in Cluster mode when *} $e - r sort_ro "{a}mylist" by "{a}by*" get "{a}get*" - } {30 200 100} + assert_equal {30 200 100} [r sort_ro "{a}mylist" by "{a}by*" get "{a}get*"] + assert_equal {30 3 200 1 100 2} [r sort_ro "{a}mylist" by "{a}by*" get "{a}get*" get #] + } } From 110d5b92bce1a0f3d54fc6c813cfa4fa00f103e9 Mon Sep 17 00:00:00 2001 From: Eran Ifrah Date: Sun, 20 Oct 2024 19:10:17 +0300 Subject: [PATCH 51/51] Added CMake build system Signed-off-by: Eran Ifrah --- .cmake-format.yaml | 76 +++++++ CMakeLists.txt | 31 +++ cmake/Modules/Packaging.cmake | 44 ++++ cmake/Modules/SourceFiles.cmake | 149 +++++++++++++ cmake/Modules/ValkeySetup.cmake | 349 ++++++++++++++++++++++++++++++ deps/CMakeLists.txt | 26 +++ deps/fpconv/CMakeLists.txt | 4 + deps/hdr_histogram/CMakeLists.txt | 7 + deps/jemalloc/CMakeLists.txt | 22 ++ deps/linenoise/CMakeLists.txt | 4 + deps/lua/CMakeLists.txt | 19 ++ src/CMakeLists.txt | 61 ++++++ src/debug.c | 2 + src/server.c | 3 +- src/unit/CMakeLists.txt | 29 +++ 15 files changed, 825 insertions(+), 1 deletion(-) create mode 100644 .cmake-format.yaml create mode 100644 CMakeLists.txt create mode 100644 cmake/Modules/Packaging.cmake create mode 100644 cmake/Modules/SourceFiles.cmake create mode 100644 cmake/Modules/ValkeySetup.cmake create mode 100644 deps/CMakeLists.txt create mode 100644 deps/fpconv/CMakeLists.txt create mode 100644 deps/hdr_histogram/CMakeLists.txt create mode 100644 deps/jemalloc/CMakeLists.txt create mode 100644 deps/linenoise/CMakeLists.txt create mode 100644 deps/lua/CMakeLists.txt create mode 100644 src/CMakeLists.txt create mode 100644 src/unit/CMakeLists.txt diff --git a/.cmake-format.yaml b/.cmake-format.yaml new file mode 100644 index 0000000000..061d458542 --- /dev/null +++ b/.cmake-format.yaml @@ -0,0 +1,76 @@ +format: + _help_line_width: + - How wide to allow formatted cmake files + line_width: 120 + _help_tab_size: + - How many spaces to tab for indent + tab_size: 4 + _help_use_tabchars: + - If true, lines are indented using tab characters (utf-8 + - 0x09) instead of space characters (utf-8 0x20). + - In cases where the layout would require a fractional tab + - character, the behavior of the fractional indentation is + - governed by + use_tabchars: false + _help_separate_ctrl_name_with_space: + - If true, separate flow control names from their parentheses + - with a space + separate_ctrl_name_with_space: true + _help_min_prefix_chars: + - If the statement spelling length (including space and + - parenthesis) is smaller than this amount, then force reject + - nested layouts. + min_prefix_chars: 4 + _help_max_prefix_chars: + - If the statement spelling length (including space and + - parenthesis) is larger than the tab width by more than this + - amount, then force reject un-nested layouts. + max_prefix_chars: 10 + _help_max_lines_hwrap: + - If a candidate layout is wrapped horizontally but it exceeds + - this many lines, then reject the layout. + max_lines_hwrap: 2 + _help_line_ending: + - What style line endings to use in the output. + line_ending: unix + _help_command_case: + - Format command names consistently as 'lower' or 'upper' case + command_case: canonical + _help_keyword_case: + - Format keywords consistently as 'lower' or 'upper' case + keyword_case: unchanged + _help_always_wrap: + - A list of command names which should always be wrapped + always_wrap: [] + _help_enable_sort: + - If true, the argument lists which are known to be sortable + - will be sorted lexicographicall + enable_sort: true + _help_autosort: + - If true, the parsers may infer whether or not an argument + - list is sortable (without annotation). + autosort: false + _help_require_valid_layout: + - By default, if cmake-format cannot successfully fit + - everything into the desired linewidth it will apply the + - last, most agressive attempt that it made. If this flag is + - True, however, cmake-format will print error, exit with non- + - zero status code, and write-out nothing + require_valid_layout: false + _help_layout_passes: + - A dictionary mapping layout nodes to a list of wrap + - decisions. See the documentation for more information. + layout_passes: {} +encode: + _help_emit_byteorder_mark: + - If true, emit the unicode byte-order mark (BOM) at the start + - of the file + emit_byteorder_mark: false + _help_input_encoding: + - Specify the encoding of the input file. Defaults to utf-8 + input_encoding: utf-8 + _help_output_encoding: + - Specify the encoding of the output file. Defaults to utf-8. + - Note that cmake only claims to support utf-8 so be careful + - when using anything else + output_encoding: utf-8 diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..796d121f56 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,31 @@ +cmake_minimum_required(VERSION 3.20) + +# Must be done first +if (APPLE) + # Force clang compiler on macOS + set(CMAKE_CXX_COMPILER "/usr/bin/clang++") + set(CMAKE_C_COMPILER "/usr/bin/clang") +endif () + +# Options +option(WITH_TLS "Build valkey-server with TLS support" OFF) +option(BUILD_TESTS "Build valkey-unit-tests" OFF) +option(WITH_RDMA "Build valkey-rdma module" OFF) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") +project("valkey") + +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED ON) +set(CMAKE_C_EXTENSIONS OFF) + +include(ValkeySetup) +add_subdirectory(src) + +# Include the packaging module +include(Packaging) + +# Clear variables declared on this file from the cache +unset(WITH_TLS CACHE) +unset(BUILD_TESTS CACHE) +unset(WITH_RDMA CACHE) diff --git a/cmake/Modules/Packaging.cmake b/cmake/Modules/Packaging.cmake new file mode 100644 index 0000000000..c9b0abac0b --- /dev/null +++ b/cmake/Modules/Packaging.cmake @@ -0,0 +1,44 @@ +set(CPACK_PACKAGE_NAME "valkey") +set(CPACK_PACKAGE_VERSION_MAJOR 8) +set(CPACK_PACKAGE_VERSION_MINOR 1) +set(CPACK_PACKAGE_VERSION_PATCH 0) +set(CPACK_PACKAGE_CONTACT "maintainers@lists.valkey.io") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Valkey is an open source (BSD) high-performance key/value datastore") +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/COPYING") +set(CPACK_RESOURCE_FILE_README "${CMAKE_SOURCE_DIR}/README.md") +set(CPACK_STRIP_FILES TRUE) + +valkey_get_distro_name(DISTRO_NAME) +message(STATUS "Current host distro: ${DISTRO_NAME}") + +if (DISTRO_NAME MATCHES ubuntu + OR DISTRO_NAME MATCHES debian + OR DISTRO_NAME MATCHES mint) + message(STATUS "Adding target package for ${DISTRO_NAME}") + set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/valkey") + # Debian related parameters + set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Valkey contributors") + set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON) + set(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT) + set(CPACK_GENERATOR "DEB") +endif () + +include(CPack) +unset(DISTRO_NAME CACHE) + +# --------------------------------------------------- +# Create a helper script for creating symbolic links +# --------------------------------------------------- +write_file( + ${CMAKE_BINARY_DIR}/CreateSymlink.sh + "\ +#!/bin/bash \n\ +if [ -z \${DESTDIR} ]; then \n\ + # Script is called during 'make install' \n\ + PREFIX=${CMAKE_INSTALL_PREFIX}/bin \n\ +else \n\ + # Script is called during 'make package' \n\ + PREFIX=\${DESTDIR}${CPACK_PACKAGING_INSTALL_PREFIX}/bin \n\ +fi \n\ +cd \$PREFIX \n\ +ln -sf \$1 \$2") diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake new file mode 100644 index 0000000000..29205c640e --- /dev/null +++ b/cmake/Modules/SourceFiles.cmake @@ -0,0 +1,149 @@ +# ------------------------------------------------- +# Define the sources to be built +# ------------------------------------------------- + +# libvalkeylib.a sources +set(VALKEY_LIB_SRCS + ${CMAKE_SOURCE_DIR}/src/threads_mngr.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/quicklist.c + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/kvstore.c + ${CMAKE_SOURCE_DIR}/src/sds.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/lzf_c.c + ${CMAKE_SOURCE_DIR}/src/lzf_d.c + ${CMAKE_SOURCE_DIR}/src/pqsort.c + ${CMAKE_SOURCE_DIR}/src/zipmap.c + ${CMAKE_SOURCE_DIR}/src/sha1.c + ${CMAKE_SOURCE_DIR}/src/ziplist.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/memory_prefetch.c + ${CMAKE_SOURCE_DIR}/src/io_threads.c + ${CMAKE_SOURCE_DIR}/src/networking.c + ${CMAKE_SOURCE_DIR}/src/util.c + ${CMAKE_SOURCE_DIR}/src/object.c + ${CMAKE_SOURCE_DIR}/src/db.c + ${CMAKE_SOURCE_DIR}/src/replication.c + ${CMAKE_SOURCE_DIR}/src/rdb.c + ${CMAKE_SOURCE_DIR}/src/t_string.c + ${CMAKE_SOURCE_DIR}/src/t_list.c + ${CMAKE_SOURCE_DIR}/src/t_set.c + ${CMAKE_SOURCE_DIR}/src/t_zset.c + ${CMAKE_SOURCE_DIR}/src/t_hash.c + ${CMAKE_SOURCE_DIR}/src/config.c + ${CMAKE_SOURCE_DIR}/src/aof.c + ${CMAKE_SOURCE_DIR}/src/pubsub.c + ${CMAKE_SOURCE_DIR}/src/multi.c + ${CMAKE_SOURCE_DIR}/src/debug.c + ${CMAKE_SOURCE_DIR}/src/sort.c + ${CMAKE_SOURCE_DIR}/src/intset.c + ${CMAKE_SOURCE_DIR}/src/syncio.c + ${CMAKE_SOURCE_DIR}/src/cluster.c + ${CMAKE_SOURCE_DIR}/src/cluster_legacy.c + ${CMAKE_SOURCE_DIR}/src/cluster_slot_stats.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/endianconv.c + ${CMAKE_SOURCE_DIR}/src/slowlog.c + ${CMAKE_SOURCE_DIR}/src/eval.c + ${CMAKE_SOURCE_DIR}/src/bio.c + ${CMAKE_SOURCE_DIR}/src/rio.c + ${CMAKE_SOURCE_DIR}/src/rand.c + ${CMAKE_SOURCE_DIR}/src/memtest.c + ${CMAKE_SOURCE_DIR}/src/syscheck.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/bitops.c + ${CMAKE_SOURCE_DIR}/src/sentinel.c + ${CMAKE_SOURCE_DIR}/src/notify.c + ${CMAKE_SOURCE_DIR}/src/setproctitle.c + ${CMAKE_SOURCE_DIR}/src/blocked.c + ${CMAKE_SOURCE_DIR}/src/hyperloglog.c + ${CMAKE_SOURCE_DIR}/src/latency.c + ${CMAKE_SOURCE_DIR}/src/sparkline.c + ${CMAKE_SOURCE_DIR}/src/valkey-check-rdb.c + ${CMAKE_SOURCE_DIR}/src/valkey-check-aof.c + ${CMAKE_SOURCE_DIR}/src/geo.c + ${CMAKE_SOURCE_DIR}/src/lazyfree.c + ${CMAKE_SOURCE_DIR}/src/module.c + ${CMAKE_SOURCE_DIR}/src/evict.c + ${CMAKE_SOURCE_DIR}/src/expire.c + ${CMAKE_SOURCE_DIR}/src/geohash.c + ${CMAKE_SOURCE_DIR}/src/geohash_helper.c + ${CMAKE_SOURCE_DIR}/src/childinfo.c + ${CMAKE_SOURCE_DIR}/src/defrag.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/rax.c + ${CMAKE_SOURCE_DIR}/src/t_stream.c + ${CMAKE_SOURCE_DIR}/src/listpack.c + ${CMAKE_SOURCE_DIR}/src/localtime.c + ${CMAKE_SOURCE_DIR}/src/lolwut.c + ${CMAKE_SOURCE_DIR}/src/lolwut5.c + ${CMAKE_SOURCE_DIR}/src/lolwut6.c + ${CMAKE_SOURCE_DIR}/src/acl.c + ${CMAKE_SOURCE_DIR}/src/tracking.c + ${CMAKE_SOURCE_DIR}/src/socket.c + ${CMAKE_SOURCE_DIR}/src/tls.c + ${CMAKE_SOURCE_DIR}/src/sha256.c + ${CMAKE_SOURCE_DIR}/src/timeout.c + ${CMAKE_SOURCE_DIR}/src/setcpuaffinity.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/resp_parser.c + ${CMAKE_SOURCE_DIR}/src/call_reply.c + ${CMAKE_SOURCE_DIR}/src/script_lua.c + ${CMAKE_SOURCE_DIR}/src/script.c + ${CMAKE_SOURCE_DIR}/src/functions.c + ${CMAKE_SOURCE_DIR}/src/function_lua.c + ${CMAKE_SOURCE_DIR}/src/commands.c + ${CMAKE_SOURCE_DIR}/src/strl.c + ${CMAKE_SOURCE_DIR}/src/connection.c + ${CMAKE_SOURCE_DIR}/src/unix.c + ${CMAKE_SOURCE_DIR}/src/logreqres.c) + +# valkey-cli +set(VALKEY_CLI_SRCS + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/valkey-cli.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/serverassert.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/cli_common.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/strl.c + ${CMAKE_SOURCE_DIR}/src/cli_commands.c) + +# valkey-benchmark +set(VALKEY_BENCHMARK_SRCS + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/valkey-benchmark.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/serverassert.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/cli_common.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/strl.c) + +# valkey-rdmo module +set(VALKEY_RDMA_MODULE_SRCS ${CMAKE_SOURCE_DIR}/src/rdma.c) diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake new file mode 100644 index 0000000000..0ddc954802 --- /dev/null +++ b/cmake/Modules/ValkeySetup.cmake @@ -0,0 +1,349 @@ +include(CheckIncludeFiles) + +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") + +# Generate compile_commands.json file for IDEs code completion support +set(CMAKE_EXPORT_COMPILE_COMMANDS 1) + +# Installed executables will have this permissions +set(VALKEY_EXE_PERMISSIONS + OWNER_EXECUTE + OWNER_WRITE + OWNER_READ + GROUP_EXECUTE + GROUP_READ + WORLD_EXECUTE + WORLD_READ) + +set(VALKEY_SERVER_CFLAGS "") +set(VALKEY_SERVER_LDFLAGS "") + +# ---------------------------------------------------- +# Helper functions & macros +# ---------------------------------------------------- +macro (add_valkey_server_compiler_options value) + set(VALKEY_SERVER_CFLAGS "${VALKEY_SERVER_CFLAGS} ${value}") +endmacro () + +macro (add_valkey_server_linker_option value) + list(APPEND VALKEY_SERVER_LDFLAGS ${value}) +endmacro () + +macro (get_valkey_server_linker_option return_value) + list(JOIN VALKEY_SERVER_LDFLAGS " " ${value} ${return_value}) +endmacro () + +set(IS_FREEBSD 0) +if (CMAKE_SYSTEM_NAME MATCHES "^.*BSD$|DragonFly") + message(STATUS "Building for FreeBSD compatible system") + set(IS_FREEBSD 1) + include_directories("/usr/local/include") + add_valkey_server_compiler_options("-DUSE_BACKTRACE") +endif () + +# Helper function for creating symbolic link so that: link -> source +macro (valkey_create_symlink source link) + install( + CODE "execute_process( \ + COMMAND /bin/bash ${CMAKE_BINARY_DIR}/CreateSymlink.sh \ + ${source} \ + ${link} \ + )" + COMPONENT "valkey") +endmacro () + +# Install a binary +macro (valkey_install_bin target) + # Install cli tool and create a redis symbolic link + install( + TARGETS ${target} + DESTINATION ${CMAKE_INSTALL_BINDIR} + PERMISSIONS ${VALKEY_EXE_PERMISSIONS} + COMPONENT "valkey") +endmacro () + +# Helper function that defines, builds and installs `target` In addition, it creates a symbolic link between the target +# and `link_name` +macro (valkey_build_and_install_bin target sources ld_flags libs link_name) + add_executable(${target} ${sources}) + + if (USE_JEMALLOC) + # Using jemalloc + target_link_libraries(${target} jemalloc) + endif () + + # Place this line last to ensure that ${ld_flags} is placed last on the linker line + target_link_libraries(${target} ${libs} ${ld_flags}) + target_link_libraries(${target} hiredis) + if (USE_TLS) + # Add required libraries needed for TLS + target_link_libraries(${target} OpenSSL::SSL hiredis_ssl) + endif () + + if (IS_FREEBSD) + target_link_libraries(${target} execinfo) + endif () + + # Install cli tool and create a redis symbolic link + valkey_install_bin(${target}) + valkey_create_symlink(${target} ${link_name}) +endmacro () + +# Return the current host distro name. For example: ubuntu, debian, amzn etc +function (valkey_get_distro_name DISTRO_NAME) + if (LINUX AND NOT APPLE) + execute_process( + COMMAND /bin/bash "-c" "cat /etc/os-release |grep ^ID=|cut -d = -f 2" + OUTPUT_VARIABLE _OUT_VAR + OUTPUT_STRIP_TRAILING_WHITESPACE) + # clean the output + string(REPLACE "\"" "" _OUT_VAR "${_OUT_VAR}") + string(REPLACE "." "" _OUT_VAR "${_OUT_VAR}") + set(${DISTRO_NAME} + "${_OUT_VAR}" + PARENT_SCOPE) + elseif (APPLE) + set(${DISTRO_NAME} + "darwin" + PARENT_SCOPE) + elseif (IS_FREEBSD) + set(${DISTRO_NAME} + "freebsd" + PARENT_SCOPE) + else () + set(${DISTRO_NAME} + "unknown" + PARENT_SCOPE) + endif () +endfunction () + +# Determine if we are building in Release or Debug mode +if (CMAKE_BUILD_TYPE MATCHES Debug OR CMAKE_BUILD_TYPE MATCHES DebugFull) + set(VALKEY_DEBUG_BUILD 1) + message(STATUS "Building in debug mode") +else () + set(VALKEY_DEBUG_BUILD 0) + message(STATUS "Building in release mode") +endif () + +# ---------------------------------------------------- +# Helper functions - end +# ---------------------------------------------------- + +# ---------------------------------------------------- +# Build options (allocator, tls, rdma et al) +# ---------------------------------------------------- + +if (NOT WITH_MALLOC) + if (APPLE) + set(WITH_MALLOC "libc") + elseif (UNIX) + set(WITH_MALLOC "jemalloc") + endif () +endif () + +# User may pass different allocator library. Using -DWITH_MALLOC=, make sure it is a valid value +if (WITH_MALLOC) + if ("${WITH_MALLOC}" STREQUAL "jemalloc") + set(MALLOC_LIB "jemalloc") + add_valkey_server_compiler_options("-DUSE_JEMALLOC") + set(USE_JEMALLOC 1) + elseif ("${WITH_MALLOC}" STREQUAL "libc") + set(MALLOC_LIB "libc") + elseif ("${WITH_MALLOC}" STREQUAL "tcmalloc") + set(MALLOC_LIB "tcmalloc") + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + elseif ("${WITH_MALLOC}" STREQUAL "tcmalloc_minimal") + set(MALLOC_LIB "tcmalloc_minimal") + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + else () + message(FATAL_ERROR "WITH_MALLOC can be one of: jemalloc, libc, tcmalloc or tcmalloc_minimal") + endif () +endif () + +message(STATUS "Using ${MALLOC_LIB}") + +# TLS support +if (WITH_TLS) + find_package(OpenSSL REQUIRED) + message(STATUS "OpenSSL include dir: ${OPENSSL_INCLUDE_DIR}") + message(STATUS "OpenSSL libraries: ${OPENSSL_LIBRARIES}") + + include_directories(${OPENSSL_INCLUDE_DIR}) + add_valkey_server_compiler_options("-DUSE_OPENSSL=1") + add_valkey_server_compiler_options("-DBUILD_TLS_MODULE=0") + set(USE_TLS 1) +endif () + +if (WITH_RDMA) + # RDMA support (Linux only) + if (LINUX AND NOT APPLE) + add_valkey_server_compiler_options("-DUSE_RDMA=2") + find_package(PkgConfig REQUIRED) + pkg_check_modules(VALKEY_RDMA REQUIRED librdmacm) + pkg_check_modules(VALKEY_IBVERBS REQUIRED libibverbs) + list(APPEND RDMA_LIBS "${RDMA_LIBRARIES};${IBVERBS_LIBRARIES}") + else () + message(WARNING "RDMA is only supported on Linux platforms") + endif () +endif () + +set(BUILDING_ARM64 0) +set(BUILDING_ARM32 0) + +if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64") + set(BUILDING_ARM64 1) +endif () + +if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm") + set(BUILDING_ARM32 1) +endif () + +message(STATUS "Building on ${CMAKE_HOST_SYSTEM_NAME}") +if (BUILDING_ARM64) + message(STATUS "Compiling valkey for ARM64") + add_valkey_server_linker_option("-funwind-tables") +endif () + +if (APPLE) + add_valkey_server_linker_option("-rdynamic") + add_valkey_server_linker_option("-ldl") +elseif (UNIX) + add_valkey_server_linker_option("-rdynamic") + add_valkey_server_linker_option("-pthread") + add_valkey_server_linker_option("-ldl") + add_valkey_server_linker_option("-lm") +endif () + +if (VALKEY_DEBUG_BUILD) + # Debug build, use enable "-fno-omit-frame-pointer" + add_valkey_server_compiler_options("-fno-omit-frame-pointer") +else () + # Release build. Enable LTO + if (APPLE) + add_valkey_server_compiler_options("-flto") + else () + add_valkey_server_compiler_options("-flto=auto") + endif () +endif () + +# Check for Atomic +check_include_files(stdatomic.h HAVE_C11_ATOMIC) +if (HAVE_C11_ATOMIC) + add_valkey_server_compiler_options("-std=gnu11") +else () + add_valkey_server_compiler_options("-std=c99") +endif () + +# Sanitizer +if (WITH_SANITIZER) + # For best results, force libc + set(MALLOC_LIB, "libc") + if ("${WITH_SANITIZER}" STREQUAL "address") + add_valkey_server_compiler_options("-fsanitize=address -fno-sanitize-recover=all -fno-omit-frame-pointer") + add_valkey_server_linker_option("-fsanitize=address") + elseif ("${WITH_SANITIZER}" STREQUAL "thread") + add_valkey_server_compiler_options("-fsanitize=thread -fno-sanitize-recover=all -fno-omit-frame-pointer") + add_valkey_server_linker_option("-fsanitize=thread") + elseif ("${WITH_SANITIZER}" STREQUAL "undefined") + add_valkey_server_compiler_options("-fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer") + add_valkey_server_linker_option("-fsanitize=undefined") + else () + message(FATAL_ERROR "Unknown sanitizer: ${WITH_SANITIZER}") + endif () +endif () + +include_directories("${CMAKE_SOURCE_DIR}/deps/hiredis") +include_directories("${CMAKE_SOURCE_DIR}/deps/linenoise") +include_directories("${CMAKE_SOURCE_DIR}/deps/lua/src") +include_directories("${CMAKE_SOURCE_DIR}/deps/hdr_histogram") +include_directories("${CMAKE_SOURCE_DIR}/deps/fpconv") + +add_subdirectory("${CMAKE_SOURCE_DIR}/deps") + +# Update linker flags for the allocator +if (USE_JEMALLOC) + include_directories("${CMAKE_SOURCE_DIR}/deps/jemalloc/include") +endif () + +# Common compiler flags +add_valkey_server_compiler_options("-pedantic") + +# ---------------------------------------------------- +# Build options (allocator, tls, rdma et al) - end +# ---------------------------------------------------- + +# ------------------------------------------------- +# Code Generation section +# ------------------------------------------------- +find_program(PYTHON_EXE python3) +if (PYTHON_EXE) + # Python based code generation + message(STATUS "Found python3: ${PYTHON_EXE}") + # Rule for generating commands.def file from json files + message(STATUS "Adding target generate_commands_def") + file(GLOB COMMAND_FILES_JSON "${CMAKE_SOURCE_DIR}/src/commands/*.json") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/commands_def_generated + DEPENDS ${COMMAND_FILES_JSON} + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-command-code.py + COMMAND touch ${CMAKE_BINARY_DIR}/commands_def_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_commands_def DEPENDS ${CMAKE_BINARY_DIR}/commands_def_generated) + + # Rule for generating fmtargs.h + message(STATUS "Adding target generate_fmtargs_h") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/fmtargs_generated + DEPENDS ${CMAKE_SOURCE_DIR}/utils/generate-fmtargs.py + COMMAND sed '/Everything/,$$d' fmtargs.h > fmtargs.h.tmp + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-fmtargs.py >> fmtargs.h.tmp + COMMAND mv fmtargs.h.tmp fmtargs.h + COMMAND touch ${CMAKE_BINARY_DIR}/fmtargs_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_fmtargs_h DEPENDS ${CMAKE_BINARY_DIR}/fmtargs_generated) + + # Rule for generating test_files.h + message(STATUS "Adding target generate_test_files_h") + file(GLOB UNIT_TEST_SRCS "${CMAKE_SOURCE_DIR}/src/unit/*.c") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/test_files_generated + DEPENDS "${UNIT_TEST_SRCS};${CMAKE_SOURCE_DIR}/utils/generate-unit-test-header.py" + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-unit-test-header.py + COMMAND touch ${CMAKE_BINARY_DIR}/test_files_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_test_files_h DEPENDS ${CMAKE_BINARY_DIR}/test_files_generated) +else () + # Fake targets + add_custom_target(generate_commands_def) + add_custom_target(generate_fmtargs_h) + add_custom_target(generate_test_files_h) +endif () + +# Generate release.h file (always) +add_custom_target( + release_header + COMMAND sh -c '${CMAKE_SOURCE_DIR}/src/mkreleasehdr.sh' + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + +# ------------------------------------------------- +# Code Generation section - end +# ------------------------------------------------- + +# ---------------------------------------------------------- +# All our source files are defined in SourceFiles.cmake file +# ---------------------------------------------------------- +include(SourceFiles) + +# Clear the below variables from the cache +unset(CMAKE_C_FLAGS CACHE) +unset(WITH_SANITIZER CACHE) +unset(VALKEY_SERVER_LDFLAGS CACHE) +unset(VALKEY_SERVER_CFLAGS CACHE) +unset(PYTHON_EXE CACHE) +unset(HAVE_C11_ATOMIC CACHE) +unset(USE_TLS CACHE) +unset(WITH_MALLOC CACHE) +unset(USE_JEMALLOC CACHE) diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt new file mode 100644 index 0000000000..d86bdee12d --- /dev/null +++ b/deps/CMakeLists.txt @@ -0,0 +1,26 @@ +add_subdirectory(jemalloc) +add_subdirectory(lua) + +# Set hiredis options. We need to disable the defaults set in the OPTION(..) we do this by setting them in the CACHE +set(BUILD_SHARED_LIBS + OFF + CACHE BOOL "Build shared libraries") +set(DISABLE_TESTS + ON + CACHE BOOL "If tests should be compiled or not") +if (USE_TLS) + message(STATUS "Building hiredis_ssl") + set(ENABLE_SSL + ON + CACHE BOOL "Should we test SSL connections") +endif () + +add_subdirectory(hiredis) +add_subdirectory(linenoise) +add_subdirectory(fpconv) +add_subdirectory(hdr_histogram) + +# Clear any cached variables passed to hiredis from the cache +unset(BUILD_SHARED_LIBS CACHE) +unset(DISABLE_TESTS CACHE) +unset(ENABLE_SSL CACHE) diff --git a/deps/fpconv/CMakeLists.txt b/deps/fpconv/CMakeLists.txt new file mode 100644 index 0000000000..c586aa650a --- /dev/null +++ b/deps/fpconv/CMakeLists.txt @@ -0,0 +1,4 @@ +project(fpconv) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/fpconv_dtoa.c" "${CMAKE_CURRENT_LIST_DIR}/fpconv_dtoa.h") +add_library(fpconv STATIC ${SRCS}) diff --git a/deps/hdr_histogram/CMakeLists.txt b/deps/hdr_histogram/CMakeLists.txt new file mode 100644 index 0000000000..7b45bd76ba --- /dev/null +++ b/deps/hdr_histogram/CMakeLists.txt @@ -0,0 +1,7 @@ +project(hdr_histogram) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/hdr_histogram.c" "${CMAKE_CURRENT_LIST_DIR}/hdr_histogram.h" + "${CMAKE_CURRENT_LIST_DIR}/hdr_atomic.h" "${CMAKE_CURRENT_LIST_DIR}/hdr_redis_malloc.h") + +add_library(hdr_histogram STATIC ${SRCS}) +target_compile_definitions(hdr_histogram PRIVATE HDR_MALLOC_INCLUDE=\"hdr_redis_malloc.h\") diff --git a/deps/jemalloc/CMakeLists.txt b/deps/jemalloc/CMakeLists.txt new file mode 100644 index 0000000000..fc0be3a21f --- /dev/null +++ b/deps/jemalloc/CMakeLists.txt @@ -0,0 +1,22 @@ +project(jemalloc) + +# Build jemalloc using configure && make install +set(JEMALLOC_INSTALL_DIR ${CMAKE_BINARY_DIR}/jemalloc-build) +set(JEMALLOC_SRC_DIR ${CMAKE_CURRENT_LIST_DIR}) +if (NOT EXISTS ${JEMALLOC_INSTALL_DIR}/lib/libjemalloc.a) + message(STATUS "Building jemalloc (custom build)") + message(STATUS "JEMALLOC_SRC_DIR = ${JEMALLOC_SRC_DIR}") + message(STATUS "JEMALLOC_INSTALL_DIR = ${JEMALLOC_INSTALL_DIR}") + + execute_process( + COMMAND sh -c "${JEMALLOC_SRC_DIR}/configure --disable-cxx \ + --with-version=5.3.0-0-g0 --with-lg-quantum=3 --disable-cache-oblivious --with-jemalloc-prefix=je_ \ + --enable-static --disable-shared --prefix=${JEMALLOC_INSTALL_DIR}" + WORKING_DIRECTORY ${JEMALLOC_SRC_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND make -j4 lib/libjemalloc.a install WORKING_DIRECTORY "${JEMALLOC_SRC_DIR}") +endif () + +# Import the compiled library as a CMake target +add_library(jemalloc STATIC IMPORTED GLOBAL) +set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION "${JEMALLOC_INSTALL_DIR}/lib/libjemalloc.a" + INCLUDE_DIRECTORIES "${JEMALLOC_INSTALL_DIR}/include") diff --git a/deps/linenoise/CMakeLists.txt b/deps/linenoise/CMakeLists.txt new file mode 100644 index 0000000000..f801e4abf1 --- /dev/null +++ b/deps/linenoise/CMakeLists.txt @@ -0,0 +1,4 @@ +project(linenoise) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/linenoise.c" "${CMAKE_CURRENT_LIST_DIR}/linenoise.h") +add_library(linenoise STATIC ${SRCS}) diff --git a/deps/lua/CMakeLists.txt b/deps/lua/CMakeLists.txt new file mode 100644 index 0000000000..223c4d333c --- /dev/null +++ b/deps/lua/CMakeLists.txt @@ -0,0 +1,19 @@ +project(lualib) + +# Build lua library +set(LUA_INSTALL_DIR ${CMAKE_BINARY_DIR}/lua-build) +set(LUA_SRC_DIR ${CMAKE_CURRENT_LIST_DIR}) +if (NOT EXISTS ${LUA_INSTALL_DIR}/lib/liblua.a) + message(STATUS "Building lua (custom build)") + execute_process(COMMAND mkdir -p "${LUA_INSTALL_DIR}" COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND make -j4 all WORKING_DIRECTORY "${LUA_SRC_DIR}/src" COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND make -j4 install INSTALL_TOP=${LUA_INSTALL_DIR} + WORKING_DIRECTORY "${LUA_SRC_DIR}" COMMAND_ERROR_IS_FATAL ANY) +endif () + +# Import the compiled library as a CMake target "liblua" +add_library(lualib STATIC IMPORTED GLOBAL) +set_target_properties(lualib PROPERTIES IMPORTED_LOCATION "${LUA_INSTALL_DIR}/lib/liblua.a" + INCLUDE_DIRECTORIES "${LUA_INSTALL_DIR}/include") + +message(STATUS "lualib import from ${LUA_INSTALL_DIR}/lib/liblua.a") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..7c3f2bde76 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,61 @@ +project(valkey-server) + +set(INSTALL_BIN_PATH ${CMAKE_INSTALL_PREFIX}/bin) +set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) + +# Create a library from Valkey sources +add_library(valkeylib STATIC "${VALKEY_LIB_SRCS}") +add_dependencies(valkeylib generate_fmtargs_h) +add_dependencies(valkeylib release_header) +add_dependencies(valkeylib generate_commands_def) + +if (BUILD_TESTS) + # Build unit tests only + message(STATUS "Building unit tests") + add_definitions(-DSERVER_TEST=1) + add_definitions(-DVALKEY_USE_TEST_MAIN) + add_definitions(-DVALKEY_USE_TEST_SERVER_ASSERT) + add_subdirectory(unit) +else () + # Target: valkey-server + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${VALKEY_SERVER_CFLAGS}") + message(STATUS "CFLAGS: ${CMAKE_C_FLAGS}") + + get_valkey_server_linker_option(VALKEY_SERVER_LDFLAGS) + list(APPEND SERVER_LIBS "valkeylib") + list(APPEND SERVER_LIBS "fpconv") + list(APPEND SERVER_LIBS "lualib") + list(APPEND SERVER_LIBS "hdr_histogram") + set(VALKEY_SERVER_SRCS "${CMAKE_SOURCE_DIR}/src/server.c") + valkey_build_and_install_bin(valkey-server "${VALKEY_SERVER_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${SERVER_LIBS}" + "redis-server") + + # Target: valkey-cli + list(APPEND CLI_LIBS "linenoise") + valkey_build_and_install_bin(valkey-cli "${VALKEY_CLI_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${CLI_LIBS}" "redis-cli") + add_dependencies(valkey-cli generate_commands_def) + add_dependencies(valkey-cli generate_fmtargs_h) + + # Target: valkey-benchmark + list(APPEND BENCH_LIBS "hdr_histogram") + valkey_build_and_install_bin(valkey-benchmark "${VALKEY_BENCHMARK_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${BENCH_LIBS}" + "redis-benchmark") + add_dependencies(valkey-benchmark generate_commands_def) + add_dependencies(valkey-benchmark generate_fmtargs_h) + + # Targets: valkey-sentinel, valkey-check-aof and valkey-check-rdb are just symbolic links + valkey_create_symlink("valkey-server" "valkey-sentinel") + valkey_create_symlink("valkey-server" "valkey-check-rdb") + valkey_create_symlink("valkey-server" "valkey-check-aof") + + # Target valkey-rdma + if (WITH_RDMA) + set(MODULE_NAME "valkey-rdma") + add_library(${MODULE_NAME} SHARED "${VALKEY_RDMA_MODULE_SRCS}") + target_compile_options(${MODULE_NAME} PUBLIC -DBUILD_RDMA_MODULE -DUSE_RDMA=1) + target_link_libraries(${MODULE_NAME} "${RDMA_LIBS}") + # remove the "lib" prefix from the module + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") + valkey_install_bin(${MODULE_NAME}) + endif () +endif () diff --git a/src/debug.c b/src/debug.c index 98512fd436..8c4badf758 100644 --- a/src/debug.c +++ b/src/debug.c @@ -1023,6 +1023,7 @@ void debugCommand(client *c) { /* =========================== Crash handling ============================== */ +#ifndef VALKEY_USE_TEST_SERVER_ASSERT __attribute__((noinline)) void _serverAssert(const char *estr, const char *file, int line) { int new_report = bugReportStart(); serverLog(LL_WARNING, "=== %sASSERTION FAILED ===", new_report ? "" : "RECURSIVE "); @@ -1041,6 +1042,7 @@ __attribute__((noinline)) void _serverAssert(const char *estr, const char *file, removeSigSegvHandlers(); bugReportEnd(0, 0); } +#endif /* Checks if the argument at the given index should be redacted from logs. */ int shouldRedactArg(const client *c, int idx) { diff --git a/src/server.c b/src/server.c index 531fb07b76..e501cced56 100644 --- a/src/server.c +++ b/src/server.c @@ -6720,6 +6720,7 @@ serverTestProc *getTestProcByName(const char *name) { } #endif +#ifndef VALKEY_USE_TEST_MAIN int main(int argc, char **argv) { struct timeval tv; int j; @@ -7063,5 +7064,5 @@ int main(int argc, char **argv) { aeDeleteEventLoop(server.el); return 0; } - +#endif // VALKEY_USE_TEST_MAIN /* The End */ diff --git a/src/unit/CMakeLists.txt b/src/unit/CMakeLists.txt new file mode 100644 index 0000000000..e62d0389f2 --- /dev/null +++ b/src/unit/CMakeLists.txt @@ -0,0 +1,29 @@ +project(valkey-unit-tests) + +file(GLOB UNIT_TEST_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.c" "${CMAKE_SOURCE_DIR}/src/server.c") +set(UNIT_TEST_SRCS "${UNIT_TEST_SRCS}") + +get_valkey_server_linker_option(VALKEY_SERVER_LDFLAGS) + +add_executable(valkey-unit-tests ${UNIT_TEST_SRCS}) +add_dependencies(valkey-unit-tests generate_test_files_h) + +if (USE_TLS) + # Add required libraries needed for TLS + target_link_libraries(valkey-unit-tests OpenSSL::SSL hiredis_ssl) +endif () + +if (USE_JEMALLOC) + # Using jemalloc + target_link_libraries(valkey-unit-tests jemalloc) +endif () + +# Place this line last to ensure that ${ld_flags} is placed last on the linker line +target_link_libraries( + valkey-unit-tests + valkeylib + fpconv + lualib + hdr_histogram + hiredis + ${VALKEY_SERVER_LDFLAGS})