Skip tracking clients OOM test when I/O threads are enabled (#764)

Fix feedback loop in key eviction with tracking clients when using I/O threads. Current issue: Evicting keys while tracking clients or key space-notification exist creates a feedback loop when using I/O threads: While evicting keys we send tracking async writes to I/O threads, preventing immediate release of tracking clients' COB memory consumption. Before the I/O thread finishes its write, we recheck used_memory, which now includes the tracking clients' COB and thus continue to evict more keys. **Fix:** We will skip the test for now while IO threads are active. We may consider avoiding sending writes in `processPendingWrites` to I/O threads for tracking clients when we are out of memory. --------- Signed-off-by: Uri Yagelnik <uriy@amazon.com> Signed-off-by: Madelyn Olson <madelyneolson@gmail.com> Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
valkey-io · Sep 3, 2024 · 2e93777 · 2e93777
1 parent 0f551a2
commit 2e93777
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 52 deletions.
diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
@@ -379,10 +379,10 @@ jobs:
         run: sudo apt-get install tcl8.6 tclx
       - name: test
         if: true && !contains(github.event.inputs.skiptests, 'valkey')
-        run: ./runtest --config io-threads 2 --config events-per-io-thread 0 --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
+        run: ./runtest --io-threads --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
       - name: cluster tests
         if: true && !contains(github.event.inputs.skiptests, 'cluster')
-        run: ./runtest-cluster --config io-threads 2 --config events-per-io-thread 0 ${{github.event.inputs.cluster_test_args}}
+        run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}}
 
   test-ubuntu-reclaim-cache:
     runs-on: ubuntu-latest

diff --git a/tests/support/server.tcl b/tests/support/server.tcl
@@ -241,6 +241,11 @@ proc tags_acceptable {tags err_return} {
         return 0
     }
 
+    if {$::io_threads && [lsearch $tags "io-threads:skip"] >= 0} {
+        set err "Not supported in io-threads mode"
+        return 0
+    }
+
     if {$::tcl_version < 8.6 && [lsearch $tags "ipv6"] >= 0} {
         set err "TCL version is too low and does not support this"
         return 0
@@ -502,6 +507,12 @@ proc start_server {options {code undefined}} {
         dict set config "tls-ca-cert-file" [format "%s/tests/tls/ca.crt" [pwd]]
         dict set config "loglevel" "debug"
     }
+
+    if {$::io_threads} {
+        dict set config "io-threads" 2
+        dict set config "events-per-io-thread" 0
+    }
+
     foreach line $data {
         if {[string length $line] > 0 && [string index $line 0] ne "#"} {
             set elements [split $line " "]

diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
@@ -41,6 +41,7 @@ set ::traceleaks 0
 set ::valgrind 0
 set ::durable 0
 set ::tls 0
+set ::io_threads 0
 set ::tls_module 0
 set ::stack_logging 0
 set ::verbose 0
@@ -576,6 +577,7 @@ proc print_help_screen {} {
         "--loops <count>    Execute the specified set of tests several times."
         "--wait-server      Wait after server is started (so that you can attach a debugger)."
         "--dump-logs        Dump server log on test failure."
+        "--io-threads       Run tests with IO threads."
         "--tls              Run tests in TLS mode."
         "--tls-module       Run tests in TLS mode with Valkey module."
         "--host <addr>      Run tests against an external host."
@@ -630,6 +632,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
         }
     } elseif {$opt eq {--quiet}} {
         set ::quiet 1
+   } elseif {$opt eq {--io-threads}} {
+        set ::io_threads 1
     } elseif {$opt eq {--tls} || $opt eq {--tls-module}} {
         package require tls 1.6
         set ::tls 1

diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
@@ -308,51 +308,49 @@ start_server {tags {"info" "external:skip"}} {
             assert_equal "count=2" [errorstat ERR]
         }
 
-        # skip the following 2 tests if we are running with io-threads as the eventloop metrics are different in that case.
-        if {[r config get io-threads] eq 0} {
-            test {stats: eventloop metrics} {
-                set info1 [r info stats]
-                set cycle1 [getInfoProperty $info1 eventloop_cycles]
-                set el_sum1 [getInfoProperty $info1 eventloop_duration_sum]
-                set cmd_sum1 [getInfoProperty $info1 eventloop_duration_cmd_sum]
-                assert_morethan $cycle1 0
-                assert_morethan $el_sum1 0
-                assert_morethan $cmd_sum1 0
-                after 110 ;# default hz is 10, wait for a cron tick. 
-                set info2 [r info stats]
-                set cycle2 [getInfoProperty $info2 eventloop_cycles]
-                set el_sum2 [getInfoProperty $info2 eventloop_duration_sum]
-                set cmd_sum2 [getInfoProperty $info2 eventloop_duration_cmd_sum]
-                if {$::verbose} { puts "eventloop metrics cycle1: $cycle1, cycle2: $cycle2" }
-                assert_morethan $cycle2 $cycle1
-                assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance
-                if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" }
-                assert_morethan $el_sum2 $el_sum1
-                assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance
-                if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" }
-                assert_morethan $cmd_sum2 $cmd_sum1
-                assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance
-            }
-
-            test {stats: instantaneous metrics} {
-                r config resetstat
-                set retries 0
-                for {set retries 1} {$retries < 4} {incr retries} {
-                    after 1600 ;# hz is 10, wait for 16 cron tick so that sample array is fulfilled
-                    set value [s instantaneous_eventloop_cycles_per_sec]
-                    if {$value > 0} break
-                }
-
-                assert_lessthan $retries 4
-                if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_cycles_per_sec: $value" }
-                assert_morethan $value 0
-                assert_lessthan $value [expr $retries*15] ;# default hz is 10
-                set value [s instantaneous_eventloop_duration_usec]
-                if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_duration_usec: $value" }
-                assert_morethan $value 0
-                assert_lessthan $value [expr $retries*22000] ;# default hz is 10, so duration < 1000 / 10, allow some tolerance
+        test {stats: eventloop metrics} {
+            set info1 [r info stats]
+            set cycle1 [getInfoProperty $info1 eventloop_cycles]
+            set el_sum1 [getInfoProperty $info1 eventloop_duration_sum]
+            set cmd_sum1 [getInfoProperty $info1 eventloop_duration_cmd_sum]
+            assert_morethan $cycle1 0
+            assert_morethan $el_sum1 0
+            assert_morethan $cmd_sum1 0
+            after 110 ;# default hz is 10, wait for a cron tick. 
+            set info2 [r info stats]
+            set cycle2 [getInfoProperty $info2 eventloop_cycles]
+            set el_sum2 [getInfoProperty $info2 eventloop_duration_sum]
+            set cmd_sum2 [getInfoProperty $info2 eventloop_duration_cmd_sum]
+            if {$::verbose} { puts "eventloop metrics cycle1: $cycle1, cycle2: $cycle2" }
+            assert_morethan $cycle2 $cycle1
+            assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance
+            if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" }
+            assert_morethan $el_sum2 $el_sum1
+            assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance
+            if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" }
+            assert_morethan $cmd_sum2 $cmd_sum1
+            assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance
+        } {} {io-threads:skip} ; # skip with io-threads as the eventloop metrics are different in that case.
+
+        test {stats: instantaneous metrics} {
+            r config resetstat
+            set retries 0
+            for {set retries 1} {$retries < 4} {incr retries} {
+                after 1600 ;# hz is 10, wait for 16 cron tick so that sample array is fulfilled
+                set value [s instantaneous_eventloop_cycles_per_sec]
+                if {$value > 0} break
             }
-        }
+
+            assert_lessthan $retries 4
+            if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_cycles_per_sec: $value" }
+            assert_morethan $value 0
+            assert_lessthan $value [expr $retries*15] ;# default hz is 10
+            set value [s instantaneous_eventloop_duration_usec]
+            if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_duration_usec: $value" }
+            assert_morethan $value 0
+            assert_lessthan $value [expr $retries*22000] ;# default hz is 10, so duration < 1000 / 10, allow some tolerance
+        } {} {io-threads:skip} ; # skip with io-threads as the eventloop metrics are different in that case.
+
 
         test {stats: debug metrics} {
             # make sure debug info is hidden

diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl
@@ -456,13 +456,15 @@ start_server {tags {"maxmemory external:skip"}} {
     } {4098}
 }
 
-start_server {tags {"maxmemory external:skip"}} {
+# Skip the following test when running with IO threads
+# With IO threads, we asynchronously write to tracking clients.
+# This invalidates the assumption that their output buffers will be free within the same event loop.
+start_server {tags {"maxmemory external:skip io-threads:skip"}} {
     test {client tracking don't cause eviction feedback loop} {
         r config set latency-tracking no
         r config set maxmemory 0
         r config set maxmemory-policy allkeys-lru
         r config set maxmemory-eviction-tenacity 100
-
         # 10 clients listening on tracking messages
         set clients {}
         for {set j 0} {$j < 10} {incr j} {

diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
@@ -404,8 +404,6 @@ run_solo {defrag} {
             r save ;# saving an rdb iterates over all the data / pointers
         } {OK}
 
-        # Skip the following two tests if we are running with IO threads, as the IO threads allocate the command arguments in a different arena. As a result, fragmentation is not as expected.
-        if {[r config get io-threads] eq 0} {
         test "Active defrag pubsub: $type" {
             r flushdb
             r config resetstat
@@ -503,8 +501,7 @@ run_solo {defrag} {
                 $rd_pubsub read
             }
             $rd_pubsub close
-        }
-        } ;# io-threads
+        } {0} {io-threads:skip} ; # skip with io-threads as the threads may allocate the command arguments in a different arena. As a result, fragmentation is not as expected.
 
         if {$type eq "standalone"} { ;# skip in cluster mode
         test "Active defrag big list: $type" {