Skip to content

Commit

Permalink
Skip tracking clients OOM test when I/O threads are enabled (#764)
Browse files Browse the repository at this point in the history
Fix feedback loop in key eviction with tracking clients when using I/O
threads.

Current issue:
Evicting keys while tracking clients or key space-notification exist
creates a feedback loop when using I/O threads:

While evicting keys we send tracking async writes to I/O threads,
preventing immediate release of tracking clients' COB memory
consumption.

Before the I/O thread finishes its write, we recheck used_memory, which
now includes the tracking clients' COB and thus continue to evict more
keys.

**Fix:**
We will skip the test for now while IO threads are active. We may
consider avoiding sending writes in `processPendingWrites` to I/O
threads for tracking clients when we are out of memory.

---------

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
  • Loading branch information
uriyage and madolson committed Sep 3, 2024
1 parent 0f551a2 commit 2e93777
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 52 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -379,10 +379,10 @@ jobs:
run: sudo apt-get install tcl8.6 tclx
- name: test
if: true && !contains(github.event.inputs.skiptests, 'valkey')
run: ./runtest --config io-threads 2 --config events-per-io-thread 0 --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
run: ./runtest --io-threads --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
- name: cluster tests
if: true && !contains(github.event.inputs.skiptests, 'cluster')
run: ./runtest-cluster --config io-threads 2 --config events-per-io-thread 0 ${{github.event.inputs.cluster_test_args}}
run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}}

test-ubuntu-reclaim-cache:
runs-on: ubuntu-latest
Expand Down
11 changes: 11 additions & 0 deletions tests/support/server.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,11 @@ proc tags_acceptable {tags err_return} {
return 0
}

if {$::io_threads && [lsearch $tags "io-threads:skip"] >= 0} {
set err "Not supported in io-threads mode"
return 0
}

if {$::tcl_version < 8.6 && [lsearch $tags "ipv6"] >= 0} {
set err "TCL version is too low and does not support this"
return 0
Expand Down Expand Up @@ -502,6 +507,12 @@ proc start_server {options {code undefined}} {
dict set config "tls-ca-cert-file" [format "%s/tests/tls/ca.crt" [pwd]]
dict set config "loglevel" "debug"
}

if {$::io_threads} {
dict set config "io-threads" 2
dict set config "events-per-io-thread" 0
}

foreach line $data {
if {[string length $line] > 0 && [string index $line 0] ne "#"} {
set elements [split $line " "]
Expand Down
4 changes: 4 additions & 0 deletions tests/test_helper.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ set ::traceleaks 0
set ::valgrind 0
set ::durable 0
set ::tls 0
set ::io_threads 0
set ::tls_module 0
set ::stack_logging 0
set ::verbose 0
Expand Down Expand Up @@ -576,6 +577,7 @@ proc print_help_screen {} {
"--loops <count> Execute the specified set of tests several times."
"--wait-server Wait after server is started (so that you can attach a debugger)."
"--dump-logs Dump server log on test failure."
"--io-threads Run tests with IO threads."
"--tls Run tests in TLS mode."
"--tls-module Run tests in TLS mode with Valkey module."
"--host <addr> Run tests against an external host."
Expand Down Expand Up @@ -630,6 +632,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
}
} elseif {$opt eq {--quiet}} {
set ::quiet 1
} elseif {$opt eq {--io-threads}} {
set ::io_threads 1
} elseif {$opt eq {--tls} || $opt eq {--tls-module}} {
package require tls 1.6
set ::tls 1
Expand Down
86 changes: 42 additions & 44 deletions tests/unit/info.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -308,51 +308,49 @@ start_server {tags {"info" "external:skip"}} {
assert_equal "count=2" [errorstat ERR]
}

# skip the following 2 tests if we are running with io-threads as the eventloop metrics are different in that case.
if {[r config get io-threads] eq 0} {
test {stats: eventloop metrics} {
set info1 [r info stats]
set cycle1 [getInfoProperty $info1 eventloop_cycles]
set el_sum1 [getInfoProperty $info1 eventloop_duration_sum]
set cmd_sum1 [getInfoProperty $info1 eventloop_duration_cmd_sum]
assert_morethan $cycle1 0
assert_morethan $el_sum1 0
assert_morethan $cmd_sum1 0
after 110 ;# default hz is 10, wait for a cron tick.
set info2 [r info stats]
set cycle2 [getInfoProperty $info2 eventloop_cycles]
set el_sum2 [getInfoProperty $info2 eventloop_duration_sum]
set cmd_sum2 [getInfoProperty $info2 eventloop_duration_cmd_sum]
if {$::verbose} { puts "eventloop metrics cycle1: $cycle1, cycle2: $cycle2" }
assert_morethan $cycle2 $cycle1
assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance
if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" }
assert_morethan $el_sum2 $el_sum1
assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance
if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" }
assert_morethan $cmd_sum2 $cmd_sum1
assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance
}

test {stats: instantaneous metrics} {
r config resetstat
set retries 0
for {set retries 1} {$retries < 4} {incr retries} {
after 1600 ;# hz is 10, wait for 16 cron tick so that sample array is fulfilled
set value [s instantaneous_eventloop_cycles_per_sec]
if {$value > 0} break
}

assert_lessthan $retries 4
if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_cycles_per_sec: $value" }
assert_morethan $value 0
assert_lessthan $value [expr $retries*15] ;# default hz is 10
set value [s instantaneous_eventloop_duration_usec]
if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_duration_usec: $value" }
assert_morethan $value 0
assert_lessthan $value [expr $retries*22000] ;# default hz is 10, so duration < 1000 / 10, allow some tolerance
test {stats: eventloop metrics} {
set info1 [r info stats]
set cycle1 [getInfoProperty $info1 eventloop_cycles]
set el_sum1 [getInfoProperty $info1 eventloop_duration_sum]
set cmd_sum1 [getInfoProperty $info1 eventloop_duration_cmd_sum]
assert_morethan $cycle1 0
assert_morethan $el_sum1 0
assert_morethan $cmd_sum1 0
after 110 ;# default hz is 10, wait for a cron tick.
set info2 [r info stats]
set cycle2 [getInfoProperty $info2 eventloop_cycles]
set el_sum2 [getInfoProperty $info2 eventloop_duration_sum]
set cmd_sum2 [getInfoProperty $info2 eventloop_duration_cmd_sum]
if {$::verbose} { puts "eventloop metrics cycle1: $cycle1, cycle2: $cycle2" }
assert_morethan $cycle2 $cycle1
assert_lessthan $cycle2 [expr $cycle1+10] ;# we expect 2 or 3 cycles here, but allow some tolerance
if {$::verbose} { puts "eventloop metrics el_sum1: $el_sum1, el_sum2: $el_sum2" }
assert_morethan $el_sum2 $el_sum1
assert_lessthan $el_sum2 [expr $el_sum1+30000] ;# we expect roughly 100ms here, but allow some tolerance
if {$::verbose} { puts "eventloop metrics cmd_sum1: $cmd_sum1, cmd_sum2: $cmd_sum2" }
assert_morethan $cmd_sum2 $cmd_sum1
assert_lessthan $cmd_sum2 [expr $cmd_sum1+15000] ;# we expect about tens of ms here, but allow some tolerance
} {} {io-threads:skip} ; # skip with io-threads as the eventloop metrics are different in that case.

test {stats: instantaneous metrics} {
r config resetstat
set retries 0
for {set retries 1} {$retries < 4} {incr retries} {
after 1600 ;# hz is 10, wait for 16 cron tick so that sample array is fulfilled
set value [s instantaneous_eventloop_cycles_per_sec]
if {$value > 0} break
}
}

assert_lessthan $retries 4
if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_cycles_per_sec: $value" }
assert_morethan $value 0
assert_lessthan $value [expr $retries*15] ;# default hz is 10
set value [s instantaneous_eventloop_duration_usec]
if {$::verbose} { puts "instantaneous metrics instantaneous_eventloop_duration_usec: $value" }
assert_morethan $value 0
assert_lessthan $value [expr $retries*22000] ;# default hz is 10, so duration < 1000 / 10, allow some tolerance
} {} {io-threads:skip} ; # skip with io-threads as the eventloop metrics are different in that case.


test {stats: debug metrics} {
# make sure debug info is hidden
Expand Down
6 changes: 4 additions & 2 deletions tests/unit/maxmemory.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -456,13 +456,15 @@ start_server {tags {"maxmemory external:skip"}} {
} {4098}
}

start_server {tags {"maxmemory external:skip"}} {
# Skip the following test when running with IO threads
# With IO threads, we asynchronously write to tracking clients.
# This invalidates the assumption that their output buffers will be free within the same event loop.
start_server {tags {"maxmemory external:skip io-threads:skip"}} {
test {client tracking don't cause eviction feedback loop} {
r config set latency-tracking no
r config set maxmemory 0
r config set maxmemory-policy allkeys-lru
r config set maxmemory-eviction-tenacity 100

# 10 clients listening on tracking messages
set clients {}
for {set j 0} {$j < 10} {incr j} {
Expand Down
5 changes: 1 addition & 4 deletions tests/unit/memefficiency.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -404,8 +404,6 @@ run_solo {defrag} {
r save ;# saving an rdb iterates over all the data / pointers
} {OK}

# Skip the following two tests if we are running with IO threads, as the IO threads allocate the command arguments in a different arena. As a result, fragmentation is not as expected.
if {[r config get io-threads] eq 0} {
test "Active defrag pubsub: $type" {
r flushdb
r config resetstat
Expand Down Expand Up @@ -503,8 +501,7 @@ run_solo {defrag} {
$rd_pubsub read
}
$rd_pubsub close
}
} ;# io-threads
} {0} {io-threads:skip} ; # skip with io-threads as the threads may allocate the command arguments in a different arena. As a result, fragmentation is not as expected.

if {$type eq "standalone"} { ;# skip in cluster mode
test "Active defrag big list: $type" {
Expand Down

0 comments on commit 2e93777

Please sign in to comment.