Cluster: Failed to refresh slots cache. when options.redisOptions.enableOfflineQueue = false #581

jcstanaway · 2018-02-02T22:42:47Z

If I pass options to new Cluster with options.redisOptions.enableOfflineQueue = false, ioredis (3.2.2) fails to connect to the cluster. If I remove setting that specific option, then everything is good (went through them one-by-one).

Below is sample code with the output when options.redisOptions.enableOfflineQueue = false.

Note that I also have set options.enableOfflineQueue = false - it is not entirely clear if both need to be set. In my usage, I do NOT want commands to be queued when the cluster or the intended node is not available.

'use strict';

let Redis = require('ioredis');

function _retryStrategy(times) {
    let retry;

    if (times === 1) {
        retry = 1;  // immediate retry
    } else {
        retry = Math.min((times - 1) * 2000, 20000);
    }
    return retry;
}

const options = {
    // Cluster-level options
    // Override ioredis defaults.
    clusterRetryStrategy   : _retryStrategy,
    enableOfflineQueue     : false,                     // ioredis default is true
    scaleReads             : 'all',                     // ioredis default is 'master'

    // ioredis defaults - explicitly set for clarity.
    enableReadyCheck       : true,
    maxRedirections        : 16,
    retryDelayOnFailover   : 100,
    retryDelayOnClusterDown: 100,
    retryDelayOnTryAgain   : 100,

    // Per Server options
    redisOptions: {
        // Override ioredis defaults.
        enableOfflineQueue           : false,           // ioredis default is true   !!! THIS IS THE PROBLEM !!!
        connectTimeout               : 2000,            // ioredis default is 10000
        autoResendUnfulfilledCommands: false,           // ioredis default is true
        retryStrategy                : _retryStrategy,
        readOnly                     : true,            // ioredis default is false

        // ioredis defaults - explicitly set for clarity.
        family           : 4,
        path             : null,
        keepAlive        : 0,
        noDelay          : true,
        db               : 0,
        dropBufferSupport: false,
        enableReadyCheck : true,
        autoResubscribe  : true,
        lazyConnect      : false,
        keyPrefix        : '',
        stringNumbers    : false
     }
};

let startupNodes = [];
for (let i = 0; i < 3; i++) {
    startupNodes[i] = {
        host: '127.0.0.1',
        port: 6400 + (10*i)
    };
    console.log('Using startupNode %s:%d.', startupNodes[i].host, startupNodes[i].port);
}

let client = new Redis.Cluster(startupNodes, options);

client.on('ready', function() {
    console.log('Redis Cluster is Ready.');
});

client.cluster('info', function (err, clusterInfo) {
    if (err) {
        console.log('Redis Cluster is not yet ready. err=%j', err);
    } else {
        console.log('Redis Cluster Info=%j', clusterInfo);
    }
});

/usr/src/app# DEBUG=ioredis:* node clustertest.js    
Using startupNode 127.0.0.1:6400.
Using startupNode 127.0.0.1:6410.
Using startupNode 127.0.0.1:6420.
  ioredis:cluster status: [empty] -> connecting +0ms
  ioredis:redis status[127.0.0.1:6400]: [empty] -> connecting +3ms
  ioredis:redis status[127.0.0.1:6410]: [empty] -> connecting +2ms
  ioredis:redis status[127.0.0.1:6420]: [empty] -> connecting +0ms
  ioredis:cluster getting slot cache from 127.0.0.1:6410 +1ms
  ioredis:redis status[127.0.0.1:6400]: connecting -> connect +15ms
  ioredis:redis write command[0] -> info() +0ms
  ioredis:redis status[127.0.0.1:6410]: connecting -> connect +2ms
  ioredis:redis write command[0] -> info() +0ms
  ioredis:redis status[127.0.0.1:6420]: connecting -> connect +1ms
  ioredis:redis write command[0] -> info() +0ms
  ioredis:cluster getting slot cache from 127.0.0.1:6420 +3ms
Redis Cluster is not yet ready. err={}
  ioredis:cluster getting slot cache from 127.0.0.1:6400 +2ms
[ioredis] Unhandled error event: Error: Failed to refresh slots cache.
    at tryNode (/usr/src/app/node_modules/ioredis/lib/cluster/index.js:375:19)
    at /usr/src/app/node_modules/ioredis/lib/cluster/index.js:387:9
    at /usr/src/app/node_modules/ioredis/lib/cluster/index.js:591:14
    at run (/usr/src/app/node_modules/ioredis/lib/utils/index.js:153:16)
    at tryCatcher (/usr/src/app/node_modules/bluebird/js/release/util.js:16:23)
    at Promise.errorAdapter [as _rejectionHandler0] (/usr/src/app/node_modules/bluebird/js/release/nodeify.js:35:34)
    at Promise._settlePromise (/usr/src/app/node_modules/bluebird/js/release/promise.js:566:21)
    at Promise._settlePromise0 (/usr/src/app/node_modules/bluebird/js/release/promise.js:614:10)
    at Promise._settlePromises (/usr/src/app/node_modules/bluebird/js/release/promise.js:689:18)
    at Async._drainQueue (/usr/src/app/node_modules/bluebird/js/release/async.js:133:16)
    at Async._drainQueues (/usr/src/app/node_modules/bluebird/js/release/async.js:143:10)
    at Immediate.Async.drainQueues (/usr/src/app/node_modules/bluebird/js/release/async.js:17:14)
    at runCallback (timers.js:789:20)
    at tryOnImmediate (timers.js:751:5)
    at processImmediate [as _immediateCallback] (timers.js:722:5)
  ioredis:redis status[127.0.0.1:6410]: connect -> ready +7ms
  ioredis:redis status[127.0.0.1:6420]: connect -> ready +2ms
  ioredis:redis status[127.0.0.1:6420]: ready -> close +1ms
  ioredis:connection skip reconnecting since the connection is manually closed. +0ms
  ioredis:redis status[127.0.0.1:6420]: close -> end +0ms
  ioredis:redis status[127.0.0.1:6410]: ready -> close +1ms
  ioredis:connection skip reconnecting since the connection is manually closed. +0ms
  ioredis:redis status[127.0.0.1:6410]: close -> end +0ms
  ioredis:redis status[127.0.0.1:6400]: connect -> ready +1ms
  ioredis:redis status[127.0.0.1:6400]: ready -> close +0ms
  ioredis:connection skip reconnecting since the connection is manually closed. +0ms
  ioredis:redis status[127.0.0.1:6400]: close -> end +1ms
  ioredis:cluster status: connecting -> close +0ms
  ioredis:cluster status: close -> reconnecting +1ms
  ioredis:cluster Cluster is disconnected. Retrying after 1ms +2ms
  ioredis:cluster status: reconnecting -> connecting +0ms
  ioredis:redis status[127.0.0.1:6400]: [empty] -> connecting +1ms
  ioredis:redis status[127.0.0.1:6410]: [empty] -> connecting +1ms
  ioredis:redis status[127.0.0.1:6420]: [empty] -> connecting +1ms
  ioredis:cluster getting slot cache from 127.0.0.1:6410 +0ms
  ioredis:redis status[127.0.0.1:6400]: connecting -> connect +3ms
  ioredis:redis write command[0] -> info() +1ms
  ioredis:redis status[127.0.0.1:6410]: connecting -> connect +0ms
  ioredis:redis write command[0] -> info() +0ms
  ioredis:redis status[127.0.0.1:6420]: connecting -> connect +1ms
  ioredis:redis write command[0] -> info() +0ms
  ioredis:cluster getting slot cache from 127.0.0.1:6420 +0ms
  ioredis:cluster getting slot cache from 127.0.0.1:6400 +1ms
[ioredis] Unhandled error event: Error: Failed to refresh slots cache.
    at tryNode (/usr/src/app/node_modules/ioredis/lib/cluster/index.js:375:19)
    at /usr/src/app/node_modules/ioredis/lib/cluster/index.js:387:9
    at /usr/src/app/node_modules/ioredis/lib/cluster/index.js:591:14
    at run (/usr/src/app/node_modules/ioredis/lib/utils/index.js:153:16)
    at tryCatcher (/usr/src/app/node_modules/bluebird/js/release/util.js:16:23)
    at Promise.errorAdapter [as _rejectionHandler0] (/usr/src/app/node_modules/bluebird/js/release/nodeify.js:35:34)
    at Promise._settlePromise (/usr/src/app/node_modules/bluebird/js/release/promise.js:566:21)
    at Promise._settlePromise0 (/usr/src/app/node_modules/bluebird/js/release/promise.js:614:10)
    at Promise._settlePromises (/usr/src/app/node_modules/bluebird/js/release/promise.js:689:18)
    at Async._drainQueue (/usr/src/app/node_modules/bluebird/js/release/async.js:133:16)
    at Async._drainQueues (/usr/src/app/node_modules/bluebird/js/release/async.js:143:10)
    at Immediate.Async.drainQueues (/usr/src/app/node_modules/bluebird/js/release/async.js:17:14)
    at runCallback (timers.js:789:20)
    at tryOnImmediate (timers.js:751:5)
    at processImmediate [as _immediateCallback] (timers.js:722:5)
  ioredis:redis status[127.0.0.1:6410]: connect -> ready +7ms
  ioredis:redis status[127.0.0.1:6410]: ready -> close +1ms
  ioredis:connection skip reconnecting since the connection is manually closed. +1ms
  ioredis:redis status[127.0.0.1:6410]: close -> end +0ms
  ioredis:redis status[127.0.0.1:6400]: connect -> ready +4ms
  ioredis:redis status[127.0.0.1:6420]: connect -> ready +1ms
  ioredis:redis status[127.0.0.1:6420]: ready -> close +0ms
  ioredis:connection skip reconnecting since the connection is manually closed. +0ms
  ioredis:redis status[127.0.0.1:6420]: close -> end +1ms
  ioredis:redis status[127.0.0.1:6400]: ready -> close +0ms
  ioredis:connection skip reconnecting since the connection is manually closed. +0ms
  ioredis:redis status[127.0.0.1:6400]: close -> end +0ms
  ioredis:cluster status: connecting -> close +0ms
  ioredis:cluster status: close -> reconnecting +1ms
^C

If I comment out the one line, then all is good:

/usr/src/app# DEBUG=ioredis:* node clustertest.js
Using startupNode 127.0.0.1:6400.
Using startupNode 127.0.0.1:6410.
Using startupNode 127.0.0.1:6420.
  ioredis:cluster status: [empty] -> connecting +0ms
  ioredis:redis status[127.0.0.1:6400]: [empty] -> connecting +3ms
  ioredis:redis status[127.0.0.1:6410]: [empty] -> connecting +3ms
  ioredis:redis status[127.0.0.1:6420]: [empty] -> connecting +0ms
  ioredis:cluster getting slot cache from 127.0.0.1:6420 +2ms
  ioredis:redis queue command[0] -> cluster(slots) +2ms
  ioredis:redis queue command[0] -> cluster(info) +2ms
  ioredis:redis status[127.0.0.1:6400]: connecting -> connect +6ms
  ioredis:redis write command[0] -> info() +1ms
  ioredis:redis status[127.0.0.1:6410]: connecting -> connect +2ms
  ioredis:redis write command[0] -> info() +0ms
  ioredis:redis status[127.0.0.1:6420]: connecting -> connect +0ms
  ioredis:redis write command[0] -> info() +0ms
  ioredis:redis status[127.0.0.1:6410]: connect -> ready +14ms
  ioredis:connection send 1 commands in offline queue +0ms
  ioredis:redis write command[0] -> cluster(info) +1ms
  ioredis:redis status[127.0.0.1:6400]: connect -> ready +1ms
Redis Cluster Info="cluster_state:ok\r\ncluster_slots_assigned:16384\r\ncluster_slots_ok:16384\r\ncluster_slots_pfail:0\r\ncluster_slots_fail:0\r\ncluster_known_nodes:6\r\ncluster_size:3\r\ncluster_current_epoch:3\r\ncluster_my_epoch:2\r\ncluster_stats_messages_ping_sent:7405\r\ncluster_stats_messages_pong_sent:6762\r\ncluster_stats_messages_meet_sent:1\r\ncluster_stats_messages_sent:14168\r\ncluster_stats_messages_ping_received:6758\r\ncluster_stats_messages_pong_received:7406\r\ncluster_stats_messages_meet_received:4\r\ncluster_stats_messages_received:14168\r\n"
  ioredis:redis status[127.0.0.1:6420]: connect -> ready +2ms
  ioredis:connection send 1 commands in offline queue +1ms
  ioredis:redis write command[0] -> cluster(slots) +0ms
  ioredis:redis status[127.0.0.1:6401]: [empty] -> connecting +11ms
  ioredis:redis status[127.0.0.1:6421]: [empty] -> connecting +1ms
  ioredis:redis status[127.0.0.1:6411]: [empty] -> connecting +0ms
  ioredis:cluster status: connecting -> connect +1ms
  ioredis:redis write command[0] -> cluster(info) +1ms
  ioredis:redis status[127.0.0.1:6401]: connecting -> connect +3ms
  ioredis:redis write command[0] -> info() +1ms
  ioredis:redis status[127.0.0.1:6421]: connecting -> connect +1ms
  ioredis:redis write command[0] -> info() +1ms
  ioredis:redis status[127.0.0.1:6411]: connecting -> connect +0ms
  ioredis:redis write command[0] -> info() +0ms
  ioredis:cluster status: connect -> ready +1ms
Redis Cluster is Ready.
  ioredis:redis status[127.0.0.1:6411]: connect -> ready +5ms
  ioredis:connection set the connection to readonly mode +0ms
  ioredis:redis write command[0] -> readonly() +1ms
  ioredis:redis status[127.0.0.1:6401]: connect -> ready +2ms
  ioredis:connection set the connection to readonly mode +2ms
  ioredis:redis write command[0] -> readonly() +0ms
  ioredis:redis status[127.0.0.1:6421]: connect -> ready +1ms
  ioredis:connection set the connection to readonly mode +1ms
  ioredis:redis write command[0] -> readonly() +0ms
^C

The text was updated successfully, but these errors were encountered:

dogabudak · 2018-02-14T08:42:26Z

Same here;

Redis-logs are also printing - Client closed connection Err

Any solutions ?

stale · 2018-03-16T08:52:38Z

This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs, but feel free to re-open a closed issue if needed.

jcstanaway · 2018-03-16T13:40:48Z

This is clearly a defect. I'd try to root cause it and submit a PR, but I'm not familiar enough with the code and unfortunately don't have the time to figure it out. While the obviously work around is to not set that option, it is not at all clear what the implications are as the both exists at both the cluster level and the individual server level. At a minimum, the documentation needs to be updated to clarify the behavior of these options for a Redis Cluster.

stale · 2018-04-15T13:54:10Z

This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs, but feel free to re-open a closed issue if needed.

jcstanaway · 2018-04-16T13:29:16Z

Commenting to keep the issue open.

stale · 2018-05-16T14:14:50Z

This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs, but feel free to re-open a closed issue if needed.

jcstanaway · 2018-05-16T16:48:17Z

@luin Any chance this will be addressed?

stale · 2018-06-15T16:48:31Z

This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs, but feel free to re-open a closed issue if needed.

jcstanaway · 2018-06-15T20:11:45Z

unstale

shaharmor · 2018-06-18T17:15:40Z

Hey @ccs018, I took a look at your issue.

The problem is that currently, ioredis tries to refresh the slots map before actually connecting to the cluster, which causes it to fail and disconnect from the node before it managed to connect (Because you don't have the offline queue enabled).

From my understanding, the redisOptions.offlineQueue is not relevant for cluster mode because when the redis node will go offline it will also be removed from the cluster nodes list, so no commands will be sent to it when its offline.

One possible fix for this is to move the refreshSlotsCache call to only after at least one node has been connected. @luin ?

jcstanaway · 2018-06-19T13:02:27Z

@shaharmor , thanks for taking a look at this.

the redisOptions.offlineQueue is not relevant for cluster mode

Thanks for clarifying. The documentation is not clear which redis node options are applicable in cluster mode. In my use case, queuing commands when offline would likely cause issues as I can have multiple clients attempting to update the same keys and if one client queued an update while its connection was down and another client was able to make another update, then when the first client reconnected it would incorrectly clobber the other client's update (the updates would be made out-of-order).

when the redis node will go offline it will also be removed from the cluster nodes list

I believe that this is incorrect behavior. This can typically be a temporary event and ioredis should not be unilaterally removing the node from the cluster nodes list. It should query the cluster for the topology since when the node recovers ioredis is not proactively adding the node back to the cluster nodes list.

Consider a rolling upgrade scenario. As each node is taken down to be upgraded, ioredis is removing the node from the cluster nodes list. Eventually, there will be none left. Restarting clients is not an acceptable recovery when one is trying to design a zero-outage system - which includes upgrades.

jcstanaway · 2018-06-19T13:09:13Z

Just saw the PR - Looks like this will fix a couple similar issues. Though I still don't believe that when a node goes offline that it should be removed from the cluster nodes list. Just mark it as offline and periodically attempt to reconnect.

shaharmor · 2018-06-19T13:14:46Z

The problem with this is that a cluster can sometimes have nodes that will never come up again, in cases such as auto scaling where a machine goes up, and when it goes down it will never return, which will cause ioredis to basically leak nodes.

I agree that there are fixes that should be done in the way we handle cluster nodes. I have talked with @luin and we will take a look at it next week. Sorry for the delay.

The "moved" event will cause ioredis to re-fetch the nodes list, and now with my PR it will do that periodically as well, but I agree that we should probably trigger it on a -node event as well.

jcstanaway · 2018-06-19T13:24:52Z

Again, thanks for digging into this. I have some rather ugly code that periodically performs a CLUSTER NODES and compares the count with cluster.nodes and if the count is different I destroy and recreate the ioredis client. Not the cleanest or more efficient. Similarly, to work around the issue captured here where the client never connects - I simply have a timeout to guard against this condition and again destroy and recreate the ioredis client. Looking forward to ripping that code out and completely rely on ioredis to maintain and automatically recover connections.

heri16 · 2018-07-07T17:12:46Z

any real workarounds?

luin · 2018-07-07T18:26:21Z

@heri16 v4.0.0-2 adds a fix for this that will refresh the slots periodically.

stale · 2018-08-06T18:28:46Z

This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs, but feel free to re-open a closed issue if needed.

yunnysunny · 2019-11-22T06:51:41Z

I use ioredis 4.14.1, and node 12.13.0.
The error of ClusterAllFailedError: Failed to refresh slots cache. occured in only one of my cluster, the others worked ok.
Then I restarted the app with the problem, it recovered from the error.

danielkv · 2020-05-17T20:08:39Z

+1

winningjai · 2021-06-01T05:59:55Z

+1

jcstanaway mentioned this issue Feb 16, 2018

Will enableOfflineQueue:false conflict when it's in both cluster config and redisOptions? #575

Closed

stale bot added the wontfix label Mar 16, 2018

stale bot removed the wontfix label Mar 16, 2018

stale bot added the wontfix label Apr 15, 2018

stale bot removed the wontfix label Apr 16, 2018

stale bot added the wontfix label May 16, 2018

stale bot removed the wontfix label May 16, 2018

stale bot added the wontfix label Jun 15, 2018

stale bot removed the wontfix label Jun 15, 2018

shaharmor mentioned this issue Jun 23, 2018

WIP: v4 #642

Merged

6 tasks

stale bot added the wontfix label Aug 6, 2018

stale bot closed this as completed Aug 13, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Cluster: Failed to refresh slots cache. when options.redisOptions.enableOfflineQueue = false #581

Cluster: Failed to refresh slots cache. when options.redisOptions.enableOfflineQueue = false #581

jcstanaway commented Feb 2, 2018

dogabudak commented Feb 14, 2018 •

edited

Loading

stale bot commented Mar 16, 2018

jcstanaway commented Mar 16, 2018

stale bot commented Apr 15, 2018

jcstanaway commented Apr 16, 2018

stale bot commented May 16, 2018

jcstanaway commented May 16, 2018

stale bot commented Jun 15, 2018

jcstanaway commented Jun 15, 2018

shaharmor commented Jun 18, 2018

jcstanaway commented Jun 19, 2018

jcstanaway commented Jun 19, 2018

shaharmor commented Jun 19, 2018

jcstanaway commented Jun 19, 2018

heri16 commented Jul 7, 2018

luin commented Jul 7, 2018

stale bot commented Aug 6, 2018

yunnysunny commented Nov 22, 2019

danielkv commented May 17, 2020

winningjai commented Jun 1, 2021

Cluster: Failed to refresh slots cache. when options.redisOptions.enableOfflineQueue = false #581

Cluster: Failed to refresh slots cache. when options.redisOptions.enableOfflineQueue = false #581

Comments

jcstanaway commented Feb 2, 2018

dogabudak commented Feb 14, 2018 • edited Loading

stale bot commented Mar 16, 2018

jcstanaway commented Mar 16, 2018

stale bot commented Apr 15, 2018

jcstanaway commented Apr 16, 2018

stale bot commented May 16, 2018

jcstanaway commented May 16, 2018

stale bot commented Jun 15, 2018

jcstanaway commented Jun 15, 2018

shaharmor commented Jun 18, 2018

jcstanaway commented Jun 19, 2018

jcstanaway commented Jun 19, 2018

shaharmor commented Jun 19, 2018

jcstanaway commented Jun 19, 2018

heri16 commented Jul 7, 2018

luin commented Jul 7, 2018

stale bot commented Aug 6, 2018

yunnysunny commented Nov 22, 2019

danielkv commented May 17, 2020

winningjai commented Jun 1, 2021

dogabudak commented Feb 14, 2018 •

edited

Loading