Connection leak in informer? #494

cin · 2020-09-04T18:35:41Z

kubernetes-client/javascript version: 0.12.0
node version: v14.9.0

The number of connections to the kubernetes API seems to double every 10 minutes. We've setup an endpoint informer to monitor when IPs change so we can update a service w/external IPs. We use this to discover Cassandra cluster seeds in other regions.

function startEndpointWatcher (labelSelector = '', namespace = POD_NAMESPACE) {
  const informer = k8s.makeInformer(kc, `/api/v1/namespaces/${namespace}/endpoints?labelSelector=${labelSelector}`,
    () => k8sApi.listNamespacedEndpoints(namespace, undefined, undefined, undefined, undefined, labelSelector))

  _.forEach(['add', 'update', 'delete'], event => informer.on(event, updateEndpointsFn))

  informer.on('error', err => {
    log.error('Watcher ERROR event: \n', err, '\nRestarting Watcher after 5 sec...')
    setTimeout(informer.start, 5000)
  })

  informer.start()
    .then(() => log.info('HostIPs-Endpoint-watcher successfully started'))
    .catch(err => log.error('HostIPs-Endpoint-watcher failed to start: \n', err))
}

async function updateEndpointsFn (endpointsObj) {
  try {
    if (!endpointsObj.subsets) return
    const subsets = endpointsObj.subsets[0]
    const namespace = endpointsObj.metadata.namespace

    const getAddressWithHostIP = address =>
      k8sApi.readNamespacedPodStatus(address.hostname, namespace).then(res => {
        address.ip = res.body.status.hostIP
      })
    const addressesUpdater = async addresses => await Promise.all(_.map(addresses, getAddressWithHostIP))
    await Promise.all(_.filter([subsets.addresses, subsets.notReadyAddresses]).map(addressesUpdater))

    const endpointPatch = [{ op: 'replace', path: '/subsets', value: [subsets] }]
    k8sApi.patchNamespacedEndpoints(endpointsObj.metadata.name + HOSTIP_ENDPOINTS_SUFFIX, namespace, endpointPatch,
      undefined, undefined, undefined, undefined, { headers: { 'Content-type': 'application/json-patch+json' } }
    ).then(result => {
      log.info('Successfully patched hostIPs Endpoint: ' + result.body.metadata.name)
      log.debug(result.body)
      hostipsEndpoints = result.body
    }).catch(err => log.error('Endpoints patch error', err))
  } catch (err) { log.error('UNEXPECTED ERROR: \n', err) }
}

I've tried explicitly closing the responses that I can with no luck, so I'm assuming that the leak must be in the listNamespacedEndpoints call or somewhere in informer.

Here's logging from the pod:

9/4/2020, 6:08:41 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:08:41 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:18:41 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:18:41 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:18:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:18:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:28:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:28:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:28:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:28:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:28:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:28:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:28:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)
9/4/2020, 6:28:42 PM      INFO  Successfully patched hostIPs Endpoint: cassandra-us-south-hostips (host-ips-service.js:57)

Here's netstat output:

❯ date; k exec cassandra-prober-647dc7c5c7-gvw2k -c prober -- netstat -al | grep kube
Fri Sep  4 14:26:30 EDT 2020
tcp        0      0 cassandra-prober-:40182 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:40098 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:40154 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:40126 kubernetes.default.:443 ESTABLISHED

❯ date; k exec cassandra-prober-647dc7c5c7-gvw2k -c prober -- netstat -al | grep kube
Fri Sep  4 14:30:16 EDT 2020
tcp        0      0 cassandra-prober-:43218 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:43420 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:43246 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:43356 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:43274 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:43392 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:43448 kubernetes.default.:443 ESTABLISHED
tcp        0      0 cassandra-prober-:43304 kubernetes.default.:443 ESTABLISHED

Any ideas as to what may be going on here?

The text was updated successfully, but these errors were encountered:

cin · 2020-09-05T01:44:42Z

Here's a self contained example that demonstrates the leak.

const k8s = require('@kubernetes/client-node')
const util = require('util')
const _ = require('lodash')

const { // initialize from environment variables
  CASSANDRA_ENDPOINT_LABELS = 'app.kubernetes.io/component=database',
  POD_NAMESPACE = 'cassandra'
} = process.env

const kc = new k8s.KubeConfig()
kc.loadFromDefault()
const k8sApi = kc.makeApiClient(k8s.CoreV1Api)

function startEndpointWatcher (labelSelector = '', namespace = POD_NAMESPACE) {
  const informer = k8s.makeInformer(kc, `/api/v1/namespaces/${namespace}/endpoints?labelSelector=${labelSelector}`,
    () => k8sApi.listNamespacedEndpoints(namespace, undefined, undefined, undefined, undefined, labelSelector))

  _.forEach(['add', 'update', 'delete'], event => informer.on(event, updateEndpointsFn))

  informer.on('error', err => {
    console.log('Watcher ERROR event: \n', err, '\nRestarting Watcher after 5 sec...')
    setTimeout(informer.start, 5000)
  })

  informer.start()
    .then(() => console.log('HostIPs-Endpoint-watcher successfully started'))
    .catch(err => console.log('HostIPs-Endpoint-watcher failed to start: \n', err))
}

async function updateEndpointsFn (endpointsObj) {
  console.log('updateEndpointFn: ' + util.inspect(endpointsObj, {depth: null}))
}

startEndpointWatcher(CASSANDRA_ENDPOINT_LABELS)

brendandburns · 2020-09-18T20:44:45Z

I created a PR here:

#505

Any chance you can patch it and see if it fixes things?

Thanks!

jc-roman · 2020-10-16T18:22:15Z

I created a PR here:

#505

Any chance you can patch it and see if it fixes things?

Thanks!

@brendanburns, I spent some time debugging the leak (also tried #505 to no avail) and discovered that the problem is not that the request connection is not being closed in the watch/informer/cache. I focused on figuring out why the informer was doubling connections and narrowed it down to an issue with cache implementation; (It's not that it does not disconnect, and then creates an additional new connection. )

The informer properly closes the connection once the k8s api-server disconnects, the issue is that right after, two new connections are created.

My initial hunch was that two different listeners are recreating new watches once the original closes. I'm already working on a proof of concept first.

TODO: I'll submit a PR once I confirm a fix with more tests, and if I can find the time, figure out how an accompanying test to add into test_cache.ts, which at the moment doesn't have one.

DocX · 2020-10-17T10:53:37Z

Hi. We are probably experiencing the same issue. Version 0.12.0.

We are not receiving any error events, and we are listening to add, update, delete events. We are not restarting the informer - informer.start is called only once. And this is what happens over time:

This is the memory usage:

And perhaps useful/interesting is the Node's async resources number:

The drops are when process is being killed by OOM killer and restarted.

There is no changes in the traffic in kubernetes (in fact this chart is over weekend when nothing is happening)

Hope this helps you perhaps somehow to pinpoint the issue.

jc-roman · 2020-10-17T12:28:31Z

Thanks for the extra data. You can clearly see the expected exponential growth due to two full new async requests objects being created for every one that expires. To make matters worse, when one event is sent by the kubernetes API, the callback for every one the ever growing copies of watch instances will get executed. You can see in the first chart that even though the total number of events processed grows exponentially, the ratio for each event type (update/add/delete) is maintained.

FYI: the Watch doesn't have this issue. If the complexity of your specific use of the informer is low, you can replace it with a watch and restart it after from the done callback. If watching single objects, you can save the resourceVersion after every event and use it when you restart the watch.

edqd · 2020-12-16T16:26:43Z

yep, exactly the same thing is happening in our app

any progress?

brendandburns · 2020-12-18T17:07:50Z

@edqd couple of things:

This looks like a RAM graph, not network connections. I would not expect that a growth in connections would cause that much memory usage (though I could be wrong)
What version are you running? ~~I believe that this is fixed in 0.13.0~~ I was wrong, this is fixed at HEAD (I hope) any chance you can try the HEAD version? I will push a new version soon too.

Thanks

brendandburns · 2020-12-19T16:23:10Z

I just pushed 0.13.1 which contains (I hope) the fix that merged in #555

Please test that and see if it fixes things for you.

DocX · 2020-12-21T16:00:55Z

Hi @brendanburns

Thanks for releasing the new version 0.13.1. We have deployed the updated today, but it unfortunately looks like it did not really help :(

However, it looks like it's a little slower to raise, but can't tell for sure:

Not sure if we do something wrong tho. This is our code that we use to register the informer:

const kube_config = new k8s.KubeConfig();
kube_config.loadFromDefault();

const k8sApi = kube_config.makeApiClient(k8s.CoreV1Api);

const listFn = () => k8sApi.listNamespacedPod("scooter-launch");

const informer = k8s.makeInformer(
  kube_config,
  "/api/v1/namespaces/scooter-launch/pods",
  listFn
);

informer.on("add", async (pod: k8s.V1Pod) => {
  statsd.increment("informer.events_by_type", { event_type: "add" });
  await this.handle_pod_change_event(pod);
});

// ... same for "update" and "delete"

informer.on("error", (err: k8s.V1Pod) => {
  setTimeout(() => {
    this.informer.start();
  }, 5000);
});

brendandburns · 2020-12-21T19:25:45Z

@edqd
Your code looks reasonable to me.

It's a little hard to tell, but it doesn't look to me like you are getting any errors?

If you are getting errors, it would be interesting to see if you are getting duplicate errors for some reason.

In either case, I think the informer should be doing the right thing. Do you have monitoring for the number of active TCP connections from your code?

That's the easiest way to see if the problem is fixed or not. If the number of connections is also growing exponentially, then the problem isn't resolved. If the number of connections is constant then it is a memory leak somewhere else.

brendandburns · 2020-12-21T20:31:23Z

So I dug into this a little more and I found a bug in the caching such that the cache was never emptied when a delete occurred after the connection re-connected.

It definitely will cause a memory leak, especially if you are creating/deleting a lot of pods.

I'll send a PR with the fix.

brendandburns · 2020-12-21T20:40:06Z

See #572

brendandburns · 2020-12-21T20:41:23Z

@DocX if you wanted to review #572 we can get it merged and into a patch release for you to try.

DocX · 2020-12-21T20:45:36Z

@brendandburns Hi, Thanks for quick reply.
If you can create a release that would be great, because I had troubles installing the package directly from git. Then we can give it another try.

I will also try to see if we can get network connections metrics. Thanks for tip.

brendandburns · 2020-12-22T06:17:20Z

@DocX I need someone to review the fix, then I can push it into a release. If you're comfortable, have a look at #572 otherwise I can wait for a review from someone else.

DocX · 2020-12-22T18:26:22Z

Got some data on the network connections to the k8s API. There is not visible difference:

brendandburns · 2020-12-23T02:39:04Z

@DocX thanks for the additional data.

sadness that this doesn't seem to be fixed! I'll keep digging and seeing if I can add better tests...

brendandburns · 2020-12-23T02:39:49Z

@DocX one other thing is that this code is continuously re-listing from the start instead of from the current resource version. I need to optimize that (it's been a long-standing 'todo')

brendandburns · 2020-12-23T02:54:42Z

@DocX I just pushed 0.13.2 that contains the cache clearing fix (and your pre-pack PR too) I'll see about investigating this further.

XiaocongDong · 2020-12-25T15:32:15Z

Thanks for the extra data. You can clearly see the expected exponential growth due to two full new async requests objects being created for every one that expires. To make matters worse, when one event is sent by the kubernetes API, the callback for every one the ever growing copies of watch instances will get executed. You can see in the first chart that even though the total number of events processed grows exponentially, the ratio for each event type (update/add/delete) is maintained.

FYI: the Watch doesn't have this issue. If the complexity of your specific use of the informer is low, you can replace it with a watch and restart it after from the done callback. If watching single objects, you can save the resourceVersion after every event and use it when you restart the watch.

Hi @jc-roman , I am having the same issue here, could you provide a working example of using Watch for the replacement of Informer?

brendandburns · 2021-01-05T05:32:55Z

Ok, taking another stab at this (since I can't seem to repro it locally) I sent #575 which explicitly aborts and deletes the connection before initiating a new watch. If someone can patch and try that it would be appreciated.

XiaocongDong · 2021-01-05T06:11:37Z

Hi @brendanburns , I have tried this approach before you made this change (manually abort or destroy last active request), but unfortunately this didn't help.

DocX · 2021-02-02T09:52:05Z

The latest master with #576 seems to have fixed the leak. See the network and memory usage on our app, the spikes are from before, and after that the current master is deployed:

In the latest release of the library, maintainers addressed the issue that might be related to the elevated errors we were seeing. More info here kubernetes-client/javascript#494

DocX · 2021-02-15T16:17:24Z

@brendandburns Hello. What is the timeline for when we can expect new release with the fix? Thank you :)

brendandburns · 2021-02-16T16:10:52Z

@DocX I want to rev the Kubernetes version to 1.20 and then include that in the release. ETA end of the week, hopefully sooner.

Specifically, I want to get #585 merged, then we can cut a release.

brendandburns · 2021-02-17T16:03:58Z

@DocX #585 has merged, I will try to cut a 0.14.0 release today.

DocX · 2021-02-17T18:03:39Z

Awesome thank you. No pressure :)

brendandburns · 2021-02-18T16:23:14Z

I just pushed 0.14.0 to npm. I'm closing this issue now (sorry it took so long to get resolved!)

In the latest release of the library, maintainers addressed the issue that might be related to the elevated errors we were seeing. More info here kubernetes-client/javascript#494

brendandburns mentioned this issue Sep 18, 2020

Improve closing behavior for cache/informer. #505

Closed

brendandburns mentioned this issue Nov 10, 2020

Done handler gets called twice when watch aborts #552

Closed

jgehrcke mentioned this issue Nov 30, 2020

controller OOM-killed N times in cluster; cluster never became healthy opstrace/opstrace#21

Closed

brendandburns mentioned this issue Jan 5, 2021

Explicitly abort and destroy active requests before re-initiating a watch. #575

Closed

DocX mentioned this issue Jan 28, 2021

cache: Fix connection leak. #576

Merged

dkontorovskyy mentioned this issue Feb 4, 2021

feat: upgrade kubernetes-client library snyk/kubernetes-monitor#649

Merged

3 tasks

brendandburns closed this as completed Feb 18, 2021

sreis mentioned this issue Mar 4, 2021

Controller memory leak opstrace/opstrace#443

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Connection leak in informer? #494

Connection leak in informer? #494

cin commented Sep 4, 2020 •

edited

Loading

cin commented Sep 5, 2020

brendandburns commented Sep 18, 2020

jc-roman commented Oct 16, 2020 •

edited

Loading

DocX commented Oct 17, 2020 •

edited

Loading

jc-roman commented Oct 17, 2020

edqd commented Dec 16, 2020

brendandburns commented Dec 18, 2020 •

edited

Loading

brendandburns commented Dec 19, 2020

DocX commented Dec 21, 2020 •

edited

Loading

brendandburns commented Dec 21, 2020

brendandburns commented Dec 21, 2020

brendandburns commented Dec 21, 2020

brendandburns commented Dec 21, 2020

DocX commented Dec 21, 2020

brendandburns commented Dec 22, 2020

DocX commented Dec 22, 2020 •

edited

Loading

brendandburns commented Dec 23, 2020

brendandburns commented Dec 23, 2020

brendandburns commented Dec 23, 2020

XiaocongDong commented Dec 25, 2020

brendandburns commented Jan 5, 2021

XiaocongDong commented Jan 5, 2021 •

edited

Loading

DocX commented Feb 2, 2021

DocX commented Feb 15, 2021

brendandburns commented Feb 16, 2021 •

edited

Loading

brendandburns commented Feb 17, 2021

DocX commented Feb 17, 2021

brendandburns commented Feb 18, 2021

Connection leak in informer? #494

Connection leak in informer? #494

Comments

cin commented Sep 4, 2020 • edited Loading

cin commented Sep 5, 2020

brendandburns commented Sep 18, 2020

jc-roman commented Oct 16, 2020 • edited Loading

DocX commented Oct 17, 2020 • edited Loading

jc-roman commented Oct 17, 2020

edqd commented Dec 16, 2020

brendandburns commented Dec 18, 2020 • edited Loading

brendandburns commented Dec 19, 2020

DocX commented Dec 21, 2020 • edited Loading

brendandburns commented Dec 21, 2020

brendandburns commented Dec 21, 2020

brendandburns commented Dec 21, 2020

brendandburns commented Dec 21, 2020

DocX commented Dec 21, 2020

brendandburns commented Dec 22, 2020

DocX commented Dec 22, 2020 • edited Loading

brendandburns commented Dec 23, 2020

brendandburns commented Dec 23, 2020

brendandburns commented Dec 23, 2020

XiaocongDong commented Dec 25, 2020

brendandburns commented Jan 5, 2021

XiaocongDong commented Jan 5, 2021 • edited Loading

DocX commented Feb 2, 2021

DocX commented Feb 15, 2021

brendandburns commented Feb 16, 2021 • edited Loading

brendandburns commented Feb 17, 2021

DocX commented Feb 17, 2021

brendandburns commented Feb 18, 2021

cin commented Sep 4, 2020 •

edited

Loading

jc-roman commented Oct 16, 2020 •

edited

Loading

DocX commented Oct 17, 2020 •

edited

Loading

brendandburns commented Dec 18, 2020 •

edited

Loading

DocX commented Dec 21, 2020 •

edited

Loading

DocX commented Dec 22, 2020 •

edited

Loading

XiaocongDong commented Jan 5, 2021 •

edited

Loading

brendandburns commented Feb 16, 2021 •

edited

Loading