Skip to content
This repository has been archived by the owner on May 12, 2021. It is now read-only.

[DNM] agent: Properly stop the gRPC server #448

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 56 additions & 2 deletions agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ type sandbox struct {
enableGrpcTrace bool
sandboxPidNs bool
storages map[string]*sandboxStorage
stopServer chan struct{}
}

var agentFields = logrus.Fields{
Expand Down Expand Up @@ -524,6 +525,44 @@ func (s *sandbox) teardownSharedPidNs() error {
return nil
}

func (s *sandbox) waitForStopServer() {
fieldLogger := agentLog.WithField("subsystem", "stopserverwatcher")

fieldLogger.Info("Waiting for stopServer signal...")

// Wait for DestroySandbox() to signal this thread about the need to
// stop the server.
<-s.stopServer

fieldLogger.Info("stopServer signal received")

if s.server == nil {
fieldLogger.Info("No server initialized, nothing to stop")
return
}

defer fieldLogger.Info("gRPC server stopped")

// Try to gracefully stop the server for a minute
timeout := time.Minute
done := make(chan struct{})
go func() {
s.server.GracefulStop()
close(done)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uhmmm this smells like race condition

gracefulStopGRPC sets s.server to nil and stopGRPC can use it

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes let me do that a little bit better

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh that’s what that smell was....

}()

select {
case <-done:
s.server = nil
return
case <-time.After(timeout):
fieldLogger.WithField("timeout", timeout).Warn("Could not gracefully stop the server")
}

fieldLogger.Info("Force stopping the server now")
s.stopGRPC()
}

func (s *sandbox) listenToUdevEvents() {
fieldLogger := agentLog.WithField("subsystem", "udevlistener")

Expand Down Expand Up @@ -810,6 +849,7 @@ func (s *sandbox) startGRPC() {
defer s.wg.Done()

var err error
var servErr error
for {
agentLog.Info("agent grpc server starts")

Expand All @@ -833,15 +873,26 @@ func (s *sandbox) startGRPC() {
}

// l is closed when Serve() returns
err = grpcServer.Serve(l)
if err != nil {
servErr = grpcServer.Serve(l)
if servErr != nil {
agentLog.WithError(err).Warn("agent grpc server quits")
}

err = s.channel.teardown()
if err != nil {
agentLog.WithError(err).Warn("agent grpc channel teardown failed")
}

// Based on the definition of grpc.Serve(), the function
// returns nil in case of a proper stop triggered by either
// grpc.GracefulStop() or grpc.Stop(). Those calls can only
// be issued by the chain of events coming from DestroySandbox
// and explicitly means the server should not try to listen
// again, as the sandbox is being completely removed.
if servErr == nil {
agentLog.Info("agent grpc server has been explicitly stopped")
return
}
}
}()
}
Expand Down Expand Up @@ -1019,6 +1070,7 @@ func realMain() {
pciDeviceMap: make(map[string]string),
deviceWatchers: make(map[string](chan string)),
storages: make(map[string]*sandboxStorage),
stopServer: make(chan struct{}),
}

if err = s.initLogger(); err != nil {
Expand Down Expand Up @@ -1046,6 +1098,8 @@ func realMain() {
// Start gRPC server.
s.startGRPC()

go s.waitForStopServer()

go s.listenToUdevEvents()

s.wg.Wait()
Expand Down
4 changes: 4 additions & 0 deletions grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -1341,6 +1341,10 @@ func (a *agentGRPC) DestroySandbox(ctx context.Context, req *pb.DestroySandboxRe
return emptyResp, err
}

// Close stopServer channel to signal the main agent code to stop
// the server when all gRPC calls will be completed.
close(a.sandbox.stopServer)

a.sandbox.hostname = ""
a.sandbox.id = ""
a.sandbox.containers = make(map[string]*container)
Expand Down
9 changes: 7 additions & 2 deletions kata-agent.service.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
#
# Copyright (c) 2018 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#

[Unit]
Description=Kata Containers Agent
Documentation=https://github.com/kata-containers/agent
Expand All @@ -10,5 +16,4 @@ StandardOutput=tty
Type=simple
ExecStart=@bindir@/@kata-agent@
LimitNOFILE=infinity
ExecStop=/bin/sync ; /usr/bin/systemctl --force poweroff
FailureAction=poweroff
ExecStop=/bin/sync