Skip to content

Commit

Permalink
Debugging and extension timeout (#417)
Browse files Browse the repository at this point in the history
This commit:

* Adds an extensions_timeout to the osquery invocation
* Adds stacktraces to the top level fatal error reporting
* Adds retry logic in several places

Why?

* We occasionally see issues with osquery failing to start due to an extension timeout. This has been rare, and hard to debug. This adds retry logic and improves debugging
  • Loading branch information
directionless authored Feb 7, 2019
1 parent 8de4cba commit fa45344
Show file tree
Hide file tree
Showing 11 changed files with 169 additions and 16 deletions.
3 changes: 2 additions & 1 deletion cmd/grpc.ext/grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"crypto/x509"
"flag"
"fmt"
"path/filepath"
"time"

Expand Down Expand Up @@ -38,7 +39,7 @@ func main() {

client, err := osquery.NewClient(*flSocketPath, timeout)
if err != nil {
logutil.Fatal(logger, "err", err, "creating osquery extension client")
logutil.Fatal(logger, "err", err, "creating osquery extension client", "stack", fmt.Sprintf("%+v", err))
}

var (
Expand Down
7 changes: 4 additions & 3 deletions cmd/launcher.ext/launcher-extension.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"flag"
"fmt"
"time"

"github.com/kolide/kit/logutil"
Expand Down Expand Up @@ -31,12 +32,12 @@ func main() {
osquery.ServerTimeout(timeout),
)
if err != nil {
logutil.Fatal(logger, "err", err, "msg", "creating osquery extension server")
logutil.Fatal(logger, "err", err, "msg", "creating osquery extension server", "stack", fmt.Sprintf("%+v", err))
}

client, err := osquery.NewClient(*flSocketPath, timeout)
if err != nil {
logutil.Fatal(logger, "err", err, "creating osquery extension client")
logutil.Fatal(logger, "err", err, "creating osquery extension client", "stack", fmt.Sprintf("%+v", err))
}

var plugins []osquery.OsqueryPlugin
Expand All @@ -46,6 +47,6 @@ func main() {
server.RegisterPlugin(plugins...)

if err := server.Run(); err != nil {
logutil.Fatal(logger, "err", err)
logutil.Fatal(logger, "err", err, "stack", fmt.Sprintf("%+v", err))
}
}
5 changes: 3 additions & 2 deletions cmd/launcher/extension.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"bytes"
"context"
"fmt"
"io/ioutil"

"github.com/boltdb/bolt"
Expand Down Expand Up @@ -110,12 +111,12 @@ func createExtensionRuntime(ctx context.Context, rootDirectory string, db *bolt.
return nil
},
Interrupt: func(err error) {
level.Info(logger).Log("msg", "extension interrupted", "err", err)
level.Info(logger).Log("msg", "extension interrupted", "err", err, "stack", fmt.Sprintf("%+v", err))
grpcConn.Close()
ext.Shutdown()
if runner != nil {
if err := runner.Shutdown(); err != nil {
level.Info(logger).Log("msg", "error shutting down runtime", "err", err)
level.Info(logger).Log("msg", "error shutting down runtime", "err", err, "stack", fmt.Sprintf("%+v", err))
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion cmd/launcher/launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ func runLauncher(ctx context.Context, cancel func(), opts *options, logger log.L
return nil
}
}, func(err error) {
level.Info(logger).Log("msg", "interrupted", "err", err)
level.Info(logger).Log("msg", "interrupted", "err", err, "stack", fmt.Sprintf("%+v", err))
cancel()
close(sig)
})
Expand Down
3 changes: 2 additions & 1 deletion cmd/launcher/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package main

import (
"context"
"fmt"
"os"

"github.com/go-kit/kit/log"
Expand Down Expand Up @@ -50,7 +51,7 @@ func main() {
defer cancel()

if err := runLauncher(ctx, cancel, opts, logger); err != nil {
logutil.Fatal(logger, err, "run launcher")
logutil.Fatal(logger, err, "run launcher", "stack", fmt.Sprintf("%+v", err))
}
}

Expand Down
4 changes: 3 additions & 1 deletion cmd/launcher/updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"context"
"fmt"
"net/http"
"os"
"syscall"
Expand Down Expand Up @@ -71,7 +72,7 @@ func createUpdater(
return nil
},
Interrupt: func(err error) {
level.Info(logger).Log("msg", "updater interrupted", "err", err)
level.Info(logger).Log("msg", "updater interrupted", "err", err, "stack", fmt.Sprintf("%+v", err))
if stop != nil {
stop()
}
Expand All @@ -85,6 +86,7 @@ func launcherFinalizer(logger log.Logger, shutdownOsquery func() error) func() e
level.Info(logger).Log(
"method", "launcherFinalizer",
"err", err,
"stack", fmt.Sprintf("%+v", err),
)
}
// replace launcher
Expand Down
63 changes: 63 additions & 0 deletions pkg/backoff/backoff.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package backoff

import (
"time"

"github.com/pkg/errors"
)

type Opt func(*Backoff)

func MaxAttempts(i int) Opt {
return func(b *Backoff) {
b.maxAttempt = i
}
}

// Backoff is a quick retry function
type Backoff struct {
count int
maxAttempt int
delay float32
runFunc func() error
}

// New returns a Backoff timer
func New(opts ...Opt) *Backoff {
b := &Backoff{
count: 0,
delay: 1,
maxAttempt: 20,
}

for _, opt := range opts {
opt(b)
}

return b
}

// Run trys to run function several times until it succeeds or times out.
func (b *Backoff) Run(runFunc func() error) error {
b.runFunc = runFunc
return b.try()
}

func (b *Backoff) try() error {
b.count += 1
err := b.runFunc()
if err != nil {
if b.count >= b.maxAttempt {
return errors.Wrap(err, "done trying")
}

// Wait for amount of time
timer := time.NewTimer(time.Second * time.Duration(b.delay))
<-timer.C

return b.try()
}

// err == nil, SUCCESS!
return nil
}
77 changes: 77 additions & 0 deletions pkg/backoff/backoff_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package backoff

import (
"testing"

"github.com/pkg/errors"
"github.com/stretchr/testify/require"
)

func TestImmediate(t *testing.T) {
t.Parallel()

bkoff := New()
bkoff.delay = 0.0

err := bkoff.Run(willSucceed)
require.NoError(t, err)
require.Equal(t, 1, bkoff.count)
}

func TestEventual(t *testing.T) {
t.Parallel()
bkoff := New()
bkoff.delay = 0.0

tTime := &takesTime{
attempts: 0,
}

err := bkoff.Run(tTime.eventual)
require.NoError(t, err)
require.Equal(t, 5, bkoff.count)
require.Equal(t, 5, tTime.attempts)

}

func TestSlowFail(t *testing.T) {
t.Parallel()
bkoff := New()
bkoff.delay = 0.0

err := bkoff.Run(willFail)
require.Error(t, err)
require.Equal(t, 20, bkoff.count)
}

func TestMaxAttempts(t *testing.T) {
t.Parallel()
bkoff := New(MaxAttempts(5))
bkoff.delay = 0.0

err := bkoff.Run(willFail)
require.Error(t, err)
require.Equal(t, 5, bkoff.count)
}

func willSucceed() error {
return nil
}

func willFail() error {
return errors.New("nope")
}

type takesTime struct {
attempts int
}

func (t *takesTime) eventual() error {
t.attempts += 1

if t.attempts >= 5 {
return nil
}

return errors.New("not yet")
}
12 changes: 8 additions & 4 deletions pkg/osquery/extension.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/go-kit/kit/log/level"
"github.com/google/uuid"
"github.com/kolide/kit/version"
"github.com/kolide/launcher/pkg/backoff"
"github.com/kolide/launcher/pkg/service"
"github.com/kolide/osquery-go/plugin/distributed"
"github.com/kolide/osquery-go/plugin/logger"
Expand Down Expand Up @@ -320,10 +321,13 @@ func (e *Extension) Enroll(ctx context.Context) (string, bool, error) {
return "", true, errors.Wrap(err, "generating UUID")
}

enrollDetails, err := getEnrollDetails(e.osqueryClient)
if err != nil {
return "", true, errors.Wrap(err, "query enrollment details")
}
// We've seen this fail, so add some retry logic.
var enrollDetails service.EnrollmentDetails
backoff := backoff.New(backoff.MaxAttempts(5))
backoff.Run(func() error {
enrollDetails, err = getEnrollDetails(e.osqueryClient)
return err
})

// If no cached node key, enroll for new node key
keyString, invalid, err := e.serviceClient.RequestEnrollment(ctx, e.Opts.EnrollSecret, identifier, enrollDetails)
Expand Down
2 changes: 1 addition & 1 deletion pkg/osquery/platform.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ func DetectPlatform() (OsqueryPlatform, error) {
case "linux":
return Linux, nil
default:
return Unknown, errors.New("unrecognized runtime.GOOS: " + runtime.GOOS)
return Unknown, errors.Errorf("unrecognized runtime.GOOS: %s", runtime.GOOS)
}
}
7 changes: 5 additions & 2 deletions pkg/osquery/runtime/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"golang.org/x/sync/errgroup"
"golang.org/x/time/rate"

"github.com/kolide/launcher/pkg/backoff"
"github.com/kolide/launcher/pkg/osquery/table"
)

Expand Down Expand Up @@ -130,6 +131,7 @@ func createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths, confi
fmt.Sprintf("--database_path=%s", paths.databasePath),
fmt.Sprintf("--extensions_socket=%s", paths.extensionSocketPath),
fmt.Sprintf("--extensions_autoload=%s", paths.extensionAutoloadPath),
"--extensions_timeout=10",
fmt.Sprintf("--config_plugin=%s", configPlugin),
fmt.Sprintf("--logger_plugin=%s", loggerPlugin),
fmt.Sprintf("--distributed_plugin=%s", distributedPlugin),
Expand Down Expand Up @@ -553,8 +555,9 @@ func (r *Runner) launchOsqueryInstance() error {

// Launch the extension manager server asynchronously.
o.errgroup.Go(func() error {
time.Sleep(1 * time.Second)
if err := o.extensionManagerServer.Start(); err != nil {
// We see the extention manager being slow to start. Implement a simple re-try routine
backoff := backoff.New()
if err := backoff.Run(o.extensionManagerServer.Start); err != nil {
return errors.Wrap(err, "running extension server")
}
return errors.New("extension manager server exited")
Expand Down

0 comments on commit fa45344

Please sign in to comment.