Skip to content

Commit

Permalink
pkg/osquery/runtime improvements, largely around improving test flaki…
Browse files Browse the repository at this point in the history
…ness (#1798)
  • Loading branch information
RebeccaMahany authored Jul 25, 2024
1 parent d26319c commit 38af721
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 50 deletions.
3 changes: 1 addition & 2 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,7 @@ func (o *OsqueryInstance) StartOsqueryExtensionManagerServer(name string, socket
<-o.doneCtx.Done()

o.slogger.Log(context.TODO(), slog.LevelDebug,
"exiting errgroup",
"errgroup", "starting extension shutdown",
"starting extension shutdown",
"extension_name", name,
)

Expand Down
27 changes: 18 additions & 9 deletions pkg/osquery/runtime/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -481,13 +481,6 @@ func (r *Runner) launchOsqueryInstance() error {
span.AddEvent("extension_server_created")
}

if err := o.stats.Connected(o); err != nil {
r.slogger.Log(ctx, slog.LevelWarn,
"could not set connection time for osquery instance history",
"err", err,
)
}

// Now spawn an extension manager for the tables. We need to
// start this one in the background, because the runner.Start
// function needs to return promptly enough for osquery to use
Expand Down Expand Up @@ -517,6 +510,15 @@ func (r *Runner) launchOsqueryInstance() error {
return nil
})

// All done with osquery setup! Mark instance as connected, then proceed
// with setting up remaining errgroups.
if err := o.stats.Connected(o); err != nil {
r.slogger.Log(ctx, slog.LevelWarn,
"could not set connection time for osquery instance history",
"err", err,
)
}

// Health check on interval
o.errgroup.Go(func() error {
defer r.slogger.Log(ctx, slog.LevelInfo,
Expand Down Expand Up @@ -612,9 +614,16 @@ func (r *Runner) launchOsqueryInstance() error {
)

<-o.doneCtx.Done()
if err := os.Remove(paths.pidfilePath); err != nil {
// We do a couple retries -- on Windows, the PID file may still be in use
// and therefore unable to be removed.
if err := backoff.WaitFor(func() error {
if err := os.Remove(paths.pidfilePath); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("removing PID file: %w", err)
}
return nil
}, 5*time.Second, 500*time.Millisecond); err != nil {
r.slogger.Log(ctx, slog.LevelInfo,
"could not remove PID file",
"could not remove PID file, despite retries",
"pid_file", paths.pidfilePath,
"err", err,
)
Expand Down
2 changes: 1 addition & 1 deletion pkg/osquery/runtime/runtime_helpers_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func SocketPath(rootDir string) string {
// launcher and osquery. We would like to be able to run multiple
// launchers.
//
// We could use something based on the laumcher root, but given the
// We could use something based on the launcher root, but given the
// context this runs in a ulid seems simpler.
return fmt.Sprintf(`\\.\pipe\kolide-osquery-%s`, ulid.New())
}
Expand Down
22 changes: 14 additions & 8 deletions pkg/osquery/runtime/runtime_posix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ func TestOsquerySlowStart(t *testing.T) {
}),
)
go runner.Run()
waitHealthy(t, runner)
waitHealthy(t, runner, &logBytes)

// ensure that we actually had to wait on the socket
require.Contains(t, logBytes.String(), "osquery extension socket not created yet")
require.NoError(t, runner.Shutdown())
waitShutdown(t, runner, &logBytes)
}

// TestExtensionSocketPath tests that the launcher can start osqueryd with a custom extension socket path.
Expand All @@ -94,11 +94,17 @@ func TestExtensionSocketPath(t *testing.T) {
require.NoError(t, err)
defer rmRootDirectory()

var logBytes threadsafebuffer.ThreadSafeBuffer
slogger := slog.New(slog.NewTextHandler(&logBytes, &slog.HandlerOptions{
AddSource: true,
Level: slog.LevelDebug,
}))

k := typesMocks.NewKnapsack(t)
k.On("OsqueryHealthcheckStartupDelay").Return(0 * time.Second).Maybe()
k.On("WatchdogEnabled").Return(false)
k.On("RegisterChangeObserver", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything)
k.On("Slogger").Return(multislogger.NewNopLogger())
k.On("Slogger").Return(slogger)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory)
store, err := storageci.NewStore(t, multislogger.NewNopLogger(), storage.KatcConfigStore.String())
require.NoError(t, err)
Expand All @@ -113,7 +119,7 @@ func TestExtensionSocketPath(t *testing.T) {
)
go runner.Run()

waitHealthy(t, runner)
waitHealthy(t, runner, &logBytes)

// wait for the launcher-provided extension to register
time.Sleep(2 * time.Second)
Expand All @@ -127,21 +133,21 @@ func TestExtensionSocketPath(t *testing.T) {
assert.Equal(t, int32(0), resp.Status.Code)
assert.Equal(t, "OK", resp.Status.Message)

require.NoError(t, runner.Shutdown())
waitShutdown(t, runner, &logBytes)
}

// TestRestart tests that the launcher can restart the osqueryd process.
// This test causes time outs on windows, so it is only run on non-windows platforms.
// Should investigate why this is the case.
func TestRestart(t *testing.T) {
t.Parallel()
runner, teardown := setupOsqueryInstanceForTests(t)
runner, logBytes, teardown := setupOsqueryInstanceForTests(t)
defer teardown()

previousStats := runner.instance.stats

require.NoError(t, runner.Restart())
waitHealthy(t, runner)
waitHealthy(t, runner, logBytes)

require.NotEmpty(t, runner.instance.stats.StartTime, "start time should be set on latest instance stats after restart")
require.NotEmpty(t, runner.instance.stats.ConnectTime, "connect time should be set on latest instance stats after restart")
Expand All @@ -152,7 +158,7 @@ func TestRestart(t *testing.T) {
previousStats = runner.instance.stats

require.NoError(t, runner.Restart())
waitHealthy(t, runner)
waitHealthy(t, runner, logBytes)

require.NotEmpty(t, runner.instance.stats.StartTime, "start time should be added to latest instance stats after restart")
require.NotEmpty(t, runner.instance.stats.ConnectTime, "connect time should be added to latest instance stats after restart")
Expand Down
Loading

0 comments on commit 38af721

Please sign in to comment.