Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pkg/osquery/runtime improvements, largely around improving test flakiness #1798

Merged
merged 9 commits into from
Jul 25, 2024
3 changes: 1 addition & 2 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,7 @@ func (o *OsqueryInstance) StartOsqueryExtensionManagerServer(name string, socket
<-o.doneCtx.Done()

o.slogger.Log(context.TODO(), slog.LevelDebug,
"exiting errgroup",
"errgroup", "starting extension shutdown",
"starting extension shutdown",
"extension_name", name,
)

Expand Down
27 changes: 18 additions & 9 deletions pkg/osquery/runtime/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -481,13 +481,6 @@ func (r *Runner) launchOsqueryInstance() error {
span.AddEvent("extension_server_created")
}

if err := o.stats.Connected(o); err != nil {
r.slogger.Log(ctx, slog.LevelWarn,
"could not set connection time for osquery instance history",
"err", err,
)
}

// Now spawn an extension manager for the tables. We need to
// start this one in the background, because the runner.Start
// function needs to return promptly enough for osquery to use
Expand Down Expand Up @@ -517,6 +510,15 @@ func (r *Runner) launchOsqueryInstance() error {
return nil
})

// All done with osquery setup! Mark instance as connected, then proceed
// with setting up remaining errgroups.
if err := o.stats.Connected(o); err != nil {
r.slogger.Log(ctx, slog.LevelWarn,
"could not set connection time for osquery instance history",
"err", err,
)
}

// Health check on interval
o.errgroup.Go(func() error {
defer r.slogger.Log(ctx, slog.LevelInfo,
Expand Down Expand Up @@ -612,9 +614,16 @@ func (r *Runner) launchOsqueryInstance() error {
)

<-o.doneCtx.Done()
if err := os.Remove(paths.pidfilePath); err != nil {
// We do a couple retries -- on Windows, the PID file may still be in use
// and therefore unable to be removed.
if err := backoff.WaitFor(func() error {
if err := os.Remove(paths.pidfilePath); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("removing PID file: %w", err)
}
return nil
}, 5*time.Second, 500*time.Millisecond); err != nil {
r.slogger.Log(ctx, slog.LevelInfo,
"could not remove PID file",
"could not remove PID file, despite retries",
"pid_file", paths.pidfilePath,
"err", err,
)
Expand Down
2 changes: 1 addition & 1 deletion pkg/osquery/runtime/runtime_helpers_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func SocketPath(rootDir string) string {
// launcher and osquery. We would like to be able to run multiple
// launchers.
//
// We could use something based on the laumcher root, but given the
// We could use something based on the launcher root, but given the
// context this runs in a ulid seems simpler.
return fmt.Sprintf(`\\.\pipe\kolide-osquery-%s`, ulid.New())
}
Expand Down
22 changes: 14 additions & 8 deletions pkg/osquery/runtime/runtime_posix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ func TestOsquerySlowStart(t *testing.T) {
}),
)
go runner.Run()
waitHealthy(t, runner)
waitHealthy(t, runner, &logBytes)

// ensure that we actually had to wait on the socket
require.Contains(t, logBytes.String(), "osquery extension socket not created yet")
require.NoError(t, runner.Shutdown())
waitShutdown(t, runner, &logBytes)
}

// TestExtensionSocketPath tests that the launcher can start osqueryd with a custom extension socket path.
Expand All @@ -94,11 +94,17 @@ func TestExtensionSocketPath(t *testing.T) {
require.NoError(t, err)
defer rmRootDirectory()

var logBytes threadsafebuffer.ThreadSafeBuffer
slogger := slog.New(slog.NewTextHandler(&logBytes, &slog.HandlerOptions{
AddSource: true,
Level: slog.LevelDebug,
}))

k := typesMocks.NewKnapsack(t)
k.On("OsqueryHealthcheckStartupDelay").Return(0 * time.Second).Maybe()
k.On("WatchdogEnabled").Return(false)
k.On("RegisterChangeObserver", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything)
k.On("Slogger").Return(multislogger.NewNopLogger())
k.On("Slogger").Return(slogger)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory)
store, err := storageci.NewStore(t, multislogger.NewNopLogger(), storage.KatcConfigStore.String())
require.NoError(t, err)
Expand All @@ -113,7 +119,7 @@ func TestExtensionSocketPath(t *testing.T) {
)
go runner.Run()

waitHealthy(t, runner)
waitHealthy(t, runner, &logBytes)

// wait for the launcher-provided extension to register
time.Sleep(2 * time.Second)
Expand All @@ -127,21 +133,21 @@ func TestExtensionSocketPath(t *testing.T) {
assert.Equal(t, int32(0), resp.Status.Code)
assert.Equal(t, "OK", resp.Status.Message)

require.NoError(t, runner.Shutdown())
waitShutdown(t, runner, &logBytes)
}

// TestRestart tests that the launcher can restart the osqueryd process.
// This test causes time outs on windows, so it is only run on non-windows platforms.
// Should investigate why this is the case.
func TestRestart(t *testing.T) {
t.Parallel()
runner, teardown := setupOsqueryInstanceForTests(t)
runner, logBytes, teardown := setupOsqueryInstanceForTests(t)
defer teardown()

previousStats := runner.instance.stats

require.NoError(t, runner.Restart())
waitHealthy(t, runner)
waitHealthy(t, runner, logBytes)

require.NotEmpty(t, runner.instance.stats.StartTime, "start time should be set on latest instance stats after restart")
require.NotEmpty(t, runner.instance.stats.ConnectTime, "connect time should be set on latest instance stats after restart")
Expand All @@ -152,7 +158,7 @@ func TestRestart(t *testing.T) {
previousStats = runner.instance.stats

require.NoError(t, runner.Restart())
waitHealthy(t, runner)
waitHealthy(t, runner, logBytes)

require.NotEmpty(t, runner.instance.stats.StartTime, "start time should be added to latest instance stats after restart")
require.NotEmpty(t, runner.instance.stats.ConnectTime, "connect time should be added to latest instance stats after restart")
Expand Down
Loading
Loading