Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check for and log information about stale osquery database lock files #2006

Merged
merged 19 commits into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/osquery/osquery-go/plugin/config"
"github.com/osquery/osquery-go/plugin/distributed"
osquerylogger "github.com/osquery/osquery-go/plugin/logger"
"github.com/shirou/gopsutil/v3/process"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"

Expand Down Expand Up @@ -249,6 +250,15 @@ func (i *OsqueryInstance) Launch() error {
return fmt.Errorf("could not calculate osquery file paths: %w", err)
}

// Check to see whether lock files exist that could prevent osquery from starting up
if _, err := i.detectStaleDatabaseLock(ctx, paths); err != nil {
// A detection error shouldn't prevent us from creating the process -- log it and move on
i.slogger.Log(ctx, slog.LevelInfo,
"error detecting stale database lock",
"err", err,
)
}

// Populate augeas lenses, if requested
if i.opts.augeasLensFunc != nil {
if err := os.MkdirAll(paths.augeasPath, 0755); err != nil {
Expand Down Expand Up @@ -702,6 +712,79 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st
return osqueryFilePaths, nil
}

// detectStaleDatabaseLock checks for the presence of a lock file in the given database.
// We have noticed the lock file occasionally hanging around -- we think this can happen
// when osquery doesn't get adequate time to shut down gracefully. The presence of the lock
// file will prevent osquery from starting up entirely. For now, we just detect this stale
// lock file, and log information about it. In the future we will attempt to remediate.
// See: https://github.com/kolide/launcher/issues/2004.
func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *osqueryFilePaths) (bool, error) {
lockFilePath := filepath.Join(paths.databasePath, "LOCK")

lockFileInfo, err := os.Stat(lockFilePath)
if os.IsNotExist(err) {
// No lock file, nothing to do here
return false, nil
}
if err != nil {
return false, fmt.Errorf("determining whether lock file exists: %w", err)
}

infoToLog := []any{
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().UTC().String(),
}

defer func() {
i.slogger.Log(ctx, slog.LevelInfo,
"detected stale osquery db lock file",
infoToLog...,
)
}()

// Check to see whether the process holding the file still exists
processes, err := getProcessesHoldingFile(ctx, lockFilePath)
if err != nil {
infoToLog = append(infoToLog, "err", err)
return false, fmt.Errorf("getting process holding file: %w", err)
}

// Grab more info to log from the processes using the lockfile
processStrs := make([]string, len(processes))
for i, p := range processes {
processStrs[i] = processStr(ctx, p)
}
infoToLog = append(infoToLog, "processes", processStrs)

return true, nil
}

func processStr(ctx context.Context, p *process.Process) string {
name := "unknown"
processOwner := "unknown"
runningStatus := "unknown"
cmdline := "unknown"

if gotName, err := p.NameWithContext(ctx); err == nil {
name = gotName
}
if gotUsername, err := p.UsernameWithContext(ctx); err == nil {
processOwner = gotUsername
}
if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil {
if gotIsRunning {
runningStatus = "running"
} else {
runningStatus = "not running"
}
}
if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil {
cmdline = gotCmdline
}

return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, processOwner, runningStatus, cmdline)
}

// createOsquerydCommand uses osqueryOptions to return an *exec.Cmd
// which will launch a properly configured osqueryd process.
func (i *OsqueryInstance) createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths) (*exec.Cmd, error) {
Expand Down
20 changes: 20 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_other_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//go:build !windows
// +build !windows

package runtime

import (
"syscall"
"testing"

"github.com/stretchr/testify/require"
)

func createLockFile(t *testing.T, fileToLock string) {
lockFile, err := syscall.Open(fileToLock, syscall.O_CREAT|syscall.O_RDONLY, 0600)
require.NoError(t, err)
require.NoError(t, syscall.Flock(lockFile, syscall.LOCK_EX|syscall.LOCK_NB))
t.Cleanup(func() {
syscall.Close(lockFile)
})
}
103 changes: 103 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package runtime

import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
Expand All @@ -12,8 +14,10 @@ import (
"github.com/kolide/kit/ulid"
"github.com/kolide/launcher/ee/agent/types"
typesMocks "github.com/kolide/launcher/ee/agent/types/mocks"
"github.com/kolide/launcher/pkg/backoff"
"github.com/kolide/launcher/pkg/log/multislogger"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -235,3 +239,102 @@ func Test_healthcheckWithRetries(t *testing.T) {
// No client available, so healthcheck should fail despite retries
require.Error(t, i.healthcheckWithRetries(context.TODO(), 5, 100*time.Millisecond))
}

func TestLaunch(t *testing.T) {
t.Parallel()

logBytes, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("WatchdogEnabled").Return(true)
k.On("WatchdogMemoryLimitMB").Return(150)
k.On("WatchdogUtilizationLimitPercent").Return(20)
k.On("WatchdogDelaySec").Return(120)
k.On("OsqueryFlags").Return([]string{"verbose=true"})
k.On("OsqueryVerbose").Return(true)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
k.On("LoggingInterval").Return(1 * time.Second)
k.On("LogMaxBytesPerBatch").Return(500)
k.On("Transport").Return("jsonrpc")
setUpMockStores(t, k)
k.On("ReadEnrollSecret").Return("", nil)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinary)
k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())
go i.Launch()

// Wait for the instance to become healthy
require.NoError(t, backoff.WaitFor(func() error {
// Instance self-reports as healthy
if err := i.Healthy(); err != nil {
return fmt.Errorf("instance not healthy: %w", err)
}

// Confirm instance setup is complete
if i.stats == nil || i.stats.ConnectTime == "" {
return errors.New("no connect time set yet")
}

// Good to go
return nil
}, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String()))

// Now wait for full shutdown
i.BeginShutdown()
shutdownErr := make(chan error)
go func() {
shutdownErr <- i.WaitShutdown()
}()

select {
case err := <-shutdownErr:
require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String()))
case <-time.After(1 * time.Minute):
t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String()))
t.FailNow()
}

k.AssertExpectations(t)
}

func Test_detectStaleDatabaseLock(t *testing.T) {
t.Parallel()

if runtime.GOOS == "windows" && os.Getenv("GITHUB_ACTIONS") == "true" {
t.Skip("Skipping test on GitHub Actions -- test only works locally on Windows")
}

_, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
setUpMockStores(t, k)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())

// Calculate paths
paths, err := calculateOsqueryPaths(i.knapsack.RootDirectory(), i.registrationId, i.runId, i.opts)
require.NoError(t, err)

// Check for stale database lock -- there shouldn't be one
detected, err := i.detectStaleDatabaseLock(context.TODO(), paths)
require.NoError(t, err)
require.False(t, detected)

// Create a lock file
lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK")
require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx
createLockFile(t, lockFilePath)

// Check for a stale database lock again -- now, we should find it
detectedRetry, err := i.detectStaleDatabaseLock(context.TODO(), paths)
require.NoError(t, err)
require.True(t, detectedRetry)

k.AssertExpectations(t)
}
40 changes: 40 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ package runtime

import (
"strings"
"syscall"
"testing"
"unsafe"

"github.com/kolide/launcher/ee/agent/types"
typesMocks "github.com/kolide/launcher/ee/agent/types/mocks"
Expand Down Expand Up @@ -50,3 +52,41 @@ func TestCreateOsqueryCommandEnvVars(t *testing.T) {

k.AssertExpectations(t)
}

func createLockFile(t *testing.T, fileToLock string) {
lockedFileName, err := syscall.UTF16PtrFromString(fileToLock)
require.NoError(t, err)

handle, err := syscall.CreateFile(
lockedFileName,
syscall.GENERIC_READ,
syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE,
nil,
syscall.CREATE_ALWAYS,
syscall.FILE_ATTRIBUTE_NORMAL,
0,
)
require.NoError(t, err)

t.Cleanup(func() {
syscall.Close(handle)
})

// See: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-lockfileex
modkernel32 := syscall.NewLazyDLL("kernel32.dll")
procLockFileEx := modkernel32.NewProc("LockFileEx")
overlapped := &syscall.Overlapped{
HEvent: 0,
}

r1, _, e1 := syscall.SyscallN(
procLockFileEx.Addr(),
uintptr(handle),
uintptr(2), // LOCKFILE_EXCLUSIVE_LOCK
uintptr(0), // Reserved parameter; must be set to zero.
uintptr(1), // nNumberOfBytesToLockLow -- the low-order 32 bits of the length of the byte range to lock.
uintptr(0), // nNumberOfBytesToLockHigh -- the high-order 32 bits of the length of the byte range to lock.
uintptr(unsafe.Pointer(overlapped)),
)
require.False(t, r1 == 0, error(e1))
}
45 changes: 45 additions & 0 deletions pkg/osquery/runtime/runtime_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,17 @@
package runtime

import (
"context"
"errors"
"fmt"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"

"github.com/kolide/launcher/ee/allowedcmd"
"github.com/shirou/gopsutil/v3/process"
)

func setpgid() *syscall.SysProcAttr {
Expand All @@ -33,3 +40,41 @@ func platformArgs() []string {
func isExitOk(_ error) bool {
return false
}

func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) {
cmd, err := allowedcmd.Lsof(ctx, "-t", pathToFile)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish we had a better option, but I concur we do not.

if err != nil {
return nil, fmt.Errorf("creating lsof command: %w", err)
}

out, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("running lsof: %w", err)
}

pidStr := strings.TrimSpace(string(out))
if pidStr == "" {
return nil, errors.New("no process found using file via lsof")
}

pidStrs := strings.Split(strings.TrimSpace(string(out)), "\n")
if len(pidStrs) == 0 {
return nil, errors.New("no processes found using file via lsof")
}

processes := make([]*process.Process, 0)
for _, pidStr := range pidStrs {
pid, err := strconv.ParseInt(pidStr, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err)
}

p, err := process.NewProcess(int32(pid))
if err != nil {
return nil, fmt.Errorf("getting process for %d: %w", pid, err)
}
processes = append(processes, p)
}

return processes, nil
}
Loading
Loading