Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a file lock to the data directory on startup to prevent multiple agents. #18483

Merged
merged 3 commits into from
May 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions x-pack/elastic-agent/pkg/agent/application/locker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
// or more contributor license agreements. Licensed under the Elastic License;
// you may not use this file except in compliance with the Elastic License.

package application

import (
"fmt"
"os"
"path/filepath"

"github.com/gofrs/flock"
)

const lockFileName = "agent.lock"

// ErrAppAlreadyRunning error returned when another elastic-agent is already holding the lock.
var ErrAppAlreadyRunning = fmt.Errorf("another elastic-agent is already running")

// AppLocker locks the agent.lock file inside the provided directory.
type AppLocker struct {
lock *flock.Flock
}

// NewAppLocker creates an AppLocker that locks the agent.lock file inside the provided directory.
func NewAppLocker(dir string) *AppLocker {
if _, err := os.Stat(dir); os.IsNotExist(err) {
_ = os.Mkdir(dir, 0755)
}
return &AppLocker{
lock: flock.New(filepath.Join(dir, lockFileName)),
}
}

// TryLock tries to grab the lock file and returns error if it cannot.
func (a *AppLocker) TryLock() error {
locked, err := a.lock.TryLock()
if err != nil {
return err
}
if !locked {
return ErrAppAlreadyRunning
}
return nil
}

// Unlock releases the lock file.
func (a *AppLocker) Unlock() error {
return a.lock.Unlock()
}
29 changes: 29 additions & 0 deletions x-pack/elastic-agent/pkg/agent/application/locker_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
// or more contributor license agreements. Licensed under the Elastic License;
// you may not use this file except in compliance with the Elastic License.

package application

import (
"io/ioutil"
"os"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestAppLocker(t *testing.T) {
tmp, _ := ioutil.TempDir("", "locker")
defer os.RemoveAll(tmp)

locker1 := NewAppLocker(tmp)
locker2 := NewAppLocker(tmp)

require.NoError(t, locker1.TryLock())
assert.Error(t, locker2.TryLock())
require.NoError(t, locker1.Unlock())
require.NoError(t, locker2.TryLock())
assert.Error(t, locker1.TryLock())
require.NoError(t, locker2.Unlock())
}
27 changes: 15 additions & 12 deletions x-pack/elastic-agent/pkg/agent/application/periodic.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,24 @@ type periodic struct {
}

func (p *periodic) Start() error {
if err := p.work(); err != nil {
p.log.Debugf("Failed to read configuration, error: %s", err)
}

for {
select {
case <-p.done:
break
case <-time.After(p.period):
}

go func() {
if err := p.work(); err != nil {
p.log.Debugf("Failed to read configuration, error: %s", err)
}
}

for {
select {
case <-p.done:
break
case <-time.After(p.period):
}

if err := p.work(); err != nil {
p.log.Debugf("Failed to read configuration, error: %s", err)
}
}
}()
return nil
}

func (p *periodic) work() error {
Expand Down
7 changes: 7 additions & 0 deletions x-pack/elastic-agent/pkg/agent/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/spf13/cobra"

"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/agent/application"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/agent/application/paths"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/agent/errors"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/cli"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/config"
Expand Down Expand Up @@ -47,6 +48,12 @@ func run(flags *globalFlags, streams *cli.IOStreams) error {
return err
}

locker := application.NewAppLocker(paths.Data())
if err := locker.TryLock(); err != nil {
return err
}
defer locker.Unlock()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make sure this is handled even if we are killed. defer statements are skipped if SIGINT or SIGTERM are received and it can prevent us from restarting

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have verified that this does get called in all the cases defined below with signals.

signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGINT, syscall.SIGKILL, syscall.SIGTERM, syscall.SIGQUIT)

<-signals

So the defer does get called. I did find a bug in periodic that was preventing app.Start from returning to catch the signals. I have fixed that in my most recent commit.


app, err := application.New(logger, pathConfigFile)
if err != nil {
return err
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func NewDownloader(config *artifact.Config) *Downloader {
func (e *Downloader) Download(_ context.Context, programName, version string) (string, error) {
// create a destination directory root/program
destinationDir := filepath.Join(e.config.TargetDirectory, programName)
if err := os.MkdirAll(destinationDir, os.ModeDir); err != nil {
if err := os.MkdirAll(destinationDir, 0755); err != nil {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a drive-by fix on Mac os.ModeDir does not create the directory with the proper permissions. 0755 must be used.

return "", errors.New(err, "creating directory for downloaded artifact failed", errors.TypeFilesystem, errors.M(errors.MetaKeyPath, destinationDir))
}

Expand Down