Skip to content

Commit

Permalink
backport of commit d626fed
Browse files Browse the repository at this point in the history
  • Loading branch information
elprans committed Mar 27, 2023
1 parent 8bf0677 commit f2d722f
Show file tree
Hide file tree
Showing 136 changed files with 1,305 additions and 1,373 deletions.
3 changes: 0 additions & 3 deletions .changelog/16099.txt

This file was deleted.

3 changes: 0 additions & 3 deletions .changelog/16221.txt

This file was deleted.

3 changes: 0 additions & 3 deletions .changelog/16243.txt

This file was deleted.

3 changes: 0 additions & 3 deletions .changelog/16463.txt

This file was deleted.

3 changes: 3 additions & 0 deletions .changelog/16583.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
server: Added verification of cron jobs already running before forcing new evals right after leader change
```
3 changes: 3 additions & 0 deletions .changelog/16609.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
scheduler: Fix reconciliation of reconnecting allocs when the replacement allocations are not running
```
3 changes: 3 additions & 0 deletions .changelog/16672.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
fingerprint/cpu: correctly fingerprint P/E cores of Apple Silicon chips
```
2 changes: 1 addition & 1 deletion .release/ci.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ event "prepare" {
}

notification {
on = "always"
on = "fail"
}
}

Expand Down
182 changes: 0 additions & 182 deletions CHANGELOG.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion api/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func (a *Agent) Region() (string, error) {

// Join is used to instruct a server node to join another server
// via the gossip protocol. Multiple addresses may be specified.
// We attempt to join all the hosts in the list. Returns the
// We attempt to join all of the hosts in the list. Returns the
// number of nodes successfully joined and any error. If one or
// more nodes have a successful result, no error is returned.
func (a *Agent) Join(addrs ...string) (int, error) {
Expand Down
40 changes: 1 addition & 39 deletions api/agent_test.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
package api

import (
"fmt"
"sort"
"strings"
"testing"
"time"

"github.com/hashicorp/nomad/api/internal/testutil"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
)

func TestAgent_Self(t *testing.T) {
Expand Down Expand Up @@ -112,43 +110,7 @@ func TestAgent_ForceLeave(t *testing.T) {
err := a.ForceLeave("nope")
must.NoError(t, err)

// Force-leave on an existing node
_, s2 := makeClient(t, nil, func(c *testutil.TestServerConfig) {
c.Server.BootstrapExpect = 0
})
defer s2.Stop()
// Create a new node to join
n, err := a.Join(s2.SerfAddr)
must.NoError(t, err)
must.One(t, n)

membersBefore, err := a.MembersOpts(&QueryOptions{})
must.Eq(t, membersBefore.Members[1].Status, "alive")

err = a.ForceLeave(membersBefore.Members[1].Name)
must.NoError(t, err)

time.Sleep(3 * time.Second)

f := func() error {
membersAfter, err := a.MembersOpts(&QueryOptions{})
if err != nil {
return err
}
for _, node := range membersAfter.Members {
if node.Name == membersBefore.Members[1].Name {
if node.Status != "leaving" {
return fmt.Errorf("node did not leave")
}
}
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(3*time.Second),
wait.Gap(100*time.Millisecond),
))
// TODO: test force-leave on an existing node
}

func (a *AgentMember) String() string {
Expand Down
2 changes: 1 addition & 1 deletion api/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ require (
github.com/hashicorp/go-rootcerts v1.0.2
github.com/mitchellh/go-testing-interface v1.14.1
github.com/mitchellh/mapstructure v1.5.0
github.com/shoenig/test v0.6.2
github.com/shoenig/test v0.6.3
golang.org/x/exp v0.0.0-20230108222341-4b8118a2686a
)

Expand Down
4 changes: 2 additions & 2 deletions api/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/shoenig/test v0.6.2 h1:tdq+WGnznwE5xcOMXkqqXuudK75RkSGBazBGcP1lX6w=
github.com/shoenig/test v0.6.2/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
github.com/shoenig/test v0.6.3 h1:GVXWJFk9PiOjN0KoJ7VrJGH6uLPnqxR7/fe3HUPfE0c=
github.com/shoenig/test v0.6.3/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
Expand Down
2 changes: 1 addition & 1 deletion api/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func (c *Client) Nodes() *Nodes {
return &Nodes{client: c}
}

// List is used to list out all the nodes
// List is used to list out all of the nodes
func (n *Nodes) List(q *QueryOptions) ([]*NodeListStub, *QueryMeta, error) {
var resp NodeIndexSort
qm, err := n.client.query("/v1/nodes", &resp, q)
Expand Down
20 changes: 0 additions & 20 deletions ci/skip_non_root.go

This file was deleted.

1 change: 0 additions & 1 deletion client/allocrunner/taskrunner/artifact_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ func TestTaskRunner_ArtifactHook_PartialDone(t *testing.T) {
// TestTaskRunner_ArtifactHook_ConcurrentDownloadSuccess asserts that the artifact hook
// download multiple files concurrently. this is a successful test without any errors.
func TestTaskRunner_ArtifactHook_ConcurrentDownloadSuccess(t *testing.T) {
ci.SkipTestWithoutRootAccess(t)
t.Parallel()

me := &mockEmitter{}
Expand Down
1 change: 0 additions & 1 deletion client/allocrunner/taskrunner/task_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1729,7 +1729,6 @@ func TestTaskRunner_DeriveToken_Unrecoverable(t *testing.T) {
// TestTaskRunner_Download_RawExec asserts that downloaded artifacts may be
// executed in a driver without filesystem isolation.
func TestTaskRunner_Download_RawExec(t *testing.T) {
ci.SkipTestWithoutRootAccess(t)
ci.Parallel(t)

ts := httptest.NewServer(http.FileServer(http.Dir(filepath.Dir("."))))
Expand Down
155 changes: 99 additions & 56 deletions client/fingerprint/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

"github.com/hashicorp/nomad/lib/cpuset"

log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper/stats"
"github.com/hashicorp/nomad/nomad/structs"
)
Expand All @@ -22,85 +22,128 @@ const (
// CPUFingerprint is used to fingerprint the CPU
type CPUFingerprint struct {
StaticFingerprinter
logger log.Logger
logger hclog.Logger

// accumulates result in these resource structs
resources *structs.Resources
nodeResources *structs.NodeResources
}

// NewCPUFingerprint is used to create a CPU fingerprint
func NewCPUFingerprint(logger log.Logger) Fingerprint {
f := &CPUFingerprint{logger: logger.Named("cpu")}
return f
func NewCPUFingerprint(logger hclog.Logger) Fingerprint {
return &CPUFingerprint{
logger: logger.Named("cpu"),
resources: new(structs.Resources), // COMPAT (to be removed after 0.10)
nodeResources: new(structs.NodeResources),
}
}

func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error {
cfg := req.Config
setResourcesCPU := func(totalCompute int, totalCores uint16, reservableCores []uint16) {
// COMPAT(0.10): Remove in 0.10
resp.Resources = &structs.Resources{
CPU: totalCompute,
}
func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *FingerprintResponse) error {
f.initialize()

resp.NodeResources = &structs.NodeResources{
Cpu: structs.NodeCpuResources{
CpuShares: int64(totalCompute),
TotalCpuCores: totalCores,
ReservableCpuCores: reservableCores,
},
}
}
f.setModelName(response)

f.setFrequency(response)

f.setCoreCount(response)

f.setReservableCores(request, response)

f.setTotalCompute(request, response)

f.setResponseResources(response)

response.Detected = true

return nil
}

func (f *CPUFingerprint) initialize() {
if err := stats.Init(); err != nil {
f.logger.Warn("failed initializing stats collector", "error", err)
}
}

func (f *CPUFingerprint) setModelName(response *FingerprintResponse) {
if modelName := stats.CPUModelName(); modelName != "" {
resp.AddAttribute("cpu.modelname", modelName)
response.AddAttribute("cpu.modelname", modelName)
f.logger.Debug("detected CPU model", "name", modelName)
}
}

func (*CPUFingerprint) frequency(mhz uint64) string {
return fmt.Sprintf("%.0f", float64(mhz))
}

if mhz := stats.CPUMHzPerCore(); mhz > 0 {
resp.AddAttribute("cpu.frequency", fmt.Sprintf("%.0f", mhz))
f.logger.Debug("detected cpu frequency", "MHz", log.Fmt("%.0f", mhz))
func (f *CPUFingerprint) setFrequency(response *FingerprintResponse) {
power, efficiency := stats.CPUMHzPerCore()
switch {
case efficiency > 0:
response.AddAttribute("cpu.frequency.efficiency", f.frequency(efficiency))
response.AddAttribute("cpu.frequency.power", f.frequency(power))
f.logger.Debug("detected CPU efficiency core speed", "mhz", efficiency)
f.logger.Debug("detected CPU power core speed", "mhz", power)
case power > 0:
response.AddAttribute("cpu.frequency", f.frequency(power))
f.logger.Debug("detected CPU frequency", "mhz", power)
}
}

func (*CPUFingerprint) cores(count int) string {
return strconv.Itoa(count)
}

var numCores int
if numCores = stats.CPUNumCores(); numCores > 0 {
resp.AddAttribute("cpu.numcores", strconv.Itoa(numCores))
f.logger.Debug("detected core count", "cores", numCores)
func (f *CPUFingerprint) setCoreCount(response *FingerprintResponse) {
power, efficiency := stats.CPUNumCores()
switch {
case efficiency > 0:
response.AddAttribute("cpu.numcores.efficiency", f.cores(efficiency))
response.AddAttribute("cpu.numcores.power", f.cores(power))
f.logger.Debug("detected CPU efficiency core count", "cores", efficiency)
f.logger.Debug("detected CPU power core count", "cores", power)
case power > 0:
response.AddAttribute("cpu.numcores", f.cores(power))
f.logger.Debug("detected CPU core count", power)
}
f.nodeResources.Cpu.TotalCpuCores = uint16(power + efficiency)
}

var reservableCores []uint16
if req.Config.ReservableCores != nil {
reservableCores = req.Config.ReservableCores
f.logger.Debug("reservable cores set by config", "cpuset", reservableCores)
func (f *CPUFingerprint) setReservableCores(request *FingerprintRequest, response *FingerprintResponse) {
reservable := request.Config.ReservableCores
if len(reservable) > 0 {
f.logger.Debug("reservable cores set by config", "cpuset", reservable)
} else {
if cores, err := f.deriveReservableCores(req); err != nil {
f.logger.Warn("failed to detect set of reservable cores", "error", err)
} else {
if req.Node.ReservedResources != nil {
reservableCores = cpuset.New(cores...).Difference(cpuset.New(req.Node.ReservedResources.Cpu.ReservedCpuCores...)).ToSlice()
cgroupParent := request.Config.CgroupParent
if reservable = f.deriveReservableCores(cgroupParent); reservable != nil {
if request.Node.ReservedResources != nil {
forNode := request.Node.ReservedResources.Cpu.ReservedCpuCores
reservable = cpuset.New(reservable...).Difference(cpuset.New(forNode...)).ToSlice()
f.logger.Debug("client configuration reserves these cores for node", "cores", forNode)
}
f.logger.Debug("detected reservable cores", "cpuset", reservableCores)
f.logger.Debug("set of reservable cores available for tasks", "cores", reservable)
}
}
resp.AddAttribute("cpu.reservablecores", strconv.Itoa(len(reservableCores)))

tt := int(stats.TotalTicksAvailable())
if cfg.CpuCompute > 0 {
f.logger.Debug("using user specified cpu compute", "cpu_compute", cfg.CpuCompute)
tt = cfg.CpuCompute
}
response.AddAttribute("cpu.reservablecores", strconv.Itoa(len(reservable)))
f.nodeResources.Cpu.ReservableCpuCores = reservable
}

// If we cannot detect the cpu total compute, fallback to a very low default
// value and log a message about configuring cpu_total_compute. This happens
// on Graviton instances where CPU information is unavailable. In that case,
// the env_aws fingerprinter updates the value with correct information.
if tt == 0 {
f.logger.Info("fallback to default cpu total compute, set client config option cpu_total_compute to override")
tt = defaultCPUTicks
func (f *CPUFingerprint) setTotalCompute(request *FingerprintRequest, response *FingerprintResponse) {
var ticks uint64
switch {
case request.Config.CpuCompute > 0:
ticks = uint64(request.Config.CpuCompute)
case stats.TotalTicksAvailable() > 0:
ticks = stats.TotalTicksAvailable()
default:
ticks = defaultCPUTicks
}
response.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", ticks))
f.resources.CPU = int(ticks)
f.nodeResources.Cpu.CpuShares = int64(ticks)
}

resp.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", tt))
setResourcesCPU(tt, uint16(numCores), reservableCores)
resp.Detected = true

return nil
func (f *CPUFingerprint) setResponseResources(response *FingerprintResponse) {
response.Resources = f.resources
response.NodeResources = f.nodeResources
}
Loading

0 comments on commit f2d722f

Please sign in to comment.