Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

9.3.0: Add missing check for SenderStatusCertMiss and more robust cert fetch #3031

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/CONFIG-PROPERTIES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
| ---- | ---- | ------- | ----------- |
| app.allow.vnc | boolean | false | allow access to the app using the VNC tcp port |
| timer.config.interval | integer in seconds | 60 | how frequently device gets config |
| timer.cert.interval | integer in seconds | 1 day (24*3600) | how frequently device checks for new controller certificates |
| timer.metric.interval | integer in seconds | 60 | how frequently device reports metrics |
| timer.metric.diskscan.interval | integer in seconds | 300 | how frequently device should scan the disk for metrics |
| timer.location.cloud.interval | integer in seconds | 1 hour | how frequently device reports geographic location information to controller |
Expand Down
91 changes: 71 additions & 20 deletions pkg/pillar/cmd/zedagent/handlecertconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
zconfig "github.com/lf-edge/eve/api/go/config"
"github.com/lf-edge/eve/api/go/evecommon"
"github.com/lf-edge/eve/pkg/pillar/agentlog"
"github.com/lf-edge/eve/pkg/pillar/flextimer"
"github.com/lf-edge/eve/pkg/pillar/types"
"github.com/lf-edge/eve/pkg/pillar/zedcloud"
"google.golang.org/protobuf/proto"
Expand All @@ -38,13 +39,14 @@ type cipherContext struct {
var controllerCertHash []byte

// parse and update controller certs
func parseControllerCerts(ctx *zedagentContext, contents []byte) {
func parseControllerCerts(ctx *zedagentContext, contents []byte) (changed bool, err error) {
log.Functionf("Started parsing controller certs")
cfgConfig := &zcert.ZControllerCert{}
err := proto.Unmarshal(contents, cfgConfig)
err = proto.Unmarshal(contents, cfgConfig)
if err != nil {
log.Errorf("parseControllerCerts(): Unmarshal error %v", err)
return
err = fmt.Errorf("parseControllerCerts(): Unmarshal error %w", err)
log.Error(err)
return false, err
}

cfgCerts := cfgConfig.GetCerts()
Expand All @@ -54,7 +56,7 @@ func parseControllerCerts(ctx *zedagentContext, contents []byte) {
}
newHash := h.Sum(nil)
if bytes.Equal(newHash, controllerCertHash) {
return
return false, nil
}
log.Functionf("parseControllerCerts: Applying updated config "+
"Last Sha: % x, "+
Expand All @@ -80,6 +82,7 @@ func parseControllerCerts(ctx *zedagentContext, contents []byte) {
if !found {
log.Functionf("parseControllerCerts: deleting %s", config.Key())
unpublishControllerCert(ctx.getconfigCtx, config.Key())
changed = true
}
}

Expand All @@ -95,9 +98,11 @@ func parseControllerCerts(ctx *zedagentContext, contents []byte) {
CertHash: cfgConfig.GetCertHash(),
}
publishControllerCert(ctx.getconfigCtx, *cert)
changed = true
}
}
log.Functionf("parsing controller certs done")
return changed, nil
}

// look up controller cert
Expand Down Expand Up @@ -170,7 +175,7 @@ func handleEdgeNodeCertDelete(ctxArg interface{}, key string,
func controllerCertsTask(ctx *zedagentContext, triggerCerts <-chan struct{}) {

log.Functionln("starting controller certificate fetch task")
getCertsFromController(ctx)
retry := !getCertsFromController(ctx, "initial")

wdName := agentName + "ccerts"

Expand All @@ -179,27 +184,66 @@ func controllerCertsTask(ctx *zedagentContext, triggerCerts <-chan struct{}) {
ctx.ps.StillRunning(wdName, warningTime, errorTime)
ctx.ps.RegisterFileWatchdog(wdName)

// Run a timer for extra safety to handle controller certificates updates
// If we failed with the initial we have a short timer, otherwise
// the configurable one.
const shortTime = 120 // Two minutes
certInterval := ctx.globalConfig.GlobalValueInt(types.CertInterval)
if retry {
log.Noticef("Initial getCertsFromController failed; switching to short timer")
certInterval = shortTime
}
interval := time.Duration(certInterval) * time.Second
max := float64(interval)
min := max * 0.3
periodicTicker := flextimer.NewRangeTicker(time.Duration(min),
time.Duration(max))
ctx.getconfigCtx.certTickerHandle = periodicTicker

for {
success := true
select {
case <-triggerCerts:
start := time.Now()
getCertsFromController(ctx)
success = getCertsFromController(ctx, "triggered")
ctx.ps.CheckMaxTimeTopic(wdName, "publishCerts", start,
warningTime, errorTime)

case <-periodicTicker.C:
start := time.Now()
success = getCertsFromController(ctx, "periodic")
ctx.ps.CheckMaxTimeTopic(wdName, "publishCerts", start,
warningTime, errorTime)

case <-stillRunning.C:
}
ctx.ps.StillRunning(wdName, warningTime, errorTime)
if retry && success {
log.Noticef("getCertsFromController succeeded; switching to long timer %d seconds",
ctx.globalConfig.GlobalValueInt(types.CertInterval))
updateCertTimer(ctx.globalConfig.GlobalValueInt(types.CertInterval),
ctx.getconfigCtx.certTickerHandle)
retry = false
} else if !retry && !success {
log.Noticef("getCertsFromController failed; switching to short timer")
updateCertTimer(shortTime,
ctx.getconfigCtx.certTickerHandle)
retry = true
}
}
}

// prepare the certs list proto message
func getCertsFromController(ctx *zedagentContext) bool {
// Fetch and verify the controller certificates. Returns true if certificates have
// not changed or the update was successfully applied.
// False is returned if the function failed to fetch/verify/unmarshal certs.
func getCertsFromController(ctx *zedagentContext, desc string) (success bool) {
log.Functionf("getCertsFromController started for %s", desc)
certURL := zedcloud.URLPathString(serverNameAndPort,
zedcloudCtx.V2API, nilUUID, "certs")

// not V2API
if !zedcloud.UseV2API() {
log.Noticef("getCertsFromController not V2API!")
return false
}

Expand All @@ -211,13 +255,13 @@ func getCertsFromController(ctx *zedagentContext) bool {
if err != nil {
switch senderStatus {
case types.SenderStatusUpgrade:
log.Functionf("getCertsFromController: Controller upgrade in progress")
log.Noticef("getCertsFromController: Controller upgrade in progress")
case types.SenderStatusRefused:
log.Functionf("getCertsFromController: Controller returned ECONNREFUSED")
log.Noticef("getCertsFromController: Controller returned ECONNREFUSED")
case types.SenderStatusCertInvalid:
log.Warnf("getCertsFromController: Controller certificate invalid time")
case types.SenderStatusCertMiss:
log.Functionf("getCertsFromController: Controller certificate miss")
log.Noticef("getCertsFromController: Controller certificate miss")
default:
log.Errorf("getCertsFromController failed: %s", err)
}
Expand Down Expand Up @@ -247,23 +291,30 @@ func getCertsFromController(ctx *zedagentContext) bool {
}

// validate the certificate message payload
certBytes, ret := zedcloud.VerifyProtoSigningCertChain(log, contents)
signingCertBytes, ret := zedcloud.VerifyProtoSigningCertChain(log, contents)
if ret != nil {
log.Errorf("getCertsFromController: verify err %v", ret)
return false
}

// manage the certificates through pubsub
changed, err := parseControllerCerts(ctx, contents)
if err != nil {
// Note that err is already logged.
return false
}
if !changed {
return true
}

// write the signing cert to file
if err := zedcloud.SaveServerSigningCert(zedcloudCtx, certBytes); err != nil {
if err := zedcloud.SaveServerSigningCert(zedcloudCtx, signingCertBytes); err != nil {
errStr := fmt.Sprintf("%v", err)
log.Errorf("getCertsFromController: " + errStr)
return false
}

// manage the certificates through pubsub
parseControllerCerts(ctx, contents)

log.Functionf("getCertsFromController: success")
log.Noticef("getCertsFromController: success for %s", desc)
return true
}

Expand Down Expand Up @@ -411,7 +462,7 @@ func handleControllerCertsSha(ctx *zedagentContext,

certHash := config.GetControllercertConfighash()
if certHash != ctx.cipherCtx.cfgControllerCertHash {
log.Functionf("handleControllerCertsSha trigger due to controller %v vs current %v",
log.Noticef("handleControllerCertsSha trigger due to controller %v vs current %v",
certHash, ctx.cipherCtx.cfgControllerCertHash)
ctx.cipherCtx.cfgControllerCertHash = certHash
triggerControllerCertEvent(ctx)
Expand All @@ -421,7 +472,7 @@ func handleControllerCertsSha(ctx *zedagentContext,
// controller certificate pull trigger function
func triggerControllerCertEvent(ctxPtr *zedagentContext) {

log.Function("Trigger for Controller Certs")
log.Noticef("Trigger for Controller Certs")
select {
case ctxPtr.cipherCtx.triggerControllerCerts <- struct{}{}:
// Do nothing more
Expand Down
34 changes: 33 additions & 1 deletion pkg/pillar/cmd/zedagent/handleconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ type getconfigContext struct {
updateInprogress bool
readSavedConfig bool // Did we already read it?
configTickerHandle interface{}
certTickerHandle interface{}
metricsTickerHandle interface{}
locationCloudTickerHandle interface{}
locationAppTickerHandle interface{}
Expand Down Expand Up @@ -422,6 +423,25 @@ func updateConfigTimer(configInterval uint32, tickerHandle interface{}) {
flextimer.TickNow(tickerHandle)
}

// Called when globalConfig changes
// Assumes the caller has verified that the interval has changed
func updateCertTimer(configInterval uint32, tickerHandle interface{}) {

if tickerHandle == nil {
// Happens if we have a GlobalConfig setting in /persist/
log.Warnf("updateConfigTimer: no certTickerHandle yet")
return
}
interval := time.Duration(configInterval) * time.Second
log.Functionf("updateCertTimer() change to %v", interval)
max := float64(interval)
min := max * 0.3
flextimer.UpdateRangeTicker(tickerHandle,
time.Duration(min), time.Duration(max))
// Force an immediate timeout since timer could have decreased
flextimer.TickNow(tickerHandle)
}

// Start by trying the all the free management ports and then all the non-free
// until one succeeds in communicating with the cloud.
// We use the iteration argument to start at a different point each time.
Expand Down Expand Up @@ -472,7 +492,12 @@ func getLatestConfig(getconfigCtx *getconfigContext, url string,
log.Errorf("getLatestConfig failed: %s", err)
}
switch senderStatus {
case types.SenderStatusUpgrade, types.SenderStatusRefused, types.SenderStatusCertInvalid, types.SenderStatusNotFound:
case types.SenderStatusCertInvalid:
// trigger to acquire new controller certs from cloud
log.Noticef("%s trigger", senderStatus.String())
triggerControllerCertEvent(ctx)
fallthrough
case types.SenderStatusUpgrade, types.SenderStatusRefused, types.SenderStatusNotFound:
newCount = types.LedBlinkConnectedToController // Almost connected to controller!
// Don't treat as upgrade failure
if getconfigCtx.updateInprogress {
Expand All @@ -481,6 +506,7 @@ func getLatestConfig(getconfigCtx *getconfigContext, url string,
}
case types.SenderStatusCertMiss:
// trigger to acquire new controller certs from cloud
log.Noticef("%s trigger", senderStatus.String())
triggerControllerCertEvent(ctx)
}
if getconfigCtx.ledBlinkCount == types.LedBlinkOnboarded {
Expand Down Expand Up @@ -564,6 +590,12 @@ func getLatestConfig(getconfigCtx *getconfigContext, url string,
url, contents, false, senderStatus)
if err != nil {
log.Errorf("RemoveAndVerifyAuthContainer failed: %s", err)
switch senderStatus {
case types.SenderStatusCertMiss, types.SenderStatusCertInvalid:
// trigger to acquire new controller certs from cloud
log.Noticef("%s trigger", senderStatus.String())
triggerControllerCertEvent(ctx)
}
// Inform ledmanager about problem
utils.UpdateLedManagerConfig(log, types.LedBlinkInvalidAuthContainer)
getconfigCtx.ledBlinkCount = types.LedBlinkInvalidAuthContainer
Expand Down
7 changes: 7 additions & 0 deletions pkg/pillar/cmd/zedagent/parseconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -2347,6 +2347,8 @@ func parseConfigItems(ctx *getconfigContext, config *zconfig.EdgeDevConfig,
// Set GlobalStatus Values from GlobalConfig.
oldConfigInterval := oldGlobalConfig.GlobalValueInt(types.ConfigInterval)
newConfigInterval := newGlobalConfig.GlobalValueInt(types.ConfigInterval)
oldCertInterval := oldGlobalConfig.GlobalValueInt(types.CertInterval)
newCertInterval := newGlobalConfig.GlobalValueInt(types.CertInterval)

oldMetricInterval := oldGlobalConfig.GlobalValueInt(types.MetricInterval)
newMetricInterval := newGlobalConfig.GlobalValueInt(types.MetricInterval)
Expand All @@ -2366,6 +2368,11 @@ func parseConfigItems(ctx *getconfigContext, config *zconfig.EdgeDevConfig,
updateConfigTimer(newConfigInterval, ctx.configTickerHandle)
updateConfigTimer(newConfigInterval, ctx.localProfileTickerHandle)
}
if newCertInterval != oldCertInterval {
log.Functionf("parseConfigItems: %s change from %d to %d",
"CertInterval", oldCertInterval, newCertInterval)
updateCertTimer(newCertInterval, ctx.certTickerHandle)
}
if newMetricInterval != oldMetricInterval {
log.Functionf("parseConfigItems: %s change from %d to %d",
"MetricInterval", oldMetricInterval, newMetricInterval)
Expand Down
3 changes: 2 additions & 1 deletion pkg/pillar/cmd/zedagent/reportinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,8 @@ func getState(ctx *zedagentContext) info.ZDeviceState {
if ctx.poweroffCmd || ctx.devicePoweroff {
return info.ZDeviceState_ZDEVICE_STATE_POWERING_OFF
}
if ctx.getconfigCtx != nil && ctx.getconfigCtx.configReceived {
if ctx.getconfigCtx != nil && (ctx.getconfigCtx.configReceived ||
ctx.getconfigCtx.readSavedConfig) {
return info.ZDeviceState_ZDEVICE_STATE_ONLINE
}
return info.ZDeviceState_ZDEVICE_STATE_BOOTING
Expand Down
41 changes: 41 additions & 0 deletions pkg/pillar/types/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,42 @@ const (
SenderStatusDebug // Not a failure
)

// String prints ASCII
func (status SenderResult) String() string {
switch status {
case SenderStatusNone:
return "SenderStatusNone"
case SenderStatusRefused:
return "SenderStatusRefused"
case SenderStatusUpgrade:
return "SenderStatusUpgrade"
case SenderStatusCertInvalid:
return "SenderStatusCertInvalid"
case SenderStatusCertMiss:
return "SenderStatusCertMiss"
case SenderStatusSignVerifyFail:
return "SenderStatusSignVerifyFail"
case SenderStatusAlgoFail:
return "SenderStatusAlgoFail"
case SenderStatusHashSizeError:
return "SenderStatusHashSizeError"
case SenderStatusCertUnknownAuthority:
return "SenderStatusCertUnknownAuthority"
case SenderStatusCertUnknownAuthorityProxy:
return "SenderStatusCertUnknownAuthorityProxy"
case SenderStatusNotFound:
return "SenderStatusNotFound"
case SenderStatusForbidden:
return "SenderStatusForbidden"
case SenderStatusFailed:
return "SenderStatusFailed"
case SenderStatusDebug:
return "SenderStatusDebug"
default:
return fmt.Sprintf("Unknown status %d", status)
}
}

const (
// MinuteInSec is number of seconds in a minute
MinuteInSec = 60
Expand Down Expand Up @@ -118,6 +154,8 @@ const (
// Int Items
// ConfigInterval global setting key
ConfigInterval GlobalSettingKey = "timer.config.interval"
// CertInterval global setting key; check for controller cert update
CertInterval GlobalSettingKey = "timer.cert.interval"
// MetricInterval global setting key
MetricInterval GlobalSettingKey = "timer.metric.interval"
// DiskScanMetricInterval global setting key
Expand Down Expand Up @@ -733,6 +771,9 @@ func NewConfigItemSpecMap() ConfigItemSpecMap {
// too long to get next config and is practically unreachable for any config
// changes or reboot through cloud.
configItemSpecMap.AddIntItem(ConfigInterval, 60, 5, HourInSec)
// Additional safety to periodically fetch the controller certificate
// Useful for odd cases when the triggered updates do not work.
configItemSpecMap.AddIntItem(CertInterval, 24*HourInSec, 60, 0xFFFFFFFF)
// timer.metric.diskscan.interval (seconds)
// Shorter interval can lead to device scanning the disk frequently which is a costly operation.
configItemSpecMap.AddIntItem(DiskScanMetricInterval, 300, 5, HourInSec)
Expand Down
1 change: 1 addition & 0 deletions pkg/pillar/types/global_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ func TestNewConfigItemSpecMap(t *testing.T) {
gsKeys := []GlobalSettingKey{
// Int Items
ConfigInterval,
CertInterval,
MetricInterval,
LocationCloudInterval,
LocationAppInterval,
Expand Down
2 changes: 2 additions & 0 deletions pkg/pillar/zedcloud/authen.go
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,8 @@ func SaveServerSigningCert(ctx *ZedCloudContext, certByte []byte) error {
ctx.log.Errorf("SaveServerSignCert: %v", err)
return err
}
// Clear cached
ClearCloudCert(ctx)
return nil
}

Expand Down