Skip to content

Commit

Permalink
Merge pull request #29269 from kannon92/fix-machine-phase-deletion
Browse files Browse the repository at this point in the history
OCPBUGS-44244: use node node in deleted machine phases
  • Loading branch information
openshift-merge-bot[bot] authored Nov 12, 2024
2 parents e29afe5 + de21794 commit 5e59368
Show file tree
Hide file tree
Showing 8 changed files with 464 additions and 163 deletions.
44 changes: 44 additions & 0 deletions pkg/monitortestlibrary/utility/utility.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package utility

import (
"fmt"
"os"
"regexp"
"time"
)

// SystemdJournalLogTime returns Now if there is trouble reading the time. This will stack the event intervals without
// parsable times at the end of the run, which will be more clearly visible as a problem than not reporting them.
func SystemdJournalLogTime(logLine string) time.Time {
var kubeletTimeRegex = regexp.MustCompile(`^(?P<MONTH>\S+)\s(?P<DAY>\S+)\s(?P<TIME>\S+)`)
kubeletTimeRegex.MatchString(logLine)
if !kubeletTimeRegex.MatchString(logLine) {
return time.Now()
}

month := ""
day := ""
year := fmt.Sprintf("%d", time.Now().Year())
timeOfDay := ""
subMatches := kubeletTimeRegex.FindStringSubmatch(logLine)
subNames := kubeletTimeRegex.SubexpNames()
for i, name := range subNames {
switch name {
case "MONTH":
month = subMatches[i]
case "DAY":
day = subMatches[i]
case "TIME":
timeOfDay = subMatches[i]
}
}

timeString := fmt.Sprintf("%s %s %s %s UTC", day, month, year, timeOfDay)
ret, err := time.Parse("02 Jan 2006 15:04:05.999999999 MST", timeString)
if err != nil {
fmt.Fprintf(os.Stderr, "Failure parsing time format: %v for %q\n", err, timeString)
return time.Now()
}

return ret
}
15 changes: 13 additions & 2 deletions pkg/monitortests/machines/watchmachines/machines.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,26 @@ func startMachineMonitoring(ctx context.Context, m monitorapi.RecorderWriter, cl
// this is last so machine deleted shows up last when queried
func(machine, oldMachine *machine.Machine) []monitorapi.Interval {
var intervals []monitorapi.Interval
if machine != nil {
if machine == nil {
return intervals
}
nodeName := "<unknown>"
oldHasNodeRef := oldMachine != nil && oldMachine.Status.NodeRef != nil
newHasNodeRef := machine.Status.NodeRef != nil
if oldHasNodeRef {
nodeName = oldMachine.Status.NodeRef.Name
}

if newHasNodeRef {
nodeName = machine.Status.NodeRef.Name
}

now := time.Now()
intervals = append(intervals,
monitorapi.NewInterval(monitorapi.SourceMachine, monitorapi.Info).
Locator(monitorapi.NewLocator().MachineFromName(oldMachine.Name)).
Locator(monitorapi.NewLocator().MachineFromName(machine.Name)).
Message(monitorapi.NewMessage().Reason(monitorapi.MachineDeletedInAPI).
WithAnnotation(monitorapi.AnnotationNode, nodeName).
HumanMessage("Machine deleted")).
Build(now, now))
return intervals
Expand Down
5 changes: 5 additions & 0 deletions pkg/monitortests/machines/watchmachines/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,14 @@ func (*machineWatcher) ConstructComputedIntervals(ctx context.Context, startingI
})
for _, phaseChange := range phaseChanges {
previousPhase := phaseChange.Message.Annotations[monitorapi.AnnotationPreviousPhase]
nodeName := phaseChange.Message.Annotations[monitorapi.AnnotationNode]
constructedIntervals = append(constructedIntervals,
monitorapi.NewInterval(monitorapi.SourceMachine, monitorapi.Info).
Locator(phaseChange.Locator).
Message(monitorapi.NewMessage().Reason(monitorapi.MachinePhase).
Constructed(monitorapi.ConstructionOwnerMachineLifecycle).
WithAnnotation(monitorapi.AnnotationPhase, previousPhase).
WithAnnotation(monitorapi.AnnotationNode, nodeName).
HumanMessage(fmt.Sprintf("Machine is in %q", previousPhase))).
Display().
Build(previousChangeTime, phaseChange.From),
Expand All @@ -86,11 +88,13 @@ func (*machineWatcher) ConstructComputedIntervals(ctx context.Context, startingI
}

deletionTime := time.Time{}
nodeName := "unknown"
deletedIntervals := monitorapi.Intervals(allMachineChanges).Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Message.Reason == monitorapi.MachineDeletedInAPI
})
if len(deletedIntervals) > 0 {
deletionTime = deletedIntervals[0].To
nodeName = deletedIntervals[0].Message.Annotations[monitorapi.AnnotationNode]
}
if len(lastPhase) > 0 {
constructedIntervals = append(constructedIntervals,
Expand All @@ -99,6 +103,7 @@ func (*machineWatcher) ConstructComputedIntervals(ctx context.Context, startingI
Message(monitorapi.NewMessage().Reason(monitorapi.MachinePhase).
Constructed(monitorapi.ConstructionOwnerMachineLifecycle).
WithAnnotation(monitorapi.AnnotationPhase, lastPhase).
WithAnnotation(monitorapi.AnnotationNode, nodeName).
HumanMessage(fmt.Sprintf("Machine is in %q", lastPhase))).
Display().
Build(previousChangeTime, deletionTime),
Expand Down
60 changes: 12 additions & 48 deletions pkg/monitortests/node/kubeletlogcollector/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
utilerrors "k8s.io/apimachinery/pkg/util/errors"

"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/monitortestlibrary/utility"
"k8s.io/client-go/kubernetes"
)

Expand Down Expand Up @@ -132,7 +133,7 @@ func unreasonablyLongPollInterval(logLine string, nodeLocator monitorapi.Locator
return nil
}

toTime := systemdJournalLogTime(logLine)
toTime := utility.SystemdJournalLogTime(logLine)

// Extract the number of millis and use it for the interval, starting from the point we logged
// and looking backwards.
Expand Down Expand Up @@ -183,7 +184,7 @@ func tooManyNetlinkEvents(logLine string, nodeLocator monitorapi.Locator) monito
return nil
}

logTime := systemdJournalLogTime(logLine)
logTime := utility.SystemdJournalLogTime(logLine)

message := logLine[strings.Index(logLine, "NetworkManager"):]
return monitorapi.Intervals{
Expand Down Expand Up @@ -218,7 +219,7 @@ func readinessFailure(nodeName, logLine string) monitorapi.Intervals {
}

containerRef := probeProblemToContainerReference(logLine)
failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)
return monitorapi.Intervals{
monitorapi.NewInterval(monitorapi.SourceKubeletLog, monitorapi.Info).
Locator(containerRef).
Expand All @@ -245,7 +246,7 @@ func readinessError(nodeName, logLine string) monitorapi.Intervals {
message, _ = strconv.Unquote(`"` + message + `"`)

containerRef := probeProblemToContainerReference(logLine)
failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)
return monitorapi.Intervals{
monitorapi.NewInterval(monitorapi.SourceKubeletLog, monitorapi.Info).
Locator(containerRef).
Expand All @@ -272,7 +273,7 @@ func errParsingSignature(nodeName, logLine string) monitorapi.Intervals {
}

containerRef := errImagePullToContainerReference(logLine)
failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)
return monitorapi.Intervals{
monitorapi.NewInterval(monitorapi.SourceKubeletLog, monitorapi.Info).
Locator(containerRef).
Expand Down Expand Up @@ -318,7 +319,7 @@ func startupProbeError(nodeName, logLine string) monitorapi.Intervals {
}

containerRef := probeProblemToContainerReference(logLine)
failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)
return monitorapi.Intervals{
monitorapi.NewInterval(monitorapi.SourceKubeletLog, monitorapi.Info).
Locator(containerRef).
Expand Down Expand Up @@ -423,7 +424,7 @@ func failedToDeleteCGroupsPath(nodeLocator monitorapi.Locator, logLine string) m
return nil
}

failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)

return monitorapi.Intervals{
monitorapi.NewInterval(monitorapi.SourceKubeletLog, monitorapi.Error).
Expand All @@ -439,7 +440,7 @@ func anonymousCertConnectionError(nodeLocator monitorapi.Locator, logLine string
return nil
}

failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)

return monitorapi.Intervals{
monitorapi.NewInterval(monitorapi.SourceKubeletLog, monitorapi.Error).
Expand All @@ -466,7 +467,7 @@ func leaseUpdateError(nodeLocator monitorapi.Locator, logLine string) monitorapi
return nil
}

failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)
url := ""
msg := ""

Expand Down Expand Up @@ -508,7 +509,7 @@ func leaseUpdateError(nodeLocator monitorapi.Locator, logLine string) monitorapi

func leaseFailBackOff(nodeLocator monitorapi.Locator, logLine string) monitorapi.Intervals {

failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)

subMatches := failedLeaseFiveTimes.FindStringSubmatch(logLine)

Expand Down Expand Up @@ -616,7 +617,7 @@ func commonErrorInterval(nodeName, logLine string, messageExp *regexp.Regexp, re
message = unquotedMessage
}

failureTime := systemdJournalLogTime(logLine)
failureTime := utility.SystemdJournalLogTime(logLine)
return monitorapi.Intervals{
monitorapi.NewInterval(monitorapi.SourceKubeletLog, monitorapi.Info).
Locator(locator()).
Expand All @@ -628,43 +629,6 @@ func commonErrorInterval(nodeName, logLine string, messageExp *regexp.Regexp, re
}
}

var kubeletTimeRegex = regexp.MustCompile(`^(?P<MONTH>\S+)\s(?P<DAY>\S+)\s(?P<TIME>\S+)`)

// systemdJournalLogTime returns Now if there is trouble reading the time. This will stack the event intervals without
// parsable times at the end of the run, which will be more clearly visible as a problem than not reporting them.
func systemdJournalLogTime(logLine string) time.Time {
kubeletTimeRegex.MatchString(logLine)
if !kubeletTimeRegex.MatchString(logLine) {
return time.Now()
}

month := ""
day := ""
year := fmt.Sprintf("%d", time.Now().Year())
timeOfDay := ""
subMatches := kubeletTimeRegex.FindStringSubmatch(logLine)
subNames := kubeletTimeRegex.SubexpNames()
for i, name := range subNames {
switch name {
case "MONTH":
month = subMatches[i]
case "DAY":
day = subMatches[i]
case "TIME":
timeOfDay = subMatches[i]
}
}

timeString := fmt.Sprintf("%s %s %s %s UTC", day, month, year, timeOfDay)
ret, err := time.Parse("02 Jan 2006 15:04:05.999999999 MST", timeString)
if err != nil {
fmt.Fprintf(os.Stderr, "Failure parsing time format: %v for %q\n", err, timeString)
return time.Now()
}

return ret
}

// getNodeLog returns logs for a particular systemd service on a given node.
// We're count on these logs to fit into some reasonable memory size.
func getNodeLog(ctx context.Context, client kubernetes.Interface, nodeName, systemdServiceName string) ([]byte, error) {
Expand Down
Loading

0 comments on commit 5e59368

Please sign in to comment.