Skip to content

Commit

Permalink
Suggested changes:
Browse files Browse the repository at this point in the history
1. Check validity per container's recommendation
2. Kruize ERROR logging, all levels
  • Loading branch information
saltgen committed Dec 4, 2023
1 parent 97430fa commit 172b403
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 98 deletions.
4 changes: 0 additions & 4 deletions internal/services/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ import (
)

var (
invalidRecommendation = promauto.NewCounter(prometheus.CounterOpts{
Name: "rosocp_invalid_recommendation_total",
Help: "The total number of invalid recommendation send by Kruize",
})
invalidCSV = promauto.NewCounter(prometheus.CounterOpts{
Name: "rosocp_invalid_csv_total",
Help: "The total number of invalid csv send by cost-mgmt",
Expand Down
8 changes: 3 additions & 5 deletions internal/services/report_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,9 @@ func ProcessReport(msg *kafka.Message) {
continue
}

if kruize.Is_valid_recommendation(recommendation, experiment_name, maxEndTime) {
containers := recommendation[0].Kubernetes_objects[0].Containers
for _, container := range containers {
containers := recommendation[0].Kubernetes_objects[0].Containers
for _, container := range containers {
if kruize.Is_valid_recommendation(container.Recommendations, experiment_name, maxEndTime) {
for _, v := range container.Recommendations.Data {
marshalData, err := json.Marshal(v)
if err != nil {
Expand Down Expand Up @@ -238,8 +238,6 @@ func ProcessReport(msg *kafka.Message) {
}
}
}
} else {
invalidRecommendation.Inc()
}
}
}
Expand Down
33 changes: 16 additions & 17 deletions internal/types/kruizePayload/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ type container struct {
Container_image_name string `json:"container_image_name,omitempty"`
Container_name string `json:"container_name,omitempty"`
Metrics []metric `json:"metrics,omitempty"`
Recommendations recommendation `json:"recommendations,omitempty"`
Recommendations Recommendation `json:"recommendations,omitempty"`
}

type metric struct {
Expand All @@ -37,45 +37,44 @@ type aggregation_info struct {
Format string `json:"format,omitempty"`
}

type recommendation struct {
Version string `json:"version,omitempty"`
type Recommendation struct {
Version string `json:"version,omitempty"`
Data map[string]RecommendationData `json:"data,omitempty"`
Notifications map[string]Notification `json:"notifications,omitempty"`
}


type Notification struct {
NotifyType string `json:"type,omitempty"`
Message string `json:"message,omitempty"`
Code int `json:"code,omitempty"`
}

type RecommendationEngineObject struct {
PodsCount int `json:"pods_count,omitempty"`
ConfidenceLevel float64 `json:"confidence_level,omitempty"`
Config ConfigObject `json:"config,omitempty"`
Variation ConfigObject `json:"variation,omitempty"`
Notifications map[string]Notification `json:"notifications,omitempty"`
PodsCount int `json:"pods_count,omitempty"`
ConfidenceLevel float64 `json:"confidence_level,omitempty"`
Config ConfigObject `json:"config,omitempty"`
Variation ConfigObject `json:"variation,omitempty"`
Notifications map[string]Notification `json:"notifications,omitempty"`
}

type RecommendationData struct {
Notifications map[string]Notification `json:"notifications,omitempty"`
MonitoringEndTime time.Time `json:"monitoring_end_time,omitempty"`
Current ConfigObject `json:"current,omitempty"`
RecommendationTerms TermBased `json:"recommendation_terms,omitempty"`
MonitoringEndTime time.Time `json:"monitoring_end_time,omitempty"`
Current ConfigObject `json:"current,omitempty"`
RecommendationTerms Term `json:"recommendation_terms,omitempty"`
}

type RecommendationTerm struct {
DurationInHours float64 `json:"duration_in_hours,omitempty"`
Notifications map[string]Notification `json:"notifications,omitempty"`
MonitoringStartTime time.Time `json:"monitoring_start_time,omitempty"`
RecommendationEngines *struct {
DurationInHours float64 `json:"duration_in_hours,omitempty"`
Notifications map[string]Notification `json:"notifications,omitempty"`
MonitoringStartTime time.Time `json:"monitoring_start_time,omitempty"`
RecommendationEngines *struct {
Cost RecommendationEngineObject `json:"cost,omitempty"`
Performance RecommendationEngineObject `json:"performance,omitempty"`
} `json:"recommendation_engines,omitempty"`
}

type TermBased struct {
type Term struct {
Short_term RecommendationTerm `json:"short_term"`
Medium_term RecommendationTerm `json:"medium_term"`
Long_term RecommendationTerm `json:"long_term,omitempty"`
Expand Down
136 changes: 75 additions & 61 deletions internal/utils/kruize/kruize_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,82 +178,96 @@ func Update_recommendations(experiment_name string, interval_end_time time.Time)

}

func Is_valid_recommendation(d []kruizePayload.ListRecommendations, experiment_name string, maxEndTime time.Time) bool {
if len(d) > 0 {

func Is_valid_recommendation(recommendation kruizePayload.Recommendation, experiment_name string, maxEndTime time.Time) bool {

validRecommendationCode := "111000"
_, recommendationIsValid := recommendation.Notifications[validRecommendationCode]
if recommendationIsValid {
// Convert the time object to the expected format
formattedMaxEndTime := maxEndTime.UTC().Format("2006-01-02T15:04:05.000Z")
_, timeStampisValid := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime]

// Allowed object of notifications; "111000" means valid and actionable recommendation
// https://github.com/kruize/autotune/blob/master/design/NotificationCodes.md#detailed-codes
notificationCodes := map[string]string{
"111000": "INFO",
"120001": "INFO",
"111101": "INFO",
"111102": "INFO",
"111103": "INFO",
"112101": "INFO",
"112102": "INFO",
"221001": "ERROR",
"221002": "ERROR",
"221003": "ERROR",
"221004": "ERROR",
"223001": "ERROR",
"223002": "ERROR",
"223003": "ERROR",
"223004": "ERROR",
"224001": "ERROR",
"224002": "ERROR",
"224003": "ERROR",
"224004": "ERROR",
recommendationData, timeStampisValid := recommendation.Data[formattedMaxEndTime]
if !timeStampisValid {
log.Error("recommendation not found for endtime: ", formattedMaxEndTime)
invalidRecommendation.Inc()
return false
}
LogKruizeErrors(recommendationData, formattedMaxEndTime, experiment_name)
return true
} else {
return false
}
}

// Recommendation level
notificationsTopLevel := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Notifications
func LogKruizeErrors(recommendationData kruizePayload.RecommendationData, formattedMaxEndTime string, experiment_name string) {

for key := range notificationsTopLevel {
// https://github.com/kruize/autotune/blob/master/design/NotificationCodes.md#detailed-codes
errorNotificationCodes := map[string]string{
"221001": "ERROR",
"221002": "ERROR",
"221003": "ERROR",
"221004": "ERROR",
"223001": "ERROR",
"223002": "ERROR",
"223003": "ERROR",
"223004": "ERROR",
"224001": "ERROR",
"224002": "ERROR",
"224003": "ERROR",
"224004": "ERROR",
}
notificationSections := []map[string]kruizePayload.Notification{}

if (key == "111000" && !timeStampisValid) {
log.Error("recommendation endtime does not match with requested endtime:", formattedMaxEndTime)
return false
}
// Timestamp level
notificationsLevelTwo := recommendationData.Notifications
if notificationsLevelTwo != nil {
notificationSections = append(notificationSections, notificationsLevelTwo)
// Term Level
notificationsLevelThreeShortTerm := recommendationData.RecommendationTerms.Short_term.Notifications
if notificationsLevelThreeShortTerm != nil {
notificationSections = append(notificationSections, notificationsLevelThreeShortTerm)
// Engine Level
if recommendationData.RecommendationTerms.Short_term.RecommendationEngines != nil {
shortTermCostNotification := recommendationData.RecommendationTerms.Short_term.RecommendationEngines.Cost.Notifications
notificationSections = append(notificationSections, shortTermCostNotification)

dataExists := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data
if (key == "111000" && len(dataExists) == 0) {
log.Error("recommendation does not contain data for endtime:", formattedMaxEndTime)
// Setting the metric counter to 1
// Expecting a single metric for a combination of notification_code, experiment_name
kruizeInvalidRecommendationDetail.WithLabelValues(key, experiment_name).Set(1)
return false
shortTermPerformanceNotification := recommendationData.RecommendationTerms.Short_term.RecommendationEngines.Performance.Notifications
notificationSections = append(notificationSections, shortTermPerformanceNotification)
}
}
notificationsLevelThreeMediumTerm := recommendationData.RecommendationTerms.Medium_term.Notifications
if notificationsLevelThreeMediumTerm != nil {
notificationSections = append(notificationSections, notificationsLevelThreeMediumTerm)
if recommendationData.RecommendationTerms.Medium_term.RecommendationEngines != nil {
mediumTermCostNotification := recommendationData.RecommendationTerms.Medium_term.RecommendationEngines.Cost.Notifications
notificationSections = append(notificationSections, mediumTermCostNotification)

// Timestamp level
notificationsLevelTwo := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].Notifications
// Term Level
notificationsLevelThreeShortTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Short_term.Notifications
notificationsLevelThreeMediumTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Medium_term.Notifications
notificationsLevelThreeLongTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Long_term.Notifications
mediumTermPerformanceNotification := recommendationData.RecommendationTerms.Medium_term.RecommendationEngines.Performance.Notifications
notificationSections = append(notificationSections, mediumTermPerformanceNotification)
}
}
notificationsLevelThreeLongTerm := recommendationData.RecommendationTerms.Long_term.Notifications
if notificationsLevelThreeLongTerm != nil {
notificationSections = append(notificationSections, notificationsLevelThreeLongTerm)
if recommendationData.RecommendationTerms.Long_term.RecommendationEngines != nil {
longTermCostNotification := recommendationData.RecommendationTerms.Long_term.RecommendationEngines.Cost.Notifications
notificationSections = append(notificationSections, longTermCostNotification)

notificationSections := []map[string]kruizePayload.Notification{
notificationsLevelTwo,
notificationsLevelThreeShortTerm,
notificationsLevelThreeMediumTerm,
notificationsLevelThreeLongTerm,
longTermPerformanceNotification := recommendationData.RecommendationTerms.Long_term.RecommendationEngines.Performance.Notifications
notificationSections = append(notificationSections, longTermPerformanceNotification)
}
}

for _, notificationBody := range notificationSections {
for key := range notificationBody {
_, keyExists := notificationCodes[key]
if !keyExists {
kruizeInvalidRecommendationDetail.WithLabelValues(key, experiment_name).Set(1)
}
}

}
for _, notificationBody := range notificationSections {
for key := range notificationBody {
_, keyExists := errorNotificationCodes[key]
if keyExists {
log.Error("kruize recommendation error; experiment_name: ", experiment_name, ", notification_code: ", key)
kruizeRecommendationError.WithLabelValues(key).Inc()
}
return true

}
}
return false

}
22 changes: 11 additions & 11 deletions internal/utils/kruize/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ var (
},
[]string{"path"},
)
invalidRecommendation = promauto.NewCounter(prometheus.CounterOpts{
Name: "rosocp_invalid_recommendation_total",
Help: "The total number of invalid recommendation send by Kruize",
})
kruizeRecommendationError = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "rosocp_kruize_error_recommendations_count",
Help: "Count of ERROR type recommendations from Kruize",
},
[]string{"notification_code"},
)
)


var (
kruizeInvalidRecommendationDetail = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "rosocp_kruize_invalid_recommendation_detail",
Help: "List of INFO/ERROR type recommendations from Kruize",
},
[]string{"notification_code", "experiment_name"},
)
)

0 comments on commit 172b403

Please sign in to comment.