From bfe3514e5be75ae18a987673f37b7ca8025887ef Mon Sep 17 00:00:00 2001 From: Sagnik Dutta Date: Tue, 21 Nov 2023 18:25:26 +0530 Subject: [PATCH 1/5] RHIROS-1326 Kruize 0.0.20.1_rm Integration --- internal/api/utils.go | 116 ++- internal/services/report_processor.go | 3 +- internal/types/kruizePayload/common.go | 49 +- .../types/kruizePayload/createExperiment.go | 4 +- internal/utils/kruize/kruize_api.go | 78 +- internal/utils/kruize/metrics.go | 11 + kruize-clowdapp.yaml | 6 +- openapi.json | 957 ++++++++++++++---- scripts/docker-compose.yml | 6 +- 9 files changed, 956 insertions(+), 274 deletions(-) diff --git a/internal/api/utils.go b/internal/api/utils.go index d6381e03..9e4dae0b 100644 --- a/internal/api/utils.go +++ b/internal/api/utils.go @@ -223,11 +223,6 @@ func TransformComponentUnits(jsonData datatypes.JSON) map[string]interface{} { return nil } - durationBased, ok := data["duration_based"].(map[string]interface{}) - if !ok { - fmt.Printf("duration_based not found in JSON") - } - convertMemory := func(memory map[string]interface{}) error { amount, ok := memory["amount"].(float64) if ok { @@ -235,10 +230,10 @@ func TransformComponentUnits(jsonData datatypes.JSON) map[string]interface{} { if math.Abs(memoryInMiB) >= 1024 { memoryInGiB := memoryInMiB / 1024 memory["amount"] = math.Trunc(memoryInGiB*100) / 100 - memory["format"] = "GiB" + memory["format"] = "Gi" } else { memory["amount"] = math.Trunc(memoryInMiB*100) / 100 - memory["format"] = "MiB" + memory["format"] = "Mi" } } return nil @@ -265,56 +260,109 @@ func TransformComponentUnits(jsonData datatypes.JSON) map[string]interface{} { if math.Abs(cpuInCores) < 1 { cpuInMillicores := cpuInCores * 1000 cpu["amount"] = math.Round(cpuInMillicores) // millicore values are rounded & don't require decimal precision - cpu["format"] = "millicores" + cpu["format"] = "m" } else { cpu["amount"] = truncateToThreeDecimalPlaces(cpuInCores) - cpu["format"] = "cores" + cpu["format"] = nil } } return nil } + // Current + current_config, ok := data["current"].(map[string]interface{}) + if !ok { + log.Error("current not found in JSON") + } + + for _, section := range []string{"limits", "requests"} { + + sectionObject, ok := current_config[section].(map[string]interface{}) + if ok { + memory, ok := sectionObject["memory"].(map[string]interface{}) + if ok { + err := convertMemory(memory) + if err != nil { + fmt.Printf("error converting memory in %s: %v\n", sectionObject, err) + continue + } + } + cpu, ok := sectionObject["cpu"].(map[string]interface{}) + if ok { + err := convertCPU(cpu) + if err != nil { + fmt.Printf("error converting cpu in %s: %v\n", sectionObject, err) + continue + } + } + } + } + /* Recommendation data is available for three periods + under cost and performance keys For each of these actual values will be present in below mentioned dataBlocks > request and limits */ - for _, period := range []string{"long_term", "medium_term", "short_term"} { - intervalData, ok := durationBased[period].(map[string]interface{}) + // Recommendations + recommendation_terms, ok := data["recommendation_terms"].(map[string]interface{}) + if !ok { + log.Error("recommendation_terms not found in JSON") + } + + for _, period := range []string{"short_term", "medium_term", "long_term"} { + intervalData, ok := recommendation_terms[period].(map[string]interface{}) if !ok { continue } - for _, dataBlock := range []string{"current", "config", "variation"} { - recommendationSection, ok := intervalData[dataBlock].(map[string]interface{}) - if !ok { - continue - } + // Hack + // remove nil equivalent monitoring_start_time in API response + monitoring_start_time := intervalData["monitoring_start_time"] + if monitoring_start_time == "0001-01-01T00:00:00Z" { + delete(intervalData, "monitoring_start_time") + } + + if intervalData["recommendation_engines"] != nil { - for _, section := range []string{"limits", "requests"} { + for _, recommendationType := range []string{"cost", "performance"} { + engineData, ok := intervalData["recommendation_engines"].(map[string]interface{})[recommendationType].(map[string]interface{}) + if !ok { + continue + } - sectionObject, ok := recommendationSection[section].(map[string]interface{}) - if ok { - memory, ok := sectionObject["memory"].(map[string]interface{}) - if ok { - err := convertMemory(memory) - if err != nil { - fmt.Printf("error converting memory in %s: %v\n", period, err) - continue - } + for _, dataBlock := range []string{"config", "variation"} { + recommendationSection, ok := engineData[dataBlock].(map[string]interface{}) + if !ok { + continue } - cpu, ok := sectionObject["cpu"].(map[string]interface{}) - if ok { - err := convertCPU(cpu) - if err != nil { - fmt.Printf("error converting cpu in %s: %v\n", period, err) - continue + + for _, section := range []string{"limits", "requests"} { + + sectionObject, ok := recommendationSection[section].(map[string]interface{}) + if ok { + memory, ok := sectionObject["memory"].(map[string]interface{}) + if ok { + err := convertMemory(memory) + if err != nil { + fmt.Printf("error converting memory in %s: %v\n", period, err) + continue + } + } + cpu, ok := sectionObject["cpu"].(map[string]interface{}) + if ok { + err := convertCPU(cpu) + if err != nil { + fmt.Printf("error converting cpu in %s: %v\n", period, err) + continue + } + } } } + } - } - + } } } diff --git a/internal/services/report_processor.go b/internal/services/report_processor.go index 7bce07f2..0c54013c 100644 --- a/internal/services/report_processor.go +++ b/internal/services/report_processor.go @@ -100,7 +100,8 @@ func ProcessReport(msg *kafka.Message, _ *kafka.Consumer) { k8s_object_name, ) - container_names, err := kruize.Create_kruize_experiments(experiment_name, k8s_object) + cluster_identifier := kafkaMsg.Metadata.Org_id + ";" + kafkaMsg.Metadata.Cluster_uuid + container_names, err := kruize.Create_kruize_experiments(experiment_name, cluster_identifier, k8s_object) if err != nil { log.Error(err) continue diff --git a/internal/types/kruizePayload/common.go b/internal/types/kruizePayload/common.go index 48c35745..83714704 100644 --- a/internal/types/kruizePayload/common.go +++ b/internal/types/kruizePayload/common.go @@ -38,36 +38,47 @@ type aggregation_info struct { } type recommendation struct { - Data map[string]recommendationType `json:"data,omitempty"` - Notifications map[string]notification `json:"notifications,omitempty"` + Version string `json:"version,omitempty"` + Data map[string]RecommendationData `json:"data,omitempty"` + Notifications map[string]Notification `json:"notifications,omitempty"` } -type notification struct { + +type Notification struct { NotifyType string `json:"type,omitempty"` Message string `json:"message,omitempty"` Code int `json:"code,omitempty"` } -type recommendationType struct { - Duration_based termbased `json:"duration_based,omitempty"` +type RecommendationEngineObject struct { + PodsCount int `json:"pods_count,omitempty"` + ConfidenceLevel float64 `json:"confidence_level,omitempty"` + Config ConfigObject `json:"config,omitempty"` + Variation ConfigObject `json:"variation,omitempty"` + Notifications map[string]Notification `json:"notifications,omitempty"` +} + +type RecommendationData struct { + Notifications map[string]Notification `json:"notifications,omitempty"` + MonitoringEndTime time.Time `json:"monitoring_end_time,omitempty"` + Current ConfigObject `json:"current,omitempty"` + RecommendationTerms TermBased `json:"recommendation_terms,omitempty"` } -type termbased struct { - Short_term recommendationObject `json:"short_term,omitempty"` - Medium_term recommendationObject `json:"medium_term,omitempty"` - Long_term recommendationObject `json:"long_term,omitempty"` +type RecommendationTerm struct { + DurationInHours float64 `json:"duration_in_hours,omitempty"` + Notifications map[string]Notification `json:"notifications,omitempty"` + MonitoringStartTime time.Time `json:"monitoring_start_time,omitempty"` + RecommendationEngines *struct { + Cost RecommendationEngineObject `json:"cost,omitempty"` + Performance RecommendationEngineObject `json:"performance,omitempty"` + } `json:"recommendation_engines,omitempty"` } -type recommendationObject struct { - Monitoring_start_time time.Time `json:"monitoring_start_time,omitempty"` - Monitoring_end_time time.Time `json:"monitoring_end_time,omitempty"` - Duration_in_hours float64 `json:"duration_in_hours,omitempty"` - Pods_count int `json:"pods_count,omitempty"` - Confidence_level float64 `json:"confidence_level,omitempty"` - Current ConfigObject `json:"current,omitempty"` - Config ConfigObject `json:"config,omitempty"` - Variation ConfigObject `json:"variation,omitempty"` - Notifications map[string]notification `json:"notifications,omitempty"` +type TermBased struct { + Short_term RecommendationTerm `json:"short_term"` + Medium_term RecommendationTerm `json:"medium_term"` + Long_term RecommendationTerm `json:"long_term,omitempty"` } type ConfigObject struct { diff --git a/internal/types/kruizePayload/createExperiment.go b/internal/types/kruizePayload/createExperiment.go index 2153b36a..037eacef 100644 --- a/internal/types/kruizePayload/createExperiment.go +++ b/internal/types/kruizePayload/createExperiment.go @@ -7,6 +7,7 @@ import ( type createExperiment struct { Version string `json:"version"` Experiment_name string `json:"experiment_name"` + Cluster_name string `json:"cluster_name"` Performance_profile string `json:"performance_profile"` Mode string `json:"mode"` Target_cluster string `json:"target_cluster"` @@ -23,7 +24,7 @@ type recommendation_settings struct { Threshold string `json:"threshold"` } -func GetCreateExperimentPayload(experiment_name string, containers []map[string]string, data map[string]string) ([]byte, error) { +func GetCreateExperimentPayload(experiment_name string, cluster_identifier string, containers []map[string]string, data map[string]string) ([]byte, error) { container_array := []container{} for _, c := range containers { container_array = append(container_array, container{ @@ -35,6 +36,7 @@ func GetCreateExperimentPayload(experiment_name string, containers []map[string] { Version: "1.0", Experiment_name: experiment_name, + Cluster_name: cluster_identifier, Performance_profile: "resource-optimization-openshift", Mode: "monitor", Target_cluster: "remote", diff --git a/internal/utils/kruize/kruize_api.go b/internal/utils/kruize/kruize_api.go index d07eefbc..b263fee6 100644 --- a/internal/utils/kruize/kruize_api.go +++ b/internal/utils/kruize/kruize_api.go @@ -20,7 +20,7 @@ var log *logrus.Entry = logging.GetLogger() var cfg *config.Config = config.GetConfig() var experimentCreateAttempt bool = true -func Create_kruize_experiments(experiment_name string, k8s_object []map[string]interface{}) ([]string, error) { +func Create_kruize_experiments(experiment_name string, cluster_identifier string, k8s_object []map[string]interface{}) ([]string, error) { // k8s_object (can) contain multiple containers of same k8s object type. data := map[string]string{ "namespace": kruizePayload.AssertAndConvertToString(k8s_object[0]["namespace"]), @@ -39,7 +39,7 @@ func Create_kruize_experiments(experiment_name string, k8s_object []map[string]i }) } } - payload, err := kruizePayload.GetCreateExperimentPayload(experiment_name, containers, data) + payload, err := kruizePayload.GetCreateExperimentPayload(experiment_name, cluster_identifier, containers, data) if err != nil { return nil, fmt.Errorf("unable to create payload: %v", err) } @@ -69,7 +69,7 @@ func Create_kruize_experiments(experiment_name string, k8s_object []map[string]i log.Info("Tring to create resource_optimization_openshift performance profile") utils.Setup_kruize_performance_profile() experimentCreateAttempt = false // Attempting only once - container_names, err := Create_kruize_experiments(experiment_name, k8s_object) + container_names, err := Create_kruize_experiments(experiment_name, cluster_identifier, k8s_object) experimentCreateAttempt = true if err != nil { return nil, err @@ -178,14 +178,74 @@ func Update_recommendations(experiment_name string, interval_end_time time.Time) } -func Is_valid_recommendation(d []kruizePayload.ListRecommendations) bool { +func Is_valid_recommendation(d []kruizePayload.ListRecommendations, experiment_name string, maxEndTime time.Time) bool { if len(d) > 0 { - notifications := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Notifications - // 112101 is notification code for "Duration Based Recommendations Available". - if _, ok := notifications["112101"]; ok { + + // Convert the time object to the expected format + formattedMaxEndTime := maxEndTime.UTC().Format("2006-01-02T15:04:05.000Z") + + // Allowed object of notifications + // https://github.com/kruize/autotune/blob/master/design/NotificationCodes.md#detailed-codes + notificationCodes := map[string]string{ + "111000": "INFO", + "120001": "INFO", + "111101": "INFO", + "111102": "INFO", + "111103": "INFO", + "112101": "INFO", + "112102": "INFO", + "221001": "ERROR", + "221002": "ERROR", + "221003": "ERROR", + "221004": "ERROR", + "223001": "ERROR", + "223002": "ERROR", + "223003": "ERROR", + "223004": "ERROR", + "224001": "ERROR", + "224002": "ERROR", + "224003": "ERROR", + "224004": "ERROR", + } + + // Recommendation level + notificationsTopLevel := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Notifications + + // At the top level 111000 and 120001 are considered valid notifications + for key := range notificationsTopLevel { + _, notificationExists := notificationCodes[key] + dataExists := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data + if (key != "120001" && len(dataExists) == 0) || !notificationExists{ + // Setting the metric counter to 1 + // Expecting a single metric for a combination of notification_code, experiment_name + kruizeInvalidRecommendationDetail.WithLabelValues(key, experiment_name).Set(1) + return false + } + + // Timestamp level + notificationsLevelTwo := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].Notifications + // Term Level + notificationsLevelThreeShortTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Short_term.Notifications + notificationsLevelThreeMediumTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Medium_term.Notifications + notificationsLevelThreeLongTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Long_term.Notifications + + notificationSections := []map[string]kruizePayload.Notification{ + notificationsLevelTwo, + notificationsLevelThreeShortTerm, + notificationsLevelThreeMediumTerm, + notificationsLevelThreeLongTerm, + } + + for _, notificationBody := range notificationSections { + for key := range notificationBody { + _, keyExists := notificationCodes[key] + if !keyExists { + kruizeInvalidRecommendationDetail.WithLabelValues(key, experiment_name).Set(1) + } + + } + } return true - } else { - return false } } return false diff --git a/internal/utils/kruize/metrics.go b/internal/utils/kruize/metrics.go index a6980148..eb3d17a3 100644 --- a/internal/utils/kruize/metrics.go +++ b/internal/utils/kruize/metrics.go @@ -13,3 +13,14 @@ var ( []string{"path"}, ) ) + + +var ( + kruizeInvalidRecommendationDetail = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "rosocp_kruize_invalid_recommendation_detail", + Help: "List of INFO/ERROR type recommendations from Kruize", + }, + []string{"notification_code", "experiment_name"}, + ) +) \ No newline at end of file diff --git a/kruize-clowdapp.yaml b/kruize-clowdapp.yaml index 7c06cc56..ddf3fe3c 100644 --- a/kruize-clowdapp.yaml +++ b/kruize-clowdapp.yaml @@ -159,7 +159,11 @@ parameters: - description: Kruize image tag name: KRUIZE_IMAGE_TAG required: true +<<<<<<< HEAD value: "6c23945" +======= + value: "c84bd63" +>>>>>>> 752adf2 (RHIROS-1326 Kruize 0.0.20.1_rm Integration) - description: Kruize server port name: KRUIZE_PORT required: true @@ -192,4 +196,4 @@ parameters: - name: SSL_CERT_DIR value: '/etc/ssl/certs:/etc/pki/tls/certs:/system/etc/security/cacerts:/cdapp/certs' - name: KRUIZE_LOGGING_LEVEL - value: "info" \ No newline at end of file + value: "info" diff --git a/openapi.json b/openapi.json index 534b095f..a4dd4aee 100644 --- a/openapi.json +++ b/openapi.json @@ -33,7 +33,7 @@ { "name": "workload_type", "in": "query", - "description": "Workload type", + "description": "Options are daemonset, deployment, deploymentconfig, replicaset, replicationcontroller, statefulset", "required": false, "schema": { "type": "string" @@ -73,7 +73,8 @@ "required": false, "schema": { "type": "string" - } + }, + "example": "YYYY-MM-DD" }, { "name": "end_date", @@ -82,7 +83,8 @@ "required": false, "schema": { "type": "string" - } + }, + "example": "YYYY-MM-DD" }, { "name": "offset", @@ -178,7 +180,7 @@ "content": { "application/json; charset=UTF-8": { "schema": { - "$ref": "#/components/schemas/Recommendation" + "$ref": "#/components/schemas/Recommendations" } } } @@ -193,82 +195,630 @@ } } } - } - } - } - } - }, - "components": { - "schemas": { - "RecommendationList": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Recommendation" - } + } + } + } + } + }, + "components": { + "schemas": { + "RecommendationList": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Recommendations" + } + }, + "meta": { + "type": "object", + "properties": { + "count": { + "type": "integer", + "minimum": 1 + }, + "limit": { + "type": "integer", + "minimum": 1, + "maximum": 10 + }, + "offset": { + "type": "integer", + "minimum": 0 + } + } + }, + "links": { + "type": "object", + "properties": { + "first": { + "type": "string" + }, + "previous": { + "type": "string" + }, + "next": { + "type": "string" + }, + "last": { + "type": "string" + } + } + } + } + }, + "Recommendations": { + "type": "object", + "properties": { + "cluster_alias": { + "type": "string", + "example": "test_cost_ocp_ros_3c462dbe" + }, + "cluster_uuid": { + "type": "string", + "example": "d29c4b8b-f1a8-471c-ab95-b64e36bb51a9" + }, + "container": { + "type": "string", + "example": "pod-ros-A11" + }, + "id": { + "type": "string", + "example": "721eb376-13a9-43ab-868e-755aa1ce7f2a" + }, + "last_reported": { + "type": "string", + "format": "date-time", + "example": "2023-04-18T15:48:54.000Z" + }, + "project": { + "type": "string", + "example": "project-ros-A1" + }, + "recommendations": { + "type": "object", + "properties": { + "current": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 2 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 30.715 + }, + "format": { + "type": "string", + "example": "Mi" + } + } + } + } + }, + "requests": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 2 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 20.391 + }, + "format": { + "type": "string", + "example": "Mi" + } + } + } + } + } + } + }, + "monitoring_end_time": { + "type": "string", + "format": "date-time" + }, + "notifications": { + "type": "object", + "properties": { + "type": { + "type": "string", + "example": "info" + }, + "code": { + "type": "string", + "example": 112101 + }, + "message": { + "type": "string", + "example": "Short Term Recommendations Available" + } + } + }, + "recommendation_terms": { + "type": "object", + "properties": { + "long_term": { + "$ref": "#/components/schemas/LongTermRecommendation" + }, + "medium_term": { + "$ref": "#/components/schemas/MediumTermRecommendation" + }, + "short_term": { + "$ref": "#/components/schemas/ShortTermRecommendation" + } + } + } + } + }, + "source_id": { + "type": "string", + "example": "0920ff0d-f1d6-4fe2-8bf3-18e6074bd27b" + }, + "workload": { + "type": "string", + "example": "pod-ros-A11" + }, + "workload_type": { + "type": "string", + "example": "deploymentconfig" + } + } + }, + "LongTermRecommendation": { + "type": "object", + "properties": { + "duration_in_hours": { + "type": "number", + "example": 24.7 + }, + "monitoring_start_time": { + "type": "string", + "format": "date-time", + "example": "0001-01-01T00:00:00Z" + }, + "notifications": { + "type": "object", + "properties": { + "120001": { + "type": "object", + "properties": { + "code": { + "type": "integer", + "example": 120001 + }, + "message": { + "type": "string", + "example": "There is not enough data available to generate a recommendation." + }, + "type": { + "type": "string", + "example": "info" + } + } + } + } + } + } + }, + "MediumTermRecommendation": { + "type": "object", + "properties": { + "duration_in_hours": { + "type": "number", + "example": 24.7 + }, + "monitoring_start_time": { + "type": "string", + "format": "date-time", + "example": "0001-01-01T00:00:00Z" + }, + "notifications": { + "type": "object", + "properties": { + "120001": { + "type": "object", + "properties": { + "code": { + "type": "integer", + "example": 120001 + }, + "message": { + "type": "string", + "example": "There is not enough data available to generate a recommendation." + }, + "type": { + "type": "string", + "example": "info" + } + } + } + } + } + } + }, + "ShortTermRecommendation": { + "type": "object", + "properties": { + "duration_in_hours": { + "type": "number", + "example": 24.7 + }, + "monitoring_start_time": { + "type": "string", + "format": "date-time", + "example": "0001-01-01T00:00:00Z" + }, + "notifications": { + "type": "object", + "properties": { + "112101": { + "type": "object", + "properties": { + "code": { + "type": "integer", + "example": 112101 + }, + "message": { + "type": "string", + "example": "Cost Recommendations Available" + }, + "type": { + "type": "string", + "example": "info" + } + } + }, + "112102": { + "type": "object", + "properties": { + "code": { + "type": "integer", + "example": 112102 + }, + "message": { + "type": "string", + "example": "Performance Recommendations Available" + }, + "type": { + "type": "string", + "example": "info" + } + } + } + } + }, + "recommendation_engines": { + "type": "object", + "properties": { + "cost": { + "$ref": "#/components/schemas/CostRecommendation" + }, + "performance": { + "$ref": "#/components/schemas/PerformanceRecommendation" + } + } + } + } + }, + "CostRecommendation": { + "type": "object", + "properties": { + "config": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 5 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 6.7 + }, + "format": { + "type": "string", + "example": "Gi" + } + } + } + } + }, + "requests": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 3 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 700 + }, + "format": { + "type": "string", + "example": "Mi" + } + } + } + } + } + } + }, + "pods_count": { + "type": "integer", + "example": 1 + }, + "variation": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 1.24 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 1.7 + }, + "format": { + "type": "string", + "example": "Gi" + } + } + } + } + }, + "requests": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 1.08 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 300 + }, + "format": { + "type": "string", + "example": "Mi" + } + } + } + } + } + } + } + } + }, + "PerformanceRecommendation": { + "type": "object", + "properties": { + "config": { + "type": "object", + "properties": { + "limits": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 622 + }, + "format": { + "type": "string", + "example": "millicores" + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 500 + }, + "format": { + "type": "string", + "example": "Mi" + } + } + } + } + }, + "requests": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 3.92 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 6 + }, + "format": { + "type": "string", + "example": "Gi" + } + } + } + } + } + } }, - "meta": { - "type": "object", - "properties": { - "count": { - "type": "integer", - "minimum": 0 - }, - "limit": { - "type": "integer", - "minimum": 1, - "maximum": 100 - }, - "offset": { - "type": "integer", - "minimum": 0 - } - } + "pods_count": { + "type": "integer", + "example": 1 }, - "links": { + "variation": { "type": "object", "properties": { - "first": { - "type": "string" - }, - "previous": { - "type": "string" - }, - "next": { - "type": "string" + "limits": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": -1.468 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 200 + }, + "format": { + "type": "string", + "example": "Mi" + } + } + } + } }, - "last": { - "type": "string" + "requests": { + "type": "object", + "properties": { + "cpu": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 2 + }, + "format": { + "type": "string", + "example": null + } + } + }, + "memory": { + "type": "object", + "properties": { + "amount": { + "type": "number", + "example": 1 + }, + "format": { + "type": "string", + "example": "Gi" + } + } + } + } } } } } }, - "Notifications": { - "type": "object", - "additionalProperties": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "NOTICE", - "WARNING", - "CRITICAL" - ] - }, - "message": { - "type": "string", - "example": "There is not enough data available to generate a recommendation." - }, - "code": { - "type": "number" - } - } - } - }, "Recommendation": { "type": "object", "properties": { @@ -321,7 +871,7 @@ }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -334,7 +884,7 @@ }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -348,11 +898,11 @@ "properties": { "amount": { "type": "number", - "example": 1.91 + "example": 2 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -361,11 +911,11 @@ "properties": { "amount": { "type": "number", - "example": 16.391 + "example": 20.391 }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -384,11 +934,11 @@ "properties": { "amount": { "type": "number", - "example": 2.11 + "example": 3.11 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -401,7 +951,7 @@ }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -415,11 +965,11 @@ "properties": { "amount": { "type": "number", - "example": 1.92 + "example": 3 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -432,7 +982,7 @@ }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -440,31 +990,6 @@ } } }, - "notifications": { - "$ref": "#/components/schemas/Notifications" - }, - "pods_count": { - "type": "integer", - "example": 1 - }, - "confidence_level": { - "type": "number", - "example": 0.5 - }, - "duration_in_hours": { - "type": "number", - "example": 361 - }, - "monitoring_end_time": { - "type": "string", - "format": "date-time", - "example": "2023-04-18T15:00:00.000Z" - }, - "monitoring_start_time": { - "type": "string", - "format": "date-time", - "example": "2023-04-03T15:00:00.000Z" - }, "variation": { "type": "object", "properties": { @@ -476,11 +1001,11 @@ "properties": { "amount": { "type": "number", - "example": 3 + "example": 1 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -489,11 +1014,11 @@ "properties": { "amount": { "type": "number", - "example": 344 + "example": 0.959 }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -507,11 +1032,11 @@ "properties": { "amount": { "type": "number", - "example": 4 + "example": 1 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -520,17 +1045,39 @@ "properties": { "amount": { "type": "number", - "example": 500 + "example": 3.995 }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } } } } + }, + "pods_count": { + "type": "integer", + "example": 1 + }, + "confidence_level": { + "type": "number", + "example": 0.5 + }, + "duration_in_hours": { + "type": "number", + "example": 361 + }, + "monitoring_end_time": { + "type": "string", + "format": "date-time", + "example": "2023-04-18T15:00:00.000Z" + }, + "monitoring_start_time": { + "type": "string", + "format": "date-time", + "example": "2023-04-03T15:00:00.000Z" } } }, @@ -552,7 +1099,7 @@ }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -561,11 +1108,11 @@ "properties": { "amount": { "type": "number", - "example": 30.715 + "example": 300 }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -583,7 +1130,7 @@ }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -592,11 +1139,11 @@ "properties": { "amount": { "type": "number", - "example": 16.391 + "example": 5 }, "format": { "type": "string", - "example": "MiB" + "example": "Gi" } } } @@ -628,11 +1175,11 @@ "properties": { "amount": { "type": "number", - "example": 31.674 + "example": 500 }, "format": { "type": "string", - "example": "GiB" + "example": "Mi" } } } @@ -646,11 +1193,11 @@ "properties": { "amount": { "type": "number", - "example": 1.92 + "example": 3.92 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -659,11 +1206,11 @@ "properties": { "amount": { "type": "number", - "example": 16.396 + "example": 6 }, "format": { "type": "string", - "example": "MiB" + "example": "Gi" } } } @@ -671,31 +1218,6 @@ } } }, - "notifications": { - "$ref": "#/components/schemas/Notifications" - }, - "pods_count": { - "type": "integer", - "example": 1 - }, - "confidence_level": { - "type": "number", - "example": 0.5 - }, - "duration_in_hours": { - "type": "number", - "example": 169 - }, - "monitoring_end_time": { - "type": "string", - "format": "date-time", - "example": "2023-04-18T15:00:00.000Z" - }, - "monitoring_start_time": { - "type": "string", - "format": "date-time", - "example": "2023-04-11T15:00:00.000Z" - }, "variation": { "type": "object", "properties": { @@ -707,11 +1229,11 @@ "properties": { "amount": { "type": "number", - "example": 2 + "example": -1.468 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -720,11 +1242,11 @@ "properties": { "amount": { "type": "number", - "example": 959.4 + "example": 200 }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -738,11 +1260,11 @@ "properties": { "amount": { "type": "number", - "example": 5 + "example": 2 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -751,17 +1273,39 @@ "properties": { "amount": { "type": "number", - "example": 200.333 + "example": 1 }, "format": { "type": "string", - "example": "MiB" + "example": "Gi" } } } } } } + }, + "pods_count": { + "type": "integer", + "example": 1 + }, + "confidence_level": { + "type": "number", + "example": 0.5 + }, + "duration_in_hours": { + "type": "number", + "example": 169 + }, + "monitoring_end_time": { + "type": "string", + "format": "date-time", + "example": "2023-04-18T15:00:00.000Z" + }, + "monitoring_start_time": { + "type": "string", + "format": "date-time", + "example": "2023-04-11T15:00:00.000Z" } } }, @@ -779,11 +1323,11 @@ "properties": { "amount": { "type": "number", - "example": 2.09 + "example": 3.76 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -792,11 +1336,11 @@ "properties": { "amount": { "type": "number", - "example": 30.715 + "example": 5 }, "format": { "type": "string", - "example": "MiB" + "example": "Gi" } } } @@ -814,7 +1358,7 @@ }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -823,11 +1367,11 @@ "properties": { "amount": { "type": "number", - "example": 16.391 + "example": 400 }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -846,11 +1390,11 @@ "properties": { "amount": { "type": "number", - "example": 2.11 + "example": 5 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -859,11 +1403,11 @@ "properties": { "amount": { "type": "number", - "example": 31.674 + "example": 6.7 }, "format": { "type": "string", - "example": "MiB" + "example": "Gi" } } } @@ -877,11 +1421,11 @@ "properties": { "amount": { "type": "number", - "example": 1.92 + "example": 3 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -890,11 +1434,11 @@ "properties": { "amount": { "type": "number", - "example": 16.396 + "example": 700 }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } @@ -902,31 +1446,6 @@ } } }, - "notifications": { - "$ref": "#/components/schemas/Notifications" - }, - "pods_count": { - "type": "integer", - "example": 1 - }, - "confidence_level": { - "type": "number", - "example": 0.5 - }, - "duration_in_hours": { - "type": "number", - "example": 25 - }, - "monitoring_end_time": { - "type": "string", - "format": "date-time", - "example": "2023-04-18T15:00:00.000Z" - }, - "monitoring_start_time": { - "type": "string", - "format": "date-time", - "example": "2023-04-17T15:00:00.000Z" - }, "variation": { "type": "object", "properties": { @@ -938,11 +1457,11 @@ "properties": { "amount": { "type": "number", - "example": 5 + "example": 1.24 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -951,11 +1470,11 @@ "properties": { "amount": { "type": "number", - "example": 929.111 + "example": 1.7 }, "format": { "type": "string", - "example": "MiB" + "example": "Gi" } } } @@ -969,11 +1488,11 @@ "properties": { "amount": { "type": "number", - "example": 3 + "example": 1.08 }, "format": { "type": "string", - "example": "cores" + "example": null } } }, @@ -982,17 +1501,39 @@ "properties": { "amount": { "type": "number", - "example": 500 + "example": 300 }, "format": { "type": "string", - "example": "MiB" + "example": "Mi" } } } } } } + }, + "pods_count": { + "type": "integer", + "example": 1 + }, + "confidence_level": { + "type": "number", + "example": 0.5 + }, + "duration_in_hours": { + "type": "number", + "example": 25 + }, + "monitoring_end_time": { + "type": "string", + "format": "date-time", + "example": "2023-04-18T15:00:00.000Z" + }, + "monitoring_start_time": { + "type": "string", + "format": "date-time", + "example": "2023-04-17T15:00:00.000Z" } } } diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml index 38157c36..fbcc89f6 100644 --- a/scripts/docker-compose.yml +++ b/scripts/docker-compose.yml @@ -54,7 +54,11 @@ services: depends_on: - kafka kruize-autotune: +<<<<<<< HEAD image: quay.io/cloudservices/autotune:6c23945 +======= + image: quay.io/cloudservices/autotune:c84bd63 +>>>>>>> 752adf2 (RHIROS-1326 Kruize 0.0.20.1_rm Integration) volumes: - ./cdappconfig.json:/tmp/cdappconfig.json:Z ports: @@ -145,4 +149,4 @@ services: depends_on: - db-sources - kafka - - redis \ No newline at end of file + - redis From d2c91ffd1c4b1b54d771324518c7da4c33caf62d Mon Sep 17 00:00:00 2001 From: Sagnik Dutta Date: Wed, 22 Nov 2023 14:21:25 +0530 Subject: [PATCH 2/5] Updates to Is_valid_recommendation --- internal/utils/kruize/kruize_api.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/internal/utils/kruize/kruize_api.go b/internal/utils/kruize/kruize_api.go index b263fee6..910fdac5 100644 --- a/internal/utils/kruize/kruize_api.go +++ b/internal/utils/kruize/kruize_api.go @@ -181,10 +181,12 @@ func Update_recommendations(experiment_name string, interval_end_time time.Time) func Is_valid_recommendation(d []kruizePayload.ListRecommendations, experiment_name string, maxEndTime time.Time) bool { if len(d) > 0 { + // Convert the time object to the expected format formattedMaxEndTime := maxEndTime.UTC().Format("2006-01-02T15:04:05.000Z") + _, timeStampisValid := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime] - // Allowed object of notifications + // Allowed object of notifications; "111000" means valid and actionable recommendation // https://github.com/kruize/autotune/blob/master/design/NotificationCodes.md#detailed-codes notificationCodes := map[string]string{ "111000": "INFO", @@ -211,11 +213,16 @@ func Is_valid_recommendation(d []kruizePayload.ListRecommendations, experiment_n // Recommendation level notificationsTopLevel := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Notifications - // At the top level 111000 and 120001 are considered valid notifications for key := range notificationsTopLevel { - _, notificationExists := notificationCodes[key] + + if (key == "111000" && !timeStampisValid) { + log.Error("recommendation endtime does not match with requested endtime:", formattedMaxEndTime) + return false + } + dataExists := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data - if (key != "120001" && len(dataExists) == 0) || !notificationExists{ + if (key == "111000" && len(dataExists) == 0) { + log.Error("recommendation does not contain data for endtime:", formattedMaxEndTime) // Setting the metric counter to 1 // Expecting a single metric for a combination of notification_code, experiment_name kruizeInvalidRecommendationDetail.WithLabelValues(key, experiment_name).Set(1) From bb0e49efeb540848b6789707a1dfadf0b052fec6 Mon Sep 17 00:00:00 2001 From: Sagnik Dutta Date: Mon, 4 Dec 2023 23:30:24 +0530 Subject: [PATCH 3/5] Suggested changes: 1. Check validity per container's recommendation 2. Kruize ERROR logging, all levels --- internal/services/metrics.go | 4 - internal/types/kruizePayload/common.go | 33 +++--- internal/utils/kruize/kruize_api.go | 136 ++++++++++++++----------- internal/utils/kruize/metrics.go | 22 ++-- kruize-clowdapp.yaml | 4 - scripts/docker-compose.yml | 4 - 6 files changed, 102 insertions(+), 101 deletions(-) diff --git a/internal/services/metrics.go b/internal/services/metrics.go index 435356d9..ad4c7c6e 100644 --- a/internal/services/metrics.go +++ b/internal/services/metrics.go @@ -6,10 +6,6 @@ import ( ) var ( - invalidRecommendation = promauto.NewCounter(prometheus.CounterOpts{ - Name: "rosocp_invalid_recommendation_total", - Help: "The total number of invalid recommendation send by Kruize", - }) invalidCSV = promauto.NewCounter(prometheus.CounterOpts{ Name: "rosocp_invalid_csv_total", Help: "The total number of invalid csv send by cost-mgmt", diff --git a/internal/types/kruizePayload/common.go b/internal/types/kruizePayload/common.go index 83714704..ada5af11 100644 --- a/internal/types/kruizePayload/common.go +++ b/internal/types/kruizePayload/common.go @@ -17,7 +17,7 @@ type container struct { Container_image_name string `json:"container_image_name,omitempty"` Container_name string `json:"container_name,omitempty"` Metrics []metric `json:"metrics,omitempty"` - Recommendations recommendation `json:"recommendations,omitempty"` + Recommendations Recommendation `json:"recommendations,omitempty"` } type metric struct { @@ -37,13 +37,12 @@ type aggregation_info struct { Format string `json:"format,omitempty"` } -type recommendation struct { - Version string `json:"version,omitempty"` +type Recommendation struct { + Version string `json:"version,omitempty"` Data map[string]RecommendationData `json:"data,omitempty"` Notifications map[string]Notification `json:"notifications,omitempty"` } - type Notification struct { NotifyType string `json:"type,omitempty"` Message string `json:"message,omitempty"` @@ -51,31 +50,31 @@ type Notification struct { } type RecommendationEngineObject struct { - PodsCount int `json:"pods_count,omitempty"` - ConfidenceLevel float64 `json:"confidence_level,omitempty"` - Config ConfigObject `json:"config,omitempty"` - Variation ConfigObject `json:"variation,omitempty"` - Notifications map[string]Notification `json:"notifications,omitempty"` + PodsCount int `json:"pods_count,omitempty"` + ConfidenceLevel float64 `json:"confidence_level,omitempty"` + Config ConfigObject `json:"config,omitempty"` + Variation ConfigObject `json:"variation,omitempty"` + Notifications map[string]Notification `json:"notifications,omitempty"` } type RecommendationData struct { Notifications map[string]Notification `json:"notifications,omitempty"` - MonitoringEndTime time.Time `json:"monitoring_end_time,omitempty"` - Current ConfigObject `json:"current,omitempty"` - RecommendationTerms TermBased `json:"recommendation_terms,omitempty"` + MonitoringEndTime time.Time `json:"monitoring_end_time,omitempty"` + Current ConfigObject `json:"current,omitempty"` + RecommendationTerms Term `json:"recommendation_terms,omitempty"` } type RecommendationTerm struct { - DurationInHours float64 `json:"duration_in_hours,omitempty"` - Notifications map[string]Notification `json:"notifications,omitempty"` - MonitoringStartTime time.Time `json:"monitoring_start_time,omitempty"` - RecommendationEngines *struct { + DurationInHours float64 `json:"duration_in_hours,omitempty"` + Notifications map[string]Notification `json:"notifications,omitempty"` + MonitoringStartTime time.Time `json:"monitoring_start_time,omitempty"` + RecommendationEngines *struct { Cost RecommendationEngineObject `json:"cost,omitempty"` Performance RecommendationEngineObject `json:"performance,omitempty"` } `json:"recommendation_engines,omitempty"` } -type TermBased struct { +type Term struct { Short_term RecommendationTerm `json:"short_term"` Medium_term RecommendationTerm `json:"medium_term"` Long_term RecommendationTerm `json:"long_term,omitempty"` diff --git a/internal/utils/kruize/kruize_api.go b/internal/utils/kruize/kruize_api.go index 910fdac5..dd52be87 100644 --- a/internal/utils/kruize/kruize_api.go +++ b/internal/utils/kruize/kruize_api.go @@ -178,82 +178,96 @@ func Update_recommendations(experiment_name string, interval_end_time time.Time) } -func Is_valid_recommendation(d []kruizePayload.ListRecommendations, experiment_name string, maxEndTime time.Time) bool { - if len(d) > 0 { - +func Is_valid_recommendation(recommendation kruizePayload.Recommendation, experiment_name string, maxEndTime time.Time) bool { + validRecommendationCode := "111000" + _, recommendationIsValid := recommendation.Notifications[validRecommendationCode] + if recommendationIsValid { // Convert the time object to the expected format formattedMaxEndTime := maxEndTime.UTC().Format("2006-01-02T15:04:05.000Z") - _, timeStampisValid := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime] - - // Allowed object of notifications; "111000" means valid and actionable recommendation - // https://github.com/kruize/autotune/blob/master/design/NotificationCodes.md#detailed-codes - notificationCodes := map[string]string{ - "111000": "INFO", - "120001": "INFO", - "111101": "INFO", - "111102": "INFO", - "111103": "INFO", - "112101": "INFO", - "112102": "INFO", - "221001": "ERROR", - "221002": "ERROR", - "221003": "ERROR", - "221004": "ERROR", - "223001": "ERROR", - "223002": "ERROR", - "223003": "ERROR", - "223004": "ERROR", - "224001": "ERROR", - "224002": "ERROR", - "224003": "ERROR", - "224004": "ERROR", + recommendationData, timeStampisValid := recommendation.Data[formattedMaxEndTime] + if !timeStampisValid { + log.Error("recommendation not found for endtime: ", formattedMaxEndTime) + invalidRecommendation.Inc() + return false } + LogKruizeErrors(recommendationData, formattedMaxEndTime, experiment_name) + return true + } else { + return false + } +} - // Recommendation level - notificationsTopLevel := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Notifications +func LogKruizeErrors(recommendationData kruizePayload.RecommendationData, formattedMaxEndTime string, experiment_name string) { - for key := range notificationsTopLevel { + // https://github.com/kruize/autotune/blob/master/design/NotificationCodes.md#detailed-codes + errorNotificationCodes := map[string]string{ + "221001": "ERROR", + "221002": "ERROR", + "221003": "ERROR", + "221004": "ERROR", + "223001": "ERROR", + "223002": "ERROR", + "223003": "ERROR", + "223004": "ERROR", + "224001": "ERROR", + "224002": "ERROR", + "224003": "ERROR", + "224004": "ERROR", + } + notificationSections := []map[string]kruizePayload.Notification{} - if (key == "111000" && !timeStampisValid) { - log.Error("recommendation endtime does not match with requested endtime:", formattedMaxEndTime) - return false - } + // Timestamp level + notificationsLevelTwo := recommendationData.Notifications + if notificationsLevelTwo != nil { + notificationSections = append(notificationSections, notificationsLevelTwo) + // Term Level + notificationsLevelThreeShortTerm := recommendationData.RecommendationTerms.Short_term.Notifications + if notificationsLevelThreeShortTerm != nil { + notificationSections = append(notificationSections, notificationsLevelThreeShortTerm) + // Engine Level + if recommendationData.RecommendationTerms.Short_term.RecommendationEngines != nil { + shortTermCostNotification := recommendationData.RecommendationTerms.Short_term.RecommendationEngines.Cost.Notifications + notificationSections = append(notificationSections, shortTermCostNotification) - dataExists := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data - if (key == "111000" && len(dataExists) == 0) { - log.Error("recommendation does not contain data for endtime:", formattedMaxEndTime) - // Setting the metric counter to 1 - // Expecting a single metric for a combination of notification_code, experiment_name - kruizeInvalidRecommendationDetail.WithLabelValues(key, experiment_name).Set(1) - return false + shortTermPerformanceNotification := recommendationData.RecommendationTerms.Short_term.RecommendationEngines.Performance.Notifications + notificationSections = append(notificationSections, shortTermPerformanceNotification) } + } + notificationsLevelThreeMediumTerm := recommendationData.RecommendationTerms.Medium_term.Notifications + if notificationsLevelThreeMediumTerm != nil { + notificationSections = append(notificationSections, notificationsLevelThreeMediumTerm) + if recommendationData.RecommendationTerms.Medium_term.RecommendationEngines != nil { + mediumTermCostNotification := recommendationData.RecommendationTerms.Medium_term.RecommendationEngines.Cost.Notifications + notificationSections = append(notificationSections, mediumTermCostNotification) - // Timestamp level - notificationsLevelTwo := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].Notifications - // Term Level - notificationsLevelThreeShortTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Short_term.Notifications - notificationsLevelThreeMediumTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Medium_term.Notifications - notificationsLevelThreeLongTerm := d[0].Kubernetes_objects[0].Containers[0].Recommendations.Data[formattedMaxEndTime].RecommendationTerms.Long_term.Notifications + mediumTermPerformanceNotification := recommendationData.RecommendationTerms.Medium_term.RecommendationEngines.Performance.Notifications + notificationSections = append(notificationSections, mediumTermPerformanceNotification) + } + } + notificationsLevelThreeLongTerm := recommendationData.RecommendationTerms.Long_term.Notifications + if notificationsLevelThreeLongTerm != nil { + notificationSections = append(notificationSections, notificationsLevelThreeLongTerm) + if recommendationData.RecommendationTerms.Long_term.RecommendationEngines != nil { + longTermCostNotification := recommendationData.RecommendationTerms.Long_term.RecommendationEngines.Cost.Notifications + notificationSections = append(notificationSections, longTermCostNotification) - notificationSections := []map[string]kruizePayload.Notification{ - notificationsLevelTwo, - notificationsLevelThreeShortTerm, - notificationsLevelThreeMediumTerm, - notificationsLevelThreeLongTerm, + longTermPerformanceNotification := recommendationData.RecommendationTerms.Long_term.RecommendationEngines.Performance.Notifications + notificationSections = append(notificationSections, longTermPerformanceNotification) } + } - for _, notificationBody := range notificationSections { - for key := range notificationBody { - _, keyExists := notificationCodes[key] - if !keyExists { - kruizeInvalidRecommendationDetail.WithLabelValues(key, experiment_name).Set(1) - } + } - } + for _, notificationBody := range notificationSections { + for key := range notificationBody { + _, keyExists := errorNotificationCodes[key] + if keyExists { + log.Error("kruize recommendation error; experiment_name: ", experiment_name, ", notification_code: ", key) + kruizeRecommendationError.WithLabelValues(key).Inc() } - return true + } } - return false + } diff --git a/internal/utils/kruize/metrics.go b/internal/utils/kruize/metrics.go index eb3d17a3..736d5489 100644 --- a/internal/utils/kruize/metrics.go +++ b/internal/utils/kruize/metrics.go @@ -12,15 +12,15 @@ var ( }, []string{"path"}, ) + invalidRecommendation = promauto.NewCounter(prometheus.CounterOpts{ + Name: "rosocp_invalid_recommendation_total", + Help: "The total number of invalid recommendation send by Kruize", + }) + kruizeRecommendationError = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "rosocp_kruize_error_recommendations_count", + Help: "Count of ERROR type recommendations from Kruize", + }, + []string{"notification_code"}, + ) ) - - -var ( - kruizeInvalidRecommendationDetail = promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "rosocp_kruize_invalid_recommendation_detail", - Help: "List of INFO/ERROR type recommendations from Kruize", - }, - []string{"notification_code", "experiment_name"}, - ) -) \ No newline at end of file diff --git a/kruize-clowdapp.yaml b/kruize-clowdapp.yaml index ddf3fe3c..84a272aa 100644 --- a/kruize-clowdapp.yaml +++ b/kruize-clowdapp.yaml @@ -159,11 +159,7 @@ parameters: - description: Kruize image tag name: KRUIZE_IMAGE_TAG required: true -<<<<<<< HEAD - value: "6c23945" -======= value: "c84bd63" ->>>>>>> 752adf2 (RHIROS-1326 Kruize 0.0.20.1_rm Integration) - description: Kruize server port name: KRUIZE_PORT required: true diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml index fbcc89f6..96826ab2 100644 --- a/scripts/docker-compose.yml +++ b/scripts/docker-compose.yml @@ -54,11 +54,7 @@ services: depends_on: - kafka kruize-autotune: -<<<<<<< HEAD - image: quay.io/cloudservices/autotune:6c23945 -======= image: quay.io/cloudservices/autotune:c84bd63 ->>>>>>> 752adf2 (RHIROS-1326 Kruize 0.0.20.1_rm Integration) volumes: - ./cdappconfig.json:/tmp/cdappconfig.json:Z ports: From 7532cc770515d072d6662af9b3852b29442a144b Mon Sep 17 00:00:00 2001 From: Sagnik Dutta Date: Tue, 19 Dec 2023 18:07:35 +0530 Subject: [PATCH 4/5] Remove Kruize Error logging --- internal/api/utils.go | 20 +++++--- internal/utils/kruize/kruize_api.go | 77 +---------------------------- internal/utils/kruize/metrics.go | 7 --- 3 files changed, 13 insertions(+), 91 deletions(-) diff --git a/internal/api/utils.go b/internal/api/utils.go index 9e4dae0b..3aa2c55d 100644 --- a/internal/api/utils.go +++ b/internal/api/utils.go @@ -1,14 +1,14 @@ package api import ( + "encoding/json" "fmt" + "math" "net/http" "net/url" "strconv" "strings" "time" - "encoding/json" - "math" "gorm.io/datatypes" @@ -269,7 +269,7 @@ func TransformComponentUnits(jsonData datatypes.JSON) map[string]interface{} { return nil } - // Current + // Current section of recommendation current_config, ok := data["current"].(map[string]interface{}) if !ok { log.Error("current not found in JSON") @@ -300,15 +300,16 @@ func TransformComponentUnits(jsonData datatypes.JSON) map[string]interface{} { /* Recommendation data is available for three periods - under cost and performance keys + under cost and performance keys(engines) For each of these actual values will be present in below mentioned dataBlocks > request and limits */ - // Recommendations + // Recommendation section recommendation_terms, ok := data["recommendation_terms"].(map[string]interface{}) if !ok { - log.Error("recommendation_terms not found in JSON") + log.Error("recommendation data not found in JSON") + return data } for _, period := range []string{"short_term", "medium_term", "long_term"} { @@ -317,13 +318,16 @@ func TransformComponentUnits(jsonData datatypes.JSON) map[string]interface{} { continue } - // Hack + /* Hack + // monitoring_start_time is currently not nullable on DB + // Hence cannot be set to null while saving response from Kruize + */ // remove nil equivalent monitoring_start_time in API response monitoring_start_time := intervalData["monitoring_start_time"] if monitoring_start_time == "0001-01-01T00:00:00Z" { delete(intervalData, "monitoring_start_time") } - + if intervalData["recommendation_engines"] != nil { for _, recommendationType := range []string{"cost", "performance"} { diff --git a/internal/utils/kruize/kruize_api.go b/internal/utils/kruize/kruize_api.go index dd52be87..0e069fef 100644 --- a/internal/utils/kruize/kruize_api.go +++ b/internal/utils/kruize/kruize_api.go @@ -185,89 +185,14 @@ func Is_valid_recommendation(recommendation kruizePayload.Recommendation, experi if recommendationIsValid { // Convert the time object to the expected format formattedMaxEndTime := maxEndTime.UTC().Format("2006-01-02T15:04:05.000Z") - recommendationData, timeStampisValid := recommendation.Data[formattedMaxEndTime] + _, timeStampisValid := recommendation.Data[formattedMaxEndTime] if !timeStampisValid { log.Error("recommendation not found for endtime: ", formattedMaxEndTime) invalidRecommendation.Inc() return false } - LogKruizeErrors(recommendationData, formattedMaxEndTime, experiment_name) return true } else { return false } } - -func LogKruizeErrors(recommendationData kruizePayload.RecommendationData, formattedMaxEndTime string, experiment_name string) { - - // https://github.com/kruize/autotune/blob/master/design/NotificationCodes.md#detailed-codes - errorNotificationCodes := map[string]string{ - "221001": "ERROR", - "221002": "ERROR", - "221003": "ERROR", - "221004": "ERROR", - "223001": "ERROR", - "223002": "ERROR", - "223003": "ERROR", - "223004": "ERROR", - "224001": "ERROR", - "224002": "ERROR", - "224003": "ERROR", - "224004": "ERROR", - } - notificationSections := []map[string]kruizePayload.Notification{} - - // Timestamp level - notificationsLevelTwo := recommendationData.Notifications - if notificationsLevelTwo != nil { - notificationSections = append(notificationSections, notificationsLevelTwo) - // Term Level - notificationsLevelThreeShortTerm := recommendationData.RecommendationTerms.Short_term.Notifications - if notificationsLevelThreeShortTerm != nil { - notificationSections = append(notificationSections, notificationsLevelThreeShortTerm) - // Engine Level - if recommendationData.RecommendationTerms.Short_term.RecommendationEngines != nil { - shortTermCostNotification := recommendationData.RecommendationTerms.Short_term.RecommendationEngines.Cost.Notifications - notificationSections = append(notificationSections, shortTermCostNotification) - - shortTermPerformanceNotification := recommendationData.RecommendationTerms.Short_term.RecommendationEngines.Performance.Notifications - notificationSections = append(notificationSections, shortTermPerformanceNotification) - } - } - notificationsLevelThreeMediumTerm := recommendationData.RecommendationTerms.Medium_term.Notifications - if notificationsLevelThreeMediumTerm != nil { - notificationSections = append(notificationSections, notificationsLevelThreeMediumTerm) - if recommendationData.RecommendationTerms.Medium_term.RecommendationEngines != nil { - mediumTermCostNotification := recommendationData.RecommendationTerms.Medium_term.RecommendationEngines.Cost.Notifications - notificationSections = append(notificationSections, mediumTermCostNotification) - - mediumTermPerformanceNotification := recommendationData.RecommendationTerms.Medium_term.RecommendationEngines.Performance.Notifications - notificationSections = append(notificationSections, mediumTermPerformanceNotification) - } - } - notificationsLevelThreeLongTerm := recommendationData.RecommendationTerms.Long_term.Notifications - if notificationsLevelThreeLongTerm != nil { - notificationSections = append(notificationSections, notificationsLevelThreeLongTerm) - if recommendationData.RecommendationTerms.Long_term.RecommendationEngines != nil { - longTermCostNotification := recommendationData.RecommendationTerms.Long_term.RecommendationEngines.Cost.Notifications - notificationSections = append(notificationSections, longTermCostNotification) - - longTermPerformanceNotification := recommendationData.RecommendationTerms.Long_term.RecommendationEngines.Performance.Notifications - notificationSections = append(notificationSections, longTermPerformanceNotification) - } - } - - } - - for _, notificationBody := range notificationSections { - for key := range notificationBody { - _, keyExists := errorNotificationCodes[key] - if keyExists { - log.Error("kruize recommendation error; experiment_name: ", experiment_name, ", notification_code: ", key) - kruizeRecommendationError.WithLabelValues(key).Inc() - } - - } - } - -} diff --git a/internal/utils/kruize/metrics.go b/internal/utils/kruize/metrics.go index 736d5489..50aadec0 100644 --- a/internal/utils/kruize/metrics.go +++ b/internal/utils/kruize/metrics.go @@ -16,11 +16,4 @@ var ( Name: "rosocp_invalid_recommendation_total", Help: "The total number of invalid recommendation send by Kruize", }) - kruizeRecommendationError = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "rosocp_kruize_error_recommendations_count", - Help: "Count of ERROR type recommendations from Kruize", - }, - []string{"notification_code"}, - ) ) From 1bee2e5afd7a864614fd986ccfad0dd4434c63c1 Mon Sep 17 00:00:00 2001 From: Sagnik Dutta Date: Tue, 23 Jan 2024 18:46:58 +0530 Subject: [PATCH 5/5] validate recommendation/container on poller --- .gitignore | 1 + internal/services/recommendation_poller.go | 34 ++++++++++++---------- scripts/samples/ros-ocp-usage-24Hrs.csv | 3 +- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index b53418c7..13999523 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ coverage.txt output.csv .idea/ .DS_Store +.go/ \ No newline at end of file diff --git a/internal/services/recommendation_poller.go b/internal/services/recommendation_poller.go index 7c595da7..1748afae 100644 --- a/internal/services/recommendation_poller.go +++ b/internal/services/recommendation_poller.go @@ -73,13 +73,12 @@ func requestAndSaveRecommendation(kafkaMsg types.RecommendationKafkaMsg, recomme return poll_cycle_complete } - // TODO: Is_valid_recommendation to be called on every container record v20.1 upgrade on wards - if kruize.Is_valid_recommendation(recommendation) { - containers := recommendation[0].Kubernetes_objects[0].Containers - recommendationSetList := []model.RecommendationSet{} - histRecommendationSetList := []model.HistoricalRecommendationSet{} + containers := recommendation[0].Kubernetes_objects[0].Containers + recommendationSetList := []model.RecommendationSet{} + histRecommendationSetList := []model.HistoricalRecommendationSet{} - for _, container := range containers { + for _, container := range containers { + if kruize.Is_valid_recommendation(container.Recommendations, experiment_name, maxEndTimeFromReport) { for _, v := range container.Recommendations.Data { marshalData, err := json.Marshal(v) if err != nil { @@ -89,8 +88,8 @@ func requestAndSaveRecommendation(kafkaMsg types.RecommendationKafkaMsg, recomme recommendationSet := model.RecommendationSet{ WorkloadID: kafkaMsg.Metadata.Workload_id, ContainerName: container.Container_name, - MonitoringStartTime: v.Duration_based.Short_term.Monitoring_start_time, - MonitoringEndTime: v.Duration_based.Short_term.Monitoring_end_time, + MonitoringStartTime: v.RecommendationTerms.Short_term.MonitoringStartTime, + MonitoringEndTime: v.MonitoringEndTime, Recommendations: marshalData, } recommendationSetList = append(recommendationSetList, recommendationSet) @@ -100,20 +99,24 @@ func requestAndSaveRecommendation(kafkaMsg types.RecommendationKafkaMsg, recomme OrgId: kafkaMsg.Metadata.Org_id, WorkloadID: kafkaMsg.Metadata.Workload_id, ContainerName: container.Container_name, - MonitoringStartTime: v.Duration_based.Short_term.Monitoring_start_time, - MonitoringEndTime: v.Duration_based.Short_term.Monitoring_end_time, + MonitoringStartTime: v.RecommendationTerms.Short_term.MonitoringStartTime, + MonitoringEndTime: v.MonitoringEndTime, Recommendations: marshalData, } histRecommendationSetList = append(histRecommendationSetList, historicalRecommendationSet) } + } else { + poll_cycle_complete = true + continue } + } + if len(recommendationSetList) > 0 { txError := transactionForRecommendation(recommendationSetList, histRecommendationSetList, experiment_name, recommendationType) if txError == nil { poll_cycle_complete = true + } else { + poll_cycle_complete = false } - } else { - poll_cycle_complete = true - invalidRecommendation.Inc() } return poll_cycle_complete } @@ -157,10 +160,9 @@ func PollForRecommendations(msg *kafka.Message, consumer_object *kafka.Consumer) commitKafkaMsg(msg, consumer_object) } // To consume upcoming Kafka msg, explicitly - // Especially in case of un-committed msgs - return + return case true: - // MonitoringEndTime.UTC() defaults to 0001-01-01 00:00:00 +0000 UTC if not found + // MonitoringEndTime.UTC() defaults to 0001-01-01 00:00:00 +0000 UTC if not set if !recommendation_stored_in_db.MonitoringEndTime.UTC().IsZero() { duration := maxEndTimeFromReport.Sub(recommendation_stored_in_db.MonitoringEndTime.UTC()) if int(duration.Hours()) >= cfg.RecommendationPollIntervalHours { diff --git a/scripts/samples/ros-ocp-usage-24Hrs.csv b/scripts/samples/ros-ocp-usage-24Hrs.csv index a1de8cd2..0ef0104a 100644 --- a/scripts/samples/ros-ocp-usage-24Hrs.csv +++ b/scripts/samples/ros-ocp-usage-24Hrs.csv @@ -2,5 +2,4 @@ report_period_start,report_period_end,interval_start,interval_end,container_name 2023-02-01 00:00:00 +0000 UTC,2023-03-01 00:00:00 +0000 UTC,2023-06-03 23:45:01 +0000 UTC,2023-06-04 00:00:00 +0000 UTC,Yuptoo-service,Yuptoo-app-standalone-1,Yuptoo-app,ReplicaSet,,deployment,Yuptoo-prod,quay.io/cloudservices/yuptoo,ip-10-0-176-227.us-east-2.compute.internal,i-0dfbb3fa4d0e8fc94,1,1,1,1,0.047932,0.031571,0.064131,0.047932,0,0,0,1073741824,1073741824,1073741824,1073741824,513587266.064516,510009344,513900544,513587266.064516,493311537.548387,493293568,493371392,493311537.548387 2023-02-01 00:00:00 +0000 UTC,2023-03-01 00:00:00 +0000 UTC,2023-06-04 00:00:01 +0000 UTC,2023-06-04 00:15:00 +0000 UTC,Yuptoo-service,Yuptoo-app-standalone-1,Yuptoo-app,ReplicaSet,,deployment,Yuptoo-prod,quay.io/cloudservices/yuptoo,ip-10-0-176-227.us-east-2.compute.internal,i-0dfbb3fa4d0e8fc94,1,1,1,1,0.047932,0.031571,0.064131,0.047932,0,0,0,1073741824,1073741824,1073741824,1073741824,513587266.064516,510009344,513900544,513587266.064516,493311537.548387,493293568,493371392,493311537.548387 2023-02-01 00:00:00 +0000 UTC,2023-03-01 00:00:00 +0000 UTC,2023-06-04 00:15:01 +0000 UTC,2023-06-04 00:30:00 +0000 UTC,Yuptoo-service,Yuptoo-app-standalone-1,Yuptoo-app,ReplicaSet,,deployment,Yuptoo-prod,quay.io/cloudservices/yuptoo,ip-10-0-176-227.us-east-2.compute.internal,i-0dfbb3fa4d0e8fc94,1,1,1,1,0.047932,0.031571,0.064131,0.047932,0,0,0,1073741824,1073741824,1073741824,1073741824,513587266.064516,510009344,513900544,513587266.064516,493311537.548387,493293568,493371392,493311537.548387 -2023-02-01 00:00:00 +0000 UTC,2023-03-01 00:00:00 +0000 UTC,2023-06-04 00:30:01 +0000 UTC,2023-06-04 00:45:00 +0000 UTC,Yuptoo-service,Yuptoo-app-standalone-1,Yuptoo-app,ReplicaSet,,deployment,Yuptoo-prod,quay.io/cloudservices/yuptoo,ip-10-0-176-227.us-east-2.compute.internal,i-0dfbb3fa4d0e8fc94,1,1,1,1,0.047932,0.031571,0.064131,0.047932,0,0,0,1073741824,1073741824,1073741824,1073741824,513587266.064516,510009344,513900544,513587266.064516,493311537.548387,493293568,493371392,493311537.548387 -2023-02-01 00:00:00 +0000 UTC,2023-03-01 00:00:00 +0000 UTC,2023-06-04 00:30:01 +0000 UTC,2023-06-04 00:45:00 +0000 UTC,Yuptoo-service,Yuptoo-app-standalone-1,Yuptoo-app,ReplicaSet,,deployment,Yuptoo-prod,quay.io/cloudservices/yuptoo,ip-10-0-176-227.us-east-2.compute.internal,i-0dfbb3fa4d0e8fc94,1,1,1,1,0.047932,0.031571,0.064131,0.047932,0,0,0,1073741824,1073741824,1073741824,1073741824,513587266.064516,510009344,513900544,513587266.064516,493311537.548387,493293568,493371392,493311537.548387 +2023-02-01 00:00:00 +0000 UTC,2023-03-01 00:00:00 +0000 UTC,2023-06-04 00:30:01 +0000 UTC,2023-06-04 00:45:00 +0000 UTC,Yuptoo-service,Yuptoo-app-standalone-1,Yuptoo-app,ReplicaSet,,deployment,Yuptoo-prod,quay.io/cloudservices/yuptoo,ip-10-0-176-227.us-east-2.compute.internal,i-0dfbb3fa4d0e8fc94,1,1,1,1,0.047932,0.031571,0.064131,0.047932,0,0,0,1073741824,1073741824,1073741824,1073741824,513587266.064516,510009344,513900544,513587266.064516,493311537.548387,493293568,493371392,493311537.548387 \ No newline at end of file