From 85ebb48029acbdbba3564e7dee9b44e4d1de391b Mon Sep 17 00:00:00 2001 From: Nahshon Unna-Tsameret Date: Mon, 2 Sep 2024 15:20:25 +0300 Subject: [PATCH] Read metrics directly from HCO, instead of from prometheus Signed-off-by: Nahshon Unna-Tsameret --- tests/func-tests/hco_prometheus_route.go | 134 +++++++++++++++-------- tests/func-tests/monitoring_test.go | 35 +++--- 2 files changed, 103 insertions(+), 66 deletions(-) diff --git a/tests/func-tests/hco_prometheus_route.go b/tests/func-tests/hco_prometheus_route.go index d9ee6cea5..430f7789a 100644 --- a/tests/func-tests/hco_prometheus_route.go +++ b/tests/func-tests/hco_prometheus_route.go @@ -8,6 +8,7 @@ import ( "strconv" "strings" "sync" + "time" openshiftroutev1 "github.com/openshift/api/route/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -19,6 +20,92 @@ import ( const tempRouteName = "prom-route" +type HCOPrometheusClient struct { + url string + cli *http.Client +} + +var ( + hcoClient *HCOPrometheusClient + hcoClientOnce sync.Once +) + +func GetHCOPrometheusClient(ctx context.Context, cli client.Client) (*HCOPrometheusClient, error) { + var err error + hcoClientOnce.Do(func() { + hcoClient, err = newHCOPrometheusClient(ctx, cli) + }) + + if err != nil { + return nil, err + } + + if hcoClient == nil { + return nil, fmt.Errorf("HCO client wasn't initiated") + } + + return hcoClient, nil +} + +func newHCOPrometheusClient(ctx context.Context, cli client.Client) (*HCOPrometheusClient, error) { + ticker := time.NewTicker(5 * time.Second) + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, 60*time.Second) + defer cancel() + + for { + select { + case <-ticker.C: + tempRouteHost, err := getTempRouteHost(ctx, cli) + if err != nil { + continue + } + + httpClient, err := rest.HTTPClientFor(GetClientConfig()) + if err != nil { + return nil, fmt.Errorf("can't create HTTP client; %w", err) + } + + return &HCOPrometheusClient{ + url: fmt.Sprintf("https://%s/metrics", tempRouteHost), + cli: httpClient, + }, nil + + case <-ctx.Done(): + return nil, fmt.Errorf("timed out waiting for HCO Prometheus metrics route to be available") + } + } +} + +func (hcoCli HCOPrometheusClient) GetHCOMetric(ctx context.Context, query string) (float64, error) { + req, err := http.NewRequest(http.MethodGet, hcoCli.url, nil) + if err != nil { + return 0, fmt.Errorf("failed to create request: %w", err) + } + + resp, err := hcoCli.cli.Do(req.WithContext(ctx)) + if err != nil { + return 0, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return 0, fmt.Errorf("failed to read the temp route status: %s", resp.Status) + } + + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, query) { + res, err := strconv.ParseFloat(strings.TrimSpace(strings.TrimPrefix(line, query)), 64) + if err != nil { + return 0, fmt.Errorf("error converting %s to int: %v\n", line, err) + } + return res, nil + } + } + return 0, nil +} + // CreateTempRoute creates a route to the HCO prometheus endpoint, to allow reading the metrics. func CreateTempRoute(ctx context.Context, cli client.Client) error { err := openshiftroutev1.AddToScheme(cli.Scheme()) @@ -61,7 +148,7 @@ func DeleteTempRoute(ctx context.Context, cli client.Client) error { return cli.Delete(ctx, route) } -func GetTempRouteHost(ctx context.Context, cli client.Client) (string, error) { +func getTempRouteHost(ctx context.Context, cli client.Client) (string, error) { route := &openshiftroutev1.Route{ ObjectMeta: metav1.ObjectMeta{ Name: tempRouteName, @@ -79,48 +166,3 @@ func GetTempRouteHost(ctx context.Context, cli client.Client) (string, error) { return route.Status.Ingress[0].Host, nil } - -func GetHCOMetric(ctx context.Context, url, query string) (float64, error) { - req, err := http.NewRequest(http.MethodGet, url, nil) - if err != nil { - return 0, fmt.Errorf("failed to create request: %w", err) - } - - resp, err := GetHTTPClient().Do(req.WithContext(ctx)) - if err != nil { - return 0, err - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return 0, fmt.Errorf("failed to read the temp route status: %s", resp.Status) - } - - scanner := bufio.NewScanner(resp.Body) - for scanner.Scan() { - line := scanner.Text() - if strings.HasPrefix(line, query) { - res, err := strconv.ParseFloat(strings.TrimSpace(strings.TrimPrefix(line, query)), 64) - if err != nil { - return 0, fmt.Errorf("error converting %s to int: %v\n", line, err) - } - return res, nil - } - } - return 0, nil -} - -// makes http calls to http endpoints in the cluster -var httpClient *http.Client - -func GetHTTPClient() *http.Client { - once := &sync.Once{} - once.Do(func() { - var err error - httpClient, err = rest.HTTPClientFor(GetClientConfig()) - if err != nil { - panic("can't create HTTP client;" + err.Error()) - } - }) - - return httpClient -} diff --git a/tests/func-tests/monitoring_test.go b/tests/func-tests/monitoring_test.go index f2238e428..04f08b1dd 100644 --- a/tests/func-tests/monitoring_test.go +++ b/tests/func-tests/monitoring_test.go @@ -49,7 +49,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring" promClient promApiv1.API prometheusRule monitoringv1.PrometheusRule initialOperatorHealthMetricValue float64 - tempRouteURL string + hcoClient *tests.HCOPrometheusClient ) runbookClient.Timeout = time.Second * 3 @@ -63,20 +63,12 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring" promClient = initializePromClient(getPrometheusURL(ctx, restClient), getAuthorizationTokenForPrometheus(ctx, cliSet)) prometheusRule = getPrometheusRule(ctx, restClient) - initialOperatorHealthMetricValue = getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status") - - Eventually(ctx, func(g Gomega) { - tempRouteHost, err := tests.GetTempRouteHost(ctx, cli) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(tempRouteHost).NotTo(BeEmpty()) - tempRouteURL = fmt.Sprintf("https://%s/metrics", tempRouteHost) - }).WithTimeout(time.Second * 60). - WithPolling(time.Second). - WithContext(ctx). - Should(Succeed()) - - GinkgoWriter.Println("temporary URL to read HCO metrics: ", tempRouteURL) + var err error + hcoClient, err = tests.GetHCOPrometheusClient(ctx, cli) + Expect(err).NotTo(HaveOccurred()) + initialOperatorHealthMetricValue = getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status") + Expect(err).NotTo(HaveOccurred()) }) It("Alert rules should have all the requried annotations", func() { @@ -123,7 +115,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring" var valueBefore float64 Eventually(func(g Gomega, ctx context.Context) { var err error - valueBefore, err = tests.GetHCOMetric(ctx, tempRouteURL, query) + valueBefore, err = hcoClient.GetHCOMetric(ctx, query) g.Expect(err).NotTo(HaveOccurred()) }).WithTimeout(10 * time.Second).WithPolling(500 * time.Millisecond).WithContext(ctx).Should(Succeed()) GinkgoWriter.Printf("The metric value before the test is: %0.2f\n", valueBefore) @@ -142,7 +134,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring" By("checking that the HCO metric was increased by 1") Eventually(func(g Gomega, ctx context.Context) float64 { - valueAfter, err := tests.GetHCOMetric(ctx, tempRouteURL, query) + valueAfter, err := hcoClient.GetHCOMetric(ctx, query) g.Expect(err).NotTo(HaveOccurred()) return valueAfter }). @@ -174,7 +166,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring" return alert }).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil()) - verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, warningImpact) + verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact) }) It("UnsupportedHCOModification alert should fired when there is an jsonpatch annotation to modify an operand CRs", func(ctx context.Context) { @@ -192,7 +184,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring" alert := getAlertByName(alerts, "UnsupportedHCOModification") return alert }).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil()) - verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, warningImpact) + verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact) }) }) @@ -205,11 +197,14 @@ func getAlertByName(alerts promApiv1.AlertsResult, alertName string) *promApiv1. return nil } -func verifyOperatorHealthMetricValue(ctx context.Context, promClient promApiv1.API, initialOperatorHealthMetricValue, alertImpact float64) { +func verifyOperatorHealthMetricValue(ctx context.Context, promClient promApiv1.API, hcoClient *tests.HCOPrometheusClient, initialOperatorHealthMetricValue, alertImpact float64) { Eventually(func(g Gomega, ctx context.Context) { if alertImpact >= initialOperatorHealthMetricValue { - systemHealthMetricValue := getMetricValue(ctx, promClient, "kubevirt_hco_system_health_status") + systemHealthMetricValue, err := hcoClient.GetHCOMetric(ctx, "kubevirt_hco_system_health_status") + g.Expect(err).NotTo(HaveOccurred()) + operatorHealthMetricValue := getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status") + expectedOperatorHealthMetricValue := math.Max(alertImpact, systemHealthMetricValue) g.Expect(operatorHealthMetricValue).To(Equal(expectedOperatorHealthMetricValue),