Skip to content

Commit

Permalink
Read metrics directly from HCO, instead of from prometheus (#3088)
Browse files Browse the repository at this point in the history
Signed-off-by: Nahshon Unna-Tsameret <nunnatsa@redhat.com>
  • Loading branch information
nunnatsa authored Sep 4, 2024
1 parent 81eb8ba commit 74847c4
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 66 deletions.
134 changes: 88 additions & 46 deletions tests/func-tests/hco_prometheus_route.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"strconv"
"strings"
"sync"
"time"

openshiftroutev1 "github.com/openshift/api/route/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -19,6 +20,92 @@ import (

const tempRouteName = "prom-route"

type HCOPrometheusClient struct {
url string
cli *http.Client
}

var (
hcoClient *HCOPrometheusClient
hcoClientOnce sync.Once
)

func GetHCOPrometheusClient(ctx context.Context, cli client.Client) (*HCOPrometheusClient, error) {
var err error
hcoClientOnce.Do(func() {
hcoClient, err = newHCOPrometheusClient(ctx, cli)
})

if err != nil {
return nil, err
}

if hcoClient == nil {
return nil, fmt.Errorf("HCO client wasn't initiated")
}

return hcoClient, nil
}

func newHCOPrometheusClient(ctx context.Context, cli client.Client) (*HCOPrometheusClient, error) {
ticker := time.NewTicker(5 * time.Second)
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, 60*time.Second)
defer cancel()

for {
select {
case <-ticker.C:
tempRouteHost, err := getTempRouteHost(ctx, cli)
if err != nil {
continue
}

httpClient, err := rest.HTTPClientFor(GetClientConfig())
if err != nil {
return nil, fmt.Errorf("can't create HTTP client; %w", err)
}

return &HCOPrometheusClient{
url: fmt.Sprintf("https://%s/metrics", tempRouteHost),
cli: httpClient,
}, nil

case <-ctx.Done():
return nil, fmt.Errorf("timed out waiting for HCO Prometheus metrics route to be available")
}
}
}

func (hcoCli HCOPrometheusClient) GetHCOMetric(ctx context.Context, query string) (float64, error) {
req, err := http.NewRequest(http.MethodGet, hcoCli.url, nil)
if err != nil {
return 0, fmt.Errorf("failed to create request: %w", err)
}

resp, err := hcoCli.cli.Do(req.WithContext(ctx))
if err != nil {
return 0, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return 0, fmt.Errorf("failed to read the temp route status: %s", resp.Status)
}

scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, query) {
res, err := strconv.ParseFloat(strings.TrimSpace(strings.TrimPrefix(line, query)), 64)
if err != nil {
return 0, fmt.Errorf("error converting %s to int: %v\n", line, err)
}
return res, nil
}
}
return 0, nil
}

// CreateTempRoute creates a route to the HCO prometheus endpoint, to allow reading the metrics.
func CreateTempRoute(ctx context.Context, cli client.Client) error {
err := openshiftroutev1.AddToScheme(cli.Scheme())
Expand Down Expand Up @@ -61,7 +148,7 @@ func DeleteTempRoute(ctx context.Context, cli client.Client) error {
return cli.Delete(ctx, route)
}

func GetTempRouteHost(ctx context.Context, cli client.Client) (string, error) {
func getTempRouteHost(ctx context.Context, cli client.Client) (string, error) {
route := &openshiftroutev1.Route{
ObjectMeta: metav1.ObjectMeta{
Name: tempRouteName,
Expand All @@ -79,48 +166,3 @@ func GetTempRouteHost(ctx context.Context, cli client.Client) (string, error) {

return route.Status.Ingress[0].Host, nil
}

func GetHCOMetric(ctx context.Context, url, query string) (float64, error) {
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return 0, fmt.Errorf("failed to create request: %w", err)
}

resp, err := GetHTTPClient().Do(req.WithContext(ctx))
if err != nil {
return 0, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return 0, fmt.Errorf("failed to read the temp route status: %s", resp.Status)
}

scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, query) {
res, err := strconv.ParseFloat(strings.TrimSpace(strings.TrimPrefix(line, query)), 64)
if err != nil {
return 0, fmt.Errorf("error converting %s to int: %v\n", line, err)
}
return res, nil
}
}
return 0, nil
}

// makes http calls to http endpoints in the cluster
var httpClient *http.Client

func GetHTTPClient() *http.Client {
once := &sync.Once{}
once.Do(func() {
var err error
httpClient, err = rest.HTTPClientFor(GetClientConfig())
if err != nil {
panic("can't create HTTP client;" + err.Error())
}
})

return httpClient
}
35 changes: 15 additions & 20 deletions tests/func-tests/monitoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
promClient promApiv1.API
prometheusRule monitoringv1.PrometheusRule
initialOperatorHealthMetricValue float64
tempRouteURL string
hcoClient *tests.HCOPrometheusClient
)

runbookClient.Timeout = time.Second * 3
Expand All @@ -63,20 +63,12 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
promClient = initializePromClient(getPrometheusURL(ctx, restClient), getAuthorizationTokenForPrometheus(ctx, cliSet))
prometheusRule = getPrometheusRule(ctx, restClient)

initialOperatorHealthMetricValue = getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status")

Eventually(ctx, func(g Gomega) {
tempRouteHost, err := tests.GetTempRouteHost(ctx, cli)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(tempRouteHost).NotTo(BeEmpty())
tempRouteURL = fmt.Sprintf("https://%s/metrics", tempRouteHost)
}).WithTimeout(time.Second * 60).
WithPolling(time.Second).
WithContext(ctx).
Should(Succeed())

GinkgoWriter.Println("temporary URL to read HCO metrics: ", tempRouteURL)
var err error
hcoClient, err = tests.GetHCOPrometheusClient(ctx, cli)
Expect(err).NotTo(HaveOccurred())

initialOperatorHealthMetricValue = getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status")
Expect(err).NotTo(HaveOccurred())
})

It("Alert rules should have all the requried annotations", func() {
Expand Down Expand Up @@ -123,7 +115,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
var valueBefore float64
Eventually(func(g Gomega, ctx context.Context) {
var err error
valueBefore, err = tests.GetHCOMetric(ctx, tempRouteURL, query)
valueBefore, err = hcoClient.GetHCOMetric(ctx, query)
g.Expect(err).NotTo(HaveOccurred())
}).WithTimeout(10 * time.Second).WithPolling(500 * time.Millisecond).WithContext(ctx).Should(Succeed())
GinkgoWriter.Printf("The metric value before the test is: %0.2f\n", valueBefore)
Expand All @@ -142,7 +134,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"

By("checking that the HCO metric was increased by 1")
Eventually(func(g Gomega, ctx context.Context) float64 {
valueAfter, err := tests.GetHCOMetric(ctx, tempRouteURL, query)
valueAfter, err := hcoClient.GetHCOMetric(ctx, query)
g.Expect(err).NotTo(HaveOccurred())
return valueAfter
}).
Expand Down Expand Up @@ -174,7 +166,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
return alert
}).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil())

verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, warningImpact)
verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact)
})

It("UnsupportedHCOModification alert should fired when there is an jsonpatch annotation to modify an operand CRs", func(ctx context.Context) {
Expand All @@ -192,7 +184,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
alert := getAlertByName(alerts, "UnsupportedHCOModification")
return alert
}).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil())
verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, warningImpact)
verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact)
})
})

Expand All @@ -205,11 +197,14 @@ func getAlertByName(alerts promApiv1.AlertsResult, alertName string) *promApiv1.
return nil
}

func verifyOperatorHealthMetricValue(ctx context.Context, promClient promApiv1.API, initialOperatorHealthMetricValue, alertImpact float64) {
func verifyOperatorHealthMetricValue(ctx context.Context, promClient promApiv1.API, hcoClient *tests.HCOPrometheusClient, initialOperatorHealthMetricValue, alertImpact float64) {
Eventually(func(g Gomega, ctx context.Context) {
if alertImpact >= initialOperatorHealthMetricValue {
systemHealthMetricValue := getMetricValue(ctx, promClient, "kubevirt_hco_system_health_status")
systemHealthMetricValue, err := hcoClient.GetHCOMetric(ctx, "kubevirt_hco_system_health_status")
g.Expect(err).NotTo(HaveOccurred())

operatorHealthMetricValue := getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status")

expectedOperatorHealthMetricValue := math.Max(alertImpact, systemHealthMetricValue)

g.Expect(operatorHealthMetricValue).To(Equal(expectedOperatorHealthMetricValue),
Expand Down

0 comments on commit 74847c4

Please sign in to comment.