Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

func test: Read metrics directly from HCO, instead of from Prometheus #3088

Merged
merged 1 commit into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 88 additions & 46 deletions tests/func-tests/hco_prometheus_route.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"strconv"
"strings"
"sync"
"time"

openshiftroutev1 "github.com/openshift/api/route/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -19,6 +20,92 @@ import (

const tempRouteName = "prom-route"

type HCOPrometheusClient struct {
url string
cli *http.Client
}

var (
hcoClient *HCOPrometheusClient
hcoClientOnce sync.Once
)

func GetHCOPrometheusClient(ctx context.Context, cli client.Client) (*HCOPrometheusClient, error) {
var err error
hcoClientOnce.Do(func() {
hcoClient, err = newHCOPrometheusClient(ctx, cli)
})

if err != nil {
return nil, err
}

if hcoClient == nil {
return nil, fmt.Errorf("HCO client wasn't initiated")
}

return hcoClient, nil
}

func newHCOPrometheusClient(ctx context.Context, cli client.Client) (*HCOPrometheusClient, error) {
ticker := time.NewTicker(5 * time.Second)
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, 60*time.Second)
defer cancel()

for {
select {
case <-ticker.C:
tempRouteHost, err := getTempRouteHost(ctx, cli)
if err != nil {
continue
}

httpClient, err := rest.HTTPClientFor(GetClientConfig())
if err != nil {
return nil, fmt.Errorf("can't create HTTP client; %w", err)
}

return &HCOPrometheusClient{
url: fmt.Sprintf("https://%s/metrics", tempRouteHost),
cli: httpClient,
}, nil

case <-ctx.Done():
return nil, fmt.Errorf("timed out waiting for HCO Prometheus metrics route to be available")
}
}
}

func (hcoCli HCOPrometheusClient) GetHCOMetric(ctx context.Context, query string) (float64, error) {
req, err := http.NewRequest(http.MethodGet, hcoCli.url, nil)
if err != nil {
return 0, fmt.Errorf("failed to create request: %w", err)
}

resp, err := hcoCli.cli.Do(req.WithContext(ctx))
if err != nil {
return 0, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return 0, fmt.Errorf("failed to read the temp route status: %s", resp.Status)
}

scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, query) {
res, err := strconv.ParseFloat(strings.TrimSpace(strings.TrimPrefix(line, query)), 64)
if err != nil {
return 0, fmt.Errorf("error converting %s to int: %v\n", line, err)
}
return res, nil
}
}
return 0, nil
}

// CreateTempRoute creates a route to the HCO prometheus endpoint, to allow reading the metrics.
func CreateTempRoute(ctx context.Context, cli client.Client) error {
err := openshiftroutev1.AddToScheme(cli.Scheme())
Expand Down Expand Up @@ -61,7 +148,7 @@ func DeleteTempRoute(ctx context.Context, cli client.Client) error {
return cli.Delete(ctx, route)
}

func GetTempRouteHost(ctx context.Context, cli client.Client) (string, error) {
func getTempRouteHost(ctx context.Context, cli client.Client) (string, error) {
route := &openshiftroutev1.Route{
ObjectMeta: metav1.ObjectMeta{
Name: tempRouteName,
Expand All @@ -79,48 +166,3 @@ func GetTempRouteHost(ctx context.Context, cli client.Client) (string, error) {

return route.Status.Ingress[0].Host, nil
}

func GetHCOMetric(ctx context.Context, url, query string) (float64, error) {
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return 0, fmt.Errorf("failed to create request: %w", err)
}

resp, err := GetHTTPClient().Do(req.WithContext(ctx))
if err != nil {
return 0, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return 0, fmt.Errorf("failed to read the temp route status: %s", resp.Status)
}

scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, query) {
res, err := strconv.ParseFloat(strings.TrimSpace(strings.TrimPrefix(line, query)), 64)
if err != nil {
return 0, fmt.Errorf("error converting %s to int: %v\n", line, err)
}
return res, nil
}
}
return 0, nil
}

// makes http calls to http endpoints in the cluster
var httpClient *http.Client

func GetHTTPClient() *http.Client {
once := &sync.Once{}
once.Do(func() {
var err error
httpClient, err = rest.HTTPClientFor(GetClientConfig())
if err != nil {
panic("can't create HTTP client;" + err.Error())
}
})

return httpClient
}
35 changes: 15 additions & 20 deletions tests/func-tests/monitoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
promClient promApiv1.API
prometheusRule monitoringv1.PrometheusRule
initialOperatorHealthMetricValue float64
tempRouteURL string
hcoClient *tests.HCOPrometheusClient
)

runbookClient.Timeout = time.Second * 3
Expand All @@ -63,20 +63,12 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
promClient = initializePromClient(getPrometheusURL(ctx, restClient), getAuthorizationTokenForPrometheus(ctx, cliSet))
prometheusRule = getPrometheusRule(ctx, restClient)

initialOperatorHealthMetricValue = getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status")

Eventually(ctx, func(g Gomega) {
tempRouteHost, err := tests.GetTempRouteHost(ctx, cli)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(tempRouteHost).NotTo(BeEmpty())
tempRouteURL = fmt.Sprintf("https://%s/metrics", tempRouteHost)
}).WithTimeout(time.Second * 60).
WithPolling(time.Second).
WithContext(ctx).
Should(Succeed())

GinkgoWriter.Println("temporary URL to read HCO metrics: ", tempRouteURL)
var err error
hcoClient, err = tests.GetHCOPrometheusClient(ctx, cli)
Expect(err).NotTo(HaveOccurred())

initialOperatorHealthMetricValue = getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status")
Expect(err).NotTo(HaveOccurred())
})

It("Alert rules should have all the requried annotations", func() {
Expand Down Expand Up @@ -123,7 +115,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
var valueBefore float64
Eventually(func(g Gomega, ctx context.Context) {
var err error
valueBefore, err = tests.GetHCOMetric(ctx, tempRouteURL, query)
valueBefore, err = hcoClient.GetHCOMetric(ctx, query)
g.Expect(err).NotTo(HaveOccurred())
}).WithTimeout(10 * time.Second).WithPolling(500 * time.Millisecond).WithContext(ctx).Should(Succeed())
GinkgoWriter.Printf("The metric value before the test is: %0.2f\n", valueBefore)
Expand All @@ -142,7 +134,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"

By("checking that the HCO metric was increased by 1")
Eventually(func(g Gomega, ctx context.Context) float64 {
valueAfter, err := tests.GetHCOMetric(ctx, tempRouteURL, query)
valueAfter, err := hcoClient.GetHCOMetric(ctx, query)
g.Expect(err).NotTo(HaveOccurred())
return valueAfter
}).
Expand Down Expand Up @@ -174,7 +166,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
return alert
}).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil())

verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, warningImpact)
verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact)
})

It("UnsupportedHCOModification alert should fired when there is an jsonpatch annotation to modify an operand CRs", func(ctx context.Context) {
Expand All @@ -192,7 +184,7 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring"
alert := getAlertByName(alerts, "UnsupportedHCOModification")
return alert
}).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil())
verifyOperatorHealthMetricValue(ctx, promClient, initialOperatorHealthMetricValue, warningImpact)
verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact)
})
})

Expand All @@ -205,11 +197,14 @@ func getAlertByName(alerts promApiv1.AlertsResult, alertName string) *promApiv1.
return nil
}

func verifyOperatorHealthMetricValue(ctx context.Context, promClient promApiv1.API, initialOperatorHealthMetricValue, alertImpact float64) {
func verifyOperatorHealthMetricValue(ctx context.Context, promClient promApiv1.API, hcoClient *tests.HCOPrometheusClient, initialOperatorHealthMetricValue, alertImpact float64) {
Eventually(func(g Gomega, ctx context.Context) {
if alertImpact >= initialOperatorHealthMetricValue {
systemHealthMetricValue := getMetricValue(ctx, promClient, "kubevirt_hco_system_health_status")
systemHealthMetricValue, err := hcoClient.GetHCOMetric(ctx, "kubevirt_hco_system_health_status")
g.Expect(err).NotTo(HaveOccurred())

operatorHealthMetricValue := getMetricValue(ctx, promClient, "kubevirt_hyperconverged_operator_health_status")

expectedOperatorHealthMetricValue := math.Max(alertImpact, systemHealthMetricValue)

g.Expect(operatorHealthMetricValue).To(Equal(expectedOperatorHealthMetricValue),
Expand Down
Loading