Skip to content

Commit

Permalink
Merge pull request #786 from rancher/updated-metrics
Browse files Browse the repository at this point in the history
Add new agent status metrics; rename opni_monitoring_cluster_info to opni_cluster_info
  • Loading branch information
dbason authored Nov 9, 2022
2 parents 077216f + 81eb77c commit 6991dfa
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 35 deletions.
12 changes: 12 additions & 0 deletions pkg/gateway/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ package gateway

import (
"context"
"fmt"
"math/rand"
"sync"
"time"

"github.com/prometheus/client_golang/prometheus"
agentv1 "github.com/rancher/opni/pkg/agent"
capabilityv1 "github.com/rancher/opni/pkg/apis/capability/v1"
corev1 "github.com/rancher/opni/pkg/apis/core/v1"
Expand All @@ -17,6 +19,14 @@ import (
"google.golang.org/protobuf/types/known/emptypb"
)

var (
mSyncRequests = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "opni",
Name: "server_sync_requests_total",
Help: "Total number of sync requests sent to agents",
}, []string{"cluster_id", "code", "code_text"})
)

type SyncRequester struct {
capabilityv1.UnsafeNodeManagerServer
mu sync.RWMutex
Expand Down Expand Up @@ -75,6 +85,8 @@ func (f *SyncRequester) RequestSync(ctx context.Context, req *capabilityv1.SyncR
"capabilities", req.GetFilter().GetCapabilityNames(),
).Debug("sending sync request to agent")
_, err := clientSet.SyncNow(ctx, req.GetFilter())
code := status.Code(err)
mSyncRequests.WithLabelValues(req.GetCluster().GetId(), fmt.Sprint(code), code.String()).Inc()
if err != nil {
f.logger.With(
zap.Error(err),
Expand Down
54 changes: 53 additions & 1 deletion pkg/management/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,47 @@ package management

import (
"context"
"strings"
"time"

"github.com/prometheus/client_golang/prometheus"
corev1 "github.com/rancher/opni/pkg/apis/core/v1"
managementv1 "github.com/rancher/opni/pkg/apis/management/v1"
"github.com/samber/lo"
)

var (
clusterInfo = prometheus.NewDesc(
"opni_monitoring_cluster_info",
"opni_cluster_info",
"Cluster information",
[]string{"cluster_id", "friendly_name"},
prometheus.Labels{},
)
agentUp = prometheus.NewDesc(
"opni_agent_up",
"Agent connection status",
[]string{"cluster_id"},
prometheus.Labels{},
)
agentReady = prometheus.NewDesc(
"opni_agent_ready",
"Agent readiness status",
[]string{"cluster_id", "conditions"},
prometheus.Labels{},
)
agentSummary = prometheus.NewDesc(
"opni_agent_status_summary",
"Agent status summary",
[]string{"cluster_id", "summary"},
prometheus.Labels{},
)
)

func (s *Server) Describe(c chan<- *prometheus.Desc) {
c <- clusterInfo
c <- agentUp
c <- agentReady
c <- agentSummary
}

func (s *Server) Collect(c chan<- prometheus.Metric) {
Expand All @@ -44,5 +67,34 @@ func (s *Server) Collect(c chan<- prometheus.Metric) {
cluster.Id,
friendlyName,
)

var connected, ready float64
var conditions, summary string
if hs, err := s.GetClusterHealthStatus(ctx, &corev1.Reference{Id: cluster.Id}); err == nil {
connected = lo.Ternary[float64](hs.Status.Connected, 1, 0)
ready = lo.Ternary[float64](hs.Health.Ready, 1, 0)
conditions = strings.Join(hs.Health.Conditions, ",")
summary = hs.Summary()
}
c <- prometheus.MustNewConstMetric(
agentUp,
prometheus.GaugeValue,
connected,
cluster.Id,
)
c <- prometheus.MustNewConstMetric(
agentReady,
prometheus.GaugeValue,
ready,
cluster.Id,
conditions,
)
c <- prometheus.MustNewConstMetric(
agentSummary,
prometheus.GaugeValue,
1,
cluster.Id,
summary,
)
}
}
46 changes: 31 additions & 15 deletions pkg/management/collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,11 @@ var _ = Describe("Collector", Ordered, Label("slow"), func() {
It("should collect descriptors but no metrics", func() {
descs := make(chan *prometheus.Desc, 100)
tv.ifaces.collector.Describe(descs)
Eventually(descs).Should(Receive(WithTransform(fmt.Stringer.String, Equal(
descriptorString(
"opni_monitoring_cluster_info",
"Cluster information",
[]string{},
[]string{"cluster_id", "friendly_name"},
),
))))
Consistently(descs).ShouldNot(Receive())
Eventually(descs).Should(HaveLen(4))
Consistently(descs).Should(HaveLen(4))
metrics := make(chan prometheus.Metric, 100)
tv.ifaces.collector.Collect(metrics)
Consistently(metrics).ShouldNot(Receive())
Consistently(metrics).Should(BeEmpty())
})
})
When("clusters are present", func() {
Expand All @@ -61,16 +54,39 @@ var _ = Describe("Collector", Ordered, Label("slow"), func() {
},
})

descs := make(chan *prometheus.Desc, 100)
tv.ifaces.collector.Describe(descs)
Expect(descs).To(Receive(WithTransform(fmt.Stringer.String, Equal(
c := make(chan *prometheus.Desc, 100)
tv.ifaces.collector.Describe(c)
Expect(c).To(HaveLen(4))
descs := make([]string, 0, 4)
for i := 0; i < 4; i++ {
descs = append(descs, (<-c).String())
}
Expect(descs).To(ConsistOf(
descriptorString(
"opni_monitoring_cluster_info",
"opni_cluster_info",
"Cluster information",
[]string{},
[]string{"cluster_id", "friendly_name"},
),
))))
descriptorString(
"opni_agent_up",
"Agent connection status",
[]string{},
[]string{"cluster_id"},
),
descriptorString(
"opni_agent_ready",
"Agent readiness status",
[]string{},
[]string{"cluster_id", "conditions"},
),
descriptorString(
"opni_agent_status_summary",
"Agent status summary",
[]string{},
[]string{"cluster_id", "summary"},
),
))

metrics := make(chan prometheus.Metric, 100)
tv.ifaces.collector.Collect(metrics)
Expand Down
34 changes: 17 additions & 17 deletions pkg/resources/monitoring/dashboards/dashboards.json
Original file line number Diff line number Diff line change
Expand Up @@ -1496,7 +1496,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -3201,7 +3201,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 0,
Expand Down Expand Up @@ -4214,7 +4214,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -6945,7 +6945,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -10595,7 +10595,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -11480,7 +11480,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -13631,7 +13631,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -15369,7 +15369,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -17301,7 +17301,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -19311,7 +19311,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -20534,7 +20534,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 0,
Expand Down Expand Up @@ -22053,7 +22053,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 0,
Expand Down Expand Up @@ -22649,7 +22649,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -23658,7 +23658,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 0,
Expand Down Expand Up @@ -24872,7 +24872,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -25839,7 +25839,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 1,
Expand Down Expand Up @@ -26986,7 +26986,7 @@
"multi": false,
"name": "cluster",
"options": [ ],
"query": "query_result(opni_monitoring_cluster_info)",
"query": "query_result(opni_cluster_info)",
"refresh": 2,
"regex": "/cluster_id=\"(?<value>[^\"]+)|friendly_name=\"(?<text>[^\"]+)/g",
"sort": 0,
Expand Down
2 changes: 1 addition & 1 deletion pkg/resources/monitoring/dashboards/dashboards.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ local patchDashboard(d) = {
list: [
if x.name == 'cluster'
then x {
query: 'query_result(opni_monitoring_cluster_info)',
query: 'query_result(opni_cluster_info)',
regex: '/cluster_id="(?<value>[^"]+)|friendly_name="(?<text>[^"]+)/g',
}
else x
Expand Down
2 changes: 1 addition & 1 deletion pkg/resources/monitoring/dashboards/opni-gateway.json
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
"uid": "P0BEE70D62F42845E"
},
"exemplar": false,
"expr": "sum by(cluster_id) (rate(opni_gateway_remote_write_cluster_ingest_bytes[5m])) * on(cluster_id) group_left(friendly_name) opni_monitoring_cluster_info",
"expr": "sum by(cluster_id) (rate(opni_gateway_remote_write_cluster_ingest_bytes[5m])) * on(cluster_id) group_left(friendly_name) opni_cluster_info",
"instant": false,
"interval": "",
"legendFormat": "{{friendly_name}}",
Expand Down
6 changes: 6 additions & 0 deletions pkg/resources/monitoring/grafana.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,12 @@ func (r *Reconciler) grafana() ([]resources.Resource, error) {
SecurityContext: &corev1.PodSecurityContext{
FSGroup: lo.ToPtr(int64(472)),
},
Env: []corev1.EnvVar{
{
Name: "GF_INSTALL_PLUGINS",
Value: "grafana-polystat-panel,marcusolsson-treemap-panel",
},
},
},
Secrets: []string{"grafana-datasource-cert"},
DataStorage: &grafanav1alpha1.GrafanaDataStorage{
Expand Down
7 changes: 7 additions & 0 deletions plugins/metrics/pkg/cortex/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,18 @@ var (
Name: "remote_write_cluster_ingest_bytes",
Help: "Total number of (compressed) bytes received from remote write requests by cluster ID",
}, []string{"cluster_id"})
mRemoteWriteRequests = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "opni",
Subsystem: "gateway",
Name: "remote_write_requests_total",
Help: "Total number of remote write requests forwarded to Cortex",
}, []string{"cluster_id", "code", "code_text"})
)

func Collectors() []prometheus.Collector {
return []prometheus.Collector{
mIngestBytesTotal,
mIngestBytesByID,
mRemoteWriteRequests,
}
}
Loading

0 comments on commit 6991dfa

Please sign in to comment.