Skip to content

Commit

Permalink
Add a metrics to get more informations on upstream that could be use …
Browse files Browse the repository at this point in the history
…to cross informations between ingress and envoy

Each cluster is a singular metrics so remove the "s"

Co-authored-by: Laurent Marchaud <16262531+Aluxima@users.noreply.github.com>
  • Loading branch information
SoulKyu and Aluxima committed Jul 29, 2024
1 parent 1585f99 commit 6e381b9
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 107 deletions.
29 changes: 21 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ Yggdrasil can be configured using a config file e.g:
{
"token": "xxxxxxxxxxxxxxxx",
"apiServer": "https://cluster1.api.com",
"ca": "pathto/cluster1/ca"
"ca": "pathto/cluster1/ca",
"maintenance": false,
"kubernetesClusterName": "cluster1"
},
{
"tokenPath": "/path/to/a/token",
Expand All @@ -167,18 +169,29 @@ The list of certificates will be loaded by Yggdrasil and served to the Envoy nod
The `ingressClasses` is a list of ingress classes that yggdrasil will watch for.
Each cluster represents a different Kubernetes cluster with the token being a service account token for that cluster. `ca` is the Path to the ca certificate for that cluster.

Maintenance is a new mode that allow to set a cluster in maintenance mode :
- Upstream only in one cluster are keeped
- Upstream in at least 1 cluster that is not in maintenance is deleted for the cluster in maintenance mode
- Yggdrasil will Fatal if all clusters are in maintenance mode.

This is optional and equal to `false` by default.

kubernetesClusterName is the name of the cluster, its only for information and will be used for metrics. Optional default to `""`

## Metrics
Yggdrasil has a number of Go, gRPC, Prometheus, and Yggdrasil-specific metrics built in which can be reached by cURLing the `/metrics` path at the health API address/port (default: 8081). See [Flags](#Flags) for more information on configuring the health API address/port.

The Yggdrasil-specific metrics which are available from the API are:

| Name | Description | Type |
|-----------------------------|------------------------------------------------|----------|
| yggdrasil_cluster_updates | Number of times the clusters have been updated | counter |
| yggdrasil_clusters | Total number of clusters generated | gauge |
| yggdrasil_ingresses | Total number of matching ingress objects | gauge |
| yggdrasil_listener_updates | Number of times the listener has been updated | counter |
| yggdrasil_virtual_hosts | Total number of virtual hosts generated | gauge |
| Name | Description | Type |
|----------------------------------------------|------------------------------------------------|----------|
| yggdrasil_cluster_updates | Number of times the clusters have been updated | counter |
| yggdrasil_clusters | Total number of clusters generated | gauge |
| yggdrasil_ingresses | Total number of matching ingress objects | gauge |
| yggdrasil_listener_updates | Number of times the listener has been updated | counter |
| yggdrasil_virtual_hosts | Total number of virtual hosts generated | gauge |
| yggdrasil_kubernetes_cluster_in_maintenance | Return 1 if cluster in maintenance or 0 | gauge |
| yggdrasil_upstream_info | Provide informations relate to upstream | gauge |

## Flags
```
Expand Down
35 changes: 28 additions & 7 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ import (
)

type clusterConfig struct {
APIServer string `json:"apiServer"`
Ca string `json:"ca"`
Token string `json:"token"`
TokenPath string `json:"tokenPath"`
APIServer string `json:"apiServer"`
Ca string `json:"ca"`
Token string `json:"token"`
TokenPath string `json:"tokenPath"`
Maintenance bool `json:"maintenance"`
KubernetesClusterName string `json:"kubernetesClusterName"`
}

type config struct {
Expand Down Expand Up @@ -303,9 +305,25 @@ func createSources(clusters []clusterConfig) ([]*kubernetes.Clientset, error) {
}
clientSet, err := kubernetes.NewForConfig(config)
if err != nil {
return sources, err
return nil, err
}

kubernetesConfig := k8s.NewKubernetesConfig(cluster.Maintenance, clientSet, cluster.KubernetesClusterName)

envoy.KubernetesClusterInMaintenance.WithLabelValues(cluster.APIServer).Set(float64(0))

if cluster.Maintenance {
envoy.KubernetesClusterInMaintenance.WithLabelValues(cluster.APIServer).Set(float64(1))
log.Warnf("Cluster with API Endpoint %s is in maintenance mode", cluster.APIServer)
} else {
allInMaintenance = false
}
sources = append(sources, clientSet)

sources = append(sources, *kubernetesConfig)
}

if allInMaintenance {
log.Fatal("All clusters are in maintenance mode")
}

return sources, nil
Expand All @@ -323,7 +341,10 @@ func configFromKubeConfig(paths []string) ([]*kubernetes.Clientset, error) {
if err != nil {
return sources, err
}
sources = append(sources, clientSet)

kubernetesConfig := k8s.NewKubernetesConfig(false, clientSet, "")

sources = append(sources, *kubernetesConfig)
}

return sources, nil
Expand Down
161 changes: 101 additions & 60 deletions pkg/envoy/ingress_translator.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,93 +365,133 @@ func validateSubdomain(ruleHost, host string) bool {
return strings.HasSuffix(host, ruleHost)
}

func translateIngresses(ingresses []*k8s.Ingress, syncSecrets bool, secrets []*v1.Secret, timeouts DefaultTimeouts) *envoyConfiguration {
func translateIngresses(ingresses []*k8s.Ingress, syncSecrets bool, secrets []*v1.Secret, timeouts DefaultTimeouts, accessLog string) *envoyConfiguration {
cfg := &envoyConfiguration{}
envoyIngresses := map[string]*envoyIngress{}
ruleHostToIngresses := map[string][]*k8s.Ingress{}

for _, i := range ingresses {
for _, j := range i.Upstreams {
for _, ruleHost := range i.RulesHosts {
for _, ruleHost := range i.RulesHosts {
ruleHostToIngresses[ruleHost] = append(ruleHostToIngresses[ruleHost], i)
}
}

isWildcard := isWildcard(ruleHost)
for ruleHost, ingressList := range ruleHostToIngresses {
isWildcard := isWildcard(ruleHost)

_, ok := envoyIngresses[ruleHost]
if !ok {
envoyIngresses[ruleHost] = newEnvoyIngress(ruleHost, timeouts)
}
if _, ok := envoyIngresses[ruleHost]; !ok {
envoyIngresses[ruleHost] = newEnvoyIngress(ruleHost, timeouts)
}

envoyIngress := envoyIngresses[ruleHost]
envoyIngress := envoyIngresses[ruleHost]

if weight64, err := strconv.ParseUint(i.Annotations["yggdrasil.uswitch.com/weight"], 10, 32); err == nil {
if weight64 != 0 {
envoyIngress.addUpstream(j, uint32(weight64))
// Determine if any ingress is not in maintenance mode
hasNonMaintenance := false
for _, ingress := range ingressList {
if !ingress.Maintenance {
hasNonMaintenance = true
break
}
}

// Add upstreams based on maintenance status
for _, ingress := range ingressList {
for _, j := range ingress.Upstreams {
// Skip this upstream if cluster is in maintenance but keep it if no other cluster can serve it
if !hasNonMaintenance || !ingress.Maintenance {
// Check if the upstream is already added
exists := false
for _, host := range envoyIngress.cluster.Hosts {
if host.Host == j {
exists = true
break
}
}
if exists {
continue // skip if the upstream already exists
}

class := "none"
if ingress.Class != nil {
class = *ingress.Class
}
} else {
envoyIngress.addUpstream(j, 1)
}

if isWildcard {
if i.Annotations["yggdrasil.uswitch.com/healthcheck-host"] != "" {
envoyIngress.addHealthCheckHost(i.Annotations["yggdrasil.uswitch.com/healthcheck-host"])
if !validateSubdomain(ruleHost, envoyIngress.cluster.HealthCheckHost) {
logrus.Warnf("Healthcheck %s is not on the same subdomain for %s, annotation will be skipped", envoyIngress.cluster.HealthCheckHost, ruleHost)
envoyIngress.cluster.HealthCheckHost = ruleHost
// Add upstream
if weight64, err := strconv.ParseUint(ingress.Annotations["yggdrasil.uswitch.com/weight"], 10, 32); err == nil {
if weight64 != 0 {
envoyIngress.addUpstream(j, uint32(weight64))
EnvoyUpstreamInfo.WithLabelValues(strings.ReplaceAll(ruleHost, ".", "_"), j, ingress.Namespace, class, ingress.KubernetesClusterName, ingress.Name).Set(float64(1))
}
} else {
logrus.Warnf("Be careful, healthcheck can't work for wildcard host : %s", envoyIngress.cluster.HealthCheckHost)
envoyIngress.addUpstream(j, 1)
EnvoyUpstreamInfo.WithLabelValues(strings.ReplaceAll(ruleHost, ".", "_"), j, ingress.Namespace, class, ingress.KubernetesClusterName, ingress.Name).Set(float64(1))
}
} else {
logrus.Warnf("Endpoint is in maintenance mode, upstream %s will not be added for host %s", j, ruleHost)
}
}

if i.Annotations["yggdrasil.uswitch.com/healthcheck-path"] != "" {
envoyIngress.addHealthCheckPath(i.Annotations["yggdrasil.uswitch.com/healthcheck-path"])
}

if i.Annotations["yggdrasil.uswitch.com/timeout"] != "" {
timeout, err := time.ParseDuration(i.Annotations["yggdrasil.uswitch.com/timeout"])
if err == nil {
envoyIngress.addTimeout(timeout)
if isWildcard {
if ingress.Annotations["yggdrasil.uswitch.com/healthcheck-host"] != "" {
envoyIngress.addHealthCheckHost(ingress.Annotations["yggdrasil.uswitch.com/healthcheck-host"])
if !validateSubdomain(ruleHost, envoyIngress.cluster.HealthCheckHost) {
logrus.Warnf("Healthcheck %s is not on the same subdomain for %s, annotation will be skipped", envoyIngress.cluster.HealthCheckHost, ruleHost)
envoyIngress.cluster.HealthCheckHost = ruleHost
}
} else {
logrus.Warnf("Be careful, healthcheck can't work for wildcard host : %s", envoyIngress.cluster.HealthCheckHost)
}
}

if i.Annotations["yggdrasil.uswitch.com/cluster-timeout"] != "" {
timeout, err := time.ParseDuration(i.Annotations["yggdrasil.uswitch.com/cluster-timeout"])
if err == nil {
envoyIngress.setClusterTimeout(timeout)
}
if ingress.Annotations["yggdrasil.uswitch.com/healthcheck-path"] != "" {
envoyIngress.addHealthCheckPath(ingress.Annotations["yggdrasil.uswitch.com/healthcheck-path"])
}

if ingress.Annotations["yggdrasil.uswitch.com/timeout"] != "" {
timeout, err := time.ParseDuration(ingress.Annotations["yggdrasil.uswitch.com/timeout"])
if err == nil {
envoyIngress.addTimeout(timeout)
}
}

if i.Annotations["yggdrasil.uswitch.com/route-timeout"] != "" {
timeout, err := time.ParseDuration(i.Annotations["yggdrasil.uswitch.com/route-timeout"])
if err == nil {
envoyIngress.setRouteTimeout(timeout)
}
if ingress.Annotations["yggdrasil.uswitch.com/cluster-timeout"] != "" {
timeout, err := time.ParseDuration(ingress.Annotations["yggdrasil.uswitch.com/cluster-timeout"])
if err == nil {
envoyIngress.setClusterTimeout(timeout)
}
}

if i.Annotations["yggdrasil.uswitch.com/per-try-timeout"] != "" {
timeout, err := time.ParseDuration(i.Annotations["yggdrasil.uswitch.com/per-try-timeout"])
if err == nil {
envoyIngress.setPerTryTimeout(timeout)
}
if ingress.Annotations["yggdrasil.uswitch.com/route-timeout"] != "" {
timeout, err := time.ParseDuration(ingress.Annotations["yggdrasil.uswitch.com/route-timeout"])
if err == nil {
envoyIngress.setRouteTimeout(timeout)
}
}

if i.Annotations["yggdrasil.uswitch.com/upstream-http-version"] != "" {
// TODO validate, add error path
envoyIngress.setUpstreamHttpVersion(i.Annotations["yggdrasil.uswitch.com/upstream-http-version"])
if ingress.Annotations["yggdrasil.uswitch.com/per-try-timeout"] != "" {
timeout, err := time.ParseDuration(ingress.Annotations["yggdrasil.uswitch.com/per-try-timeout"])
if err == nil {
envoyIngress.setPerTryTimeout(timeout)
}
}

if ingress.Annotations["yggdrasil.uswitch.com/upstream-http-version"] != "" {
// TODO validate, add error path
envoyIngress.setUpstreamHttpVersion(ingress.Annotations["yggdrasil.uswitch.com/upstream-http-version"])
}

envoyIngress.addRetryOn(i)
envoyIngress.addRetryOn(ingress)

if syncSecrets && envoyIngress.vhost.TlsKey == "" && envoyIngress.vhost.TlsCert == "" {
if hostTlsSecret, err := getHostTlsSecret(i, ruleHost, secrets); err != nil {
logrus.Infof(err.Error())
} else {
valid, err := validateTlsSecret(hostTlsSecret)
if err != nil {
logrus.Warnf("secret %s/%s is not valid: %s", hostTlsSecret.Namespace, hostTlsSecret.Name, err.Error())
} else if valid {
envoyIngress.vhost.TlsKey = string(hostTlsSecret.Data["tls.key"])
envoyIngress.vhost.TlsCert = string(hostTlsSecret.Data["tls.crt"])
}
if syncSecrets && envoyIngress.vhost.TlsKey == "" && envoyIngress.vhost.TlsCert == "" {
if hostTlsSecret, err := getHostTlsSecret(ingress, ruleHost, secrets); err != nil {
logrus.Infof(err.Error())
} else {
valid, err := validateTlsSecret(hostTlsSecret)
if err != nil {
logrus.Warnf("secret %s/%s is not valid: %s", hostTlsSecret.Namespace, hostTlsSecret.Name, err.Error())
} else if valid {
envoyIngress.vhost.TlsKey = string(hostTlsSecret.Data["tls.key"])
envoyIngress.vhost.TlsCert = string(hostTlsSecret.Data["tls.crt"])
}
}
}
Expand All @@ -461,6 +501,7 @@ func translateIngresses(ingresses []*k8s.Ingress, syncSecrets bool, secrets []*v
for _, ingress := range envoyIngresses {
cfg.Clusters = append(cfg.Clusters, ingress.cluster)
cfg.VirtualHosts = append(cfg.VirtualHosts, ingress.vhost)
cfg.AccessLog = accessLog
}

numVhosts.Set(float64(len(cfg.VirtualHosts)))
Expand Down
20 changes: 19 additions & 1 deletion pkg/envoy/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,26 @@ var (
Help: "Number of times the listener has been updated",
},
)

KubernetesClusterInMaintenance = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "yggdrasil",
Name: "kubernetes_clusters_in_maintenance",
Help: "Is kubernetes cluster in maintenance mode ?",
},
[]string{"apiServer"},
)

EnvoyUpstreamInfo = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "yggdrasil",
Name: "upstream_info",
Help: "Retrieve information about cluster",
},
[]string{"envoy_cluster_name", "upstream", "namespace", "ingressclass", "k8s_cluster", "ingress"},
)
)

func init() {
prometheus.MustRegister(matchingIngresses, numClusters, numVhosts, clusterUpdates, listenerUpdates)
prometheus.MustRegister(matchingIngresses, numClusters, numVhosts, clusterUpdates, listenerUpdates, KubernetesClusterInMaintenance, EnvoyUpstreamInfo)
}
Loading

0 comments on commit 6e381b9

Please sign in to comment.