Skip to content

Commit

Permalink
Add stackdriver metrics analysis
Browse files Browse the repository at this point in the history
Signed-off-by: Somtochi Onyekwere <somtochionyekwere@gmail.com>
  • Loading branch information
somtochiama authored and GregoireW committed Sep 16, 2021
1 parent 4871003 commit 912981b
Show file tree
Hide file tree
Showing 9 changed files with 599 additions and 16 deletions.
1 change: 1 addition & 0 deletions artifacts/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,7 @@ spec:
- prometheus
- influxdb
- datadog
- stackdriver
- cloudwatch
- newrelic
- graphite
Expand Down
1 change: 1 addition & 0 deletions charts/flagger/crds/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,7 @@ spec:
- prometheus
- influxdb
- datadog
- stackdriver
- cloudwatch
- newrelic
- graphite
Expand Down
50 changes: 50 additions & 0 deletions docs/gitbook/usage/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -477,3 +477,53 @@ spec:
secretRef:
name: graphite-basic-auth
```

## Google CLoud Monitoring (Stackdriver)

Enable Workload Identity on your cluster, create a service account key that has read access to the
Cloud Monitoring API and then create an IAM policy binding between the GCP service account and the Flagger
service account on Kubernetes. You can take a look at this [guide](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity)

Annotate the flagger service account
```shell script
kubectl annotate serviceaccount flagger \
--namespace <namespace> \
iam.gke.io/gcp-service-account=<gcp-serviceaccount-name>@<project-id>.iam.gserviceaccount.com
```

Alternatively, you can download the json keys and add it to your secret with the key `serviceAccountKey` (This method is not recommended).

Create a secret that contains your project-id (and, if workload identity is not enabled on your cluster,
your [service account json](https://cloud.google.com/docs/authentication/production#create_service_account)).

```
kubectl create secret generic gcloud-sa --from-literal=project=<project-id>
```
Then reference the secret in the metric template.
Note: The particular MQL query used here works if [Istio is installed on GKE](https://cloud.google.com/istio/docs/istio-on-gke/installing).
```yaml
apiVersion: flagger.app/v1beta1
kind: MetricTemplate
metadata:
name: bytes-sent
namespace: test
spec:
provider:
type: stackdriver
secretRef:
name: gcloud-sa
query: |
fetch k8s_container
| metric 'istio.io/service/server/response_latencies'
| filter
(metric.destination_service_name == '{{ service }}-canary'
&& metric.destination_service_namespace == '{{ namespace }}')
| align delta(1m)
| every 1m
| group_by [],
[value_response_latencies_percentile:
percentile(value.response_latencies, 99)]
```

The reference for the query language can be found [here](https://cloud.google.com/monitoring/mql/reference)
8 changes: 7 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@ module github.com/fluxcd/flagger
go 1.16

require (
cloud.google.com/go/monitoring v0.1.0
github.com/Masterminds/semver/v3 v3.0.3
github.com/aws/aws-sdk-go v1.37.32
github.com/davecgh/go-spew v1.1.1
github.com/go-logr/zapr v0.3.0
github.com/google/go-cmp v0.5.5
github.com/google/go-cmp v0.5.6
github.com/googleapis/gax-go/v2 v2.0.5
github.com/prometheus/client_golang v1.11.0
github.com/stretchr/testify v1.7.0
go.uber.org/zap v1.14.1
google.golang.org/api v0.54.0
google.golang.org/genproto v0.0.0-20210813162853-db860fec028c
google.golang.org/grpc v1.39.1
google.golang.org/protobuf v1.27.1
gopkg.in/h2non/gock.v1 v1.0.15
k8s.io/api v0.21.1
k8s.io/apimachinery v0.21.1
Expand Down
249 changes: 234 additions & 15 deletions go.sum

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions kustomize/base/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,7 @@ spec:
- prometheus
- influxdb
- datadog
- stackdriver
- cloudwatch
- newrelic
- graphite
Expand Down
2 changes: 2 additions & 0 deletions pkg/metrics/providers/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ func (factory Factory) Provider(
return NewNewRelicProvider(metricInterval, provider, credentials)
case "graphite":
return NewGraphiteProvider(provider, credentials)
case "stackdriver":
return NewStackDriverProvider(provider, credentials)
default:
return NewPrometheusProvider(provider, credentials)
}
Expand Down
129 changes: 129 additions & 0 deletions pkg/metrics/providers/stackdriver.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package providers

import (
"context"
"fmt"

monitoring "cloud.google.com/go/monitoring/apiv3/v2"
"google.golang.org/api/iterator"
"google.golang.org/api/option"
monitoringpb "google.golang.org/genproto/googleapis/monitoring/v3"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

flaggerv1 "github.com/fluxcd/flagger/pkg/apis/flagger/v1beta1"
)

type StackDriverProvider struct {
client *monitoring.QueryClient
project string
}

// NewStackDriverProvider takes a provider spec and credential map and
// returns a StackDriverProvider ready to execute queries against the
// Cloud Monitoring API
func NewStackDriverProvider(provider flaggerv1.MetricTemplateProvider,
credentials map[string][]byte,
) (*StackDriverProvider, error) {
stackd := &StackDriverProvider{}
var saKey []byte
if provider.SecretRef != nil {
if project, ok := credentials["project"]; ok {
stackd.project = fmt.Sprintf("projects/%s", string(project))
} else {
return nil, fmt.Errorf("%s credentials does not contain a project id", provider.Type)
}

if cred, ok := credentials["serviceAccountKey"]; ok {
saKey = cred
}
}

var client *monitoring.QueryClient
var err error
ctx := context.Background()

if saKey != nil {
client, err = monitoring.NewQueryClient(ctx, option.WithCredentialsJSON(saKey))
} else {
client, err = monitoring.NewQueryClient(ctx)
}

if err != nil {
return nil, err
}

stackd.client = client
return stackd, nil
}

// RunQuery executes Monitoring Query Language(MQL) queries against the
// Cloud Monitoring API
func (s *StackDriverProvider) RunQuery(query string) (float64, error) {
ctx := context.Background()
req := &monitoringpb.QueryTimeSeriesRequest{
Name: s.project,
Query: query,
}

it := s.client.QueryTimeSeries(ctx, req)

resp, err := it.Next()
if err == iterator.Done {
return 0, fmt.Errorf("invalid response: %s: %w", resp, ErrNoValuesFound)
}

if err != nil {
if s, ok := status.FromError(err); ok {
errStr := s.Message()
for _, d := range s.Proto().Details {
errStr = errStr + " Error Detail: " + d.String()
}

return 0, fmt.Errorf("error requesting stackdriver: %s", err)
}
return 0, fmt.Errorf("error requesting stackdriver: %s", err)
}

pointData := resp.PointData
if len(pointData) < 1 {
return 0, fmt.Errorf("invalid response: %s: %w", resp.String(), ErrNoValuesFound)
}

values := resp.PointData[0].Values
if len(values) < 1 {
return 0, fmt.Errorf("invalid response: %s: %w", resp.String(), ErrNoValuesFound)
}

return values[0].GetDoubleValue(), nil
}

// IsOnline calls QueryTimeSeries method with the empty query
// and returns an error if the returned status code is NOT grpc.InvalidArgument.
// For example, if the flagger does not the authorization scope `https://www.googleapis.com/auth/monitoring.read`,
// the returned status code would be grpc.PermissionDenied
func (s *StackDriverProvider) IsOnline() (bool, error) {
ctx := context.Background()
req := &monitoringpb.QueryTimeSeriesRequest{
Name: s.project,
Query: "",
}

it := s.client.QueryTimeSeries(ctx, req)

_, err := it.Next()
if err == nil {
return true, nil
}

stat, ok := status.FromError(err)
if !ok {
return false, fmt.Errorf("unexpected error: %s", err)
}

if stat.Code() != codes.InvalidArgument {
return false, fmt.Errorf("unexpected status code: %s", stat.Code().String())
}

return true, nil
}
Loading

0 comments on commit 912981b

Please sign in to comment.