Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Record event as grafana annotation in stability test #414

Merged
merged 19 commits into from
Apr 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions docs/stability-test-cookbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

> Important notes: this guide is under heavy development and have complicated enviroment pre-requesites, things are ought to change in the future.

## Run stability test

The following commands assumes you are in the `tidb-operator` working directory:
```shell
# image will be tagged as YOUR_DOCKER_REGISTRY/pingcap/tidb-operator-stability-test:latest
Expand All @@ -14,6 +16,35 @@ $ vi ./tests/manifests/stability/stability.yaml
$ kubectl apply -f ./tests/manifests/stability/stability.yaml
```

## Get test report

```shell
$ kubectl -n tidb-operator-stability logs tidb-operator-stability
```

## Inspect overall cluster stats under various operations

It is useful to inspect how the cluster performs under various kind of operations or faults, you can access such information from the Grafana dashboard of each cluster:

```shell
$ kubectl port-forward -n ${CLUSTER_NAMESPACE} svc/${CLUSTER_GRAFANA_SERVICE} 3000:3000
```

Navigate to [localhost:3000](http://localhost:3000) to view the dashboards.

Optionally, you can view the event annotations like `scale cluster`, `upgrade cluster`, `vm crash` by querying annotations in Grafana to get better understanding of the system, follow this step-by-step guide:

1. click "Dashboard Setting" in the navigate bar
2. click the big "Make Editable" button
3. click "Annotations" in the sidebar
4. click "Add Annotation Query"
5. enter a name you like
6. switch "Match Any" on
7. add "stability" tag
8. click "add"
9. go back to dashboard and you will see the annotations trigger and the cluster events


## Alternative: run stability test in your local environment

Deploy & witness flow can be tedious when developing stability-test, this document introduce that how to run stability-test out of the cluster(your local machine, usually) while still operating the remote cluster.
Expand Down
53 changes: 53 additions & 0 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"database/sql"
"encoding/json"
"fmt"
"github.com/pingcap/tidb-operator/tests/pkg/metrics"
"io/ioutil"
"net/http"
"net/url"
Expand Down Expand Up @@ -74,6 +75,8 @@ const (
operartorChartName = "tidb-operator"
tidbClusterChartName = "tidb-cluster"
backupChartName = "tidb-backup"
statbilityTestTag = "stability"
metricsPort = 8090
)

type OperatorActions interface {
Expand Down Expand Up @@ -170,6 +173,7 @@ type TidbClusterConfig struct {
BackupSecretName string

BlockWriteConfig blockwriter.Config
GrafanaClient *metrics.Client
}

func (tc *TidbClusterConfig) BackupHelmSetString(m map[string]string) string {
Expand Down Expand Up @@ -264,6 +268,7 @@ func (oa *operatorActions) DeployOperator(info *OperatorConfig) error {
info.Namespace,
info.OperatorHelmSetString(nil))
glog.Info(cmd)

res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput()
if err != nil {
return fmt.Errorf("failed to deploy operator: %v, %s", err, string(res))
Expand Down Expand Up @@ -319,6 +324,7 @@ func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error {

func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error {
glog.Infof("deploying tidb cluster [%s/%s]", info.Namespace, info.ClusterName)
oa.emitEvent(info, "DeployTidbCluster")

namespace := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -358,6 +364,7 @@ func (oa *operatorActions) DeployTidbClusterOrDie(info *TidbClusterConfig) {

func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error {
glog.Infof("cleaning tidbcluster %s/%s", info.Namespace, info.ClusterName)
oa.emitEvent(info, "CleanTidbCluster")

charts := []string{
info.ClusterName,
Expand Down Expand Up @@ -500,6 +507,8 @@ func (oa *operatorActions) CheckTidbClusterStatusOrDie(info *TidbClusterConfig)
}

func (oa *operatorActions) BeginInsertDataTo(info *TidbClusterConfig) error {
oa.emitEvent(info, fmt.Sprintf("BeginInsertData: concurrency: %d", oa.cfg.BlockWriter.Concurrency))

dsn := getDSN(info.Namespace, info.ClusterName, "test", info.Password)
if info.blockWriter == nil {
return fmt.Errorf("block writer not initialized for cluster: %s", info.ClusterName)
Expand All @@ -522,6 +531,8 @@ func (oa *operatorActions) BeginInsertDataToOrDie(info *TidbClusterConfig) {
}

func (oa *operatorActions) StopInsertDataTo(info *TidbClusterConfig) {
oa.emitEvent(info, "StopInsertData")

info.blockWriter.Stop()
}

Expand All @@ -542,6 +553,8 @@ func (oa *operatorActions) backupChartPath(tag string) string {
}

func (oa *operatorActions) ScaleTidbCluster(info *TidbClusterConfig) error {
oa.emitEvent(info, fmt.Sprintf("ScaleTidbCluster"))

cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s",
info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), info.TidbClusterHelmSetString(nil))
glog.Info("[SCALE] " + cmd)
Expand Down Expand Up @@ -616,6 +629,8 @@ func (oa *operatorActions) CheckScaledCorrectly(info *TidbClusterConfig, podUIDs
}

func (oa *operatorActions) UpgradeTidbCluster(info *TidbClusterConfig) error {
oa.emitEvent(info, "UpgradeTidbCluster")

cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s",
info.ClusterName, oa.tidbClusterChartPath(info.OperatorTag), info.TidbClusterHelmSetString(nil))
glog.Info("[UPGRADE] " + cmd)
Expand Down Expand Up @@ -1349,6 +1364,16 @@ func (oa *operatorActions) checkGrafanaData(clusterInfo *TidbClusterConfig) erro
if data.Status != "success" || len(data.Data.Result) < 1 {
return fmt.Errorf("invalid response: status: %s, result: %v", data.Status, data.Data.Result)
}

// Grafana ready, init grafana client, no more sync logic because race condition is okay here
if clusterInfo.GrafanaClient == nil {
grafanaUrl := fmt.Sprintf("http://%s.%s:3000", svcName, ns)
client, err := metrics.NewClient(grafanaUrl, grafanaUsername, grafanaPassword, metricsPort)
if err != nil {
return err
}
clusterInfo.GrafanaClient = client
}
return nil
}

Expand Down Expand Up @@ -1388,6 +1413,7 @@ func (oa *operatorActions) checkoutTag(tagName string) error {
}

func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error {
oa.emitEvent(info, "DeployAdHocBackup")
glog.Infof("begin to deploy adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace)

sets := map[string]string{
Expand Down Expand Up @@ -1439,6 +1465,7 @@ func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error {
}

func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfig) error {
oa.emitEvent(to, fmt.Sprintf("RestoreBackup: source: %s", from.ClusterName))
glog.Infof("deploying restore cluster[%s/%s]", from.Namespace, from.ClusterName)

sets := map[string]string{
Expand Down Expand Up @@ -1583,6 +1610,7 @@ func releaseIsExist(err error) bool {
}

func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error {
oa.emitEvent(info, "DeploySchedulerBackup")
glog.Infof("begin to deploy scheduled backup")

cron := fmt.Sprintf("'*/1 * * * *'")
Expand Down Expand Up @@ -1803,6 +1831,7 @@ func (info *TidbClusterConfig) FullName() string {
}

func (oa *operatorActions) DeployIncrementalBackup(from *TidbClusterConfig, to *TidbClusterConfig) error {
oa.emitEvent(from, fmt.Sprintf("DeployIncrementalBackup: slave: %s", to.ClusterName))
glog.Infof("begin to deploy incremental backup cluster[%s] namespace[%s]", from.ClusterName, from.Namespace)

sets := map[string]string{
Expand Down Expand Up @@ -2059,3 +2088,27 @@ func (oa *operatorActions) StartValidatingAdmissionWebhookServerOrDie() {
os.Exit(4)
}
}

func (oa *operatorActions) emitEvent(info *TidbClusterConfig, event string) {
if info.GrafanaClient == nil {
glog.V(4).Infof("cluster:[%s] grafana client not ready, skip recording event %s.",
info.ClusterName, event)
return
}

anno := metrics.Annotation{
Text: event,
TimestampInMilliSec: time.Now().UnixNano() / int64(time.Millisecond),

Tags: []string{
statbilityTestTag,
fmt.Sprintf("cluster-%s", info.ClusterName),
fmt.Sprintf("ns-%s", info.Namespace),
},
}
go func(anno metrics.Annotation) {
if err := info.GrafanaClient.AddAnnotation(anno); err != nil {
glog.Errorf("cluster:[%s] error recording event %s, reason: %v", info.ClusterName, event, err)
}
}(anno)
}
10 changes: 5 additions & 5 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon
return cnt
}

origFailures := len(tc.Status.TiKV.FailureStores)
origUps := countUpStores(tc)

// checkout pd config
Expand Down Expand Up @@ -88,6 +87,7 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon
return errors.New("failed to get container status from tikv pod")
}

oa.emitEvent(info, fmt.Sprintf("TruncateSSTFile: tikv: %s", store.PodName))
// restart tikv to ensure sst files
err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", "tikv-server")
if err != nil {
Expand Down Expand Up @@ -135,10 +135,10 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon

return tikvOps.PollTiDBCluster(info.Namespace, info.ClusterName,
func(tc *v1alpha1.TidbCluster, err error) (bool, error) {
glog.Infof("cluster: [%s/%s] check failure stores: current=%d origin=%d",
info.Namespace, info.ClusterName,
len(tc.Status.TiKV.FailureStores), origFailures)
if len(tc.Status.TiKV.FailureStores) <= origFailures {
_, ok := tc.Status.TiKV.FailureStores[store.ID]
glog.Infof("cluster: [%s/%s] check if target store failed: %t",
info.Namespace, info.ClusterName, ok)
if !ok {
return false, nil
}
ups := countUpStores(tc)
Expand Down
22 changes: 14 additions & 8 deletions tests/pkg/metrics/annotation_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"io/ioutil"
"net"
"net/http"
"net/url"
Expand All @@ -28,7 +29,7 @@ import (
)

//Client request grafana API on a set of resource paths.
type client struct {
type Client struct {
// base is the root URL for all invocations of the client
baseUrl url.URL
client *http.Client
Expand All @@ -52,15 +53,15 @@ type AnnotationOptions struct {

//NewClient creats a new grafanaClient. This client performs rest functions
//such as Get, Post on specified paths.
func NewClient(grafanaUrl string, userName string, password string, prometheusExporterPort int) (*client, error) {
func NewClient(grafanaUrl string, userName string, password string, prometheusExporterPort int) (*Client, error) {
u, err := url.Parse(grafanaUrl)
if err != nil {
return nil, err
}

initFunc(prometheusExporterPort)
u.User = url.UserPassword(userName, password)
return &client{
return &Client{
baseUrl: *u,
client: &http.Client{},
}, nil
Expand Down Expand Up @@ -110,7 +111,7 @@ func initErrorMetric() prometheus.Counter {

//IncreErrorCountWithAnno increments the errorcount by 1,
//and add the annotation to grafanan.
func (cli *client) AddAnnotation(annotation Annotation) error {
func (cli *Client) AddAnnotation(annotation Annotation) error {
body, err := annotation.getBody()
if err != nil {
return fmt.Errorf("create request body faield, %v", err)
Expand All @@ -123,24 +124,29 @@ func (cli *client) AddAnnotation(annotation Annotation) error {

req.Header.Add("Accept", "application/json, text/plain, */*")
req.Header.Add("Content-Type", "application/json;charset=UTF-8")
resp, error := cli.client.Do(req)
if error != nil {
resp, err := cli.client.Do(req)
if err != nil {
return fmt.Errorf("add annotation faield, %v", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return fmt.Errorf("add annotation faield, statusCode=%v", resp.Status)
}
all, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
fmt.Println(all)

return nil
}

func (cli *client) IncrErrorCount() {
func (cli *Client) IncrErrorCount() {
counterMetric.Inc()
}

func (cli *client) getAnnotationPath() string {
func (cli *Client) getAnnotationPath() string {
u := cli.baseUrl
u.Path = path.Join(cli.baseUrl.Path, annotationSubPath)
return u.String()
Expand Down