cortexproject · bboreham · Mar 19, 2018 · Mar 11, 2018 · Mar 12, 2018 · Mar 12, 2018
diff --git a/Gopkg.lock b/Gopkg.lock
diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go
@@ -6,7 +6,6 @@ import (
 	"flag"
 	"fmt"
 	"hash/fnv"
-	"io"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -44,12 +43,12 @@ var (
 // Distributor is a storage.SampleAppender and a client.Querier which
 // forwards appends and queries to individual ingesters.
 type Distributor struct {
-	cfg        Config
-	ring       ring.ReadRing
-	clientsMtx sync.RWMutex
-	clients    map[string]client.IngesterClient
-	quit       chan struct{}
-	done       chan struct{}
+	cfg         Config
+	ring        ring.ReadRing
+	clientsMtx  sync.RWMutex
+	clientCache *ingester_client.IngesterClientCache
+	quit        chan struct{}
+	done        chan struct{}
 
 	billingClient *billing.Client
 
@@ -73,11 +72,12 @@ type Config struct {
 	BillingConfig        billing.Config
 	IngesterClientConfig ingester_client.Config
 
-	ReplicationFactor   int
-	RemoteTimeout       time.Duration
-	ClientCleanupPeriod time.Duration
-	IngestionRateLimit  float64
-	IngestionBurstSize  int
+	ReplicationFactor    int
+	RemoteTimeout        time.Duration
+	ClientCleanupPeriod  time.Duration
+	IngestionRateLimit   float64
+	IngestionBurstSize   int
+	HealthCheckIngesters bool
 
 	// for testing
 	ingesterClientFactory func(addr string, cfg ingester_client.Config) (client.IngesterClient, error)
@@ -93,6 +93,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	flag.DurationVar(&cfg.ClientCleanupPeriod, "distributor.client-cleanup-period", 15*time.Second, "How frequently to clean up clients for ingesters that have gone away.")
 	flag.Float64Var(&cfg.IngestionRateLimit, "distributor.ingestion-rate-limit", 25000, "Per-user ingestion rate limit in samples per second.")
 	flag.IntVar(&cfg.IngestionBurstSize, "distributor.ingestion-burst-size", 50000, "Per-user allowed ingestion burst size (in number of samples).")
+	flag.BoolVar(&cfg.HealthCheckIngesters, "distributor.health-check-ingesters", false, "Run a health check on each ingester client during the cleanup period.")
 }
 
 // New constructs a new Distributor
@@ -116,7 +117,7 @@ func New(cfg Config, ring ring.ReadRing) (*Distributor, error) {
 	d := &Distributor{
 		cfg:            cfg,
 		ring:           ring,
-		clients:        map[string]client.IngesterClient{},
+		clientCache:    ingester_client.NewIngesterClientCache(cfg.ingesterClientFactory, cfg.IngesterClientConfig),
 		quit:           make(chan struct{}),
 		done:           make(chan struct{}),
 		billingClient:  billingClient,
@@ -170,6 +171,9 @@ func (d *Distributor) Run() {
 		select {
 		case <-cleanupClients.C:
 			d.removeStaleIngesterClients()
+			if d.cfg.HealthCheckIngesters {
+				d.healthCheckAndRemoveIngesters()
+			}
 		case <-d.quit:
 			close(d.done)
 			return
@@ -184,52 +188,34 @@ func (d *Distributor) Stop() {
 }
 
 func (d *Distributor) removeStaleIngesterClients() {
-	d.clientsMtx.Lock()
-	defer d.clientsMtx.Unlock()
-
 	ingesters := map[string]struct{}{}
 	for _, ing := range d.ring.GetAll() {
 		ingesters[ing.Addr] = struct{}{}
 	}
 
-	for addr, client := range d.clients {
+	for _, addr := range d.clientCache.RegisteredAddresses() {
 		if _, ok := ingesters[addr]; ok {
 			continue
 		}
 		level.Info(util.Logger).Log("msg", "removing stale ingester client", "addr", addr)
-		delete(d.clients, addr)
-
-		// Do the gRPC closing in the background since it might take a while and
-		// we're holding a mutex.
-		go func(addr string, closer io.Closer) {
-			if err := closer.Close(); err != nil {
-				level.Error(util.Logger).Log("msg", "error closing connection to ingester", "ingester", addr, "err", err)
-			}
-		}(addr, client.(io.Closer))
+		d.clientCache.RemoveClientFor(addr)
 	}
 }
 
-func (d *Distributor) getClientFor(ingester *ring.IngesterDesc) (client.IngesterClient, error) {
-	d.clientsMtx.RLock()
-	client, ok := d.clients[ingester.Addr]
-	d.clientsMtx.RUnlock()
-	if ok {
-		return client, nil
-	}
-
-	d.clientsMtx.Lock()
-	defer d.clientsMtx.Unlock()
-	client, ok = d.clients[ingester.Addr]
-	if ok {
-		return client, nil
-	}
-
-	client, err := d.cfg.ingesterClientFactory(ingester.Addr, d.cfg.IngesterClientConfig)
-	if err != nil {
-		return nil, err
+func (d *Distributor) healthCheckAndRemoveIngesters() {
+	for _, addr := range d.clientCache.RegisteredAddresses() {
+		client, err := d.clientCache.GetClientFor(addr)
+		if err != nil {
+			// if there is no client, don't need to health check it
+			level.Warn(util.Logger).Log("msg", "could not create client for", "addr", addr)
+			continue
+		}
+		err = ingester_client.HealthCheck(client, d.cfg.RemoteTimeout)
+		if err != nil {
+			level.Warn(util.Logger).Log("msg", "removing ingester failing healtcheck", "addr", addr, "reason", err)
+			d.clientCache.RemoveClientFor(addr)
+		}
 	}
-	d.clients[ingester.Addr] = client
-	return client, nil
 }
 
 func tokenForLabels(userID string, labels []client.LabelPair) (uint32, error) {
@@ -412,7 +398,7 @@ func (d *Distributor) sendSamples(ctx context.Context, ingester *ring.IngesterDe
 }
 
 func (d *Distributor) sendSamplesErr(ctx context.Context, ingester *ring.IngesterDesc, samples []*sampleTracker) error {
-	c, err := d.getClientFor(ingester)
+	c, err := d.clientCache.GetClientFor(ingester.Addr)
 	if err != nil {
 		return err
 	}
@@ -449,7 +435,7 @@ func (d *Distributor) Query(ctx context.Context, from, to model.Time, matchers .
 
 		metricNameMatcher, _, ok := util.ExtractMetricNameMatcherFromMatchers(matchers)
 
-		req, err := util.ToQueryRequest(from, to, matchers)
+		req, err := ingester_client.ToQueryRequest(from, to, matchers)
 		if err != nil {
 			return err
 		}
@@ -529,7 +515,7 @@ func (d *Distributor) queryIngesters(ctx context.Context, ingesters []*ring.Inge
 }
 
 func (d *Distributor) queryIngester(ctx context.Context, ing *ring.IngesterDesc, req *client.QueryRequest) (model.Matrix, error) {
-	client, err := d.getClientFor(ing)
+	client, err := d.clientCache.GetClientFor(ing.Addr)
 	if err != nil {
 		return nil, err
 	}
@@ -541,7 +527,7 @@ func (d *Distributor) queryIngester(ctx context.Context, ing *ring.IngesterDesc,
 		return nil, err
 	}
 
-	return util.FromQueryResponse(resp), nil
+	return ingester_client.FromQueryResponse(resp), nil
 }
 
 // forAllIngesters runs f, in parallel, for all ingesters
@@ -550,7 +536,7 @@ func (d *Distributor) forAllIngesters(f func(client.IngesterClient) (interface{}
 	ingesters := d.ring.GetAll()
 	for _, ingester := range ingesters {
 		go func(ingester *ring.IngesterDesc) {
-			client, err := d.getClientFor(ingester)
+			client, err := d.clientCache.GetClientFor(ingester.Addr)
 			if err != nil {
 				errs <- err
 				return
@@ -609,7 +595,7 @@ func (d *Distributor) LabelValuesForLabelName(ctx context.Context, labelName mod
 
 // MetricsForLabelMatchers gets the metrics that match said matchers
 func (d *Distributor) MetricsForLabelMatchers(ctx context.Context, from, through model.Time, matchers ...*labels.Matcher) ([]metric.Metric, error) {
-	req, err := util.ToMetricsForLabelMatchersRequest(from, through, matchers)
+	req, err := ingester_client.ToMetricsForLabelMatchersRequest(from, through, matchers)
 	if err != nil {
 		return nil, err
 	}
@@ -623,7 +609,7 @@ func (d *Distributor) MetricsForLabelMatchers(ctx context.Context, from, through
 
 	metrics := map[model.Fingerprint]model.Metric{}
 	for _, resp := range resps {
-		ms := util.FromMetricsForLabelMatchersResponse(resp.(*client.MetricsForLabelMatchersResponse))
+		ms := ingester_client.FromMetricsForLabelMatchersResponse(resp.(*client.MetricsForLabelMatchersResponse))
 		for _, m := range ms {
 			metrics[m.Fingerprint()] = m
 		}
@@ -677,7 +663,7 @@ func (d *Distributor) AllUserStats(ctx context.Context) ([]UserIDStats, error) {
 	// Not using d.forAllIngesters(), so we can fail after first error.
 	ingesters := d.ring.GetAll()
 	for _, ingester := range ingesters {
-		client, err := d.getClientFor(ingester)
+		client, err := d.clientCache.GetClientFor(ingester.Addr)
 		if err != nil {
 			return nil, err
 		}
@@ -736,6 +722,6 @@ func (d *Distributor) Collect(ch chan<- prometheus.Metric) {
 	ch <- prometheus.MustNewConstMetric(
 		numClientsDesc,
 		prometheus.GaugeValue,
-		float64(len(d.clients)),
+		float64(d.clientCache.Count()),
 	)
 }
diff --git a/pkg/ingester/client/cache.go b/pkg/ingester/client/cache.go
@@ -0,0 +1,89 @@
+package client
+
+import (
+	io "io"
+	"sync"
+
+	"github.com/go-kit/kit/log/level"
+	"github.com/weaveworks/cortex/pkg/util"
+)
+
+// Factory defines the signature for an ingester client factory
+type Factory func(addr string, cfg Config) (IngesterClient, error)
+
+// IngesterClientCache holds a cache of ingester clients
+type IngesterClientCache struct {
+	sync.RWMutex
+	clients map[string]IngesterClient
+
+	ingesterClientFactory Factory
+	ingesterClientConfig  Config
+}
+
+// NewIngesterClientCache creates a new cache
+func NewIngesterClientCache(factory Factory, config Config) *IngesterClientCache {
+	return &IngesterClientCache{
+		clients:               map[string]IngesterClient{},
+		ingesterClientFactory: factory,
+		ingesterClientConfig:  config,
+	}
+}
+
+// GetClientFor gets the client for the specified address. If it does not exist it will make a new client
+// at that address
+func (cache *IngesterClientCache) GetClientFor(addr string) (IngesterClient, error) {
+	cache.RLock()
+	client, ok := cache.clients[addr]
+	cache.RUnlock()
+	if ok {
+		return client, nil
+	}
+
+	cache.Lock()
+	defer cache.Unlock()
+	client, ok = cache.clients[addr]
+	if ok {
+		return client, nil
+	}
+
+	client, err := cache.ingesterClientFactory(addr, cache.ingesterClientConfig)
+	if err != nil {
+		return nil, err
+	}
+	cache.clients[addr] = client
+	return client, nil
+}
+
+// RemoveClientFor removes the client with the specified address
+func (cache *IngesterClientCache) RemoveClientFor(addr string) {
+	cache.Lock()
+	defer cache.Unlock()
+	client, ok := cache.clients[addr]
+	if ok {
+		delete(cache.clients, addr)
+		// Close in the background since this operation may take awhile and we have a mutex
+		go func(addr string, closer io.Closer) {
+			if err := closer.Close(); err != nil {
+				level.Error(util.Logger).Log("msg", "error closing connection to ingester", "ingester", addr, "err", err)
+			}
+		}(addr, client.(io.Closer))
+	}
+}
+
+// RegisteredAddresses returns all the addresses that a client is cached for
+func (cache *IngesterClientCache) RegisteredAddresses() []string {
+	result := []string{}
+	cache.RLock()
+	defer cache.RUnlock()
+	for addr := range cache.clients {
+		result = append(result, addr)
+	}
+	return result
+}
+
+// Count returns how many clients are in the cache
+func (cache *IngesterClientCache) Count() int {
+	cache.RLock()
+	defer cache.RUnlock()
+	return len(cache.clients)
+}
diff --git a/pkg/ingester/client/cache_test.go b/pkg/ingester/client/cache_test.go
@@ -0,0 +1,62 @@
+package client
+
+import (
+	fmt "fmt"
+	"testing"
+
+	"google.golang.org/grpc/health/grpc_health_v1"
+)
+
+func (i mockIngester) Close() error {
+	return nil
+}
+
+func TestIngesterCache(t *testing.T) {
+	buildCount := 0
+	factory := func(addr string, _ Config) (IngesterClient, error) {
+		if addr == "bad" {
+			return nil, fmt.Errorf("Fail")
+		}
+		buildCount++
+		return mockIngester{happy: true, status: grpc_health_v1.HealthCheckResponse_SERVING}, nil
+	}
+	cache := NewIngesterClientCache(factory, Config{})
+
+	cache.GetClientFor("1")
+	if buildCount != 1 {
+		t.Errorf("Did not create client")
+	}
+
+	cache.GetClientFor("1")
+	if buildCount != 1 {
+		t.Errorf("Created client that should have been cached")
+	}
+
+	cache.GetClientFor("2")
+	if cache.Count() != 2 {
+		t.Errorf("Expected Count() = 2, got %d", cache.Count())
+	}
+
+	cache.RemoveClientFor("1")
+	if cache.Count() != 1 {
+		t.Errorf("Expected Count() = 1, got %d", cache.Count())
+	}
+
+	cache.GetClientFor("1")
+	if buildCount != 3 || cache.Count() != 2 {
+		t.Errorf("Did not re-create client correctly")
+	}
+
+	_, err := cache.GetClientFor("bad")
+	if err == nil {
+		t.Errorf("Bad create should have thrown an error")
+	}
+	if cache.Count() != 2 {
+		t.Errorf("Bad create should not have been added to cache")
+	}
+
+	addrs := cache.RegisteredAddresses()
+	if len(addrs) != cache.Count() {
+		t.Errorf("Lengths of registered addresses and cache.Count() do not match")
+	}
+}