grafana · pstibrany · Jun 8, 2023 · Jun 1, 2023 · Jun 2, 2023 · Jun 6, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@
   * `-<prefix>.initial-connection-window-size`
 * [ENHANCEMENT] Query-frontend: added "response_size_bytes" field to "query stats" log. #5196
 * [ENHANCEMENT] Querier: Refine error messages for per-tenant query limits, informing the user of the preferred strategy for not hitting the limit, in addition to how they may tweak the limit. #5059
+* [ENHANCEMENT] Distributor: optimize sending of requests to ingesters by reusing memory buffers for marshalling requests. For now this optimization can be disabled by setting `-distributor.write-requests-buffer-pooling-enabled` to `false`. #5195
 * [BUGFIX] Querier: don't leak memory when processing query requests from query-frontends (ie. when the query-scheduler is disabled). #5199
 
 ### Mixin

@@ -1476,6 +1476,17 @@
           ],
           "fieldValue": null,
           "fieldDefaultValue": null
+        },
+        {
+          "kind": "field",
+          "name": "write_requests_buffer_pooling_enabled",
+          "required": false,
+          "desc": "Enable pooling of buffers used for marshaling write requests.",
+          "fieldValue": null,
+          "fieldDefaultValue": true,
+          "fieldFlag": "distributor.write-requests-buffer-pooling-enabled",
+          "fieldType": "boolean",
+          "fieldCategory": "experimental"
         }
       ],
       "fieldValue": null,

@@ -1159,6 +1159,8 @@ Usage of ./cmd/mimir/mimir:
     	The prefix for the keys in the store. Should end with a /. (default "collectors/")
   -distributor.ring.store string
     	Backend storage to use for the ring. Supported values are: consul, etcd, inmemory, memberlist, multi. (default "memberlist")
+  -distributor.write-requests-buffer-pooling-enabled
+    	[experimental] Enable pooling of buffers used for marshaling write requests. (default true)
   -enable-go-runtime-metrics
     	Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.
   -flusher.exit-after-flush

@@ -114,6 +114,7 @@ The following features are currently experimental:
 - Per-tenant Results cache TTL (`-query-frontend.results-cache-ttl`, `-query-frontend.results-cache-ttl-for-out-of-order-time-window`)
 - Fetching TLS secrets from Vault for various clients (`-vault.enabled`)
 - Timeseries Unmarshal caching optimization in distributor (`-timeseries-unmarshal-caching-optimization-enabled`)
+- Reusing buffers for marshalling write requests in distributors (`-distributor.write-requests-buffer-pooling-enabled`)
 
 ## Deprecated features
 

@@ -768,6 +768,10 @@ instance_limits:
   # per-tenant. Additional requests will be rejected. 0 = unlimited.
   # CLI flag: -distributor.instance-limits.max-inflight-push-requests-bytes
   [max_inflight_push_requests_bytes: <int> | default = 0]
+
+# (experimental) Enable pooling of buffers used for marshaling write requests.
+# CLI flag: -distributor.write-requests-buffer-pooling-enabled
+[write_requests_buffer_pooling_enabled: <boolean> | default = true]
 ```
 
 ### ingester

@@ -45,6 +45,7 @@ import (
 	"github.com/grafana/mimir/pkg/querier/stats"
 	"github.com/grafana/mimir/pkg/util"
 	util_math "github.com/grafana/mimir/pkg/util/math"
+	pool2 "github.com/grafana/mimir/pkg/util/pool"
 	"github.com/grafana/mimir/pkg/util/push"
 	"github.com/grafana/mimir/pkg/util/validation"
 )
@@ -64,10 +65,12 @@ const (
 
 	// metaLabelTenantID is the name of the metric_relabel_configs label with tenant ID.
 	metaLabelTenantID = model.MetaLabelPrefix + "tenant_id"
-)
 
-const (
 	instanceIngestionRateTickInterval = time.Second
+
+	// Size of "slab" when using pooled buffers for marshaling write requests. When handling single Push request
+	// buffers for multiple write requests sent to ingesters will be allocated from single "slab", if there is enough space.
+	writeRequestSlabPoolSize = 512 * 1024
 )
 
 // Distributor forwards appends and queries to individual ingesters.
@@ -133,6 +136,9 @@ type Distributor struct {
 	metadataValidationMetrics *validation.MetadataValidationMetrics
 
 	PushWithMiddlewares push.Func
+
+	// Pool of []byte used when marshalling write requests.
+	writeRequestBytePool sync.Pool
 }
 
 // Config contains the configuration required to
@@ -168,6 +174,8 @@ type Config struct {
 	// and access the deserialized write requests before/after they are pushed.
 	// These functions will only receive samples that don't get dropped by HA deduplication.
 	PushWrappers []PushWrapper `yaml:"-"`
+
+	WriteRequestsBufferPoolingEnabled bool `yaml:"write_requests_buffer_pooling_enabled" category:"experimental"`
 }
 
 // PushWrapper wraps around a push. It is similar to middleware.Interface.
@@ -181,6 +189,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) {
 
 	f.IntVar(&cfg.MaxRecvMsgSize, "distributor.max-recv-msg-size", 100<<20, "Max message size in bytes that the distributors will accept for incoming push requests to the remote write API. If exceeded, the request will be rejected.")
 	f.DurationVar(&cfg.RemoteTimeout, "distributor.remote-timeout", 2*time.Second, "Timeout for downstream ingesters.")
+	f.BoolVar(&cfg.WriteRequestsBufferPoolingEnabled, "distributor.write-requests-buffer-pooling-enabled", true, "Enable pooling of buffers used for marshaling write requests.")
 
 	cfg.DefaultLimits.RegisterFlags(f)
 }
@@ -204,6 +213,7 @@ const (
 func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, ingestersRing ring.ReadRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) {
 	if cfg.IngesterClientFactory == nil {
 		cfg.IngesterClientFactory = func(addr string) (ring_client.PoolClient, error) {
+			clientConfig.WriteRequestsBufferPoolingEnabled = cfg.WriteRequestsBufferPoolingEnabled
 			return ingester_client.MakeIngesterClient(addr, clientConfig)
 		}
 	}
@@ -1121,6 +1131,11 @@ func (d *Distributor) push(ctx context.Context, pushReq *push.Request) (*mimirpb
 	// so set this flag false and pass cleanup() to DoBatch.
 	cleanupInDefer = false
 
+	if d.cfg.WriteRequestsBufferPoolingEnabled {
+		slabPool := pool2.NewFastReleasingSlabPool[byte](&d.writeRequestBytePool, writeRequestSlabPoolSize)
+		localCtx = ingester_client.WithSlabPool(localCtx, slabPool)
+	}
+
 	err = ring.DoBatch(ctx, ring.WriteNoExtend, subRing, keys, func(ingester ring.InstanceDesc, indexes []int) error {
 		var timeseriesCount, metadataCount int
 		for _, i := range indexes {
@@ -1203,6 +1218,7 @@ func (d *Distributor) send(ctx context.Context, ingester ring.InstanceDesc, time
 		Metadata:   metadata,
 		Source:     source,
 	}
+
 	_, err = c.Push(ctx, &req)
 	if resp, ok := httpgrpc.HTTPResponseFromError(err); ok {
 		// Wrap HTTP gRPC error with more explanatory message.

@@ -0,0 +1,111 @@
+package client
+
+import (
+	"context"
+
+	"google.golang.org/grpc"
+
+	"github.com/grafana/mimir/pkg/mimirpb"
+	"github.com/grafana/mimir/pkg/util/pool"
+)
+
+// This is a copy of (*ingesterClient).Push method, but accepting any message type.
+func pushRaw(ctx context.Context, conn *grpc.ClientConn, msg interface{}, opts ...grpc.CallOption) (*mimirpb.WriteResponse, error) {
+	out := new(mimirpb.WriteResponse)
+	err := conn.Invoke(ctx, "/cortex.Ingester/Push", msg, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+// bufferPoolingIngesterClient implements IngesterClient, but overrides Push method to add pooling of buffers used to marshal write requests.
+type bufferPoolingIngesterClient struct {
+	IngesterClient
+
+	conn *grpc.ClientConn
+
+	// This refers to pushRaw function, but is overridden in the benchmark to avoid doing actual grpc calls.
+	pushRawFn func(ctx context.Context, conn *grpc.ClientConn, msg interface{}, opts ...grpc.CallOption) (*mimirpb.WriteResponse, error)
+}
+
+func newBufferPoolingIngesterClient(client IngesterClient, conn *grpc.ClientConn) *bufferPoolingIngesterClient {
+	c := &bufferPoolingIngesterClient{
+		IngesterClient: client,
+		conn:           conn,
+		pushRawFn:      pushRaw,
+	}
+	return c
+}
+
+// Push wraps WriteRequest to implement buffer pooling.
+func (c *bufferPoolingIngesterClient) Push(ctx context.Context, in *mimirpb.WriteRequest, opts ...grpc.CallOption) (*mimirpb.WriteResponse, error) {
+	p := getPool(ctx)
+	if p == nil {
+		return c.IngesterClient.Push(ctx, in, opts...)
+	}
+
+	wr := &wrappedRequest{
+		WriteRequest: in,
+		slabPool:     p,
+	}
+	// We can return all buffers back to slabPool when this method finishes.
+	defer wr.ReturnBuffersToPool()
+
+	return c.pushRawFn(ctx, c.conn, wr, opts...)
+}
+
+type poolKey int
+
+var poolKeyValue poolKey = 1
+
+func WithSlabPool(ctx context.Context, pool *pool.FastReleasingSlabPool[byte]) context.Context {
+	if pool != nil {
+		return context.WithValue(ctx, poolKeyValue, pool)
+	}
+	return ctx
+}
+
+func getPool(ctx context.Context) *pool.FastReleasingSlabPool[byte] {
+	v := ctx.Value(poolKeyValue)
+	if p, ok := v.(*pool.FastReleasingSlabPool[byte]); ok {
+		return p
+	}
+	return nil
+}
+
+type wrappedRequest struct {
+	*mimirpb.WriteRequest
+
+	slabPool    *pool.FastReleasingSlabPool[byte]
+	slabID      int
+	moreSlabIDs []int // Used in case when Marshal gets called multiple times.
+}
+
+func (w *wrappedRequest) Marshal() ([]byte, error) {
+	size := w.WriteRequest.Size()
+	buf, slabID := w.slabPool.Get(size)
+
+	if w.slabID == 0 {
+		w.slabID = slabID
+	} else {
+		w.moreSlabIDs = append(w.moreSlabIDs, slabID)
+	}
+
+	n, err := w.WriteRequest.MarshalToSizedBuffer(buf[:size])
+	if err != nil {
+		return nil, err
+	}
+	return buf[:n], nil
+}
+
+func (w *wrappedRequest) ReturnBuffersToPool() {
+	if w.slabID != 0 {
+		w.slabPool.Release(w.slabID)
+		w.slabID = 0
+	}
+	for _, s := range w.moreSlabIDs {
+		w.slabPool.Release(s)
+	}
+	w.moreSlabIDs = nil
+}