cortexproject · friedrichg · Jul 4, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 14, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 * [CHANGE] Ingester: Remove `-querier.query-store-for-labels-enabled` flag. Querying long-term store for labels is always enabled. #5984
 * [CHANGE] Server: Instrument `cortex_request_duration_seconds` metric with native histogram. If `native-histograms` feature is enabled in monitoring Prometheus then the metric name needs to be updated in your dashboards. #6056
 * [FEATURE] Ingester: Experimental: Enable native histogram ingestion via `-blocks-storage.tsdb.enable-native-histograms` flag. #5986
+* [FEATURE] Query Frontend: Added a query rejection mechanism to block resource-intensive queries. #6005
 * [ENHANCEMENT] rulers: Add support to persist tokens in rulers. #5987
 * [ENHANCEMENT] Query Frontend/Querier: Added store gateway postings touched count and touched size in Querier stats and log in Query Frontend. #5892
 * [ENHANCEMENT] Query Frontend/Querier: Returns `warnings` on prometheus query responses. #5916

diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
@@ -3285,6 +3285,20 @@ query_priority:
   # List of priority definitions.
   [priorities: <list of PriorityDef> | default = []]
 
+# Configuration for query rejection.
+query_rejection:
+  # Whether query rejection is enabled.
+  # CLI flag: -frontend.query-rejection.enabled
+  [enabled: <boolean> | default = false]
+
+  # List of query_attributes to match and reject queries. A query is rejected if
+  # it matches any query_attribute in this list. Each query_attribute has
+  # several properties (e.g., regex, time_window, user_agent), and all specified
+  # properties must match for a query_attribute to be considered a match. Only
+  # the specified properties are checked, and an AND operator is applied to
+  # them.
+  [query_attributes: <list of QueryAttribute> | default = []]
+
 # Duration to delay the evaluation of rules to ensure the underlying metrics
 # have been pushed to Cortex.
 # CLI flag: -ruler.evaluation-delay-duration
@@ -5345,14 +5359,24 @@ limits:
 # priority level. Value between 0 and 1 will be used as a percentage.
 [reserved_queriers: <float> | default = 0]
 
-# List of query attributes to assign the priority.
+# List of query_attributes to match and assign priority to queries. A query is
+# assigned to this priority if it matches any query_attribute in this list. Each
+# query_attribute has several properties (e.g., regex, time_window, user_agent),
+# and all specified properties must match for a query_attribute to be considered
+# a match. Only the specified properties are checked, and an AND operator is
+# applied to them.
 [query_attributes: <list of QueryAttribute> | default = []]
 ```
 
 ### `QueryAttribute`
 
 ```yaml
-# Regex that the query string should match. If not set, it won't be checked.
+# API type for the query. Should be one of the query, query_range, series,
+# labels, label_values. If not set, it won't be checked.
+[api_type: <string> | default = ""]
+
+# Regex that the query string (or at least one of the matchers in metadata
+# query) should match. If not set, it won't be checked.
 [regex: <string> | default = ""]
 
 # Overall data select time window (including range selectors, modifiers and
@@ -5368,6 +5392,49 @@ time_window:
   # lookback delta) that the query should be within. If set to 0, it won't be
   # checked.
   [end: <int> | default = 0]
+
+# Query time range should be within this limit to match. Depending on where it
+# was used, in most of the use-cases, either min or max value will be used. If
+# not set, it won't be checked.
+time_range_limit:
+  # This will be duration (12h, 1d, 15d etc.). Query time range should be above
+  # or equal to this value to match. Ex: if this value is 20d, then queries
+  # whose range is bigger than or equal to 20d will match. If set to 0, it won't
+  # be checked.
+  [min: <int> | default = 0]
+
+  # This will be duration (12h, 1d, 15d etc.). Query time range should be below
+  # or equal to this value to match. Ex: if this value is 24h, then queries
+  # whose range is smaller than or equal to 24h will match.If set to 0, it won't
+  # be checked.
+  [max: <int> | default = 0]
+
+# If query step provided should be within this limit to match. If not set, it
+# won't be checked. This property only applied to range queries and ignored for
+# other types of queries.
+query_step_limit:
+  # Query step should be above or equal to this value to match. If set to 0, it
+  # won't be checked.
+  [min: <int> | default = 0]
+
+  # Query step should be below or equal to this value to match. If set to 0, it
+  # won't be checked.
+  [max: <int> | default = 0]
+
+# Regex that User-Agent header of the request should match. If not set, it won't
+# be checked.
+[user_agent_regex: <string> | default = ""]
+
+# Grafana includes X-Dashboard-Uid header in query requests. If this field is
+# provided then X-Dashboard-Uid header of request should match this value. If
+# not set, it won't be checked. This property won't be applied to metadata
+# queries.
+[dashboard_uid: <string> | default = ""]
+
+# Grafana includes X-Panel-Id header in query requests. If this field is
+# provided then X-Panel-Id header of request should match this value. If not
+# set, it won't be checked. This property won't be applied to metadata queries.
+[panel_id: <string> | default = ""]
 ```
 
 ### `DisabledRuleGroup`

diff --git a/docs/configuration/v1-guarantees.md b/docs/configuration/v1-guarantees.md
@@ -115,3 +115,4 @@ Currently experimental features are:
   - Ingestion can be enabled by setting `-blocks-storage.tsdb.enable-native-histograms=true` on Ingester.
 - String interning for metrics labels
   - Enable string interning for metrics labels by setting `-ingester.labels-string-interning-enabled` on Ingester.
+- Query-frontend: query rejection (`-frontend.query-rejection.enabled`)
diff --git a/integration/e2ecortex/client.go b/integration/e2ecortex/client.go
@@ -234,7 +234,7 @@ func (c *Client) QueryRange(query string, start, end time.Time, step time.Durati
 }
 
 // QueryRangeRaw runs a ranged query directly against the querier API.
-func (c *Client) QueryRangeRaw(query string, start, end time.Time, step time.Duration) (*http.Response, []byte, error) {
+func (c *Client) QueryRangeRaw(query string, start, end time.Time, step time.Duration, headers map[string]string) (*http.Response, []byte, error) {
 	addr := fmt.Sprintf(
 		"http://%s/api/prom/api/v1/query_range?query=%s&start=%s&end=%s&step=%s",
 		c.querierAddress,
@@ -244,11 +244,11 @@ func (c *Client) QueryRangeRaw(query string, start, end time.Time, step time.Dur
 		strconv.FormatFloat(step.Seconds(), 'f', -1, 64),
 	)
 
-	return c.query(addr)
+	return c.query(addr, headers)
 }
 
 // QueryRaw runs a query directly against the querier API.
-func (c *Client) QueryRaw(query string, ts time.Time) (*http.Response, []byte, error) {
+func (c *Client) QueryRaw(query string, ts time.Time, headers map[string]string) (*http.Response, []byte, error) {
 	u := &url.URL{
 		Scheme: "http",
 		Path:   fmt.Sprintf("%s/api/prom/api/v1/query", c.querierAddress),
@@ -260,11 +260,11 @@ func (c *Client) QueryRaw(query string, ts time.Time) (*http.Response, []byte, e
 		q.Set("time", FormatTime(ts))
 	}
 	u.RawQuery = q.Encode()
-	return c.query(u.String())
+	return c.query(u.String(), headers)
 }
 
 // SeriesRaw runs a series request directly against the querier API.
-func (c *Client) SeriesRaw(matches []string, startTime, endTime time.Time) (*http.Response, []byte, error) {
+func (c *Client) SeriesRaw(matches []string, startTime, endTime time.Time, headers map[string]string) (*http.Response, []byte, error) {
 	u := &url.URL{
 		Scheme: "http",
 		Path:   fmt.Sprintf("%s/api/prom/api/v1/series", c.querierAddress),
@@ -283,11 +283,11 @@ func (c *Client) SeriesRaw(matches []string, startTime, endTime time.Time) (*htt
 	}
 
 	u.RawQuery = q.Encode()
-	return c.query(u.String())
+	return c.query(u.String(), headers)
 }
 
 // LabelNamesRaw runs a label names request directly against the querier API.
-func (c *Client) LabelNamesRaw(matches []string, startTime, endTime time.Time) (*http.Response, []byte, error) {
+func (c *Client) LabelNamesRaw(matches []string, startTime, endTime time.Time, headers map[string]string) (*http.Response, []byte, error) {
 	u := &url.URL{
 		Scheme: "http",
 		Path:   fmt.Sprintf("%s/api/prom/api/v1/labels", c.querierAddress),
@@ -306,11 +306,11 @@ func (c *Client) LabelNamesRaw(matches []string, startTime, endTime time.Time) (
 	}
 
 	u.RawQuery = q.Encode()
-	return c.query(u.String())
+	return c.query(u.String(), headers)
 }
 
 // LabelValuesRaw runs a label values request directly against the querier API.
-func (c *Client) LabelValuesRaw(label string, matches []string, startTime, endTime time.Time) (*http.Response, []byte, error) {
+func (c *Client) LabelValuesRaw(label string, matches []string, startTime, endTime time.Time, headers map[string]string) (*http.Response, []byte, error) {
 	u := &url.URL{
 		Scheme: "http",
 		Path:   fmt.Sprintf("%s/api/prom/api/v1/label/%s/values", c.querierAddress, label),
@@ -329,7 +329,7 @@ func (c *Client) LabelValuesRaw(label string, matches []string, startTime, endTi
 	}
 
 	u.RawQuery = q.Encode()
-	return c.query(u.String())
+	return c.query(u.String(), headers)
 }
 
 // RemoteRead runs a remote read query.
@@ -398,7 +398,7 @@ func (c *Client) RemoteRead(matchers []*labels.Matcher, start, end time.Time, st
 	return &resp, nil
 }
 
-func (c *Client) query(addr string) (*http.Response, []byte, error) {
+func (c *Client) query(addr string, headers map[string]string) (*http.Response, []byte, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
 	defer cancel()
 
@@ -409,6 +409,10 @@ func (c *Client) query(addr string) (*http.Response, []byte, error) {
 
 	req.Header.Set("X-Scope-OrgID", c.orgID)
 
+	for key, value := range headers {
+		req.Header.Set(key, value)
+	}
+
 	retries := backoff.New(ctx, backoff.Config{
 		MinBackoff: 1 * time.Second,
 		MaxBackoff: 2 * time.Second,

diff --git a/integration/querier_test.go b/integration/querier_test.go
@@ -901,22 +901,22 @@ func TestQuerierWithBlocksStorageLimits(t *testing.T) {
 	require.NoError(t, err)
 
 	// We expect all queries hitting 422 exceeded series limit on store gateway.
-	resp, body, err := c.QueryRangeRaw(`{job="test"}`, seriesTimestamp.Add(-time.Second), seriesTimestamp, time.Second)
+	resp, body, err := c.QueryRangeRaw(`{job="test"}`, seriesTimestamp.Add(-time.Second), seriesTimestamp, time.Second, map[string]string{})
 	require.NoError(t, err)
 	require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
 	require.Contains(t, string(body), "exceeded series limit")
 
-	resp, body, err = c.SeriesRaw([]string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp)
+	resp, body, err = c.SeriesRaw([]string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp, map[string]string{})
 	require.NoError(t, err)
 	require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
 	require.Contains(t, string(body), "exceeded series limit")
 
-	resp, body, err = c.LabelNamesRaw([]string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp)
+	resp, body, err = c.LabelNamesRaw([]string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp, map[string]string{})
 	require.NoError(t, err)
 	require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
 	require.Contains(t, string(body), "exceeded series limit")
 
-	resp, body, err = c.LabelValuesRaw("job", []string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp)
+	resp, body, err = c.LabelValuesRaw("job", []string{`{job="test"}`}, seriesTimestamp.Add(-time.Second), seriesTimestamp, map[string]string{})
 	require.NoError(t, err)
 	require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
 	require.Contains(t, string(body), "exceeded series limit")
@@ -994,7 +994,7 @@ func TestQuerierWithStoreGatewayDataBytesLimits(t *testing.T) {
 	require.NoError(t, err)
 
 	// We expect all queries hitting 422 exceeded series limit
-	resp, body, err := c.QueryRaw(`{job="test"}`, series2Timestamp)
+	resp, body, err := c.QueryRaw(`{job="test"}`, series2Timestamp, map[string]string{})
 	require.NoError(t, err)
 	require.Equal(t, http.StatusUnprocessableEntity, resp.StatusCode)
 	require.Contains(t, string(body), "exceeded bytes limit")
@@ -1245,7 +1245,7 @@ func TestQuerierMaxSamplesLimit(t *testing.T) {
 	var body []byte
 	for retries.Ongoing() {
 		// We expect request to hit max samples limit.
-		res, body, err = c.QueryRaw(`sum({job="test"})`, series1Timestamp)
+		res, body, err = c.QueryRaw(`sum({job="test"})`, series1Timestamp, map[string]string{})
 		if err == nil {
 			break
 		}