fix(dynamodb): Misconfigured metrics causing empty graphs (#11283)

This PR corrects 3 misconfigured metrics we had on the `Table` construct. ### UserErrors Per the [documentation](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/metrics-dimensions.html) The `table.metricUserErrors()` does not emit the `TableName` dimension. It is actually an account (and region) wide metric. The fix was to remove the `TableName` dimensionality from the metric creation. ### SystemErrors Per the [documentation](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/metrics-dimensions.html) The `table.metricSystemErrors()` is always emitted with the `Operation` dimension, and our current implementation does not pass it. The fix adds an additional `operations` property to the method, that allows passing an array of operations, the returned metric will be a *SUM* over those operations. If no operation is passed, we sum all available operations. Since the current method returns a `Metric`, returning a math expression won't work since it is an `IMetric` that doesn't extend `Metric`. To avoid breaking changes, we introduce a new method, `metricSystemErrorsForOperations`: ```ts const totalSystemErrors = table.metricSystemErrorsForOperations(); const getPutSystemErrors = table.metricSystemErrorsForOperations({ operations: [dynamo.Operation.PUT_ITEM, dynamo.Operation.GET_ITEM] }); ``` ### SuccessfulRequestLatency Per the [documentation](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/metrics-dimensions.html) The `table.metricSuccessfulRequestLatency()` is always emitted with the `Operation` dimension, and our current implementation does not pass it. The fix requires user to pass the `Operation` dimension. So the API is: ```ts const getLatency = table.metricSuccessfulRequestLatency({ dimensions: { Operation: 'GetItem' }, }); ``` Fixes #11261 Fixes #11269 ---- *By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*
aws · Nov 6, 2020 · 9968669 · 9968669
1 parent fa5ea36
commit 9968669
Show file tree

Hide file tree

Showing 4 changed files with 362 additions and 18 deletions.
diff --git a/packages/@aws-cdk/aws-cloudwatch/lib/alarm.ts b/packages/@aws-cdk/aws-cloudwatch/lib/alarm.ts
@@ -273,11 +273,18 @@ export class Alarm extends AlarmBase {
               };
             },
             withExpression(expr, conf) {
+
+              const hasSubmetrics = mathExprHasSubmetrics(expr);
+
+              if (hasSubmetrics) {
+                assertSubmetricsCount(expr);
+              }
+
               return {
                 expression: expr.expression,
                 id: entry.id || uniqueMetricId(),
                 label: conf.renderingProperties?.label,
-                period: mathExprHasSubmetrics(expr) ? undefined : expr.period,
+                period: hasSubmetrics ? undefined : expr.period,
                 returnData: entry.tag ? undefined : false, // Tag stores "primary" attribute, default is "true"
               };
             },
@@ -344,4 +351,11 @@ function mathExprHasSubmetrics(expr: MetricExpressionConfig) {
   return Object.keys(expr.usingMetrics).length > 0;
 }
 
+function assertSubmetricsCount(expr: MetricExpressionConfig) {
+  if (Object.keys(expr.usingMetrics).length > 10) {
+    // https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html#alarms-on-metric-math-expressions
+    throw new Error('Alarms on math expressions cannot contain more than 10 individual metrics');
+  };
+}
+
 type Writeable<T> = { -readonly [P in keyof T]: T[P] };
diff --git a/packages/@aws-cdk/aws-cloudwatch/test/test.alarm.ts b/packages/@aws-cdk/aws-cloudwatch/test/test.alarm.ts
@@ -2,14 +2,46 @@ import { ABSENT, expect, haveResource } from '@aws-cdk/assert';
 import { Duration, Stack } from '@aws-cdk/core';
 import { Construct } from 'constructs';
 import { Test } from 'nodeunit';
-import { Alarm, IAlarm, IAlarmAction, Metric } from '../lib';
+import { Alarm, IAlarm, IAlarmAction, Metric, MathExpression, IMetric } from '../lib';
 
 const testMetric = new Metric({
   namespace: 'CDK/Test',
   metricName: 'Metric',
 });
 
 export = {
+
+  'alarm does not accept a math expression with more than 10 metrics'(test: Test) {
+
+    const stack = new Stack();
+
+    const usingMetrics: Record<string, IMetric> = {};
+
+    for (const i of [...Array(15).keys()]) {
+      const metricName = `metric${i}`;
+      usingMetrics[metricName] = new Metric({
+        namespace: 'CDK/Test',
+        metricName: metricName,
+      });
+    }
+
+    const math = new MathExpression({
+      expression: 'a',
+      usingMetrics,
+    });
+
+    test.throws(() => {
+
+      new Alarm(stack, 'Alarm', {
+        metric: math,
+        threshold: 1000,
+        evaluationPeriods: 3,
+      });
+
+    }, /Alarms on math expressions cannot contain more than 10 individual metrics/);
+
+    test.done();
+  },
   'can make simple alarm'(test: Test) {
     // GIVEN
     const stack = new Stack();

diff --git a/packages/@aws-cdk/aws-dynamodb/lib/table.ts b/packages/@aws-cdk/aws-dynamodb/lib/table.ts
@@ -19,6 +19,54 @@ const RANGE_KEY_TYPE = 'RANGE';
 // https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Limits.html#limits-secondary-indexes
 const MAX_LOCAL_SECONDARY_INDEX_COUNT = 5;
 
+/**
+ * Options for configuring a system errors metric that considers multiple operations.
+ */
+export interface SystemErrorsForOperationsMetricOptions extends cloudwatch.MetricOptions {
+
+  /**
+   * The operations to apply the metric to.
+   *
+   * @default - All operations available by DynamoDB tables will be considered.
+   */
+  readonly operations?: Operation[];
+
+}
+
+/**
+ * Supported DynamoDB table operations.
+ */
+export enum Operation {
+
+  /** GetItem */
+  GET_ITEM = 'GetItem',
+
+  /** BatchGetItem */
+  BATCH_GET_ITEM = 'BatchGetItem',
+
+  /** Scan */
+  SCAN = 'Scan',
+
+  /** Query */
+  QUERY = 'Query',
+
+  /** GetRecords */
+  GET_RECORDS = 'GetRecords',
+
+  /** PutItem */
+  PUT_ITEM = 'PutItem',
+
+  /** DeleteItem */
+  DELETE_ITEM = 'DeleteItem',
+
+  /** UpdateItem */
+  UPDATE_ITEM = 'UpdateItem',
+
+  /** BatchWriteItem */
+  BATCH_WRITE_ITEM = 'BatchWriteItem',
+
+}
+
 /**
  * Represents an attribute for describing the key schema for the table
  * and indexes.
@@ -385,6 +433,8 @@ export interface ITable extends IResource {
    * Metric for the system errors
    *
    * @param props properties of a metric
+   *
+   * @deprecated use `metricSystemErrorsForOperations`
    */
   metricSystemErrors(props?: cloudwatch.MetricOptions): cloudwatch.Metric;
 
@@ -406,8 +456,10 @@ export interface ITable extends IResource {
    * Metric for the successful request latency
    *
    * @param props properties of a metric
+   *
    */
   metricSuccessfulRequestLatency(props?: cloudwatch.MetricOptions): cloudwatch.Metric;
+
 }
 
 /**
@@ -628,6 +680,9 @@ abstract class TableBase extends Resource implements ITable {
 
   /**
    * Return the given named metric for this Table
+   *
+   * By default, the metric will be calculated as a sum over a period of 5 minutes.
+   * You can customize this by using the `statistic` and `period` properties.
    */
   public metric(metricName: string, props?: cloudwatch.MetricOptions): cloudwatch.Metric {
     return new cloudwatch.Metric({
@@ -643,7 +698,8 @@ abstract class TableBase extends Resource implements ITable {
   /**
    * Metric for the consumed read capacity units this table
    *
-   * @default sum over a minute
+   * By default, the metric will be calculated as a sum over a period of 5 minutes.
+   * You can customize this by using the `statistic` and `period` properties.
    */
   public metricConsumedReadCapacityUnits(props?: cloudwatch.MetricOptions): cloudwatch.Metric {
     return this.metric('ConsumedReadCapacityUnits', { statistic: 'sum', ...props });
@@ -652,7 +708,8 @@ abstract class TableBase extends Resource implements ITable {
   /**
    * Metric for the consumed write capacity units this table
    *
-   * @default sum over a minute
+   * By default, the metric will be calculated as a sum over a period of 5 minutes.
+   * You can customize this by using the `statistic` and `period` properties.
    */
   public metricConsumedWriteCapacityUnits(props?: cloudwatch.MetricOptions): cloudwatch.Metric {
     return this.metric('ConsumedWriteCapacityUnits', { statistic: 'sum', ...props });
@@ -661,37 +718,145 @@ abstract class TableBase extends Resource implements ITable {
   /**
    * Metric for the system errors this table
    *
-   * @default sum over a minute
+   * @deprecated use `metricSystemErrorsForOperations`.
    */
   public metricSystemErrors(props?: cloudwatch.MetricOptions): cloudwatch.Metric {
-    return this.metric('SystemErrors', { statistic: 'sum', ...props });
+
+    if (!props?.dimensions?.Operation) {
+      // 'Operation' must be passed because its an operational metric.
+      throw new Error("'Operation' dimension must be passed for the 'SystemErrors' metric.");
+    }
+
+    const dimensions = {
+      TableName: this.tableName,
+      ...props?.dimensions ?? {},
+    };
+
+    return this.metric('SystemErrors', { statistic: 'sum', ...props, dimensions });
   }
 
   /**
-   * Metric for the user errors this table
+   * Metric for the user errors. Note that this metric reports user errors across all
+   * the tables in the account and region the table resides in.
    *
-   * @default sum over a minute
+   * By default, the metric will be calculated as a sum over a period of 5 minutes.
+   * You can customize this by using the `statistic` and `period` properties.
    */
   public metricUserErrors(props?: cloudwatch.MetricOptions): cloudwatch.Metric {
-    return this.metric('UserErrors', { statistic: 'sum', ...props });
+
+    if (props?.dimensions) {
+      throw new Error("'dimensions' is not supported for the 'UserErrors' metric");
+    }
+
+    // overriding 'dimensions' here because this metric is an account metric.
+    // see 'UserErrors' in https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/metrics-dimensions.html
+    return this.metric('UserErrors', { statistic: 'sum', ...props, dimensions: {} });
   }
 
   /**
    * Metric for the conditional check failed requests this table
    *
-   * @default sum over a minute
+   * By default, the metric will be calculated as a sum over a period of 5 minutes.
+   * You can customize this by using the `statistic` and `period` properties.
    */
   public metricConditionalCheckFailedRequests(props?: cloudwatch.MetricOptions): cloudwatch.Metric {
     return this.metric('ConditionalCheckFailedRequests', { statistic: 'sum', ...props });
   }
 
   /**
-   * Metric for the successful request latency this table
+   * Metric for the successful request latency this table.
+   *
+   * By default, the metric will be calculated as an average over a period of 5 minutes.
+   * You can customize this by using the `statistic` and `period` properties.
    *
-   * @default avg over a minute
    */
   public metricSuccessfulRequestLatency(props?: cloudwatch.MetricOptions): cloudwatch.Metric {
-    return this.metric('SuccessfulRequestLatency', { statistic: 'avg', ...props });
+
+    if (!props?.dimensions?.Operation) {
+      throw new Error("'Operation' dimension must be passed for the 'SuccessfulRequestLatency' metric.");
+    }
+
+    const dimensions = {
+      TableName: this.tableName,
+      ...props?.dimensions ?? {},
+    };
+
+    return this.metric('SuccessfulRequestLatency', { statistic: 'avg', ...props, dimensions });
+  }
+
+  /**
+   * Metric for the system errors this table.
+   *
+   * This will sum errors across all possible operations.
+   * Note that by default, each individual metric will be calculated as a sum over a period of 5 minutes.
+   * You can customize this by using the `statistic` and `period` properties.
+   */
+  public metricSystemErrorsForOperations(props?: SystemErrorsForOperationsMetricOptions): cloudwatch.IMetric {
+
+    if (props?.dimensions?.Operation) {
+      throw new Error("The Operation dimension is not supported. Use the 'operations' property.");
+    }
+
+    const operations = props?.operations ?? Object.values(Operation);
+
+    const values = this.createMetricsForOperations('SystemErrors', operations, { statistic: 'sum', ...props });
+
+    const sum = new cloudwatch.MathExpression({
+      expression: `${Object.keys(values).join(' + ')}`,
+      usingMetrics: { ...values },
+      color: props?.color,
+      label: 'Sum of errors across all operations',
+      period: props?.period,
+    });
+
+    return sum;
+  }
+
+  /**
+   * Create a map of metrics that can be used in a math expression.
+   *
+   * Using the return value of this function as the `usingMetrics` property in `cloudwatch.MathExpression` allows you to
+   * use the keys of this map as metric names inside you expression.
+   *
+   * @param metricName The metric name.
+   * @param operations The list of operations to create metrics for.
+   * @param props Properties for the individual metrics.
+   * @param metricNameMapper Mapper function to allow controlling the individual metric name per operation.
+   */
+  private createMetricsForOperations(metricName: string, operations: Operation[],
+    props?: cloudwatch.MetricOptions, metricNameMapper?: (op: Operation) => string): Record<string, cloudwatch.IMetric> {
+
+    const metrics: Record<string, cloudwatch.IMetric> = {};
+
+    const mapper = metricNameMapper ?? (op => op.toLowerCase());
+
+    if (props?.dimensions?.Operation) {
+      throw new Error('Invalid properties. Operation dimension is not supported when calculating operational metrics');
+    }
+
+    for (const operation of operations) {
+
+      const metric = this.metric(metricName, {
+        ...props,
+        dimensions: {
+          TableName: this.tableName,
+          Operation: operation,
+          ...props?.dimensions,
+        },
+      });
+
+      const operationMetricName = mapper(operation);
+      const firstChar = operationMetricName.charAt(0);
+
+      if (firstChar === firstChar.toUpperCase()) {
+        // https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/using-metric-math.html#metric-math-syntax
+        throw new Error(`Mapper generated an illegal operation metric name: ${operationMetricName}. Must start with a lowercase letter`);
+      }
+
+      metrics[operationMetricName] = metric;
+    }
+
+    return metrics;
   }
 
   protected abstract get hasIndex(): boolean;