From cff1e5f6a5013d40b4f1c094d53aa64d20f33aa1 Mon Sep 17 00:00:00 2001
From: Kenzie Davisson <43759233+kenzieschmoll@users.noreply.github.com>
Date: Wed, 2 Oct 2024 15:39:47 -0700
Subject: [PATCH] Add percentile computations to benchmark scores (#7760)

This adds p50, p90, and p95 computations to each benchmark metric. This
PR also refactors the benchmark computation definitions so they are
defined by an enum.

Fixes https://github.com/flutter/flutter/issues/151551.
---
 packages/web_benchmarks/CHANGELOG.md          |   3 +-
 packages/web_benchmarks/lib/client.dart       |   2 +
 .../web_benchmarks/lib/src/computations.dart  | 351 ++++++++++++++++++
 packages/web_benchmarks/lib/src/metrics.dart  |  83 +++++
 packages/web_benchmarks/lib/src/recorder.dart | 304 +--------------
 packages/web_benchmarks/pubspec.yaml          |   2 +-
 .../test/src/computations_test.dart           |  24 ++
 .../benchmark/web_benchmarks_test.dart        |  21 +-
 8 files changed, 485 insertions(+), 305 deletions(-)
 create mode 100644 packages/web_benchmarks/lib/src/computations.dart
 create mode 100644 packages/web_benchmarks/test/src/computations_test.dart

diff --git a/packages/web_benchmarks/CHANGELOG.md b/packages/web_benchmarks/CHANGELOG.md
index 7a5b8d5366da..0096da891242 100644
--- a/packages/web_benchmarks/CHANGELOG.md
+++ b/packages/web_benchmarks/CHANGELOG.md
@@ -1,8 +1,9 @@
-## 3.1.0-wip
+## 3.1.0
 
 * Add `flutter_frame.total_time`, `flutter_frame.build_time`, and `flutter_frame.raster_time`
 metrics to benchmark results. These values are derived from the Flutter `FrameTiming` API.
 * Expose a new library `metrics.dart` that contains definitions for the benchmark metrics.
+* Add p50, p90, and p95 metrics for benchmark scores.
 
 ## 3.0.0
 
diff --git a/packages/web_benchmarks/lib/client.dart b/packages/web_benchmarks/lib/client.dart
index 357d2bb8f196..fc427473e0dd 100644
--- a/packages/web_benchmarks/lib/client.dart
+++ b/packages/web_benchmarks/lib/client.dart
@@ -10,8 +10,10 @@ import 'dart:math' as math;
 import 'package:web/web.dart';
 
 import 'src/common.dart';
+import 'src/computations.dart';
 import 'src/recorder.dart';
 
+export 'src/computations.dart';
 export 'src/recorder.dart';
 
 /// Signature for a function that creates a [Recorder].
diff --git a/packages/web_benchmarks/lib/src/computations.dart b/packages/web_benchmarks/lib/src/computations.dart
new file mode 100644
index 000000000000..99ffcfbc38da
--- /dev/null
+++ b/packages/web_benchmarks/lib/src/computations.dart
@@ -0,0 +1,351 @@
+// Copyright 2013 The Flutter Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import 'dart:math' as math;
+
+import 'package:collection/collection.dart';
+import 'package:meta/meta.dart';
+
+import 'common.dart';
+import 'metrics.dart';
+
+/// Series of time recordings indexed in time order.
+///
+/// It can calculate [average], [standardDeviation] and [noise]. If the amount
+/// of data collected is higher than [_kMeasuredSampleCount], then these
+/// calculations will only apply to the latest [_kMeasuredSampleCount] data
+/// points.
+class Timeseries {
+  /// Creates an empty timeseries.
+  ///
+  /// [name], [isReported], and [useCustomWarmUp] must not be null.
+  Timeseries(this.name, this.isReported, {this.useCustomWarmUp = false})
+      : _warmUpFrameCount = useCustomWarmUp ? 0 : null;
+
+  /// The label of this timeseries used for debugging and result inspection.
+  final String name;
+
+  /// Whether this timeseries is reported to the benchmark dashboard.
+  ///
+  /// If `true` a new benchmark card is created for the timeseries and is
+  /// visible on the dashboard.
+  ///
+  /// If `false` the data is stored but it does not show up on the dashboard.
+  /// Use unreported metrics for metrics that are useful for manual inspection
+  /// but that are too fine-grained to be useful for tracking on the dashboard.
+  final bool isReported;
+
+  /// Whether to delimit warm-up frames in a custom way.
+  final bool useCustomWarmUp;
+
+  /// The number of frames ignored as warm-up frames, used only
+  /// when [useCustomWarmUp] is true.
+  int? _warmUpFrameCount;
+
+  /// The number of frames ignored as warm-up frames.
+  int get warmUpFrameCount =>
+      useCustomWarmUp ? _warmUpFrameCount! : count - kMeasuredSampleCount;
+
+  /// List of all the values that have been recorded.
+  ///
+  /// This list has no limit.
+  final List<double> _allValues = <double>[];
+
+  /// The total amount of data collected, including ones that were dropped
+  /// because of the sample size limit.
+  int get count => _allValues.length;
+
+  /// Extracts useful statistics out of this timeseries.
+  ///
+  /// See [TimeseriesStats] for more details.
+  TimeseriesStats computeStats() {
+    final int finalWarmUpFrameCount = warmUpFrameCount;
+
+    assert(finalWarmUpFrameCount >= 0 && finalWarmUpFrameCount < count);
+
+    // The first few values we simply discard and never look at. They're from the warm-up phase.
+    final List<double> warmUpValues =
+        _allValues.sublist(0, finalWarmUpFrameCount);
+
+    // Values we analyze.
+    final List<double> candidateValues =
+        _allValues.sublist(finalWarmUpFrameCount);
+
+    // The average that includes outliers.
+    final double dirtyAverage = _computeAverage(name, candidateValues);
+
+    // The standard deviation that includes outliers.
+    final double dirtyStandardDeviation =
+        _computeStandardDeviationForPopulation(name, candidateValues);
+
+    // Any value that's higher than this is considered an outlier.
+    final double outlierCutOff = dirtyAverage + dirtyStandardDeviation;
+
+    // Candidates with outliers removed.
+    final Iterable<double> cleanValues =
+        candidateValues.where((double value) => value <= outlierCutOff);
+
+    // Outlier candidates.
+    final Iterable<double> outliers =
+        candidateValues.where((double value) => value > outlierCutOff);
+
+    // Final statistics.
+    final double cleanAverage = _computeAverage(name, cleanValues);
+    final double standardDeviation =
+        _computeStandardDeviationForPopulation(name, cleanValues);
+    final double noise =
+        cleanAverage > 0.0 ? standardDeviation / cleanAverage : 0.0;
+
+    // Compute outlier average. If there are no outliers the outlier average is
+    // the same as clean value average. In other words, in a perfect benchmark
+    // with no noise the difference between average and outlier average is zero,
+    // which the best possible outcome. Noise produces a positive difference
+    // between the two.
+    final double outlierAverage =
+        outliers.isNotEmpty ? _computeAverage(name, outliers) : cleanAverage;
+
+    // Compute percentile values (e.g. p50, p90, p95).
+    final Map<double, double> percentiles = computePercentiles(
+      name,
+      PercentileMetricComputation.percentilesAsDoubles,
+      candidateValues,
+    );
+
+    final List<AnnotatedSample> annotatedValues = <AnnotatedSample>[
+      for (final double warmUpValue in warmUpValues)
+        AnnotatedSample(
+          magnitude: warmUpValue,
+          isOutlier: warmUpValue > outlierCutOff,
+          isWarmUpValue: true,
+        ),
+      for (final double candidate in candidateValues)
+        AnnotatedSample(
+          magnitude: candidate,
+          isOutlier: candidate > outlierCutOff,
+          isWarmUpValue: false,
+        ),
+    ];
+
+    return TimeseriesStats(
+      name: name,
+      average: cleanAverage,
+      outlierCutOff: outlierCutOff,
+      outlierAverage: outlierAverage,
+      standardDeviation: standardDeviation,
+      noise: noise,
+      percentiles: percentiles,
+      cleanSampleCount: cleanValues.length,
+      outlierSampleCount: outliers.length,
+      samples: annotatedValues,
+    );
+  }
+
+  /// Adds a value to this timeseries.
+  void add(double value, {required bool isWarmUpValue}) {
+    if (value < 0.0) {
+      throw StateError(
+        'Timeseries $name: negative metric values are not supported. Got: $value',
+      );
+    }
+    _allValues.add(value);
+    if (useCustomWarmUp && isWarmUpValue) {
+      _warmUpFrameCount = warmUpFrameCount + 1;
+    }
+  }
+}
+
+/// Various statistics about a [Timeseries].
+///
+/// See the docs on the individual fields for more details.
+@sealed
+class TimeseriesStats {
+  /// Creates statistics for a time series.
+  const TimeseriesStats({
+    required this.name,
+    required this.average,
+    required this.outlierCutOff,
+    required this.outlierAverage,
+    required this.standardDeviation,
+    required this.noise,
+    required this.percentiles,
+    required this.cleanSampleCount,
+    required this.outlierSampleCount,
+    required this.samples,
+  });
+
+  /// The label used to refer to the corresponding timeseries.
+  final String name;
+
+  /// The average value of the measured samples without outliers.
+  final double average;
+
+  /// The standard deviation in the measured samples without outliers.
+  final double standardDeviation;
+
+  /// The noise as a multiple of the [average] value taken from clean samples.
+  ///
+  /// This value can be multiplied by 100.0 to get noise as a percentage of
+  /// the average.
+  ///
+  /// If [average] is zero, treats the result as perfect score, returns zero.
+  final double noise;
+
+  /// The percentile values (p50, p90, p95, etc.) for the measured samples with
+  /// outliers.
+  ///
+  /// This [Map] is from percentile targets (e.g. 0.50 for p50, 0.90 for p90,
+  /// etc.) to the computed value for the [samples].
+  final Map<double, double> percentiles;
+
+  /// The maximum value a sample can have without being considered an outlier.
+  ///
+  /// See [Timeseries.computeStats] for details on how this value is computed.
+  final double outlierCutOff;
+
+  /// The average of outlier samples.
+  ///
+  /// This value can be used to judge how badly we jank, when we jank.
+  ///
+  /// Another useful metrics is the difference between [outlierAverage] and
+  /// [average]. The smaller the value the more predictable is the performance
+  /// of the corresponding benchmark.
+  final double outlierAverage;
+
+  /// The number of measured samples after outlier are removed.
+  final int cleanSampleCount;
+
+  /// The number of outliers.
+  final int outlierSampleCount;
+
+  /// All collected samples, annotated with statistical information.
+  ///
+  /// See [AnnotatedSample] for more details.
+  final List<AnnotatedSample> samples;
+
+  /// Outlier average divided by clean average.
+  ///
+  /// This is a measure of performance consistency. The higher this number the
+  /// worse is jank when it happens. Smaller is better, with 1.0 being the
+  /// perfect score. If [average] is zero, this value defaults to 1.0.
+  double get outlierRatio => average > 0.0
+      ? outlierAverage / average
+      : 1.0; // this can only happen in perfect benchmark that reports only zeros
+
+  @override
+  String toString() {
+    final StringBuffer buffer = StringBuffer();
+    buffer.writeln(
+      '$name: (samples: $cleanSampleCount clean/$outlierSampleCount '
+      'outliers/${cleanSampleCount + outlierSampleCount} '
+      'measured/${samples.length} total)',
+    );
+    buffer.writeln(' | average: $average μs');
+    buffer.writeln(' | outlier average: $outlierAverage μs');
+    buffer.writeln(' | outlier/clean ratio: ${outlierRatio}x');
+    buffer.writeln(' | noise: ${_ratioToPercent(noise)}');
+    for (final PercentileMetricComputation metric
+        in PercentileMetricComputation.values) {
+      buffer.writeln(' | ${metric.name}: ${metric.percentile} μs');
+    }
+    return buffer.toString();
+  }
+}
+
+/// Annotates a single measurement with statistical information.
+@sealed
+class AnnotatedSample {
+  /// Creates an annotated measurement sample.
+  const AnnotatedSample({
+    required this.magnitude,
+    required this.isOutlier,
+    required this.isWarmUpValue,
+  });
+
+  /// The non-negative raw result of the measurement.
+  final double magnitude;
+
+  /// Whether this sample was considered an outlier.
+  final bool isOutlier;
+
+  /// Whether this sample was taken during the warm-up phase.
+  ///
+  /// If this value is `true`, this sample does not participate in
+  /// statistical computations. However, the sample would still be
+  /// shown in the visualization of results so that the benchmark
+  /// can be inspected manually to make sure there's a predictable
+  /// warm-up regression slope.
+  final bool isWarmUpValue;
+}
+
+/// Computes the arithmetic mean (or average) of given [values].
+double _computeAverage(String label, Iterable<double> values) {
+  if (values.isEmpty) {
+    throw StateError(
+        '$label: attempted to compute an average of an empty value list.');
+  }
+
+  final double sum = values.reduce((double a, double b) => a + b);
+  return sum / values.length;
+}
+
+/// Computes population standard deviation.
+///
+/// Unlike sample standard deviation, which divides by N - 1, this divides by N.
+///
+/// See also:
+///
+/// * https://en.wikipedia.org/wiki/Standard_deviation
+double _computeStandardDeviationForPopulation(
+    String label, Iterable<double> population) {
+  if (population.isEmpty) {
+    throw StateError(
+        '$label: attempted to compute the standard deviation of empty population.');
+  }
+  final double mean = _computeAverage(label, population);
+  final double sumOfSquaredDeltas = population.fold<double>(
+    0.0,
+    (double previous, double value) => previous += math.pow(value - mean, 2),
+  );
+  return math.sqrt(sumOfSquaredDeltas / population.length);
+}
+
+String _ratioToPercent(double value) {
+  return '${(value * 100).toStringAsFixed(2)}%';
+}
+
+/// Computes the percentile threshold in [values] for the given [percentiles].
+///
+/// Each value in [percentiles] should be between 0.0 and 1.0.
+///
+/// Returns a [Map] of percentile values to the computed value from [values].
+Map<double, double> computePercentiles(
+  String label,
+  List<double> percentiles,
+  Iterable<double> values,
+) {
+  if (values.isEmpty) {
+    throw StateError(
+      '$label: attempted to compute a percentile of an empty value list.',
+    );
+  }
+  for (final double percentile in percentiles) {
+    if (percentile < 0.0 || percentile > 1.0) {
+      throw StateError(
+        '$label: attempted to compute a percentile for an invalid '
+        'value: $percentile',
+      );
+    }
+  }
+
+  final List<double> sorted =
+      values.sorted((double a, double b) => a.compareTo(b));
+  final Map<double, double> computed = <double, double>{};
+  for (final double percentile in percentiles) {
+    final int percentileIndex =
+        (sorted.length * percentile).round().clamp(0, sorted.length - 1);
+    computed[percentile] = sorted[percentileIndex];
+  }
+
+  return computed;
+}
diff --git a/packages/web_benchmarks/lib/src/metrics.dart b/packages/web_benchmarks/lib/src/metrics.dart
index e80692ee58b2..5145d73baffb 100644
--- a/packages/web_benchmarks/lib/src/metrics.dart
+++ b/packages/web_benchmarks/lib/src/metrics.dart
@@ -50,6 +50,89 @@ enum BenchmarkMetric {
 /// from the Blink trace summary.
 const String totalUiFrameAverage = 'totalUiFrame.average';
 
+/// Describes the values computed for each [BenchmarkMetric].
+sealed class BenchmarkMetricComputation {
+  const BenchmarkMetricComputation(this.name);
+
+  /// The name of each metric computation.
+  final String name;
+
+  /// The name for the computed value tracking the average value of the measured
+  /// samples without outliers.
+  static const NamedMetricComputation average =
+      NamedMetricComputation._('average');
+
+  /// The name for the computed value tracking the average of outlier samples.
+  static const NamedMetricComputation outlierAverage =
+      NamedMetricComputation._('outlierAverage');
+
+  /// The name for the computed value tracking the outlier average divided by
+  /// the clean average.
+  static const NamedMetricComputation outlierRatio =
+      NamedMetricComputation._('outlierRatio');
+
+  /// The name for the computed value tracking the noise as a multiple of the
+  /// [average] value takes from clean samples.
+  static const NamedMetricComputation noise = NamedMetricComputation._('noise');
+
+  /// The name for the computed value tracking the 50th percentile value from
+  /// the samples with outliers.
+  static const PercentileMetricComputation p50 =
+      PercentileMetricComputation._('p50', 0.5);
+
+  /// The name for the computed value tracking the 90th percentile value from
+  /// the samples with outliers.
+  static const PercentileMetricComputation p90 =
+      PercentileMetricComputation._('p90', 0.9);
+
+  /// The name for the computed value tracking the 95th percentile value from
+  /// the samples with outliers.
+  static const PercentileMetricComputation p95 =
+      PercentileMetricComputation._('p95', 0.95);
+
+  /// All of the computed vales for each [BenchmarkMetric].
+  static const List<BenchmarkMetricComputation> values =
+      <BenchmarkMetricComputation>[
+    average,
+    outlierAverage,
+    outlierRatio,
+    noise,
+    p50,
+    p90,
+    p95,
+  ];
+}
+
+/// A [BenchmarkMetricComputation] with a descriptive name.
+final class NamedMetricComputation extends BenchmarkMetricComputation {
+  const NamedMetricComputation._(super.name);
+}
+
+/// A [BenchmarkMetricComputation] describing a percentile (p50, p90, etc.).
+final class PercentileMetricComputation extends BenchmarkMetricComputation {
+  const PercentileMetricComputation._(super.name, this.percentile)
+      : assert(percentile >= 0.0 && percentile <= 1.0);
+
+  /// The percentile value as a double.
+  ///
+  /// This value must be between 0.0 and 1.0.
+  final double percentile;
+
+  /// The percentile [BenchmarkMetricComputation]s computed for each benchmark
+  /// metric.
+  static const List<PercentileMetricComputation> values =
+      <PercentileMetricComputation>[
+    BenchmarkMetricComputation.p50,
+    BenchmarkMetricComputation.p90,
+    BenchmarkMetricComputation.p95,
+  ];
+
+  /// The percentile values as doubles computed for each benchmark metric.
+  static List<double> percentilesAsDoubles = PercentileMetricComputation.values
+      .map((PercentileMetricComputation value) => value.percentile)
+      .toList();
+}
+
 /// The list of expected benchmark metrics for the current compilation mode, as
 /// determined by the value of [useWasm].
 List<BenchmarkMetric> expectedBenchmarkMetrics({required bool useWasm}) {
diff --git a/packages/web_benchmarks/lib/src/recorder.dart b/packages/web_benchmarks/lib/src/recorder.dart
index 82a490c942a2..eea86b307319 100644
--- a/packages/web_benchmarks/lib/src/recorder.dart
+++ b/packages/web_benchmarks/lib/src/recorder.dart
@@ -4,7 +4,6 @@
 
 import 'dart:async';
 import 'dart:js_interop';
-import 'dart:math' as math;
 import 'dart:ui';
 import 'dart:ui_web' as ui_web;
 
@@ -18,6 +17,7 @@ import 'package:meta/meta.dart';
 import 'package:web/web.dart' as html;
 
 import 'common.dart';
+import 'computations.dart';
 import 'metrics.dart';
 
 /// The number of samples from warm-up iterations.
@@ -597,254 +597,6 @@ class _WidgetBuildRecorderHostState extends State<_WidgetBuildRecorderHost> {
   }
 }
 
-/// Series of time recordings indexed in time order.
-///
-/// It can calculate [average], [standardDeviation] and [noise]. If the amount
-/// of data collected is higher than [_kMeasuredSampleCount], then these
-/// calculations will only apply to the latest [_kMeasuredSampleCount] data
-/// points.
-class Timeseries {
-  /// Creates an empty timeseries.
-  ///
-  /// [name], [isReported], and [useCustomWarmUp] must not be null.
-  Timeseries(this.name, this.isReported, {this.useCustomWarmUp = false})
-      : _warmUpFrameCount = useCustomWarmUp ? 0 : null;
-
-  /// The label of this timeseries used for debugging and result inspection.
-  final String name;
-
-  /// Whether this timeseries is reported to the benchmark dashboard.
-  ///
-  /// If `true` a new benchmark card is created for the timeseries and is
-  /// visible on the dashboard.
-  ///
-  /// If `false` the data is stored but it does not show up on the dashboard.
-  /// Use unreported metrics for metrics that are useful for manual inspection
-  /// but that are too fine-grained to be useful for tracking on the dashboard.
-  final bool isReported;
-
-  /// Whether to delimit warm-up frames in a custom way.
-  final bool useCustomWarmUp;
-
-  /// The number of frames ignored as warm-up frames, used only
-  /// when [useCustomWarmUp] is true.
-  int? _warmUpFrameCount;
-
-  /// The number of frames ignored as warm-up frames.
-  int get warmUpFrameCount =>
-      useCustomWarmUp ? _warmUpFrameCount! : count - kMeasuredSampleCount;
-
-  /// List of all the values that have been recorded.
-  ///
-  /// This list has no limit.
-  final List<double> _allValues = <double>[];
-
-  /// The total amount of data collected, including ones that were dropped
-  /// because of the sample size limit.
-  int get count => _allValues.length;
-
-  /// Extracts useful statistics out of this timeseries.
-  ///
-  /// See [TimeseriesStats] for more details.
-  TimeseriesStats computeStats() {
-    final int finalWarmUpFrameCount = warmUpFrameCount;
-
-    assert(finalWarmUpFrameCount >= 0 && finalWarmUpFrameCount < count);
-
-    // The first few values we simply discard and never look at. They're from the warm-up phase.
-    final List<double> warmUpValues =
-        _allValues.sublist(0, finalWarmUpFrameCount);
-
-    // Values we analyze.
-    final List<double> candidateValues =
-        _allValues.sublist(finalWarmUpFrameCount);
-
-    // The average that includes outliers.
-    final double dirtyAverage = _computeAverage(name, candidateValues);
-
-    // The standard deviation that includes outliers.
-    final double dirtyStandardDeviation =
-        _computeStandardDeviationForPopulation(name, candidateValues);
-
-    // Any value that's higher than this is considered an outlier.
-    final double outlierCutOff = dirtyAverage + dirtyStandardDeviation;
-
-    // Candidates with outliers removed.
-    final Iterable<double> cleanValues =
-        candidateValues.where((double value) => value <= outlierCutOff);
-
-    // Outlier candidates.
-    final Iterable<double> outliers =
-        candidateValues.where((double value) => value > outlierCutOff);
-
-    // Final statistics.
-    final double cleanAverage = _computeAverage(name, cleanValues);
-    final double standardDeviation =
-        _computeStandardDeviationForPopulation(name, cleanValues);
-    final double noise =
-        cleanAverage > 0.0 ? standardDeviation / cleanAverage : 0.0;
-
-    // Compute outlier average. If there are no outliers the outlier average is
-    // the same as clean value average. In other words, in a perfect benchmark
-    // with no noise the difference between average and outlier average is zero,
-    // which the best possible outcome. Noise produces a positive difference
-    // between the two.
-    final double outlierAverage =
-        outliers.isNotEmpty ? _computeAverage(name, outliers) : cleanAverage;
-
-    final List<AnnotatedSample> annotatedValues = <AnnotatedSample>[
-      for (final double warmUpValue in warmUpValues)
-        AnnotatedSample(
-          magnitude: warmUpValue,
-          isOutlier: warmUpValue > outlierCutOff,
-          isWarmUpValue: true,
-        ),
-      for (final double candidate in candidateValues)
-        AnnotatedSample(
-          magnitude: candidate,
-          isOutlier: candidate > outlierCutOff,
-          isWarmUpValue: false,
-        ),
-    ];
-
-    return TimeseriesStats(
-      name: name,
-      average: cleanAverage,
-      outlierCutOff: outlierCutOff,
-      outlierAverage: outlierAverage,
-      standardDeviation: standardDeviation,
-      noise: noise,
-      cleanSampleCount: cleanValues.length,
-      outlierSampleCount: outliers.length,
-      samples: annotatedValues,
-    );
-  }
-
-  /// Adds a value to this timeseries.
-  void add(double value, {required bool isWarmUpValue}) {
-    if (value < 0.0) {
-      throw StateError(
-        'Timeseries $name: negative metric values are not supported. Got: $value',
-      );
-    }
-    _allValues.add(value);
-    if (useCustomWarmUp && isWarmUpValue) {
-      _warmUpFrameCount = warmUpFrameCount + 1;
-    }
-  }
-}
-
-/// Various statistics about a [Timeseries].
-///
-/// See the docs on the individual fields for more details.
-@sealed
-class TimeseriesStats {
-  /// Creates statistics for a time series.
-  const TimeseriesStats({
-    required this.name,
-    required this.average,
-    required this.outlierCutOff,
-    required this.outlierAverage,
-    required this.standardDeviation,
-    required this.noise,
-    required this.cleanSampleCount,
-    required this.outlierSampleCount,
-    required this.samples,
-  });
-
-  /// The label used to refer to the corresponding timeseries.
-  final String name;
-
-  /// The average value of the measured samples without outliers.
-  final double average;
-
-  /// The standard deviation in the measured samples without outliers.
-  final double standardDeviation;
-
-  /// The noise as a multiple of the [average] value takes from clean samples.
-  ///
-  /// This value can be multiplied by 100.0 to get noise as a percentage of
-  /// the average.
-  ///
-  /// If [average] is zero, treats the result as perfect score, returns zero.
-  final double noise;
-
-  /// The maximum value a sample can have without being considered an outlier.
-  ///
-  /// See [Timeseries.computeStats] for details on how this value is computed.
-  final double outlierCutOff;
-
-  /// The average of outlier samples.
-  ///
-  /// This value can be used to judge how badly we jank, when we jank.
-  ///
-  /// Another useful metrics is the difference between [outlierAverage] and
-  /// [average]. The smaller the value the more predictable is the performance
-  /// of the corresponding benchmark.
-  final double outlierAverage;
-
-  /// The number of measured samples after outlier are removed.
-  final int cleanSampleCount;
-
-  /// The number of outliers.
-  final int outlierSampleCount;
-
-  /// All collected samples, annotated with statistical information.
-  ///
-  /// See [AnnotatedSample] for more details.
-  final List<AnnotatedSample> samples;
-
-  /// Outlier average divided by clean average.
-  ///
-  /// This is a measure of performance consistency. The higher this number the
-  /// worse is jank when it happens. Smaller is better, with 1.0 being the
-  /// perfect score. If [average] is zero, this value defaults to 1.0.
-  double get outlierRatio => average > 0.0
-      ? outlierAverage / average
-      : 1.0; // this can only happen in perfect benchmark that reports only zeros
-
-  @override
-  String toString() {
-    final StringBuffer buffer = StringBuffer();
-    buffer.writeln(
-      '$name: (samples: $cleanSampleCount clean/$outlierSampleCount '
-      'outliers/${cleanSampleCount + outlierSampleCount} '
-      'measured/${samples.length} total)',
-    );
-    buffer.writeln(' | average: $average μs');
-    buffer.writeln(' | outlier average: $outlierAverage μs');
-    buffer.writeln(' | outlier/clean ratio: ${outlierRatio}x');
-    buffer.writeln(' | noise: ${_ratioToPercent(noise)}');
-    return buffer.toString();
-  }
-}
-
-/// Annotates a single measurement with statistical information.
-@sealed
-class AnnotatedSample {
-  /// Creates an annotated measurement sample.
-  const AnnotatedSample({
-    required this.magnitude,
-    required this.isOutlier,
-    required this.isWarmUpValue,
-  });
-
-  /// The non-negative raw result of the measurement.
-  final double magnitude;
-
-  /// Whether this sample was considered an outlier.
-  final bool isOutlier;
-
-  /// Whether this sample was taken during the warm-up phase.
-  ///
-  /// If this value is `true`, this sample does not participate in
-  /// statistical computations. However, the sample would still be
-  /// shown in the visualization of results so that the benchmark
-  /// can be inspected manually to make sure there's a predictable
-  /// warm-up regression slope.
-  final bool isWarmUpValue;
-}
-
 /// Base class for a profile collected from running a benchmark.
 class Profile {
   /// Creates an empty profile.
@@ -942,18 +694,24 @@ class Profile {
       final Timeseries timeseries = scoreData[key]!;
 
       if (timeseries.isReported) {
-        scoreKeys.add('$key.average');
+        scoreKeys.add('$key.${BenchmarkMetricComputation.average.name}');
         // Report `outlierRatio` rather than `outlierAverage`, because
         // the absolute value of outliers is less interesting than the
         // ratio.
-        scoreKeys.add('$key.outlierRatio');
+        scoreKeys.add('$key.${BenchmarkMetricComputation.outlierRatio.name}');
       }
 
       final TimeseriesStats stats = timeseries.computeStats();
-      json['$key.average'] = stats.average;
-      json['$key.outlierAverage'] = stats.outlierAverage;
-      json['$key.outlierRatio'] = stats.outlierRatio;
-      json['$key.noise'] = stats.noise;
+      json['$key.${BenchmarkMetricComputation.average.name}'] = stats.average;
+      json['$key.${BenchmarkMetricComputation.outlierAverage.name}'] =
+          stats.outlierAverage;
+      json['$key.${BenchmarkMetricComputation.outlierRatio.name}'] =
+          stats.outlierRatio;
+      json['$key.${BenchmarkMetricComputation.noise.name}'] = stats.noise;
+      for (final PercentileMetricComputation metric
+          in PercentileMetricComputation.values) {
+        json['$key.${metric.name}'] = stats.percentiles[metric.percentile];
+      }
     }
 
     json.addAll(extraData);
@@ -985,42 +743,6 @@ class Profile {
   }
 }
 
-/// Computes the arithmetic mean (or average) of given [values].
-double _computeAverage(String label, Iterable<double> values) {
-  if (values.isEmpty) {
-    throw StateError(
-        '$label: attempted to compute an average of an empty value list.');
-  }
-
-  final double sum = values.reduce((double a, double b) => a + b);
-  return sum / values.length;
-}
-
-/// Computes population standard deviation.
-///
-/// Unlike sample standard deviation, which divides by N - 1, this divides by N.
-///
-/// See also:
-///
-/// * https://en.wikipedia.org/wiki/Standard_deviation
-double _computeStandardDeviationForPopulation(
-    String label, Iterable<double> population) {
-  if (population.isEmpty) {
-    throw StateError(
-        '$label: attempted to compute the standard deviation of empty population.');
-  }
-  final double mean = _computeAverage(label, population);
-  final double sumOfSquaredDeltas = population.fold<double>(
-    0.0,
-    (double previous, double value) => previous += math.pow(value - mean, 2),
-  );
-  return math.sqrt(sumOfSquaredDeltas / population.length);
-}
-
-String _ratioToPercent(double value) {
-  return '${(value * 100).toStringAsFixed(2)}%';
-}
-
 /// Implemented by recorders that use [_RecordingWidgetsBinding] to receive
 /// frame life-cycle calls.
 abstract class FrameRecorder {
diff --git a/packages/web_benchmarks/pubspec.yaml b/packages/web_benchmarks/pubspec.yaml
index 54ca82807e41..1ee5ad4aebb1 100644
--- a/packages/web_benchmarks/pubspec.yaml
+++ b/packages/web_benchmarks/pubspec.yaml
@@ -2,7 +2,7 @@ name: web_benchmarks
 description: A benchmark harness for performance-testing Flutter apps in Chrome.
 repository: https://github.com/flutter/packages/tree/main/packages/web_benchmarks
 issue_tracker: https://github.com/flutter/flutter/issues?q=is%3Aissue+is%3Aopen+label%3A%22p%3A+web_benchmarks%22
-version: 3.1.0-wip
+version: 3.1.0
 
 environment:
   sdk: ^3.3.0
diff --git a/packages/web_benchmarks/test/src/computations_test.dart b/packages/web_benchmarks/test/src/computations_test.dart
new file mode 100644
index 000000000000..94fe44079431
--- /dev/null
+++ b/packages/web_benchmarks/test/src/computations_test.dart
@@ -0,0 +1,24 @@
+// Copyright 2013 The Flutter Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import 'package:flutter_test/flutter_test.dart';
+import 'package:web_benchmarks/src/computations.dart';
+
+void main() {
+  group('computations', () {
+    test('computePercentiles', () {
+      final Map<double, double> computed = computePercentiles(
+        'test',
+        <double>[0.0, 0.5, 0.9, 0.95, 1.0],
+        List<double>.generate(100, (int i) => i.toDouble()),
+      );
+      expect(computed.length, 5);
+      expect(computed[0.0], 0.0);
+      expect(computed[0.5], 50.0);
+      expect(computed[0.9], 90.0);
+      expect(computed[0.95], 95.0);
+      expect(computed[1.0], 99.0);
+    });
+  });
+}
diff --git a/packages/web_benchmarks/testing/test_app/benchmark/web_benchmarks_test.dart b/packages/web_benchmarks/testing/test_app/benchmark/web_benchmarks_test.dart
index a6900a126d31..6a2990846478 100644
--- a/packages/web_benchmarks/testing/test_app/benchmark/web_benchmarks_test.dart
+++ b/packages/web_benchmarks/testing/test_app/benchmark/web_benchmarks_test.dart
@@ -98,22 +98,19 @@ Future<BenchmarkResults> _runBenchmarks({
 
   for (final String benchmarkName in benchmarkNames) {
     for (final String metricName in expectedMetrics) {
-      for (final String valueName in <String>[
-        'average',
-        'outlierAverage',
-        'outlierRatio',
-        'noise',
-      ]) {
+      for (final BenchmarkMetricComputation computation
+          in BenchmarkMetricComputation.values) {
         expect(
-          taskResult.scores[benchmarkName]!.where((BenchmarkScore score) =>
-              score.metric == '$metricName.$valueName'),
-          hasLength(1),
-        );
+            taskResult.scores[benchmarkName]!.where((BenchmarkScore score) =>
+                score.metric == '$metricName.${computation.name}'),
+            hasLength(1),
+            reason: 'Expected to find a metric named '
+                '$metricName.${computation.name}');
       }
     }
     expect(
-      taskResult.scores[benchmarkName]!.where(
-          (BenchmarkScore score) => score.metric == 'totalUiFrame.average'),
+      taskResult.scores[benchmarkName]!
+          .where((BenchmarkScore score) => score.metric == totalUiFrameAverage),
       hasLength(1),
     );
   }