From cff1e5f6a5013d40b4f1c094d53aa64d20f33aa1 Mon Sep 17 00:00:00 2001 From: Kenzie Davisson <43759233+kenzieschmoll@users.noreply.github.com> Date: Wed, 2 Oct 2024 15:39:47 -0700 Subject: [PATCH] Add percentile computations to benchmark scores (#7760) This adds p50, p90, and p95 computations to each benchmark metric. This PR also refactors the benchmark computation definitions so they are defined by an enum. Fixes https://github.com/flutter/flutter/issues/151551. --- packages/web_benchmarks/CHANGELOG.md | 3 +- packages/web_benchmarks/lib/client.dart | 2 + .../web_benchmarks/lib/src/computations.dart | 351 ++++++++++++++++++ packages/web_benchmarks/lib/src/metrics.dart | 83 +++++ packages/web_benchmarks/lib/src/recorder.dart | 304 +-------------- packages/web_benchmarks/pubspec.yaml | 2 +- .../test/src/computations_test.dart | 24 ++ .../benchmark/web_benchmarks_test.dart | 21 +- 8 files changed, 485 insertions(+), 305 deletions(-) create mode 100644 packages/web_benchmarks/lib/src/computations.dart create mode 100644 packages/web_benchmarks/test/src/computations_test.dart diff --git a/packages/web_benchmarks/CHANGELOG.md b/packages/web_benchmarks/CHANGELOG.md index 7a5b8d5366da..0096da891242 100644 --- a/packages/web_benchmarks/CHANGELOG.md +++ b/packages/web_benchmarks/CHANGELOG.md @@ -1,8 +1,9 @@ -## 3.1.0-wip +## 3.1.0 * Add `flutter_frame.total_time`, `flutter_frame.build_time`, and `flutter_frame.raster_time` metrics to benchmark results. These values are derived from the Flutter `FrameTiming` API. * Expose a new library `metrics.dart` that contains definitions for the benchmark metrics. +* Add p50, p90, and p95 metrics for benchmark scores. ## 3.0.0 diff --git a/packages/web_benchmarks/lib/client.dart b/packages/web_benchmarks/lib/client.dart index 357d2bb8f196..fc427473e0dd 100644 --- a/packages/web_benchmarks/lib/client.dart +++ b/packages/web_benchmarks/lib/client.dart @@ -10,8 +10,10 @@ import 'dart:math' as math; import 'package:web/web.dart'; import 'src/common.dart'; +import 'src/computations.dart'; import 'src/recorder.dart'; +export 'src/computations.dart'; export 'src/recorder.dart'; /// Signature for a function that creates a [Recorder]. diff --git a/packages/web_benchmarks/lib/src/computations.dart b/packages/web_benchmarks/lib/src/computations.dart new file mode 100644 index 000000000000..99ffcfbc38da --- /dev/null +++ b/packages/web_benchmarks/lib/src/computations.dart @@ -0,0 +1,351 @@ +// Copyright 2013 The Flutter Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +import 'dart:math' as math; + +import 'package:collection/collection.dart'; +import 'package:meta/meta.dart'; + +import 'common.dart'; +import 'metrics.dart'; + +/// Series of time recordings indexed in time order. +/// +/// It can calculate [average], [standardDeviation] and [noise]. If the amount +/// of data collected is higher than [_kMeasuredSampleCount], then these +/// calculations will only apply to the latest [_kMeasuredSampleCount] data +/// points. +class Timeseries { + /// Creates an empty timeseries. + /// + /// [name], [isReported], and [useCustomWarmUp] must not be null. + Timeseries(this.name, this.isReported, {this.useCustomWarmUp = false}) + : _warmUpFrameCount = useCustomWarmUp ? 0 : null; + + /// The label of this timeseries used for debugging and result inspection. + final String name; + + /// Whether this timeseries is reported to the benchmark dashboard. + /// + /// If `true` a new benchmark card is created for the timeseries and is + /// visible on the dashboard. + /// + /// If `false` the data is stored but it does not show up on the dashboard. + /// Use unreported metrics for metrics that are useful for manual inspection + /// but that are too fine-grained to be useful for tracking on the dashboard. + final bool isReported; + + /// Whether to delimit warm-up frames in a custom way. + final bool useCustomWarmUp; + + /// The number of frames ignored as warm-up frames, used only + /// when [useCustomWarmUp] is true. + int? _warmUpFrameCount; + + /// The number of frames ignored as warm-up frames. + int get warmUpFrameCount => + useCustomWarmUp ? _warmUpFrameCount! : count - kMeasuredSampleCount; + + /// List of all the values that have been recorded. + /// + /// This list has no limit. + final List _allValues = []; + + /// The total amount of data collected, including ones that were dropped + /// because of the sample size limit. + int get count => _allValues.length; + + /// Extracts useful statistics out of this timeseries. + /// + /// See [TimeseriesStats] for more details. + TimeseriesStats computeStats() { + final int finalWarmUpFrameCount = warmUpFrameCount; + + assert(finalWarmUpFrameCount >= 0 && finalWarmUpFrameCount < count); + + // The first few values we simply discard and never look at. They're from the warm-up phase. + final List warmUpValues = + _allValues.sublist(0, finalWarmUpFrameCount); + + // Values we analyze. + final List candidateValues = + _allValues.sublist(finalWarmUpFrameCount); + + // The average that includes outliers. + final double dirtyAverage = _computeAverage(name, candidateValues); + + // The standard deviation that includes outliers. + final double dirtyStandardDeviation = + _computeStandardDeviationForPopulation(name, candidateValues); + + // Any value that's higher than this is considered an outlier. + final double outlierCutOff = dirtyAverage + dirtyStandardDeviation; + + // Candidates with outliers removed. + final Iterable cleanValues = + candidateValues.where((double value) => value <= outlierCutOff); + + // Outlier candidates. + final Iterable outliers = + candidateValues.where((double value) => value > outlierCutOff); + + // Final statistics. + final double cleanAverage = _computeAverage(name, cleanValues); + final double standardDeviation = + _computeStandardDeviationForPopulation(name, cleanValues); + final double noise = + cleanAverage > 0.0 ? standardDeviation / cleanAverage : 0.0; + + // Compute outlier average. If there are no outliers the outlier average is + // the same as clean value average. In other words, in a perfect benchmark + // with no noise the difference between average and outlier average is zero, + // which the best possible outcome. Noise produces a positive difference + // between the two. + final double outlierAverage = + outliers.isNotEmpty ? _computeAverage(name, outliers) : cleanAverage; + + // Compute percentile values (e.g. p50, p90, p95). + final Map percentiles = computePercentiles( + name, + PercentileMetricComputation.percentilesAsDoubles, + candidateValues, + ); + + final List annotatedValues = [ + for (final double warmUpValue in warmUpValues) + AnnotatedSample( + magnitude: warmUpValue, + isOutlier: warmUpValue > outlierCutOff, + isWarmUpValue: true, + ), + for (final double candidate in candidateValues) + AnnotatedSample( + magnitude: candidate, + isOutlier: candidate > outlierCutOff, + isWarmUpValue: false, + ), + ]; + + return TimeseriesStats( + name: name, + average: cleanAverage, + outlierCutOff: outlierCutOff, + outlierAverage: outlierAverage, + standardDeviation: standardDeviation, + noise: noise, + percentiles: percentiles, + cleanSampleCount: cleanValues.length, + outlierSampleCount: outliers.length, + samples: annotatedValues, + ); + } + + /// Adds a value to this timeseries. + void add(double value, {required bool isWarmUpValue}) { + if (value < 0.0) { + throw StateError( + 'Timeseries $name: negative metric values are not supported. Got: $value', + ); + } + _allValues.add(value); + if (useCustomWarmUp && isWarmUpValue) { + _warmUpFrameCount = warmUpFrameCount + 1; + } + } +} + +/// Various statistics about a [Timeseries]. +/// +/// See the docs on the individual fields for more details. +@sealed +class TimeseriesStats { + /// Creates statistics for a time series. + const TimeseriesStats({ + required this.name, + required this.average, + required this.outlierCutOff, + required this.outlierAverage, + required this.standardDeviation, + required this.noise, + required this.percentiles, + required this.cleanSampleCount, + required this.outlierSampleCount, + required this.samples, + }); + + /// The label used to refer to the corresponding timeseries. + final String name; + + /// The average value of the measured samples without outliers. + final double average; + + /// The standard deviation in the measured samples without outliers. + final double standardDeviation; + + /// The noise as a multiple of the [average] value taken from clean samples. + /// + /// This value can be multiplied by 100.0 to get noise as a percentage of + /// the average. + /// + /// If [average] is zero, treats the result as perfect score, returns zero. + final double noise; + + /// The percentile values (p50, p90, p95, etc.) for the measured samples with + /// outliers. + /// + /// This [Map] is from percentile targets (e.g. 0.50 for p50, 0.90 for p90, + /// etc.) to the computed value for the [samples]. + final Map percentiles; + + /// The maximum value a sample can have without being considered an outlier. + /// + /// See [Timeseries.computeStats] for details on how this value is computed. + final double outlierCutOff; + + /// The average of outlier samples. + /// + /// This value can be used to judge how badly we jank, when we jank. + /// + /// Another useful metrics is the difference between [outlierAverage] and + /// [average]. The smaller the value the more predictable is the performance + /// of the corresponding benchmark. + final double outlierAverage; + + /// The number of measured samples after outlier are removed. + final int cleanSampleCount; + + /// The number of outliers. + final int outlierSampleCount; + + /// All collected samples, annotated with statistical information. + /// + /// See [AnnotatedSample] for more details. + final List samples; + + /// Outlier average divided by clean average. + /// + /// This is a measure of performance consistency. The higher this number the + /// worse is jank when it happens. Smaller is better, with 1.0 being the + /// perfect score. If [average] is zero, this value defaults to 1.0. + double get outlierRatio => average > 0.0 + ? outlierAverage / average + : 1.0; // this can only happen in perfect benchmark that reports only zeros + + @override + String toString() { + final StringBuffer buffer = StringBuffer(); + buffer.writeln( + '$name: (samples: $cleanSampleCount clean/$outlierSampleCount ' + 'outliers/${cleanSampleCount + outlierSampleCount} ' + 'measured/${samples.length} total)', + ); + buffer.writeln(' | average: $average μs'); + buffer.writeln(' | outlier average: $outlierAverage μs'); + buffer.writeln(' | outlier/clean ratio: ${outlierRatio}x'); + buffer.writeln(' | noise: ${_ratioToPercent(noise)}'); + for (final PercentileMetricComputation metric + in PercentileMetricComputation.values) { + buffer.writeln(' | ${metric.name}: ${metric.percentile} μs'); + } + return buffer.toString(); + } +} + +/// Annotates a single measurement with statistical information. +@sealed +class AnnotatedSample { + /// Creates an annotated measurement sample. + const AnnotatedSample({ + required this.magnitude, + required this.isOutlier, + required this.isWarmUpValue, + }); + + /// The non-negative raw result of the measurement. + final double magnitude; + + /// Whether this sample was considered an outlier. + final bool isOutlier; + + /// Whether this sample was taken during the warm-up phase. + /// + /// If this value is `true`, this sample does not participate in + /// statistical computations. However, the sample would still be + /// shown in the visualization of results so that the benchmark + /// can be inspected manually to make sure there's a predictable + /// warm-up regression slope. + final bool isWarmUpValue; +} + +/// Computes the arithmetic mean (or average) of given [values]. +double _computeAverage(String label, Iterable values) { + if (values.isEmpty) { + throw StateError( + '$label: attempted to compute an average of an empty value list.'); + } + + final double sum = values.reduce((double a, double b) => a + b); + return sum / values.length; +} + +/// Computes population standard deviation. +/// +/// Unlike sample standard deviation, which divides by N - 1, this divides by N. +/// +/// See also: +/// +/// * https://en.wikipedia.org/wiki/Standard_deviation +double _computeStandardDeviationForPopulation( + String label, Iterable population) { + if (population.isEmpty) { + throw StateError( + '$label: attempted to compute the standard deviation of empty population.'); + } + final double mean = _computeAverage(label, population); + final double sumOfSquaredDeltas = population.fold( + 0.0, + (double previous, double value) => previous += math.pow(value - mean, 2), + ); + return math.sqrt(sumOfSquaredDeltas / population.length); +} + +String _ratioToPercent(double value) { + return '${(value * 100).toStringAsFixed(2)}%'; +} + +/// Computes the percentile threshold in [values] for the given [percentiles]. +/// +/// Each value in [percentiles] should be between 0.0 and 1.0. +/// +/// Returns a [Map] of percentile values to the computed value from [values]. +Map computePercentiles( + String label, + List percentiles, + Iterable values, +) { + if (values.isEmpty) { + throw StateError( + '$label: attempted to compute a percentile of an empty value list.', + ); + } + for (final double percentile in percentiles) { + if (percentile < 0.0 || percentile > 1.0) { + throw StateError( + '$label: attempted to compute a percentile for an invalid ' + 'value: $percentile', + ); + } + } + + final List sorted = + values.sorted((double a, double b) => a.compareTo(b)); + final Map computed = {}; + for (final double percentile in percentiles) { + final int percentileIndex = + (sorted.length * percentile).round().clamp(0, sorted.length - 1); + computed[percentile] = sorted[percentileIndex]; + } + + return computed; +} diff --git a/packages/web_benchmarks/lib/src/metrics.dart b/packages/web_benchmarks/lib/src/metrics.dart index e80692ee58b2..5145d73baffb 100644 --- a/packages/web_benchmarks/lib/src/metrics.dart +++ b/packages/web_benchmarks/lib/src/metrics.dart @@ -50,6 +50,89 @@ enum BenchmarkMetric { /// from the Blink trace summary. const String totalUiFrameAverage = 'totalUiFrame.average'; +/// Describes the values computed for each [BenchmarkMetric]. +sealed class BenchmarkMetricComputation { + const BenchmarkMetricComputation(this.name); + + /// The name of each metric computation. + final String name; + + /// The name for the computed value tracking the average value of the measured + /// samples without outliers. + static const NamedMetricComputation average = + NamedMetricComputation._('average'); + + /// The name for the computed value tracking the average of outlier samples. + static const NamedMetricComputation outlierAverage = + NamedMetricComputation._('outlierAverage'); + + /// The name for the computed value tracking the outlier average divided by + /// the clean average. + static const NamedMetricComputation outlierRatio = + NamedMetricComputation._('outlierRatio'); + + /// The name for the computed value tracking the noise as a multiple of the + /// [average] value takes from clean samples. + static const NamedMetricComputation noise = NamedMetricComputation._('noise'); + + /// The name for the computed value tracking the 50th percentile value from + /// the samples with outliers. + static const PercentileMetricComputation p50 = + PercentileMetricComputation._('p50', 0.5); + + /// The name for the computed value tracking the 90th percentile value from + /// the samples with outliers. + static const PercentileMetricComputation p90 = + PercentileMetricComputation._('p90', 0.9); + + /// The name for the computed value tracking the 95th percentile value from + /// the samples with outliers. + static const PercentileMetricComputation p95 = + PercentileMetricComputation._('p95', 0.95); + + /// All of the computed vales for each [BenchmarkMetric]. + static const List values = + [ + average, + outlierAverage, + outlierRatio, + noise, + p50, + p90, + p95, + ]; +} + +/// A [BenchmarkMetricComputation] with a descriptive name. +final class NamedMetricComputation extends BenchmarkMetricComputation { + const NamedMetricComputation._(super.name); +} + +/// A [BenchmarkMetricComputation] describing a percentile (p50, p90, etc.). +final class PercentileMetricComputation extends BenchmarkMetricComputation { + const PercentileMetricComputation._(super.name, this.percentile) + : assert(percentile >= 0.0 && percentile <= 1.0); + + /// The percentile value as a double. + /// + /// This value must be between 0.0 and 1.0. + final double percentile; + + /// The percentile [BenchmarkMetricComputation]s computed for each benchmark + /// metric. + static const List values = + [ + BenchmarkMetricComputation.p50, + BenchmarkMetricComputation.p90, + BenchmarkMetricComputation.p95, + ]; + + /// The percentile values as doubles computed for each benchmark metric. + static List percentilesAsDoubles = PercentileMetricComputation.values + .map((PercentileMetricComputation value) => value.percentile) + .toList(); +} + /// The list of expected benchmark metrics for the current compilation mode, as /// determined by the value of [useWasm]. List expectedBenchmarkMetrics({required bool useWasm}) { diff --git a/packages/web_benchmarks/lib/src/recorder.dart b/packages/web_benchmarks/lib/src/recorder.dart index 82a490c942a2..eea86b307319 100644 --- a/packages/web_benchmarks/lib/src/recorder.dart +++ b/packages/web_benchmarks/lib/src/recorder.dart @@ -4,7 +4,6 @@ import 'dart:async'; import 'dart:js_interop'; -import 'dart:math' as math; import 'dart:ui'; import 'dart:ui_web' as ui_web; @@ -18,6 +17,7 @@ import 'package:meta/meta.dart'; import 'package:web/web.dart' as html; import 'common.dart'; +import 'computations.dart'; import 'metrics.dart'; /// The number of samples from warm-up iterations. @@ -597,254 +597,6 @@ class _WidgetBuildRecorderHostState extends State<_WidgetBuildRecorderHost> { } } -/// Series of time recordings indexed in time order. -/// -/// It can calculate [average], [standardDeviation] and [noise]. If the amount -/// of data collected is higher than [_kMeasuredSampleCount], then these -/// calculations will only apply to the latest [_kMeasuredSampleCount] data -/// points. -class Timeseries { - /// Creates an empty timeseries. - /// - /// [name], [isReported], and [useCustomWarmUp] must not be null. - Timeseries(this.name, this.isReported, {this.useCustomWarmUp = false}) - : _warmUpFrameCount = useCustomWarmUp ? 0 : null; - - /// The label of this timeseries used for debugging and result inspection. - final String name; - - /// Whether this timeseries is reported to the benchmark dashboard. - /// - /// If `true` a new benchmark card is created for the timeseries and is - /// visible on the dashboard. - /// - /// If `false` the data is stored but it does not show up on the dashboard. - /// Use unreported metrics for metrics that are useful for manual inspection - /// but that are too fine-grained to be useful for tracking on the dashboard. - final bool isReported; - - /// Whether to delimit warm-up frames in a custom way. - final bool useCustomWarmUp; - - /// The number of frames ignored as warm-up frames, used only - /// when [useCustomWarmUp] is true. - int? _warmUpFrameCount; - - /// The number of frames ignored as warm-up frames. - int get warmUpFrameCount => - useCustomWarmUp ? _warmUpFrameCount! : count - kMeasuredSampleCount; - - /// List of all the values that have been recorded. - /// - /// This list has no limit. - final List _allValues = []; - - /// The total amount of data collected, including ones that were dropped - /// because of the sample size limit. - int get count => _allValues.length; - - /// Extracts useful statistics out of this timeseries. - /// - /// See [TimeseriesStats] for more details. - TimeseriesStats computeStats() { - final int finalWarmUpFrameCount = warmUpFrameCount; - - assert(finalWarmUpFrameCount >= 0 && finalWarmUpFrameCount < count); - - // The first few values we simply discard and never look at. They're from the warm-up phase. - final List warmUpValues = - _allValues.sublist(0, finalWarmUpFrameCount); - - // Values we analyze. - final List candidateValues = - _allValues.sublist(finalWarmUpFrameCount); - - // The average that includes outliers. - final double dirtyAverage = _computeAverage(name, candidateValues); - - // The standard deviation that includes outliers. - final double dirtyStandardDeviation = - _computeStandardDeviationForPopulation(name, candidateValues); - - // Any value that's higher than this is considered an outlier. - final double outlierCutOff = dirtyAverage + dirtyStandardDeviation; - - // Candidates with outliers removed. - final Iterable cleanValues = - candidateValues.where((double value) => value <= outlierCutOff); - - // Outlier candidates. - final Iterable outliers = - candidateValues.where((double value) => value > outlierCutOff); - - // Final statistics. - final double cleanAverage = _computeAverage(name, cleanValues); - final double standardDeviation = - _computeStandardDeviationForPopulation(name, cleanValues); - final double noise = - cleanAverage > 0.0 ? standardDeviation / cleanAverage : 0.0; - - // Compute outlier average. If there are no outliers the outlier average is - // the same as clean value average. In other words, in a perfect benchmark - // with no noise the difference between average and outlier average is zero, - // which the best possible outcome. Noise produces a positive difference - // between the two. - final double outlierAverage = - outliers.isNotEmpty ? _computeAverage(name, outliers) : cleanAverage; - - final List annotatedValues = [ - for (final double warmUpValue in warmUpValues) - AnnotatedSample( - magnitude: warmUpValue, - isOutlier: warmUpValue > outlierCutOff, - isWarmUpValue: true, - ), - for (final double candidate in candidateValues) - AnnotatedSample( - magnitude: candidate, - isOutlier: candidate > outlierCutOff, - isWarmUpValue: false, - ), - ]; - - return TimeseriesStats( - name: name, - average: cleanAverage, - outlierCutOff: outlierCutOff, - outlierAverage: outlierAverage, - standardDeviation: standardDeviation, - noise: noise, - cleanSampleCount: cleanValues.length, - outlierSampleCount: outliers.length, - samples: annotatedValues, - ); - } - - /// Adds a value to this timeseries. - void add(double value, {required bool isWarmUpValue}) { - if (value < 0.0) { - throw StateError( - 'Timeseries $name: negative metric values are not supported. Got: $value', - ); - } - _allValues.add(value); - if (useCustomWarmUp && isWarmUpValue) { - _warmUpFrameCount = warmUpFrameCount + 1; - } - } -} - -/// Various statistics about a [Timeseries]. -/// -/// See the docs on the individual fields for more details. -@sealed -class TimeseriesStats { - /// Creates statistics for a time series. - const TimeseriesStats({ - required this.name, - required this.average, - required this.outlierCutOff, - required this.outlierAverage, - required this.standardDeviation, - required this.noise, - required this.cleanSampleCount, - required this.outlierSampleCount, - required this.samples, - }); - - /// The label used to refer to the corresponding timeseries. - final String name; - - /// The average value of the measured samples without outliers. - final double average; - - /// The standard deviation in the measured samples without outliers. - final double standardDeviation; - - /// The noise as a multiple of the [average] value takes from clean samples. - /// - /// This value can be multiplied by 100.0 to get noise as a percentage of - /// the average. - /// - /// If [average] is zero, treats the result as perfect score, returns zero. - final double noise; - - /// The maximum value a sample can have without being considered an outlier. - /// - /// See [Timeseries.computeStats] for details on how this value is computed. - final double outlierCutOff; - - /// The average of outlier samples. - /// - /// This value can be used to judge how badly we jank, when we jank. - /// - /// Another useful metrics is the difference between [outlierAverage] and - /// [average]. The smaller the value the more predictable is the performance - /// of the corresponding benchmark. - final double outlierAverage; - - /// The number of measured samples after outlier are removed. - final int cleanSampleCount; - - /// The number of outliers. - final int outlierSampleCount; - - /// All collected samples, annotated with statistical information. - /// - /// See [AnnotatedSample] for more details. - final List samples; - - /// Outlier average divided by clean average. - /// - /// This is a measure of performance consistency. The higher this number the - /// worse is jank when it happens. Smaller is better, with 1.0 being the - /// perfect score. If [average] is zero, this value defaults to 1.0. - double get outlierRatio => average > 0.0 - ? outlierAverage / average - : 1.0; // this can only happen in perfect benchmark that reports only zeros - - @override - String toString() { - final StringBuffer buffer = StringBuffer(); - buffer.writeln( - '$name: (samples: $cleanSampleCount clean/$outlierSampleCount ' - 'outliers/${cleanSampleCount + outlierSampleCount} ' - 'measured/${samples.length} total)', - ); - buffer.writeln(' | average: $average μs'); - buffer.writeln(' | outlier average: $outlierAverage μs'); - buffer.writeln(' | outlier/clean ratio: ${outlierRatio}x'); - buffer.writeln(' | noise: ${_ratioToPercent(noise)}'); - return buffer.toString(); - } -} - -/// Annotates a single measurement with statistical information. -@sealed -class AnnotatedSample { - /// Creates an annotated measurement sample. - const AnnotatedSample({ - required this.magnitude, - required this.isOutlier, - required this.isWarmUpValue, - }); - - /// The non-negative raw result of the measurement. - final double magnitude; - - /// Whether this sample was considered an outlier. - final bool isOutlier; - - /// Whether this sample was taken during the warm-up phase. - /// - /// If this value is `true`, this sample does not participate in - /// statistical computations. However, the sample would still be - /// shown in the visualization of results so that the benchmark - /// can be inspected manually to make sure there's a predictable - /// warm-up regression slope. - final bool isWarmUpValue; -} - /// Base class for a profile collected from running a benchmark. class Profile { /// Creates an empty profile. @@ -942,18 +694,24 @@ class Profile { final Timeseries timeseries = scoreData[key]!; if (timeseries.isReported) { - scoreKeys.add('$key.average'); + scoreKeys.add('$key.${BenchmarkMetricComputation.average.name}'); // Report `outlierRatio` rather than `outlierAverage`, because // the absolute value of outliers is less interesting than the // ratio. - scoreKeys.add('$key.outlierRatio'); + scoreKeys.add('$key.${BenchmarkMetricComputation.outlierRatio.name}'); } final TimeseriesStats stats = timeseries.computeStats(); - json['$key.average'] = stats.average; - json['$key.outlierAverage'] = stats.outlierAverage; - json['$key.outlierRatio'] = stats.outlierRatio; - json['$key.noise'] = stats.noise; + json['$key.${BenchmarkMetricComputation.average.name}'] = stats.average; + json['$key.${BenchmarkMetricComputation.outlierAverage.name}'] = + stats.outlierAverage; + json['$key.${BenchmarkMetricComputation.outlierRatio.name}'] = + stats.outlierRatio; + json['$key.${BenchmarkMetricComputation.noise.name}'] = stats.noise; + for (final PercentileMetricComputation metric + in PercentileMetricComputation.values) { + json['$key.${metric.name}'] = stats.percentiles[metric.percentile]; + } } json.addAll(extraData); @@ -985,42 +743,6 @@ class Profile { } } -/// Computes the arithmetic mean (or average) of given [values]. -double _computeAverage(String label, Iterable values) { - if (values.isEmpty) { - throw StateError( - '$label: attempted to compute an average of an empty value list.'); - } - - final double sum = values.reduce((double a, double b) => a + b); - return sum / values.length; -} - -/// Computes population standard deviation. -/// -/// Unlike sample standard deviation, which divides by N - 1, this divides by N. -/// -/// See also: -/// -/// * https://en.wikipedia.org/wiki/Standard_deviation -double _computeStandardDeviationForPopulation( - String label, Iterable population) { - if (population.isEmpty) { - throw StateError( - '$label: attempted to compute the standard deviation of empty population.'); - } - final double mean = _computeAverage(label, population); - final double sumOfSquaredDeltas = population.fold( - 0.0, - (double previous, double value) => previous += math.pow(value - mean, 2), - ); - return math.sqrt(sumOfSquaredDeltas / population.length); -} - -String _ratioToPercent(double value) { - return '${(value * 100).toStringAsFixed(2)}%'; -} - /// Implemented by recorders that use [_RecordingWidgetsBinding] to receive /// frame life-cycle calls. abstract class FrameRecorder { diff --git a/packages/web_benchmarks/pubspec.yaml b/packages/web_benchmarks/pubspec.yaml index 54ca82807e41..1ee5ad4aebb1 100644 --- a/packages/web_benchmarks/pubspec.yaml +++ b/packages/web_benchmarks/pubspec.yaml @@ -2,7 +2,7 @@ name: web_benchmarks description: A benchmark harness for performance-testing Flutter apps in Chrome. repository: https://github.com/flutter/packages/tree/main/packages/web_benchmarks issue_tracker: https://github.com/flutter/flutter/issues?q=is%3Aissue+is%3Aopen+label%3A%22p%3A+web_benchmarks%22 -version: 3.1.0-wip +version: 3.1.0 environment: sdk: ^3.3.0 diff --git a/packages/web_benchmarks/test/src/computations_test.dart b/packages/web_benchmarks/test/src/computations_test.dart new file mode 100644 index 000000000000..94fe44079431 --- /dev/null +++ b/packages/web_benchmarks/test/src/computations_test.dart @@ -0,0 +1,24 @@ +// Copyright 2013 The Flutter Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +import 'package:flutter_test/flutter_test.dart'; +import 'package:web_benchmarks/src/computations.dart'; + +void main() { + group('computations', () { + test('computePercentiles', () { + final Map computed = computePercentiles( + 'test', + [0.0, 0.5, 0.9, 0.95, 1.0], + List.generate(100, (int i) => i.toDouble()), + ); + expect(computed.length, 5); + expect(computed[0.0], 0.0); + expect(computed[0.5], 50.0); + expect(computed[0.9], 90.0); + expect(computed[0.95], 95.0); + expect(computed[1.0], 99.0); + }); + }); +} diff --git a/packages/web_benchmarks/testing/test_app/benchmark/web_benchmarks_test.dart b/packages/web_benchmarks/testing/test_app/benchmark/web_benchmarks_test.dart index a6900a126d31..6a2990846478 100644 --- a/packages/web_benchmarks/testing/test_app/benchmark/web_benchmarks_test.dart +++ b/packages/web_benchmarks/testing/test_app/benchmark/web_benchmarks_test.dart @@ -98,22 +98,19 @@ Future _runBenchmarks({ for (final String benchmarkName in benchmarkNames) { for (final String metricName in expectedMetrics) { - for (final String valueName in [ - 'average', - 'outlierAverage', - 'outlierRatio', - 'noise', - ]) { + for (final BenchmarkMetricComputation computation + in BenchmarkMetricComputation.values) { expect( - taskResult.scores[benchmarkName]!.where((BenchmarkScore score) => - score.metric == '$metricName.$valueName'), - hasLength(1), - ); + taskResult.scores[benchmarkName]!.where((BenchmarkScore score) => + score.metric == '$metricName.${computation.name}'), + hasLength(1), + reason: 'Expected to find a metric named ' + '$metricName.${computation.name}'); } } expect( - taskResult.scores[benchmarkName]!.where( - (BenchmarkScore score) => score.metric == 'totalUiFrame.average'), + taskResult.scores[benchmarkName]! + .where((BenchmarkScore score) => score.metric == totalUiFrameAverage), hasLength(1), ); }