JuliaCI · mcabbott · Sep 18, 2021 · Sep 18, 2021 · Sep 18, 2021 · Oct 2, 2021
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -33,7 +33,7 @@ For quick sanity checks, one can use the [`@btime` macro](https://github.com/Jul
 
 ```julia
 julia> @btime sin(x) setup=(x=rand())
-  4.361 ns (0 allocations: 0 bytes)
+  min 3.041 ns, mean 3.487 ns (0 allocations)
 0.49587200950472454
 ```
 
@@ -45,16 +45,16 @@ If the expression you want to benchmark depends on external variables, you shoul
 Essentially, any interpolated variable `$x` or expression `$(...)` is "pre-computed" before benchmarking begins:
 
 ```julia
-julia> A = rand(3,3);
+julia> A = rand(8,8);
 
 julia> @btime inv($A);            # we interpolate the global variable A with $A
-  1.191 μs (10 allocations: 2.31 KiB)
+  min 951.000 ns, mean 1.349 μs (4 allocations, 4.81 KiB)
 
-julia> @btime inv($(rand(3,3)));  # interpolation: the rand(3,3) call occurs before benchmarking
-  1.192 μs (10 allocations: 2.31 KiB)
+julia> @btime inv($(rand(8,8)));  # interpolation: the rand() call occurs before benchmarking
+  min 930.542 ns, mean 1.363 μs (4 allocations, 4.81 KiB)
 
-julia> @btime inv(rand(3,3));     # the rand(3,3) call is included in the benchmark time
-  1.295 μs (11 allocations: 2.47 KiB)
+julia> @btime inv(rand(8,8));     # the rand() call is included in the benchmark time
+  min 1.092 μs, mean 1.572 μs (5 allocations, 5.38 KiB)
 ```
 
 Sometimes, interpolating variables into very simple expressions can give the compiler more information than you intended, causing it to "cheat" the benchmark by hoisting the calculation out of the benchmark code
@@ -63,13 +63,13 @@ julia> a = 1; b = 2
 2
 
 julia> @btime $a + $b
-  0.024 ns (0 allocations: 0 bytes)
+  min 0.875 ns, mean 0.950 ns (0 allocations)
 3
 ```
 As a rule of thumb, if a benchmark reports that it took less than a nanosecond to perform, this hoisting probably occured. You can avoid this by referencing and dereferencing the interpolated variables 
 ```julia
 julia> @btime $(Ref(a))[] + $(Ref(b))[]
-  1.277 ns (0 allocations: 0 bytes)
+  min 0.875 ns, mean 0.953 ns (0 allocations)
 3
 ```
 

diff --git a/docs/src/manual.md b/docs/src/manual.md
@@ -65,12 +65,15 @@ before returning the value of the expression, while `@belapsed`
 returns the minimum time in seconds.
 
 ```julia
-julia> @btime sin(1)
-  13.612 ns (0 allocations: 0 bytes)
-0.8414709848078965
+julia> @btime sin.(0:2)
+  min 25.687 ns, mean 28.266 ns (1 allocation, 80 bytes)
+3-element Vector{Float64}:
+ 0.0
+ 0.8414709848078965
+ 0.9092974268256817
 
-julia> @belapsed sin(1)
-1.3614228456913828e-8
+julia> @belapsed sin.(0:2)
+2.5727911646586344e-8
 ```
 
 ### Benchmark `Parameters`
@@ -280,13 +283,13 @@ julia> a = 1; b = 2
 2
 
 julia> @btime $a + $b
-  0.024 ns (0 allocations: 0 bytes)
+  min 0.833 ns, mean 0.944 ns (0 allocations)
 3
 ```
 in this case julia was able to use the properties of `+(::Int, ::Int)` to know that it could safely replace `$a + $b` with `3` at compile time. We can stop the optimizer from doing this by referencing and dereferencing the interpolated variables  
 ```julia
 julia> @btime $(Ref(a))[] + $(Ref(b))[]
-  1.277 ns (0 allocations: 0 bytes)
+  min 0.875 ns, mean 0.948 ns (0 allocations)
 3
 ```
 

diff --git a/src/execution.jl b/src/execution.jl
@@ -554,28 +554,50 @@ this executes an expression, printing the time
 it took to execute and the memory allocated before
 returning the value of the expression.
 
-Unlike `@time`, it uses the `@benchmark`
-macro, and accepts all of the same additional
-parameters as `@benchmark`.  The printed time
-is the *minimum* elapsed time measured during the benchmark.
+However, it uses the [`@benchmark`](@ref) to evaluate
+`expression` many times, and accepts all of the same
+additional parameters as `@benchmark`. It prints both
+the minimum and the mean elapsed time, plus information
+about allocations (if any) and time spent on
+memory garbage collection (GC).
+
+# Examples
+```
+julia> @btime log(x[]) setup=(x=Ref(2.0))
+  min 3.666 ns, mean 3.772 ns (0 allocations)
+0.6931471805599453
+
+julia> @btime sum(log, \$(fill(2.0, 1000)))
+  min 3.391 μs, mean 3.441 μs (0 allocations)
+693.1471805599322
+
+julia> @btime rand(1000);
+  min 724.432 ns, mean 1.462 μs (1 allocation, 7.94 KiB. GC mean 352 ns, 24.11%)
+```
 """
 macro btime(args...)
     _, params = prunekwargs(args...)
     bench, trial, result = gensym(), gensym(), gensym()
-    trialmin, trialallocs = gensym(), gensym()
+    trialmin, trialmean, trialallocs = gensym(), gensym(), gensym()
     tune_phase = hasevals(params) ? :() : :($BenchmarkTools.tune!($bench))
     return esc(quote
         local $bench = $BenchmarkTools.@benchmarkable $(args...)
         $BenchmarkTools.warmup($bench)
         $tune_phase
         local $trial, $result = $BenchmarkTools.run_result($bench)
         local $trialmin = $BenchmarkTools.minimum($trial)
+        local $trialmean = $BenchmarkTools.mean($trial)
         local $trialallocs = $BenchmarkTools.allocs($trialmin)
-        println("  ",
-                $BenchmarkTools.prettytime($BenchmarkTools.time($trialmin)),
-                " (", $trialallocs , " allocation",
-                $trialallocs == 1 ? "" : "s", ": ",
-                $BenchmarkTools.prettymemory($BenchmarkTools.memory($trialmin)), ")")
+        $print("  min ")
+        $printstyled($BenchmarkTools.prettytime($BenchmarkTools.time($trialmin)); bold=true)
+        $print(", ")
+        $print("mean ")
+        $printstyled($BenchmarkTools.prettytime($BenchmarkTools.time($trialmean)); bold=true)
+        $print(" (", $trialallocs , " allocation", $trialallocs == 1 ? "" : "s")
+        if $trialallocs != 0
+            $print(", ", $prettymemory($memory($trialmin)))
+        end
+        $println(")")
         $result
     end)
 end

diff --git a/src/trials.jl b/src/trials.jl
@@ -253,7 +253,7 @@ function prettydiff(p)
     return string(diff >= 0.0 ? "+" : "", @sprintf("%.2f", diff * 100), "%")
 end
 
-function prettytime(t)
+function prettytime(t; short=false)
     if t < 1e3
         value, units = t, "ns"
     elseif t < 1e6
@@ -263,7 +263,11 @@ function prettytime(t)
     else
         value, units = t / 1e9, "s"
     end
-    return string(@sprintf("%.3f", value), " ", units)
+    if short
+        string(@sprintf("%.0f", value), " ", units)  # "%.3g" also OK, always 3 numbers
+    else
+        string(@sprintf("%.3f", value), " ", units)
+    end
 end
 
 function prettymemory(b)

diff --git a/test/ExecutionTests.jl b/test/ExecutionTests.jl
@@ -227,6 +227,7 @@ str = String(take!(io))
 
 let fname = tempname()
     try
+        # simple function, zero allocations
         ret = open(fname, "w") do f
             redirect_stdout(f) do
                 x = 1
@@ -238,7 +239,21 @@ let fname = tempname()
         end
         s = read(fname, String)
         try
-            @test occursin(r"[0-9.]+ \w*s \([0-9]* allocations?: [0-9]+ bytes\)", s)
+            @test occursin(r"min [0-9.]+ \w*s, mean [0-9.]+ \w*s \(0 allocations\)", s)
+        catch
+            println(stderr, "@btime output didn't match ", repr(s))
+            rethrow()
+        end
+        # function which allocates
+        ret2 = open(fname, "w") do f
+            redirect_stdout(f) do
+                y = @btime(sum(log, ones(100)))
+                @test y ≈ 0
+            end
+        end
+        s2 = read(fname, String)
+        try
+            @test occursin(r", mean [0-9.]+ \w*s \([0-9]* allocations?, [0-9]+ bytes", s2)
         catch
             println(stderr, "@btime output didn't match ", repr(s))
             rethrow()