diff --git a/perfkitbenchmarker/benchmarks/netperf_benchmark.py b/perfkitbenchmarker/benchmarks/netperf_benchmark.py
index 7da65e410d..5fdb232ec2 100644
--- a/perfkitbenchmarker/benchmarks/netperf_benchmark.py
+++ b/perfkitbenchmarker/benchmarks/netperf_benchmark.py
@@ -31,11 +31,22 @@
 from perfkitbenchmarker import vm_util
 from perfkitbenchmarker.packages import netperf
 
+flags.DEFINE_integer('netperf_max_iter', None,
+                     'Maximum number of iterations to run during '
+                     'confidence interval estimation. If unset, '
+                     'a single iteration will be run.',
+                     lower_bound=3, upper_bound=30)
+
+flags.DEFINE_integer('netperf_test_length', 60,
+                     'netperf test length, in seconds',
+                     lower_bound=1)
+
+
 FLAGS = flags.FLAGS
 
 BENCHMARK_INFO = {'name': 'netperf',
                   'description': 'Run TCP_RR, TCP_CRR, UDP_RR and TCP_STREAM '
-                  'Netperf benchmarks',
+                  'netperf benchmarks',
                   'scratch_disk': False,
                   'num_machines': 2}
 
@@ -90,15 +101,24 @@ def RunNetperf(vm, benchmark_name, server_ip):
   # Flags:
   # -o specifies keys to include in CSV output.
   # -j keeps additional latency numbers
+  # -I specifies the confidence % and width - here 99% confidence that the true
+  #    value is within +/- 2.5% of the reported value
+  # -i specifies the maximum and minimum number of iterations.
+  confidence = ('-I 99,5 -i {0},3'.format(FLAGS.netperf_max_iter)
+                if FLAGS.netperf_max_iter else '')
   netperf_cmd = ('{netperf_path} -p {command_port} -j '
-                 '-t {benchmark_name} -H {server_ip} -- '
+                 '-t {benchmark_name} -H {server_ip} -l {length} {confidence} '
+                 ' -- '
                  '-P {data_port} '
                  '-o THROUGHPUT,THROUGHPUT_UNITS,P50_LATENCY,P90_LATENCY,'
-                 'P99_LATENCY').format(
+                 'P99_LATENCY,STDDEV_LATENCY,'
+                 'CONFIDENCE_ITERATION,THROUGHPUT_CONFID').format(
                      netperf_path=netperf.NETPERF_PATH,
                      benchmark_name=benchmark_name,
                      server_ip=server_ip, command_port=COMMAND_PORT,
-                     data_port=DATA_PORT)
+                     data_port=DATA_PORT,
+                     length=FLAGS.netperf_test_length,
+                     confidence=confidence)
   stdout, _ = vm.RemoteCommand(netperf_cmd, should_log=True)
 
   fp = io.StringIO(stdout)
@@ -118,7 +138,13 @@ def RunNetperf(vm, benchmark_name, server_ip):
   else:
     metric = '%s_Transaction_Rate' % benchmark_name
 
-  samples = [sample.Sample(metric, value, unit)]
+  meta_keys = [('Confidence Iterations Run', 'confidence_iter'),
+               ('Throughput Confidence Width (%)', 'confidence_width_percent')]
+  metadata = {meta_key: row[np_key] for np_key, meta_key in meta_keys}
+  metadata.update(netperf_test_length=FLAGS.netperf_test_length,
+                  max_iter=FLAGS.netperf_max_iter or 1)
+
+  samples = [sample.Sample(metric, value, unit, metadata)]
 
   # No tail latency for throughput.
   if unit == MBPS:
@@ -127,11 +153,11 @@ def RunNetperf(vm, benchmark_name, server_ip):
   for metric_key, metric_name in [
       ('50th Percentile Latency Microseconds', 'p50'),
       ('90th Percentile Latency Microseconds', 'p90'),
-      ('99th Percentile Latency Microseconds', 'p99')]:
+      ('99th Percentile Latency Microseconds', 'p99'),
+      ('Stddev Latency Microseconds', 'stddev')]:
     samples.append(
         sample.Sample('%s_Latency_%s' % (benchmark_name, metric_name),
-                      float(row[metric_key]),
-                      'us'))
+                      float(row[metric_key]), 'us', metadata))
   return samples
 
 
diff --git a/tests/benchmarks/netperf_benchmark_test.py b/tests/benchmarks/netperf_benchmark_test.py
index 7aa8aea456..e6bcd423c2 100644
--- a/tests/benchmarks/netperf_benchmark_test.py
+++ b/tests/benchmarks/netperf_benchmark_test.py
@@ -58,7 +58,6 @@ def testExternalAndInternal(self):
 
     result = netperf_benchmark.Run(vm_spec)
 
-    self.assertEqual(26, len(result))
     tps = 'transactions_per_second'
     mbps = 'Mbits/sec'
     self.assertListEqual(
@@ -66,36 +65,42 @@ def testExternalAndInternal(self):
          ('TCP_RR_Latency_p50', 683.0, 'us'),
          ('TCP_RR_Latency_p90', 735.0, 'us'),
          ('TCP_RR_Latency_p99', 841.0, 'us'),
+         ('TCP_RR_Latency_stddev', 783.80, 'us'),
          ('TCP_RR_Transaction_Rate', 3545.77, tps),
          ('TCP_RR_Latency_p50', 274.0, 'us'),
          ('TCP_RR_Latency_p90', 309.0, 'us'),
          ('TCP_RR_Latency_p99', 371.0, 'us'),
+         ('TCP_RR_Latency_stddev', 189.82, 'us'),
          ('TCP_CRR_Transaction_Rate', 343.35, tps),
          ('TCP_CRR_Latency_p50', 2048.0, 'us'),
          ('TCP_CRR_Latency_p90', 2372.0, 'us'),
          ('TCP_CRR_Latency_p99', 30029.0, 'us'),
+         ('TCP_CRR_Latency_stddev', 8147.88, 'us'),
          ('TCP_CRR_Transaction_Rate', 1078.07, tps),
          ('TCP_CRR_Latency_p50', 871.0, 'us'),
          ('TCP_CRR_Latency_p90', 996.0, 'us'),
          ('TCP_CRR_Latency_p99', 2224.0, 'us'),
+         ('TCP_CRR_Latency_stddev', 551.07, 'us'),
          ('TCP_STREAM_Throughput', 1187.94, mbps),
          ('TCP_STREAM_Throughput', 1973.37, mbps),
          ('UDP_RR_Transaction_Rate', 1359.71, tps),
          ('UDP_RR_Latency_p50', 700.0, 'us'),
          ('UDP_RR_Latency_p90', 757.0, 'us'),
          ('UDP_RR_Latency_p99', 891.0, 'us'),
+         ('UDP_RR_Latency_stddev', 808.44, 'us'),
          ('UDP_RR_Transaction_Rate', 3313.49, tps),
          ('UDP_RR_Latency_p50', 295.0, 'us'),
          ('UDP_RR_Latency_p90', 330.0, 'us'),
-         ('UDP_RR_Latency_p99', 406.0, 'us')],
+         ('UDP_RR_Latency_p99', 406.0, 'us'),
+         ('UDP_RR_Latency_stddev', 214.64, 'us')],
         [i[:3] for i in result])
 
     external_meta = {'ip_type': 'external'}
     internal_meta = {'ip_type': 'internal'}
-    expected_meta = (([external_meta] * 4 + [internal_meta] * 4) * 2 +
+    expected_meta = (([external_meta] * 5 + [internal_meta] * 5) * 2 +
                      [external_meta, internal_meta] +
-                     [external_meta] * 4 +
-                     [internal_meta] * 4)
+                     [external_meta] * 5 +
+                     [internal_meta] * 5)
 
     for i, meta in enumerate(expected_meta):
       self.assertIsInstance(result[i][3], dict)