diff --git a/sql/hive/benchmarks/OrcReadBenchmark-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-results.txt new file mode 100644 index 0000000000000..c77f966723d71 --- /dev/null +++ b/sql/hive/benchmarks/OrcReadBenchmark-results.txt @@ -0,0 +1,173 @@ +================================================================================================ +SQL Single Numeric Column Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1630 / 1639 9.7 103.6 1.0X +Native ORC Vectorized 253 / 288 62.2 16.1 6.4X +Native ORC Vectorized with copy 227 / 244 69.2 14.5 7.2X +Hive built-in ORC 1980 / 1991 7.9 125.9 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1587 / 1589 9.9 100.9 1.0X +Native ORC Vectorized 227 / 242 69.2 14.5 7.0X +Native ORC Vectorized with copy 228 / 238 69.0 14.5 7.0X +Hive built-in ORC 2323 / 2332 6.8 147.7 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1726 / 1771 9.1 109.7 1.0X +Native ORC Vectorized 309 / 333 50.9 19.7 5.6X +Native ORC Vectorized with copy 313 / 321 50.2 19.9 5.5X +Hive built-in ORC 2668 / 2672 5.9 169.6 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1722 / 1747 9.1 109.5 1.0X +Native ORC Vectorized 395 / 403 39.8 25.1 4.4X +Native ORC Vectorized with copy 399 / 405 39.4 25.4 4.3X +Hive built-in ORC 2767 / 2777 5.7 175.9 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1797 / 1824 8.8 114.2 1.0X +Native ORC Vectorized 434 / 441 36.2 27.6 4.1X +Native ORC Vectorized with copy 437 / 447 36.0 27.8 4.1X +Hive built-in ORC 2701 / 2710 5.8 171.7 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1931 / 2028 8.1 122.8 1.0X +Native ORC Vectorized 542 / 557 29.0 34.5 3.6X +Native ORC Vectorized with copy 550 / 564 28.6 35.0 3.5X +Hive built-in ORC 2816 / 3206 5.6 179.1 0.7X + + +================================================================================================ +Int and String Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 4012 / 4068 2.6 382.6 1.0X +Native ORC Vectorized 2337 / 2339 4.5 222.9 1.7X +Native ORC Vectorized with copy 2520 / 2540 4.2 240.3 1.6X +Hive built-in ORC 5503 / 5575 1.9 524.8 0.7X + + +================================================================================================ +Partitioned Table Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Data column - Native ORC MR 2020 / 2025 7.8 128.4 1.0X +Data column - Native ORC Vectorized 398 / 409 39.5 25.3 5.1X +Data column - Native ORC Vectorized with copy 406 / 411 38.8 25.8 5.0X +Data column - Hive built-in ORC 2967 / 2969 5.3 188.6 0.7X +Partition column - Native ORC MR 1494 / 1505 10.5 95.0 1.4X +Partition column - Native ORC Vectorized 73 / 82 216.3 4.6 27.8X +Partition column - Native ORC Vectorized with copy 71 / 80 221.4 4.5 28.4X +Partition column - Hive built-in ORC 1932 / 1937 8.1 122.8 1.0X +Both columns - Native ORC MR 2057 / 2071 7.6 130.8 1.0X +Both columns - Native ORC Vectorized 445 / 448 35.4 28.3 4.5X +Both column - Native ORC Vectorized with copy 534 / 539 29.4 34.0 3.8X +Both columns - Hive built-in ORC 2994 / 2994 5.3 190.3 0.7X + + +================================================================================================ +Repeated String Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1771 / 1785 5.9 168.9 1.0X +Native ORC Vectorized 372 / 375 28.2 35.5 4.8X +Native ORC Vectorized with copy 543 / 576 19.3 51.8 3.3X +Hive built-in ORC 2671 / 2671 3.9 254.7 0.7X + + +================================================================================================ +String with Nulls Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (0.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 3276 / 3302 3.2 312.5 1.0X +Native ORC Vectorized 1057 / 1080 9.9 100.8 3.1X +Native ORC Vectorized with copy 1420 / 1431 7.4 135.4 2.3X +Hive built-in ORC 5377 / 5407 2.0 512.8 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (0.5%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 3147 / 3147 3.3 300.1 1.0X +Native ORC Vectorized 1305 / 1319 8.0 124.4 2.4X +Native ORC Vectorized with copy 1685 / 1686 6.2 160.7 1.9X +Hive built-in ORC 4077 / 4085 2.6 388.8 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (0.95%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1739 / 1744 6.0 165.8 1.0X +Native ORC Vectorized 500 / 501 21.0 47.7 3.5X +Native ORC Vectorized with copy 618 / 631 17.0 58.9 2.8X +Hive built-in ORC 2411 / 2427 4.3 229.9 0.7X + + +================================================================================================ +Single Column Scan From Wide Columns +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 100 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1348 / 1366 0.8 1285.3 1.0X +Native ORC Vectorized 119 / 134 8.8 113.5 11.3X +Native ORC Vectorized with copy 119 / 148 8.8 113.9 11.3X +Hive built-in ORC 487 / 507 2.2 464.8 2.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 200 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 2667 / 2837 0.4 2543.6 1.0X +Native ORC Vectorized 203 / 222 5.2 193.4 13.2X +Native ORC Vectorized with copy 217 / 255 4.8 207.0 12.3X +Hive built-in ORC 737 / 741 1.4 702.4 3.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 300 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 3954 / 3956 0.3 3770.4 1.0X +Native ORC Vectorized 348 / 360 3.0 331.7 11.4X +Native ORC Vectorized with copy 349 / 359 3.0 333.2 11.3X +Hive built-in ORC 1057 / 1067 1.0 1008.0 3.7X + + diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala index 49de007df3828..0bb5e8c141595 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala @@ -22,20 +22,26 @@ import java.io.File import scala.util.Random import org.apache.spark.SparkConf -import org.apache.spark.benchmark.Benchmark +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ - /** * Benchmark to measure ORC read performance. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/OrcReadBenchmark-results.txt". + * }}} * * This is in `sql/hive` module in order to compare `sql/core` and `sql/hive` ORC data sources. */ // scalastyle:off line.size.limit -object OrcReadBenchmark extends SQLHelper { +object OrcReadBenchmark extends BenchmarkBase with SQLHelper { val conf = new SparkConf() conf.set("orc.compression", "snappy") @@ -69,7 +75,7 @@ object OrcReadBenchmark extends SQLHelper { } def numericScanBenchmark(values: Int, dataType: DataType): Unit = { - val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values) + val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -98,59 +104,13 @@ object OrcReadBenchmark extends SQLHelper { spark.sql("SELECT sum(id) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1135 / 1171 13.9 72.2 1.0X - Native ORC Vectorized 152 / 163 103.4 9.7 7.5X - Native ORC Vectorized with copy 149 / 162 105.4 9.5 7.6X - Hive built-in ORC 1380 / 1384 11.4 87.7 0.8X - - SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1182 / 1244 13.3 75.2 1.0X - Native ORC Vectorized 145 / 156 108.7 9.2 8.2X - Native ORC Vectorized with copy 148 / 158 106.4 9.4 8.0X - Hive built-in ORC 1591 / 1636 9.9 101.2 0.7X - - SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1271 / 1271 12.4 80.8 1.0X - Native ORC Vectorized 206 / 212 76.3 13.1 6.2X - Native ORC Vectorized with copy 200 / 213 78.8 12.7 6.4X - Hive built-in ORC 1776 / 1787 8.9 112.9 0.7X - - SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1344 / 1355 11.7 85.4 1.0X - Native ORC Vectorized 258 / 268 61.0 16.4 5.2X - Native ORC Vectorized with copy 252 / 257 62.4 16.0 5.3X - Hive built-in ORC 1818 / 1823 8.7 115.6 0.7X - - SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1333 / 1352 11.8 84.8 1.0X - Native ORC Vectorized 310 / 324 50.7 19.7 4.3X - Native ORC Vectorized with copy 312 / 320 50.4 19.9 4.3X - Hive built-in ORC 1904 / 1918 8.3 121.0 0.7X - - SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1408 / 1585 11.2 89.5 1.0X - Native ORC Vectorized 359 / 368 43.8 22.8 3.9X - Native ORC Vectorized with copy 364 / 371 43.2 23.2 3.9X - Hive built-in ORC 1881 / 1954 8.4 119.6 0.7X - */ benchmark.run() } } } def intStringScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Int and String Scan", values) + val benchmark = new Benchmark("Int and String Scan", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -181,24 +141,13 @@ object OrcReadBenchmark extends SQLHelper { spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 2566 / 2592 4.1 244.7 1.0X - Native ORC Vectorized 1098 / 1113 9.6 104.7 2.3X - Native ORC Vectorized with copy 1527 / 1593 6.9 145.6 1.7X - Hive built-in ORC 3561 / 3705 2.9 339.6 0.7X - */ benchmark.run() } } } def partitionTableScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Partitioned Table", values) + val benchmark = new Benchmark("Partitioned Table", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -267,32 +216,13 @@ object OrcReadBenchmark extends SQLHelper { spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Data only - Native ORC MR 1447 / 1457 10.9 92.0 1.0X - Data only - Native ORC Vectorized 256 / 266 61.4 16.3 5.6X - Data only - Native ORC Vectorized with copy 263 / 273 59.8 16.7 5.5X - Data only - Hive built-in ORC 1960 / 1988 8.0 124.6 0.7X - Partition only - Native ORC MR 1039 / 1043 15.1 66.0 1.4X - Partition only - Native ORC Vectorized 48 / 53 326.6 3.1 30.1X - Partition only - Native ORC Vectorized with copy 48 / 53 328.4 3.0 30.2X - Partition only - Hive built-in ORC 1234 / 1242 12.7 78.4 1.2X - Both columns - Native ORC MR 1465 / 1475 10.7 93.1 1.0X - Both columns - Native ORC Vectorized 292 / 301 53.9 18.6 5.0X - Both column - Native ORC Vectorized with copy 348 / 354 45.1 22.2 4.2X - Both columns - Hive built-in ORC 2051 / 2060 7.7 130.4 0.7X - */ benchmark.run() } } } def repeatedStringScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Repeated String", values) + val benchmark = new Benchmark("Repeated String", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -320,17 +250,6 @@ object OrcReadBenchmark extends SQLHelper { spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1271 / 1278 8.3 121.2 1.0X - Native ORC Vectorized 200 / 212 52.4 19.1 6.4X - Native ORC Vectorized with copy 342 / 347 30.7 32.6 3.7X - Hive built-in ORC 1874 / 2105 5.6 178.7 0.7X - */ benchmark.run() } } @@ -347,7 +266,8 @@ object OrcReadBenchmark extends SQLHelper { s"SELECT IF(RAND(1) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c1, " + s"IF(RAND(2) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c2 FROM t1")) - val benchmark = new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values) + val benchmark = + new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values, output = output) benchmark.addCase("Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { @@ -373,38 +293,13 @@ object OrcReadBenchmark extends SQLHelper { "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - String with Nulls Scan (0.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 2394 / 2886 4.4 228.3 1.0X - Native ORC Vectorized 699 / 729 15.0 66.7 3.4X - Native ORC Vectorized with copy 959 / 1025 10.9 91.5 2.5X - Hive built-in ORC 3899 / 3901 2.7 371.9 0.6X - - String with Nulls Scan (0.5%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 2234 / 2255 4.7 213.1 1.0X - Native ORC Vectorized 854 / 869 12.3 81.4 2.6X - Native ORC Vectorized with copy 1099 / 1128 9.5 104.8 2.0X - Hive built-in ORC 2767 / 2793 3.8 263.9 0.8X - - String with Nulls Scan (0.95%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1166 / 1202 9.0 111.2 1.0X - Native ORC Vectorized 338 / 345 31.1 32.2 3.5X - Native ORC Vectorized with copy 418 / 428 25.1 39.9 2.8X - Hive built-in ORC 1730 / 1761 6.1 164.9 0.7X - */ benchmark.run() } } } def columnsBenchmark(values: Int, width: Int): Unit = { - val benchmark = new Benchmark(s"Single Column Scan from $width columns", values) + val benchmark = new Benchmark(s"Single Column Scan from $width columns", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -436,49 +331,36 @@ object OrcReadBenchmark extends SQLHelper { spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Single Column Scan from 100 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1050 / 1053 1.0 1001.1 1.0X - Native ORC Vectorized 95 / 101 11.0 90.9 11.0X - Native ORC Vectorized with copy 95 / 102 11.0 90.9 11.0X - Hive built-in ORC 348 / 358 3.0 331.8 3.0X - - Single Column Scan from 200 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 2099 / 2108 0.5 2002.1 1.0X - Native ORC Vectorized 179 / 187 5.8 171.1 11.7X - Native ORC Vectorized with copy 176 / 188 6.0 167.6 11.9X - Hive built-in ORC 562 / 581 1.9 535.9 3.7X - - Single Column Scan from 300 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 3221 / 3246 0.3 3071.4 1.0X - Native ORC Vectorized 312 / 322 3.4 298.0 10.3X - Native ORC Vectorized with copy 306 / 320 3.4 291.6 10.5X - Hive built-in ORC 815 / 824 1.3 777.3 4.0X - */ benchmark.run() } } } - def main(args: Array[String]): Unit = { - Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType => - numericScanBenchmark(1024 * 1024 * 15, dataType) + override def benchmark(): Unit = { + runBenchmark("SQL Single Numeric Column Scan") { + Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType => + numericScanBenchmark(1024 * 1024 * 15, dataType) + } + } + runBenchmark("Int and String Scan") { + intStringScanBenchmark(1024 * 1024 * 10) + } + runBenchmark("Partitioned Table Scan") { + partitionTableScanBenchmark(1024 * 1024 * 15) + } + runBenchmark("Repeated String Scan") { + repeatedStringScanBenchmark(1024 * 1024 * 10) + } + runBenchmark("String with Nulls Scan") { + for (fractionOfNulls <- List(0.0, 0.50, 0.95)) { + stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls) + } } - intStringScanBenchmark(1024 * 1024 * 10) - partitionTableScanBenchmark(1024 * 1024 * 15) - repeatedStringScanBenchmark(1024 * 1024 * 10) - for (fractionOfNulls <- List(0.0, 0.50, 0.95)) { - stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls) + runBenchmark("Single Column Scan From Wide Columns") { + columnsBenchmark(1024 * 1024 * 1, 100) + columnsBenchmark(1024 * 1024 * 1, 200) + columnsBenchmark(1024 * 1024 * 1, 300) } - columnsBenchmark(1024 * 1024 * 1, 100) - columnsBenchmark(1024 * 1024 * 1, 200) - columnsBenchmark(1024 * 1024 * 1, 300) } } // scalastyle:on line.size.limit