From be529880b8a6e9b7444ef49aba38f6af6ccf74e4 Mon Sep 17 00:00:00 2001
From: Orri Erling <oerling@fb.com>
Date: Mon, 19 Aug 2024 07:25:19 -0700
Subject: [PATCH] Wave Metrics (#10679)

Summary:
- Make multithreaded memcpy for staging transfers for GPU table scan.
- Make variants of bit unpacking in GpuDecoder-inl.cuh. Make selective decoding templatized as opposed to runtime switching.
- Add pieces to GpuDecoderTest, like comparing calling via launchDecode or (multi-function blocks) or decodeGlobal (single function thread blocks).
- Add a metric for driver thread waiting for first continuable stream.
- Check approx correctness of Wave runtimeStats.
- Refactor QueryBenchmarkBase.* from TpchBenchmark. Logic to do sweeps across parameter combinations.
- Add persistent file format to Wave mock format.
- Add benchmark for scan, filter, filter expr, projection. aggregation combinations with Wave and Dwrf.

Pull Request resolved: https://github.com/facebookincubator/velox/pull/10679

Reviewed By: Yuhta

Differential Revision: D60880466

Pulled By: oerling

fbshipit-source-id: 69e02986cffda5060e586610cd585fff68ff8b9a
---
 velox/benchmarks/CMakeLists.txt               |  24 ++
 velox/benchmarks/QueryBenchmarkBase.cpp       | 391 +++++++++++++++++
 velox/benchmarks/QueryBenchmarkBase.h         | 127 ++++++
 velox/benchmarks/tpch/CMakeLists.txt          |   1 +
 velox/benchmarks/tpch/TpchBenchmark.cpp       | 404 +-----------------
 velox/experimental/wave/common/Cuda.cu        |  11 +
 velox/experimental/wave/common/Cuda.h         |  13 +
 velox/experimental/wave/common/GpuArena.cpp   |  34 +-
 velox/experimental/wave/common/GpuArena.h     |  44 +-
 .../wave/common/ResultStaging.cpp             |  32 +-
 .../experimental/wave/common/ResultStaging.h  |  20 +
 .../wave/common/tests/CudaTest.cpp            |   2 +-
 velox/experimental/wave/dwio/ColumnReader.h   |  25 +-
 velox/experimental/wave/dwio/FormatData.cpp   | 117 ++++-
 velox/experimental/wave/dwio/FormatData.h     |  82 +++-
 velox/experimental/wave/dwio/ReadStream.cpp   | 105 +++--
 .../wave/dwio/decode/DecodeStep.h             |  15 +-
 .../wave/dwio/decode/GpuDecoder-inl.cuh       | 265 ++++++++++--
 .../wave/dwio/decode/GpuDecoder.cu            |  62 ++-
 .../wave/dwio/decode/tests/GpuDecoderTest.cu  | 131 ++++--
 velox/experimental/wave/exec/Aggregation.cpp  |   4 -
 velox/experimental/wave/exec/Aggregation.h    |   2 -
 velox/experimental/wave/exec/Instruction.h    |   4 +-
 velox/experimental/wave/exec/Project.cpp      |  24 +-
 velox/experimental/wave/exec/Project.h        |   2 -
 velox/experimental/wave/exec/TableScan.cpp    |  45 +-
 velox/experimental/wave/exec/TableScan.h      |  13 +-
 velox/experimental/wave/exec/ToWave.cpp       |   2 +-
 velox/experimental/wave/exec/Values.h         |   6 -
 velox/experimental/wave/exec/Wave.cpp         |  46 ++
 velox/experimental/wave/exec/Wave.h           |  52 ++-
 velox/experimental/wave/exec/WaveDataSource.h |   2 -
 velox/experimental/wave/exec/WaveDriver.cpp   |  53 ++-
 velox/experimental/wave/exec/WaveDriver.h     |   5 -
 .../wave/exec/WaveHiveDataSource.cpp          |   4 -
 .../wave/exec/WaveHiveDataSource.h            |   2 -
 velox/experimental/wave/exec/WaveOperator.h   |   9 +-
 .../experimental/wave/exec/WaveSplitReader.h  |   2 -
 .../wave/exec/tests/CMakeLists.txt            |  52 ++-
 .../wave/exec/tests/WaveBenchmark.cpp         | 372 ++++++++++++++++
 .../wave/exec/tests/utils/FileFormat.cpp      | 247 ++++++++++-
 .../wave/exec/tests/utils/FileFormat.h        |  54 ++-
 .../exec/tests/utils/TestFormatReader.cpp     |  19 +-
 .../wave/exec/tests/utils/TestFormatReader.h  |   2 +
 .../exec/tests/utils/WaveTestSplitReader.cpp  |  29 +-
 .../exec/tests/utils/WaveTestSplitReader.h    |  11 +-
 46 files changed, 2317 insertions(+), 651 deletions(-)
 create mode 100644 velox/benchmarks/QueryBenchmarkBase.cpp
 create mode 100644 velox/benchmarks/QueryBenchmarkBase.h
 create mode 100644 velox/experimental/wave/exec/tests/WaveBenchmark.cpp

diff --git a/velox/benchmarks/CMakeLists.txt b/velox/benchmarks/CMakeLists.txt
index b904043b80a5..dda7226d6715 100644
--- a/velox/benchmarks/CMakeLists.txt
+++ b/velox/benchmarks/CMakeLists.txt
@@ -39,3 +39,27 @@ if(${VELOX_ENABLE_BENCHMARKS})
   add_subdirectory(tpch)
   add_subdirectory(filesystem)
 endif()
+
+add_library(velox_query_benchmark QueryBenchmarkBase.cpp)
+target_link_libraries(
+  velox_query_benchmark
+  velox_aggregates
+  velox_exec
+  velox_exec_test_lib
+  velox_dwio_common
+  velox_dwio_common_exception
+  velox_dwio_parquet_reader
+  velox_dwio_common_test_utils
+  velox_hive_connector
+  velox_exception
+  velox_memory
+  velox_process
+  velox_serialization
+  velox_encode
+  velox_type
+  velox_type_fbhive
+  velox_caching
+  velox_vector_test_lib
+  ${FOLLY_BENCHMARK}
+  Folly::folly
+  fmt::fmt)
diff --git a/velox/benchmarks/QueryBenchmarkBase.cpp b/velox/benchmarks/QueryBenchmarkBase.cpp
new file mode 100644
index 000000000000..f51acd5544ad
--- /dev/null
+++ b/velox/benchmarks/QueryBenchmarkBase.cpp
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/benchmarks/QueryBenchmarkBase.h"
+
+DEFINE_string(data_format, "parquet", "Data format");
+
+DEFINE_validator(
+    data_format,
+    &facebook::velox::QueryBenchmarkBase::validateDataFormat);
+
+DEFINE_bool(
+    include_custom_stats,
+    false,
+    "Include custom statistics along with execution statistics");
+DEFINE_bool(include_results, false, "Include results in the output");
+DEFINE_int32(num_drivers, 4, "Number of drivers");
+
+DEFINE_int32(num_splits_per_file, 10, "Number of splits per file");
+DEFINE_int32(
+    cache_gb,
+    0,
+    "GB of process memory for cache and query.. if "
+    "non-0, uses mmap to allocator and in-process data cache.");
+DEFINE_int32(num_repeats, 1, "Number of times to run each query");
+DEFINE_int32(num_io_threads, 8, "Threads for speculative IO");
+DEFINE_string(
+    test_flags_file,
+    "",
+    "Path to a file containing gflafs and "
+    "values to try. Produces results for each flag combination "
+    "sorted on performance");
+DEFINE_bool(
+    full_sorted_stats,
+    true,
+    "Add full stats to the report on  --test_flags_file");
+
+DEFINE_string(ssd_path, "", "Directory for local SSD cache");
+DEFINE_int32(ssd_cache_gb, 0, "Size of local SSD cache in GB");
+DEFINE_int32(
+    ssd_checkpoint_interval_gb,
+    8,
+    "Checkpoint every n "
+    "GB new data in cache");
+DEFINE_bool(
+    clear_ram_cache,
+    false,
+    "Clear RAM cache before each query."
+    "Flushes in process and OS file system cache (if root on Linux)");
+DEFINE_bool(
+    clear_ssd_cache,
+    false,
+    "Clears SSD cache before "
+    "each query");
+
+DEFINE_bool(
+    warmup_after_clear,
+    false,
+    "Runs one warmup of the query before "
+    "measured run. Use to run warm after clearing caches.");
+
+DEFINE_int64(
+    max_coalesced_bytes,
+    128 << 20,
+    "Maximum size of single coalesced IO");
+
+DEFINE_int32(
+    max_coalesced_distance_bytes,
+    512 << 10,
+    "Maximum distance in bytes in which coalesce will combine requests");
+
+DEFINE_int32(
+    parquet_prefetch_rowgroups,
+    1,
+    "Number of next row groups to "
+    "prefetch. 1 means prefetch the next row group before decoding "
+    "the current one");
+
+DEFINE_int32(split_preload_per_driver, 2, "Prefetch split metadata");
+
+using namespace facebook::velox::exec;
+using namespace facebook::velox::exec::test;
+using namespace facebook::velox::dwio::common;
+
+namespace facebook::velox {
+
+//  static
+bool QueryBenchmarkBase::validateDataFormat(
+    const char* flagname,
+    const std::string& value) {
+  if ((value.compare("parquet") == 0) || (value.compare("dwrf") == 0)) {
+    return true;
+  }
+  std::cout
+      << fmt::format(
+             "Invalid value for --{}: {}. Allowed values are [\"parquet\", \"dwrf\"]",
+             flagname,
+             value)
+      << std::endl;
+  return false;
+}
+
+// static
+void QueryBenchmarkBase::ensureTaskCompletion(exec::Task* task) {
+  // ASSERT_TRUE requires a function with return type void.
+  ASSERT_TRUE(exec::test::waitForTaskCompletion(task));
+}
+
+// static
+void QueryBenchmarkBase::printResults(
+    const std::vector<RowVectorPtr>& results,
+    std::ostream& out) {
+  out << "Results:" << std::endl;
+  bool printType = true;
+  for (const auto& vector : results) {
+    // Print RowType only once.
+    if (printType) {
+      out << vector->type()->asRow().toString() << std::endl;
+      printType = false;
+    }
+    for (vector_size_t i = 0; i < vector->size(); ++i) {
+      out << vector->toString(i) << std::endl;
+    }
+  }
+}
+
+void QueryBenchmarkBase::initialize() {
+  if (FLAGS_cache_gb) {
+    memory::MemoryManagerOptions options;
+    int64_t memoryBytes = FLAGS_cache_gb * (1LL << 30);
+    options.useMmapAllocator = true;
+    options.allocatorCapacity = memoryBytes;
+    options.useMmapArena = true;
+    options.mmapArenaCapacityRatio = 1;
+    memory::MemoryManager::testingSetInstance(options);
+    std::unique_ptr<cache::SsdCache> ssdCache;
+    if (FLAGS_ssd_cache_gb) {
+      constexpr int32_t kNumSsdShards = 16;
+      cacheExecutor_ =
+          std::make_unique<folly::IOThreadPoolExecutor>(kNumSsdShards);
+      const cache::SsdCache::Config config(
+          FLAGS_ssd_path,
+          static_cast<uint64_t>(FLAGS_ssd_cache_gb) << 30,
+          kNumSsdShards,
+          cacheExecutor_.get(),
+          static_cast<uint64_t>(FLAGS_ssd_checkpoint_interval_gb) << 30);
+      ssdCache = std::make_unique<cache::SsdCache>(config);
+    }
+
+    cache_ = cache::AsyncDataCache::create(
+        memory::memoryManager()->allocator(), std::move(ssdCache));
+    cache::AsyncDataCache::setInstance(cache_.get());
+  } else {
+    memory::MemoryManager::testingSetInstance({});
+  }
+  functions::prestosql::registerAllScalarFunctions();
+  aggregate::prestosql::registerAllAggregateFunctions();
+  parse::registerTypeResolver();
+  filesystems::registerLocalFileSystem();
+
+  ioExecutor_ =
+      std::make_unique<folly::IOThreadPoolExecutor>(FLAGS_num_io_threads);
+
+  // Add new values into the hive configuration...
+  auto configurationValues = std::unordered_map<std::string, std::string>();
+  configurationValues[connector::hive::HiveConfig::kMaxCoalescedBytes] =
+      std::to_string(FLAGS_max_coalesced_bytes);
+  configurationValues[connector::hive::HiveConfig::kMaxCoalescedDistanceBytes] =
+      std::to_string(FLAGS_max_coalesced_distance_bytes);
+  configurationValues[connector::hive::HiveConfig::kPrefetchRowGroups] =
+      std::to_string(FLAGS_parquet_prefetch_rowgroups);
+  auto properties = std::make_shared<const config::ConfigBase>(
+      std::move(configurationValues));
+
+  // Create hive connector with config...
+  auto hiveConnector =
+      connector::getConnectorFactory(
+          connector::hive::HiveConnectorFactory::kHiveConnectorName)
+          ->newConnector(kHiveConnectorId, properties, ioExecutor_.get());
+  connector::registerConnector(hiveConnector);
+}
+
+std::vector<std::shared_ptr<connector::ConnectorSplit>>
+QueryBenchmarkBase::listSplits(
+    const std::string& path,
+    int32_t numSplitsPerFile,
+    const exec::test::TpchPlan& plan) {
+  std::vector<std::shared_ptr<connector::ConnectorSplit>> result;
+  auto temp = HiveConnectorTestBase::makeHiveConnectorSplits(
+      path, numSplitsPerFile, plan.dataFileFormat);
+  for (auto& i : temp) {
+    result.push_back(i);
+  }
+  return result;
+}
+
+void QueryBenchmarkBase::shutdown() {
+  if (cache_) {
+    cache_->shutdown();
+  }
+}
+
+std::pair<std::unique_ptr<TaskCursor>, std::vector<RowVectorPtr>>
+QueryBenchmarkBase::run(const TpchPlan& tpchPlan) {
+  int32_t repeat = 0;
+  try {
+    for (;;) {
+      CursorParameters params;
+      params.maxDrivers = FLAGS_num_drivers;
+      params.planNode = tpchPlan.plan;
+      params.queryConfigs[core::QueryConfig::kMaxSplitPreloadPerDriver] =
+          std::to_string(FLAGS_split_preload_per_driver);
+      const int numSplitsPerFile = FLAGS_num_splits_per_file;
+
+      bool noMoreSplits = false;
+      auto addSplits = [&](exec::Task* task) {
+        if (!noMoreSplits) {
+          for (const auto& entry : tpchPlan.dataFiles) {
+            for (const auto& path : entry.second) {
+              auto splits = listSplits(path, numSplitsPerFile, tpchPlan);
+              for (auto split : splits) {
+                task->addSplit(entry.first, exec::Split(std::move(split)));
+              }
+            }
+            task->noMoreSplits(entry.first);
+          }
+        }
+        noMoreSplits = true;
+      };
+      auto result = readCursor(params, addSplits);
+      ensureTaskCompletion(result.first->task().get());
+      if (++repeat >= FLAGS_num_repeats) {
+        return result;
+      }
+    }
+  } catch (const std::exception& e) {
+    LOG(ERROR) << "Query terminated with: " << e.what();
+    return {nullptr, std::vector<RowVectorPtr>()};
+  }
+}
+
+void QueryBenchmarkBase::readCombinations() {
+  std::ifstream file(FLAGS_test_flags_file);
+  std::string line;
+  while (std::getline(file, line)) {
+    ParameterDim dim;
+    int32_t previous = 0;
+    for (auto i = 0; i < line.size(); ++i) {
+      if (line[i] == ':') {
+        dim.flag = line.substr(0, i);
+        previous = i + 1;
+      } else if (line[i] == ',') {
+        dim.values.push_back(line.substr(previous, i - previous));
+        previous = i + 1;
+      }
+    }
+    if (previous < line.size()) {
+      dim.values.push_back(line.substr(previous, line.size() - previous));
+    }
+    if (!dim.flag.empty() && !dim.values.empty()) {
+      parameters_.push_back(dim);
+    }
+  }
+}
+
+void QueryBenchmarkBase::runCombinations(int32_t level) {
+  if (level == parameters_.size()) {
+    if (FLAGS_clear_ram_cache) {
+#ifdef linux
+      // system("echo 3 >/proc/sys/vm/drop_caches");
+      bool success = false;
+      auto fd = open("/proc//sys/vm/drop_caches", O_WRONLY);
+      if (fd > 0) {
+        success = write(fd, "3", 1) == 1;
+        close(fd);
+      }
+      if (!success) {
+        LOG(ERROR) << "Failed to clear OS disk cache: errno=" << errno;
+      }
+#endif
+
+      if (cache_) {
+        cache_->clear();
+      }
+    }
+    if (FLAGS_clear_ssd_cache) {
+      if (cache_) {
+        auto ssdCache = cache_->ssdCache();
+        if (ssdCache) {
+          ssdCache->clear();
+        }
+      }
+    }
+    if (FLAGS_warmup_after_clear) {
+      std::stringstream result;
+      RunStats ignore;
+      runMain(result, ignore);
+    }
+    RunStats stats;
+    std::stringstream result;
+    uint64_t micros = 0;
+    {
+      struct rusage start;
+      getrusage(RUSAGE_SELF, &start);
+      MicrosecondTimer timer(&micros);
+      runMain(result, stats);
+      struct rusage final;
+      getrusage(RUSAGE_SELF, &final);
+      auto tvNanos = [](struct timeval tv) {
+        return tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
+      };
+      stats.userNanos = tvNanos(final.ru_utime) - tvNanos(start.ru_utime);
+      stats.systemNanos = tvNanos(final.ru_stime) - tvNanos(start.ru_stime);
+    }
+    stats.micros = micros;
+    stats.output = result.str();
+    for (auto i = 0; i < parameters_.size(); ++i) {
+      std::string name;
+      gflags::GetCommandLineOption(parameters_[i].flag.c_str(), &name);
+      stats.flags[parameters_[i].flag] = name;
+    }
+    runStats_.push_back(std::move(stats));
+  } else {
+    auto& flag = parameters_[level].flag;
+    for (auto& value : parameters_[level].values) {
+      std::string result =
+          gflags::SetCommandLineOption(flag.c_str(), value.c_str());
+      if (result.empty()) {
+        LOG(ERROR) << "Failed to set " << flag << "=" << value;
+      }
+      std::cout << result << std::endl;
+      runCombinations(level + 1);
+    }
+  }
+}
+
+void QueryBenchmarkBase::runOne(std::ostream& out, RunStats& stats) {
+  std::stringstream result;
+  uint64_t micros = 0;
+  {
+    struct rusage start;
+    getrusage(RUSAGE_SELF, &start);
+    MicrosecondTimer timer(&micros);
+    runMain(out, stats);
+    struct rusage final;
+    getrusage(RUSAGE_SELF, &final);
+    auto tvNanos = [](struct timeval tv) {
+      return tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
+    };
+    stats.userNanos = tvNanos(final.ru_utime) - tvNanos(start.ru_utime);
+    stats.systemNanos = tvNanos(final.ru_stime) - tvNanos(start.ru_stime);
+  }
+  stats.micros = micros;
+  stats.output = result.str();
+  out << result.str();
+}
+
+void QueryBenchmarkBase::runAllCombinations() {
+  readCombinations();
+  runCombinations(0);
+  std::sort(
+      runStats_.begin(),
+      runStats_.end(),
+      [](const RunStats& left, const RunStats& right) {
+        return left.micros < right.micros;
+      });
+  for (auto& stats : runStats_) {
+    std::cout << stats.toString(false);
+  }
+  if (FLAGS_full_sorted_stats) {
+    std::cout << "Detail for stats:" << std::endl;
+    for (auto& stats : runStats_) {
+      std::cout << stats.toString(true);
+    }
+  }
+}
+
+} // namespace facebook::velox
diff --git a/velox/benchmarks/QueryBenchmarkBase.h b/velox/benchmarks/QueryBenchmarkBase.h
new file mode 100644
index 000000000000..d3577fe53cf2
--- /dev/null
+++ b/velox/benchmarks/QueryBenchmarkBase.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+#include <gflags/gflags.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fstream>
+
+#include "velox/common/base/SuccinctPrinter.h"
+#include "velox/common/file/FileSystems.h"
+#include "velox/common/memory/MmapAllocator.h"
+#include "velox/connectors/hive/HiveConfig.h"
+#include "velox/connectors/hive/HiveConnector.h"
+#include "velox/dwio/common/Options.h"
+#include "velox/exec/PlanNodeStats.h"
+#include "velox/exec/Split.h"
+#include "velox/exec/tests/utils/HiveConnectorTestBase.h"
+#include "velox/exec/tests/utils/TpchQueryBuilder.h"
+#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h"
+#include "velox/functions/prestosql/registration/RegistrationFunctions.h"
+#include "velox/parse/TypeResolver.h"
+
+DECLARE_string(test_flags_file);
+DECLARE_bool(include_results);
+DECLARE_bool(include_custom_stats);
+DECLARE_string(data_format);
+
+namespace facebook::velox {
+
+struct RunStats {
+  std::map<std::string, std::string> flags;
+  int64_t micros{0};
+  int64_t rawInputBytes{0};
+  int64_t userNanos{0};
+  int64_t systemNanos{0};
+  std::string output;
+
+  std::string toString(bool detail) {
+    std::stringstream out;
+    out << succinctNanos(micros * 1000) << " "
+        << succinctBytes(rawInputBytes / (micros / 1000000.0)) << "/s raw, "
+        << succinctNanos(userNanos) << " user " << succinctNanos(systemNanos)
+        << " system (" << (100 * (userNanos + systemNanos) / (micros * 1000))
+        << "%), flags: ";
+    for (auto& pair : flags) {
+      out << pair.first << "=" << pair.second << " ";
+    }
+    out << std::endl << "======" << std::endl;
+    if (detail) {
+      out << std::endl << output << std::endl;
+    }
+    return out.str();
+  }
+};
+
+struct ParameterDim {
+  std::string flag;
+  std::vector<std::string> values;
+};
+
+class QueryBenchmarkBase {
+ public:
+  virtual ~QueryBenchmarkBase() = default;
+  virtual void initialize();
+  void shutdown();
+  std::pair<std::unique_ptr<exec::test::TaskCursor>, std::vector<RowVectorPtr>>
+  run(const exec::test::TpchPlan& tpchPlan);
+
+  virtual std::vector<std::shared_ptr<connector::ConnectorSplit>> listSplits(
+      const std::string& path,
+      int32_t numSplitsPerFile,
+      const exec::test::TpchPlan& plan);
+
+  static void ensureTaskCompletion(exec::Task* task);
+
+  static bool validateDataFormat(
+      const char* flagname,
+      const std::string& value);
+
+  static void printResults(
+      const std::vector<RowVectorPtr>& results,
+      std::ostream& out);
+
+  void readCombinations();
+
+  /// Entry point invoked with different settings to run the benchmark.
+  virtual void runMain(std::ostream& out, RunStats& runStats) = 0;
+
+  void runOne(std::ostream& outtt, RunStats& stats);
+
+  void runCombinations(int32_t level);
+
+  void runAllCombinations();
+
+ protected:
+  std::unique_ptr<folly::IOThreadPoolExecutor> ioExecutor_;
+  std::unique_ptr<folly::IOThreadPoolExecutor> cacheExecutor_;
+  std::shared_ptr<memory::MemoryAllocator> allocator_;
+  std::shared_ptr<cache::AsyncDataCache> cache_;
+  // Parameter combinations to try. Each element specifies a flag and possible
+  // values. All permutations are tried.
+  std::vector<ParameterDim> parameters_;
+
+  std::vector<RunStats> runStats_;
+};
+} // namespace facebook::velox
diff --git a/velox/benchmarks/tpch/CMakeLists.txt b/velox/benchmarks/tpch/CMakeLists.txt
index ba7491f7e603..3cbe9a0a6a6b 100644
--- a/velox/benchmarks/tpch/CMakeLists.txt
+++ b/velox/benchmarks/tpch/CMakeLists.txt
@@ -16,6 +16,7 @@ add_library(velox_tpch_benchmark_lib TpchBenchmark.cpp)
 
 target_link_libraries(
   velox_tpch_benchmark_lib
+  velox_query_benchmark
   velox_aggregates
   velox_exec
   velox_exec_test_lib
diff --git a/velox/benchmarks/tpch/TpchBenchmark.cpp b/velox/benchmarks/tpch/TpchBenchmark.cpp
index babb86a0e9ef..25d02224fc58 100644
--- a/velox/benchmarks/tpch/TpchBenchmark.cpp
+++ b/velox/benchmarks/tpch/TpchBenchmark.cpp
@@ -14,75 +14,13 @@
  * limitations under the License.
  */
 
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-
-#include <folly/Benchmark.h>
-#include <folly/init/Init.h>
-#include <gflags/gflags.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <fstream>
-
-#include "velox/common/base/SuccinctPrinter.h"
-#include "velox/common/file/FileSystems.h"
-#include "velox/common/memory/MmapAllocator.h"
-#include "velox/connectors/hive/HiveConfig.h"
-#include "velox/connectors/hive/HiveConnector.h"
-#include "velox/dwio/common/Options.h"
-#include "velox/exec/PlanNodeStats.h"
-#include "velox/exec/Split.h"
-#include "velox/exec/tests/utils/HiveConnectorTestBase.h"
-#include "velox/exec/tests/utils/TpchQueryBuilder.h"
-#include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h"
-#include "velox/functions/prestosql/registration/RegistrationFunctions.h"
-#include "velox/parse/TypeResolver.h"
+#include "velox/benchmarks/QueryBenchmarkBase.h"
 
 using namespace facebook::velox;
 using namespace facebook::velox::exec;
 using namespace facebook::velox::exec::test;
 using namespace facebook::velox::dwio::common;
 
-namespace {
-static bool notEmpty(const char* /*flagName*/, const std::string& value) {
-  return !value.empty();
-}
-
-static bool validateDataFormat(const char* flagname, const std::string& value) {
-  if ((value.compare("parquet") == 0) || (value.compare("dwrf") == 0)) {
-    return true;
-  }
-  std::cout
-      << fmt::format(
-             "Invalid value for --{}: {}. Allowed values are [\"parquet\", \"dwrf\"]",
-             flagname,
-             value)
-      << std::endl;
-  return false;
-}
-
-void ensureTaskCompletion(exec::Task* task) {
-  // ASSERT_TRUE requires a function with return type void.
-  ASSERT_TRUE(waitForTaskCompletion(task));
-}
-
-void printResults(const std::vector<RowVectorPtr>& results, std::ostream& out) {
-  out << "Results:" << std::endl;
-  bool printType = true;
-  for (const auto& vector : results) {
-    // Print RowType only once.
-    if (printType) {
-      out << vector->type()->asRow().toString() << std::endl;
-      printType = false;
-    }
-    for (vector_size_t i = 0; i < vector->size(); ++i) {
-      out << vector->toString(i) << std::endl;
-    }
-  }
-}
-} // namespace
-
 DEFINE_string(
     data_path,
     "",
@@ -100,6 +38,13 @@ DEFINE_string(
     "each table. If they are files, they contain a file system path for each "
     "data file, one per line. This allows running against cloud storage or "
     "HDFS");
+namespace {
+static bool notEmpty(const char* /*flagName*/, const std::string& value) {
+  return !value.empty();
+}
+} // namespace
+
+DEFINE_validator(data_path, &notEmpty);
 
 DEFINE_int32(
     run_query_verbose,
@@ -112,216 +57,11 @@ DEFINE_int32(
     "include in IO meter query. The columns are sorted by name and the n% first "
     "are scanned");
 
-DEFINE_bool(
-    include_custom_stats,
-    false,
-    "Include custom statistics along with execution statistics");
-DEFINE_bool(include_results, false, "Include results in the output");
-DEFINE_int32(num_drivers, 4, "Number of drivers");
-DEFINE_string(data_format, "parquet", "Data format");
-DEFINE_int32(num_splits_per_file, 10, "Number of splits per file");
-DEFINE_int32(
-    cache_gb,
-    0,
-    "GB of process memory for cache and query.. if "
-    "non-0, uses mmap to allocator and in-process data cache.");
-DEFINE_int32(num_repeats, 1, "Number of times to run each query");
-DEFINE_int32(num_io_threads, 8, "Threads for speculative IO");
-DEFINE_string(
-    test_flags_file,
-    "",
-    "Path to a file containing gflafs and "
-    "values to try. Produces results for each flag combination "
-    "sorted on performance");
-DEFINE_bool(
-    full_sorted_stats,
-    true,
-    "Add full stats to the report on  --test_flags_file");
-
-DEFINE_string(ssd_path, "", "Directory for local SSD cache");
-DEFINE_int32(ssd_cache_gb, 0, "Size of local SSD cache in GB");
-DEFINE_int32(
-    ssd_checkpoint_interval_gb,
-    8,
-    "Checkpoint every n "
-    "GB new data in cache");
-DEFINE_bool(
-    clear_ram_cache,
-    false,
-    "Clear RAM cache before each query."
-    "Flushes in process and OS file system cache (if root on Linux)");
-DEFINE_bool(
-    clear_ssd_cache,
-    false,
-    "Clears SSD cache before "
-    "each query");
-
-DEFINE_bool(
-    warmup_after_clear,
-    false,
-    "Runs one warmup of the query before "
-    "measured run. Use to run warm after clearing caches.");
-
-DEFINE_validator(data_path, &notEmpty);
-DEFINE_validator(data_format, &validateDataFormat);
-
-DEFINE_int64(
-    max_coalesced_bytes,
-    128 << 20,
-    "Maximum size of single coalesced IO");
-
-DEFINE_int32(
-    max_coalesced_distance_bytes,
-    512 << 10,
-    "Maximum distance in bytes in which coalesce will combine requests");
-
-DEFINE_int32(
-    parquet_prefetch_rowgroups,
-    1,
-    "Number of next row groups to "
-    "prefetch. 1 means prefetch the next row group before decoding "
-    "the current one");
-
-DEFINE_int32(split_preload_per_driver, 2, "Prefetch split metadata");
-
-struct RunStats {
-  std::map<std::string, std::string> flags;
-  int64_t micros{0};
-  int64_t rawInputBytes{0};
-  int64_t userNanos{0};
-  int64_t systemNanos{0};
-  std::string output;
-
-  std::string toString(bool detail) {
-    std::stringstream out;
-    out << succinctNanos(micros * 1000) << " "
-        << succinctBytes(rawInputBytes / (micros / 1000000.0)) << "/s raw, "
-        << succinctNanos(userNanos) << " user " << succinctNanos(systemNanos)
-        << " system (" << (100 * (userNanos + systemNanos) / (micros * 1000))
-        << "%), flags: ";
-    for (auto& pair : flags) {
-      out << pair.first << "=" << pair.second << " ";
-    }
-    out << std::endl << "======" << std::endl;
-    if (detail) {
-      out << std::endl << output << std::endl;
-    }
-    return out.str();
-  }
-};
-
-struct ParameterDim {
-  std::string flag;
-  std::vector<std::string> values;
-};
-
 std::shared_ptr<TpchQueryBuilder> queryBuilder;
 
-class TpchBenchmark {
+class TpchBenchmark : public QueryBenchmarkBase {
  public:
-  void initialize() {
-    if (FLAGS_cache_gb) {
-      memory::MemoryManagerOptions options;
-      int64_t memoryBytes = FLAGS_cache_gb * (1LL << 30);
-      options.useMmapAllocator = true;
-      options.allocatorCapacity = memoryBytes;
-      options.useMmapArena = true;
-      options.mmapArenaCapacityRatio = 1;
-      memory::MemoryManager::testingSetInstance(options);
-      std::unique_ptr<cache::SsdCache> ssdCache;
-      if (FLAGS_ssd_cache_gb) {
-        constexpr int32_t kNumSsdShards = 16;
-        cacheExecutor_ =
-            std::make_unique<folly::IOThreadPoolExecutor>(kNumSsdShards);
-        const cache::SsdCache::Config config(
-            FLAGS_ssd_path,
-            static_cast<uint64_t>(FLAGS_ssd_cache_gb) << 30,
-            kNumSsdShards,
-            cacheExecutor_.get(),
-            static_cast<uint64_t>(FLAGS_ssd_checkpoint_interval_gb) << 30);
-        ssdCache = std::make_unique<cache::SsdCache>(config);
-      }
-
-      cache_ = cache::AsyncDataCache::create(
-          memory::memoryManager()->allocator(), std::move(ssdCache));
-      cache::AsyncDataCache::setInstance(cache_.get());
-    } else {
-      memory::MemoryManager::testingSetInstance({});
-    }
-    functions::prestosql::registerAllScalarFunctions();
-    aggregate::prestosql::registerAllAggregateFunctions();
-    parse::registerTypeResolver();
-    filesystems::registerLocalFileSystem();
-
-    ioExecutor_ =
-        std::make_unique<folly::IOThreadPoolExecutor>(FLAGS_num_io_threads);
-
-    // Add new values into the hive configuration...
-    auto configurationValues = std::unordered_map<std::string, std::string>();
-    configurationValues[connector::hive::HiveConfig::kMaxCoalescedBytes] =
-        std::to_string(FLAGS_max_coalesced_bytes);
-    configurationValues
-        [connector::hive::HiveConfig::kMaxCoalescedDistanceBytes] =
-            std::to_string(FLAGS_max_coalesced_distance_bytes);
-    configurationValues[connector::hive::HiveConfig::kPrefetchRowGroups] =
-        std::to_string(FLAGS_parquet_prefetch_rowgroups);
-    auto properties = std::make_shared<const config::ConfigBase>(
-        std::move(configurationValues));
-
-    // Create hive connector with config...
-    auto hiveConnector =
-        connector::getConnectorFactory(
-            connector::hive::HiveConnectorFactory::kHiveConnectorName)
-            ->newConnector(kHiveConnectorId, properties, ioExecutor_.get());
-    connector::registerConnector(hiveConnector);
-  }
-
-  void shutdown() {
-    cache_->shutdown();
-  }
-
-  std::pair<std::unique_ptr<TaskCursor>, std::vector<RowVectorPtr>> run(
-      const TpchPlan& tpchPlan) {
-    int32_t repeat = 0;
-    try {
-      for (;;) {
-        CursorParameters params;
-        params.maxDrivers = FLAGS_num_drivers;
-        params.planNode = tpchPlan.plan;
-        params.queryConfigs[core::QueryConfig::kMaxSplitPreloadPerDriver] =
-            std::to_string(FLAGS_split_preload_per_driver);
-        const int numSplitsPerFile = FLAGS_num_splits_per_file;
-
-        bool noMoreSplits = false;
-        auto addSplits = [&](exec::Task* task) {
-          if (!noMoreSplits) {
-            for (const auto& entry : tpchPlan.dataFiles) {
-              for (const auto& path : entry.second) {
-                auto const splits =
-                    HiveConnectorTestBase::makeHiveConnectorSplits(
-                        path, numSplitsPerFile, tpchPlan.dataFileFormat);
-                for (const auto& split : splits) {
-                  task->addSplit(entry.first, exec::Split(split));
-                }
-              }
-              task->noMoreSplits(entry.first);
-            }
-          }
-          noMoreSplits = true;
-        };
-        auto result = readCursor(params, addSplits);
-        ensureTaskCompletion(result.first->task().get());
-        if (++repeat >= FLAGS_num_repeats) {
-          return result;
-        }
-      }
-    } catch (const std::exception& e) {
-      LOG(ERROR) << "Query terminated with: " << e.what();
-      return {nullptr, std::vector<RowVectorPtr>()};
-    }
-  }
-
-  void runMain(std::ostream& out, RunStats& runStats) {
+  void runMain(std::ostream& out, RunStats& runStats) override {
     if (FLAGS_run_query_verbose == -1 && FLAGS_io_meter_column_pct == 0) {
       folly::runBenchmarks();
     } else {
@@ -363,130 +103,6 @@ class TpchBenchmark {
           << std::endl;
     }
   }
-
-  void readCombinations() {
-    std::ifstream file(FLAGS_test_flags_file);
-    std::string line;
-    while (std::getline(file, line)) {
-      ParameterDim dim;
-      int32_t previous = 0;
-      for (auto i = 0; i < line.size(); ++i) {
-        if (line[i] == ':') {
-          dim.flag = line.substr(0, i);
-          previous = i + 1;
-        } else if (line[i] == ',') {
-          dim.values.push_back(line.substr(previous, i - previous));
-          previous = i + 1;
-        }
-      }
-      if (previous < line.size()) {
-        dim.values.push_back(line.substr(previous, line.size() - previous));
-      }
-
-      parameters_.push_back(dim);
-    }
-  }
-
-  void runCombinations(int32_t level) {
-    if (level == parameters_.size()) {
-      if (FLAGS_clear_ram_cache) {
-#ifdef linux
-        // system("echo 3 >/proc/sys/vm/drop_caches");
-        bool success = false;
-        auto fd = open("/proc//sys/vm/drop_caches", O_WRONLY);
-        if (fd > 0) {
-          success = write(fd, "3", 1) == 1;
-          close(fd);
-        }
-        if (!success) {
-          LOG(ERROR) << "Failed to clear OS disk cache: errno=" << errno;
-        }
-#endif
-
-        if (cache_) {
-          cache_->clear();
-        }
-      }
-      if (FLAGS_clear_ssd_cache) {
-        if (cache_) {
-          auto ssdCache = cache_->ssdCache();
-          if (ssdCache) {
-            ssdCache->clear();
-          }
-        }
-      }
-      if (FLAGS_warmup_after_clear) {
-        std::stringstream result;
-        RunStats ignore;
-        runMain(result, ignore);
-      }
-      RunStats stats;
-      std::stringstream result;
-      uint64_t micros = 0;
-      {
-        struct rusage start;
-        getrusage(RUSAGE_SELF, &start);
-        MicrosecondTimer timer(&micros);
-        runMain(result, stats);
-        struct rusage final;
-        getrusage(RUSAGE_SELF, &final);
-        auto tvNanos = [](struct timeval tv) {
-          return tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
-        };
-        stats.userNanos = tvNanos(final.ru_utime) - tvNanos(start.ru_utime);
-        stats.systemNanos = tvNanos(final.ru_stime) - tvNanos(start.ru_stime);
-      }
-      stats.micros = micros;
-      stats.output = result.str();
-      for (auto i = 0; i < parameters_.size(); ++i) {
-        std::string name;
-        gflags::GetCommandLineOption(parameters_[i].flag.c_str(), &name);
-        stats.flags[parameters_[i].flag] = name;
-      }
-      runStats_.push_back(std::move(stats));
-    } else {
-      auto& flag = parameters_[level].flag;
-      for (auto& value : parameters_[level].values) {
-        std::string result =
-            gflags::SetCommandLineOption(flag.c_str(), value.c_str());
-        if (result.empty()) {
-          LOG(ERROR) << "Failed to set " << flag << "=" << value;
-        }
-        std::cout << result << std::endl;
-        runCombinations(level + 1);
-      }
-    }
-  }
-
-  void runAllCombinations() {
-    readCombinations();
-    runCombinations(0);
-    std::sort(
-        runStats_.begin(),
-        runStats_.end(),
-        [](const RunStats& left, const RunStats& right) {
-          return left.micros < right.micros;
-        });
-    for (auto& stats : runStats_) {
-      std::cout << stats.toString(false);
-    }
-    if (FLAGS_full_sorted_stats) {
-      std::cout << "Detail for stats:" << std::endl;
-      for (auto& stats : runStats_) {
-        std::cout << stats.toString(true);
-      }
-    }
-  }
-
-  std::unique_ptr<folly::IOThreadPoolExecutor> ioExecutor_;
-  std::unique_ptr<folly::IOThreadPoolExecutor> cacheExecutor_;
-  std::shared_ptr<memory::MemoryAllocator> allocator_;
-  std::shared_ptr<cache::AsyncDataCache> cache_;
-  // Parameter combinations to try. Each element specifies a flag and possible
-  // values. All permutations are tried.
-  std::vector<ParameterDim> parameters_;
-
-  std::vector<RunStats> runStats_;
 };
 
 TpchBenchmark benchmark;
diff --git a/velox/experimental/wave/common/Cuda.cu b/velox/experimental/wave/common/Cuda.cu
index f1bf5c51a594..557b7a15754c 100644
--- a/velox/experimental/wave/common/Cuda.cu
+++ b/velox/experimental/wave/common/Cuda.cu
@@ -68,6 +68,9 @@ class CudaDeviceAllocator : public GpuAllocator {
   void free(void* ptr, size_t /*size*/) override {
     cudaFree(ptr);
   }
+  bool isDevice() const override {
+    return true;
+  }
 };
 
 class CudaHostAllocator : public GpuAllocator {
@@ -81,6 +84,10 @@ class CudaHostAllocator : public GpuAllocator {
   void free(void* ptr, size_t /*size*/) override {
     cudaFreeHost(ptr);
   };
+
+  bool isHost() const override {
+    return true;
+  }
 };
 
 } // namespace
@@ -127,6 +134,10 @@ void Stream::prefetch(Device* device, void* ptr, size_t size) {
       ptr, size, device ? device->deviceId : cudaCpuDeviceId, stream_->stream));
 }
 
+void Stream::memset(void* ptr, int32_t value, size_t size) {
+  CUDA_CHECK(cudaMemsetAsync(ptr, value, size, stream_->stream));
+}
+
 void Stream::hostToDeviceAsync(
     void* deviceAddress,
     const void* hostAddress,
diff --git a/velox/experimental/wave/common/Cuda.h b/velox/experimental/wave/common/Cuda.h
index 5855645e5182..355c2d5f4b2a 100644
--- a/velox/experimental/wave/common/Cuda.h
+++ b/velox/experimental/wave/common/Cuda.h
@@ -48,6 +48,9 @@ class Stream {
   /// Waits  until the stream is completed.
   void wait();
 
+  /// Enqueus a memset on 'device'.
+  void memset(void* address, int32_t value, size_t size);
+
   /// Enqueus a prefetch. Prefetches to host if 'device' is nullptr, otherwise
   /// to 'device'.
   void prefetch(Device* device, void* address, size_t size);
@@ -128,6 +131,16 @@ class GpuAllocator {
   /// to allocate(). A Memory must be freed to the same allocator it came from.
   virtual void free(void* ptr, size_t bytes) = 0;
 
+  /// True if allocates host pinned memory.
+  virtual bool isHost() const {
+    return false;
+  }
+
+  /// True if allocates device side memory.
+  virtual bool isDevice() const {
+    return false;
+  }
+
   class Deleter;
 
   template <typename T>
diff --git a/velox/experimental/wave/common/GpuArena.cpp b/velox/experimental/wave/common/GpuArena.cpp
index 5605b6428617..71bc8bdd7a4e 100644
--- a/velox/experimental/wave/common/GpuArena.cpp
+++ b/velox/experimental/wave/common/GpuArena.cpp
@@ -32,7 +32,11 @@ uint64_t GpuSlab::roundBytes(uint64_t bytes) {
   if (FLAGS_wave_buffer_end_guard) {
     bytes += sizeof(int64_t);
   }
-  return bits::nextPowerOfTwo(std::max<int64_t>(16, bytes));
+  if (bytes > 32 << 10) {
+    return bits::roundUp(bytes, 32 << 10);
+  } else {
+    return bits::nextPowerOfTwo(std::max<int64_t>(16, bytes));
+  }
 }
 
 GpuSlab::GpuSlab(void* ptr, size_t capacityBytes, GpuAllocator* allocator)
@@ -285,12 +289,18 @@ GpuArena::Buffers::Buffers() {
   }
 }
 
-GpuArena::GpuArena(uint64_t singleArenaCapacity, GpuAllocator* allocator)
-    : singleArenaCapacity_(singleArenaCapacity), allocator_(allocator) {
+GpuArena::GpuArena(
+    uint64_t singleArenaCapacity,
+    GpuAllocator* allocator,
+    uint64_t standbyCapacity)
+    : singleArenaCapacity_(singleArenaCapacity),
+      standbyCapacity_(standbyCapacity),
+      allocator_(allocator) {
   auto arena = std::make_shared<GpuSlab>(
       allocator_->allocate(singleArenaCapacity),
       singleArenaCapacity,
       allocator_);
+  capacity_ += arena->byteSize();
   arenas_.emplace(reinterpret_cast<uint64_t>(arena->address()), arena);
   currentArena_ = arena;
 }
@@ -319,6 +329,8 @@ WaveBufferPtr GpuArena::getBuffer(void* ptr, size_t capacity, size_t size) {
 WaveBufferPtr GpuArena::allocateBytes(uint64_t bytes) {
   auto roundedBytes = GpuSlab::roundBytes(bytes);
   std::lock_guard<std::mutex> l(mutex_);
+  totalAllocated_ += roundedBytes;
+  ++numAllocations_;
   auto* result = currentArena_->allocate(roundedBytes);
   if (result != nullptr) {
     return getBuffer(result, bytes, roundedBytes);
@@ -342,6 +354,10 @@ WaveBufferPtr GpuArena::allocateBytes(uint64_t bytes) {
   auto newArena = std::make_shared<GpuSlab>(
       allocator_->allocate(arenaBytes), arenaBytes, allocator_);
   arenas_.emplace(reinterpret_cast<uint64_t>(newArena->address()), newArena);
+  capacity_ += newArena->byteSize();
+  if (capacity_ > maxCapacity_) {
+    maxCapacity_ = capacity_;
+  }
   currentArena_ = newArena;
   result = currentArena_->allocate(bytes);
   if (result) {
@@ -365,7 +381,9 @@ void GpuArena::free(Buffer* buffer) {
         iter->first + singleArenaCapacity_, addressU64 + buffer->size_);
   }
   iter->second->free(buffer->ptr_, buffer->size_);
-  if (iter->second->empty() && iter->second != currentArena_) {
+  if (iter->second->empty() && iter->second != currentArena_ &&
+      capacity_ - iter->second->byteSize() >= standbyCapacity_) {
+    capacity_ -= iter->second->byteSize();
     arenas_.erase(iter);
   }
   buffer->ptr_ = firstFreeBuffer_;
@@ -405,4 +423,12 @@ ArenaStatus GpuArena::checkBuffers() {
   return status;
 }
 
+std::string GpuArena::toString() const {
+  std::stringstream out;
+  for (auto& pair : arenas_) {
+    out << pair.second->toString() << std::endl;
+  }
+  return out.str();
+}
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/GpuArena.h b/velox/experimental/wave/common/GpuArena.h
index 1b03ce84a7ac..9a7256947fcd 100644
--- a/velox/experimental/wave/common/GpuArena.h
+++ b/velox/experimental/wave/common/GpuArena.h
@@ -130,7 +130,10 @@ struct ArenaStatus {
 /// fragmentation happens.
 class GpuArena {
  public:
-  GpuArena(uint64_t singleArenaCapacity, GpuAllocator* allocator);
+  GpuArena(
+      uint64_t singleArenaCapacity,
+      GpuAllocator* allocator,
+      uint64_t standbyCapacity = 0);
 
   WaveBufferPtr allocateBytes(uint64_t bytes);
 
@@ -152,10 +155,37 @@ class GpuArena {
     return arenas_;
   }
 
+  uint64_t maxCapacity() const {
+    return maxCapacity_;
+  }
+
+  uint64_t totalAllocated() const {
+    return totalAllocated_;
+  }
+
+  uint64_t numAllocations() const {
+    return numAllocations_;
+  }
+
+  uint64_t retainedSize() const {
+    return capacity_;
+  }
+
+  void setSizes(uint64_t arenaSize, uint64_t standbyCapacity) {
+    singleArenaCapacity_ = arenaSize;
+    standbyCapacity_ = standbyCapacity;
+  }
+
+  bool isDevice() const {
+    return allocator_->isDevice();
+  }
+
   /// Checks magic numbers and returns the sum of allocated capacity. Actual
   /// sizes are padded to larger.
   ArenaStatus checkBuffers();
 
+  std::string toString() const;
+
  private:
   // A preallocated array of Buffer handles for memory of 'this'.
   struct Buffers {
@@ -178,8 +208,14 @@ class GpuArena {
   // Head of Buffer free list.
   Buffer* firstFreeBuffer_{nullptr};
 
+  // Total capacity in all arenas.
+  uint64_t capacity_{0};
+
   // Capacity in bytes for a single GpuSlab managed by this.
-  const uint64_t singleArenaCapacity_;
+  uint64_t singleArenaCapacity_;
+
+  // Lower bound of capacity to keep around even if usage is below this.
+  uint64_t standbyCapacity_{0};
 
   GpuAllocator* const allocator_;
 
@@ -189,6 +225,10 @@ class GpuArena {
   // All allocations should come from this GpuSlab. When it is no longer able
   // to handle allocations it will be updated to a newly created GpuSlab.
   std::shared_ptr<GpuSlab> currentArena_;
+
+  uint64_t numAllocations_{0};
+  uint64_t totalAllocated_{0};
+  uint64_t maxCapacity_{0};
 };
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/ResultStaging.cpp b/velox/experimental/wave/common/ResultStaging.cpp
index 77209c8f2c24..285e57fab435 100644
--- a/velox/experimental/wave/common/ResultStaging.cpp
+++ b/velox/experimental/wave/common/ResultStaging.cpp
@@ -32,7 +32,7 @@ void ResultStaging::registerPointerInternal(
     bool clear) {
   VELOX_CHECK_LT(id, offsets_.size());
   VELOX_CHECK_NOT_NULL(pointer);
-#ifndef NDEBUG
+#if 0 // ndef NDEBUG
   for (auto& pair : patch_) {
     VELOX_CHECK(
         pair.second != pointer, "Must not register the same pointer twice");
@@ -81,4 +81,34 @@ void ResultStaging::setReturnBuffer(GpuArena& arena, ResultBuffer& result) {
   fill_ = 0;
 }
 
+GpuArena& getSmallTransferArena() {
+  static std::unique_ptr<GpuArena> arena = std::make_unique<GpuArena>(
+      10UL << 20, getHostAllocator(nullptr), 20UL << 20);
+  return *arena;
+}
+
+std::pair<char*, char*> LaunchParams::setup(size_t size) {
+  if (!arena.isDevice()) {
+    // Unified memory.
+    device = arena.allocate<char>(size);
+    return {device->as<char>(), device->as<char>()};
+  } else {
+    // Separate host and device side buffers.
+    device = arena.allocate<char>(size);
+    host = getSmallTransferArena().allocate<char>(size);
+    return {host->as<char>(), device->as<char>()};
+  }
+}
+
+void LaunchParams::transfer(Stream& stream) {
+  if (device) {
+    if (arena.isDevice()) {
+      stream.hostToDeviceAsync(
+          device->as<char>(), host->as<char>(), host->size());
+    } else {
+      stream.prefetch(getDevice(), device->as<char>(), device->size());
+    }
+  }
+}
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/ResultStaging.h b/velox/experimental/wave/common/ResultStaging.h
index 76388d2ed799..35077071aae1 100644
--- a/velox/experimental/wave/common/ResultStaging.h
+++ b/velox/experimental/wave/common/ResultStaging.h
@@ -90,4 +90,24 @@ class ResultStaging {
   std::vector<WaveBufferPtr> buffers_;
 };
 
+/// Manages parameters of kernel launch. Provides pinned host buffer and a
+/// device side copy destination.
+struct LaunchParams {
+  LaunchParams(GpuArena& arena) : arena(arena) {}
+
+  /// Returns a host, device address pair of 'size' bytes.
+  std::pair<char*, char*> setup(size_t size);
+
+  void transfer(Stream& stream);
+
+  GpuArena& arena;
+  WaveBufferPtr host;
+  WaveBufferPtr device;
+};
+
+// Arena for decode and program launch param blocks. Lifetime of allocations is
+// a wave in WaveStream, typical size under 1MB, e.g. 1K blocks with 1K of
+// params.
+GpuArena& getSmallTransferArena();
+
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/common/tests/CudaTest.cpp b/velox/experimental/wave/common/tests/CudaTest.cpp
index c9cafab29511..9f924d962f8e 100644
--- a/velox/experimental/wave/common/tests/CudaTest.cpp
+++ b/velox/experimental/wave/common/tests/CudaTest.cpp
@@ -1464,7 +1464,7 @@ TEST_F(CudaTest, reduceMatrix) {
     return;
   }
 
-  std::vector<std::string> modes = {/*"unified", "device",*/ "devicecoalesced"};
+  std::vector<std::string> modes = {"unified", "device", "devicecoalesced"};
   std::vector<int32_t> batchMBValues = {30, 100};
   std::vector<int32_t> numThreadsValues = {1, 2, 3};
   std::vector<int32_t> workPerThreadValues = {2, 4};
diff --git a/velox/experimental/wave/dwio/ColumnReader.h b/velox/experimental/wave/dwio/ColumnReader.h
index 7848409cc3b2..76ccee0b8772 100644
--- a/velox/experimental/wave/dwio/ColumnReader.h
+++ b/velox/experimental/wave/dwio/ColumnReader.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "velox/common/io/IoStatistics.h"
 #include "velox/dwio/common/TypeWithId.h"
 #include "velox/experimental/wave/dwio/FormatData.h"
 #include "velox/experimental/wave/exec/Wave.h"
@@ -113,6 +114,8 @@ class ReadStream : public Executable {
   ReadStream(
       StructColumnReader* columnReader,
       WaveStream& waveStream,
+      velox::io::IoStatistics* ioStats,
+      FileInfo& fileInfo,
       const OperandSet* firstColumns = nullptr);
 
   void setNullable(const AbstractOperand& op, bool nullable) {
@@ -151,8 +154,9 @@ class ReadStream : public Executable {
   // needed than is good per TB.
   void makeGrid(Stream* stream);
 
-  // Sets consistent blockStatus and temp across 'programs_'
-  void setBlockStatusAndTemp();
+  // Sets consistent blockStatus and temp across 'programs_'. If 'stream is
+  // given, prefetches BlockStatus on device on 'stream'.
+  void setBlockStatusAndTemp(Stream* stream = nullptr);
 
   /// Makes column dependencies.
   void makeOps();
@@ -160,10 +164,20 @@ class ReadStream : public Executable {
   void prepareRead();
   void makeControl();
 
+  // Enqueues waits on stream for the transfers the 'currentStaging_'
+  // depends on. Fist batch on stripe enqueues transfer, next batches
+  // are enqueued on a different stream but still depend on the first
+  // transfers completing on the first stream.
+  void syncStaging(Stream& stream);
+
+  io::IoStatistics* const ioStats_;
+
   // Makes steps to align values from non-last filters to the selection of the
   // last filter.
   void makeCompact(bool isSerial);
 
+  void prefetchStatus(Stream* stream);
+
   // True if non-filter columns will be done sequentially in the
   // filters kernel. This will never loose if there is an always read
   // single column. This may loose if it were better to take the
@@ -198,7 +212,7 @@ class ReadStream : public Executable {
   // Intermediate data to stay on device, e.g. selected rows.
   ResultStaging deviceStaging_;
   // Owning references to decode programs. Must be live for duration of kernels.
-  std::vector<WaveBufferPtr> commands_;
+  std::vector<LaunchParams> commands_;
   // Reusable control block for launching decode kernels.
   DecodePrograms programs_;
   // If no filters, the starting RowSet directly initializes the BlockStatus'es
@@ -207,10 +221,15 @@ class ReadStream : public Executable {
   bool filtersDone_{false};
   //  Sequence number of kernel launch.
   int32_t nthWave_{0};
+
+  // Leading bytes in control_->deviceData used for BlockStatus. Cleared on
+  // device. The bytes after that are set on host and then prefetched to device.
+  int32_t statusBytes_{0};
   LaunchControl* control_{nullptr};
 
   // Set to true when after first griddize() and akeOps().
   bool inited_{false};
+  FileInfo fileInfo_;
 };
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/dwio/FormatData.cpp b/velox/experimental/wave/dwio/FormatData.cpp
index aa588ed5b2d3..d252f4dc085c 100644
--- a/velox/experimental/wave/dwio/FormatData.cpp
+++ b/velox/experimental/wave/dwio/FormatData.cpp
@@ -16,9 +16,15 @@
 
 #include "velox/experimental/wave/dwio/FormatData.h"
 #include "velox/experimental/wave/dwio/ColumnReader.h"
+#include "velox/experimental/wave/exec/Wave.h"
 
 DECLARE_int32(wave_reader_rows_per_tb);
 
+DEFINE_int32(
+    staging_bytes_per_thread,
+    300000,
+    "Make a parallel memcpy shard per this many bytes");
+
 namespace facebook::velox::wave {
 
 BufferId SplitStaging::add(Staging& staging) {
@@ -38,7 +44,7 @@ void SplitStaging::registerPointerInternal(
   if (clear) {
     *ptr = nullptr;
   }
-#ifndef NDEBUG
+#if 0 // ndef NDEBUG
   for (auto& pair : patch_) {
     VELOX_CHECK(pair.second != ptr, "Must not register the same pointer twice");
   }
@@ -46,29 +52,116 @@ void SplitStaging::registerPointerInternal(
   patch_.push_back(std::make_pair(id, ptr));
 }
 
+void SplitStaging::copyColumns(
+    int32_t begin,
+    int32_t end,
+    char* destination,
+    bool release) {
+  for (auto i = begin; i < end; ++i) {
+    memcpy(destination, staging_[i].hostData, staging_[i].size);
+    destination += staging_[i].size;
+  }
+  if (release) {
+    sem_.release();
+  }
+}
+
+// Shared pool of 1-2GB of pinned host memory for staging. May
+// transiently exceed 2GB but settles to 2GB after the peak.
+GpuArena& getTransferArena() {
+  static std::unique_ptr<GpuArena> arena = std::make_unique<GpuArena>(
+      1UL << 30, getHostAllocator(nullptr), 2UL << 30);
+  return *arena;
+}
+
 // Starts the transfers registered with add(). 'stream' is set to a stream
 // where operations depending on the transfer may be queued.
 void SplitStaging::transfer(
     WaveStream& waveStream,
     Stream& stream,
-    bool recordEvent) {
+    bool recordEvent,
+    std::function<void(WaveStream&, Stream&)> asyncTail) {
   if (fill_ == 0 || deviceBuffer_ != nullptr) {
+    if (recordEvent && !event_) {
+      event_ = std::make_unique<Event>();
+      event_->record(stream);
+    }
+    if (asyncTail) {
+      asyncTail(waveStream, stream);
+    }
     return;
   }
-  deviceBuffer_ = waveStream.arena().allocate<char>(fill_);
-  auto universal = deviceBuffer_->as<char>();
-  for (auto i = 0; i < offsets_.size(); ++i) {
-    memcpy(universal + offsets_[i], staging_[i].hostData, staging_[i].size);
+  WaveTime startTime = WaveTime::now();
+  deviceBuffer_ = waveStream.deviceArena().allocate<char>(fill_);
+  hostBuffer_ = getTransferArena().allocate<char>(fill_);
+  auto transferBuffer = hostBuffer_->as<char>();
+  int firstToCopy = 0;
+  int32_t numCopies = staging_.size();
+  int64_t copySize = 0;
+  auto targetCopySize = FLAGS_staging_bytes_per_thread;
+  int32_t numThreads = 0;
+  if (fill_ > 2000000) {
+    for (auto i = 0; i < staging_.size(); ++i) {
+      auto columnSize = staging_[i].size;
+      copySize += columnSize;
+      if (copySize >= targetCopySize && i < staging_.size() - 1) {
+        ++numThreads;
+        WaveStream::copyExecutor()->add(
+            [i, firstToCopy, transferBuffer, this]() {
+              copyColumns(firstToCopy, i + 1, transferBuffer, true);
+            });
+        transferBuffer += copySize;
+        copySize = 0;
+        firstToCopy = i + 1;
+      }
+    }
   }
-  stream.prefetch(
-      getDevice(), deviceBuffer_->as<char>(), deviceBuffer_->size());
+  auto deviceData = deviceBuffer_->as<char>();
   for (auto& pair : patch_) {
     *reinterpret_cast<int64_t*>(pair.second) +=
-        reinterpret_cast<int64_t>(universal) + offsets_[pair.first];
+        reinterpret_cast<int64_t>(deviceData) + offsets_[pair.first];
   }
-  if (recordEvent) {
-    event_ = std::make_unique<Event>();
-    event_->record(stream);
+  if (asyncTail) {
+    WaveStream::syncExecutor()->add([firstToCopy,
+                                     numThreads,
+                                     transferBuffer,
+                                     asyncTail,
+                                     &waveStream,
+                                     &stream,
+                                     recordEvent,
+                                     startTime,
+                                     this]() {
+      copyColumns(firstToCopy, staging_.size(), transferBuffer, false);
+      for (auto i = 0; i < numThreads; ++i) {
+        sem_.acquire();
+      }
+      stream.hostToDeviceAsync(
+          deviceBuffer_->as<char>(), hostBuffer_->as<char>(), fill_);
+      waveStream.stats().stagingTime += WaveTime::now() - startTime;
+      if (recordEvent) {
+        event_ = std::make_unique<Event>();
+        event_->record(stream);
+      }
+      fill_ = 0;
+      patch_.clear();
+      offsets_.clear();
+      asyncTail(waveStream, stream);
+    });
+  } else {
+    copyColumns(firstToCopy, staging_.size(), transferBuffer, false);
+    for (auto i = 0; i < numThreads; ++i) {
+      sem_.acquire();
+    }
+    stream.hostToDeviceAsync(
+        deviceBuffer_->as<char>(), hostBuffer_->as<char>(), fill_);
+    waveStream.stats().stagingTime += WaveTime::now() - startTime;
+    if (recordEvent) {
+      event_ = std::make_unique<Event>();
+      event_->record(stream);
+    }
+    fill_ = 0;
+    patch_.clear();
+    offsets_.clear();
   }
 }
 
diff --git a/velox/experimental/wave/dwio/FormatData.h b/velox/experimental/wave/dwio/FormatData.h
index a30a479cea97..eb4e345cc410 100644
--- a/velox/experimental/wave/dwio/FormatData.h
+++ b/velox/experimental/wave/dwio/FormatData.h
@@ -16,24 +16,32 @@
 
 #pragma once
 
+#include <folly/Range.h>
+#include "velox/common/base/Semaphore.h"
+#include "velox/common/caching/AsyncDataCache.h"
+#include "velox/common/file/Region.h"
 #include "velox/dwio/common/ScanSpec.h"
 #include "velox/dwio/common/Statistics.h"
 #include "velox/dwio/common/TypeWithId.h"
 #include "velox/experimental/wave/dwio/decode/DecodeStep.h"
+#include "velox/experimental/wave/exec/OperandSet.h"
 #include "velox/experimental/wave/vector/WaveVector.h"
 
-#include <folly/Range.h>
-
 namespace facebook::velox::wave {
 
 class ReadStream;
 class WaveStream;
 
+/// Use generic bit set to track depemdemce pon staging.
+using StagingSet = OperandSet;
+
 // Describes how a column is staged on GPU, for example, copy from host RAM,
 // direct read, already on device etc.
 struct Staging {
-  Staging(const void* hostData, int32_t size)
-      : hostData(hostData), size(size) {}
+  Staging(const void* hostData, int32_t size, const common::Region& region)
+      : hostData(hostData),
+        size(hostData ? size : region.length),
+        fileOffset(region.offset) {}
 
   // Pointer to data in pageable host memory, if applicable.
   const void* hostData{nullptr};
@@ -41,7 +49,15 @@ struct Staging {
   //  Size in bytes.
   size_t size;
 
-  // Add members here to describe locations in storage for GPU direct transfer.
+  /// If 'hostData' is nullptr, this is the start offset for 'size'
+  /// bytes in 'fileInfo_' of containing SplitStaging.
+  int64_t fileOffset{0};
+};
+
+struct FileInfo {
+  ReadFile* file{nullptr};
+  StringIdLease* fileId{nullptr};
+  cache::AsyncDataCache* cache{nullptr};
 };
 
 /// Describes how columns to be read together are staged on device. This is
@@ -49,6 +65,11 @@ struct Staging {
 /// data already on device.
 class SplitStaging {
  public:
+  /// id indicating no dependence on other staging.
+  static constexpr int32_t kNoStaging = ~0;
+
+  SplitStaging(FileInfo& fileInfo, int32_t id) : id_(id), fileInfo_(fileInfo) {}
+
   /// Adds a transfer described by 'staging'. Returns an id of the
   /// device side buffer. The id will be mapped to an actual buffer
   /// when the transfers are queud. At this time, pointers that
@@ -72,20 +93,44 @@ class SplitStaging {
   int64_t bytesToDevice() const {
     return fill_;
   }
-  // Starts the transfers registered with add( on 'stream'). Does nothing after
-  // first call or if no pointers are registered. If 'recordEvent' is true,
-  // records an event that is completed after the transfer arrives. Use event()
-  // to access the event.
-  void
-  transfer(WaveStream& waveStream, Stream& stream, bool recordEvent = false);
+  // Starts the transfers registered with add( on 'stream'). Does
+  // nothing after first call or if no pointers are registered. If
+  // 'recordEvent' is true, records an event that is completed after
+  // the transfer arrives. Use event() to access the event. If
+  // 'asyncTail' is non-nullptr, it is called after the data transfer
+  // is enqueued. The call is on an executor and transfer() returns as
+  // soon as the work is enqueud. If asyncTail is not given,
+  // transfer() returns after the transfer is enqueued on
+  // 'stream'. event() is not set until the transfer is enqueued.
+  void transfer(
+      WaveStream& waveStream,
+      Stream& stream,
+      bool recordEvent = false,
+      std::function<void(WaveStream&, Stream&)> asyncTail = nullptr);
 
   Event* event() const {
     return event_.get();
   }
 
+  int32_t id() const {
+    return id_;
+  }
+
+  void addDependency(int32_t id) {
+    dependsOn_.add(id);
+  }
+
+  const StagingSet& dependsOn() {
+    return dependsOn_;
+  }
+
  private:
   void registerPointerInternal(BufferId id, void** ptr, bool clear);
 
+  void copyColumns(int32_t begin, int32_t end, char* destination, bool release);
+
+  const int32_t id_;
+
   // Pinned host memory for transfer to device. May be nullptr if using unified
   // memory.
   WaveBufferPtr hostBuffer_;
@@ -107,6 +152,16 @@ class SplitStaging {
   // Optional event recorded after end of transfer. Use to sync dependent
   // kernels on other streams.
   std::unique_ptr<Event> event_;
+
+  // Synchronizes arrival of multithreaded memcpy
+  Semaphore sem_{0};
+
+  FileInfo& fileInfo_;
+
+  // Set of other SplitStaging ids of which 'this' is a
+  // duplicate. These need to be complete efore dependents of 'this'
+  // can run. A staging can both have transfers and dependencies.
+  StagingSet dependsOn_;
 };
 
 using RowSet = folly::Range<const int32_t*>;
@@ -265,6 +320,11 @@ class FormatData {
       WaveTypeKind columnKind,
       int32_t blockIdx);
 
+  // Staging id for nulls.
+  int32_t nullsStagingId_{SplitStaging::kNoStaging};
+  // id of last splitStaging 'this' depends on.
+  int32_t lastStagingId_{SplitStaging::kNoStaging};
+
   // First unaccessed row number relative to start of 'this'.
   int32_t currentRow_{0};
 
diff --git a/velox/experimental/wave/dwio/ReadStream.cpp b/velox/experimental/wave/dwio/ReadStream.cpp
index e961f1929d6a..e979fbe90d88 100644
--- a/velox/experimental/wave/dwio/ReadStream.cpp
+++ b/velox/experimental/wave/dwio/ReadStream.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "velox/common/process/TraceContext.h"
 #include "velox/experimental/wave/dwio/ColumnReader.h"
 #include "velox/experimental/wave/dwio/StructColumnReader.h"
 
@@ -49,18 +50,22 @@ void allOperands(
 ReadStream::ReadStream(
     StructColumnReader* columnReader,
     WaveStream& _waveStream,
+    io::IoStatistics* ioStats,
+    FileInfo& fileInfo,
     const OperandSet* firstColumns)
-    : Executable() {
+    : Executable(), ioStats_(ioStats), fileInfo_(fileInfo) {
   waveStream = &_waveStream;
   allOperands(columnReader, outputOperands, &abstractOperands_);
   output.resize(outputOperands.size());
   reader_ = columnReader;
-  reader_->splitStaging().push_back(std::make_unique<SplitStaging>());
+  reader_->splitStaging().push_back(
+      std::make_unique<SplitStaging>(fileInfo_, 0));
   currentStaging_ = reader_->splitStaging().back().get();
 }
 
-void ReadStream::setBlockStatusAndTemp() {
+void ReadStream::setBlockStatusAndTemp(Stream* stream) {
   auto* status = control_->deviceData->as<BlockStatus>();
+  prefetchStatus(stream);
   auto maxRowsPerThread = FLAGS_wave_reader_rows_per_tb / kBlockSize;
   auto tempSize = programs_.programs[0][0]->tempSize();
   auto size = programs_.programs.size() * tempSize;
@@ -75,6 +80,15 @@ void ReadStream::setBlockStatusAndTemp() {
   }
 }
 
+void ReadStream::prefetchStatus(Stream* stream) {
+  if (!stream) {
+    return;
+  }
+  char* data = control_->deviceData->as<char>();
+  auto size = control_->deviceData->size();
+  stream->prefetch(getDevice(), data, size);
+}
+
 void ReadStream::makeGrid(Stream* stream) {
   programs_.clear();
   auto total = reader_->formatData()->totalRows();
@@ -99,21 +113,29 @@ void ReadStream::makeGrid(Stream* stream) {
   }
   if (!programs_.programs.empty()) {
     WaveStats& stats = waveStream->stats();
-    stats.bytesToDevice += currentStaging_->bytesToDevice();
+    auto bytes = currentStaging_->bytesToDevice();
+    ioStats_->incRawBytesRead(bytes);
+    stats.bytesToDevice += bytes;
     ++stats.numKernels;
     stats.numPrograms += programs_.programs.size();
     stats.numThreads +=
         programs_.programs.size() * std::min<int32_t>(rows_.size(), kBlockSize);
     setBlockStatusAndTemp();
     deviceStaging_.makeDeviceBuffer(waveStream->arena());
-    currentStaging_->transfer(*waveStream, *stream);
+    currentStaging_->transfer(*waveStream, *stream, true);
+    LaunchParams params(waveStream->deviceArena());
     WaveBufferPtr extra;
-    launchDecode(programs_, &waveStream->arena(), extra, stream);
+    {
+      PrintTime l("grid");
+      launchDecode(programs_, params, stream);
+    }
     reader_->recordGriddize(*stream);
-    if (extra) {
-      commands_.push_back(std::move(extra));
+    if (params.device) {
+      commands_.push_back(std::move(params));
     }
-    reader_->splitStaging().push_back(std::make_unique<SplitStaging>());
+    auto nth = reader_->splitStaging().size();
+    reader_->splitStaging().push_back(
+        std::make_unique<SplitStaging>(fileInfo_, nth));
     currentStaging_ = reader_->splitStaging().back().get();
   }
 }
@@ -290,6 +312,19 @@ bool ReadStream::makePrograms(bool& needSync) {
   return allDone;
 }
 
+void ReadStream::syncStaging(Stream& stream) {
+  auto& set = currentStaging_->dependsOn();
+  if (set.empty()) {
+    return;
+  }
+  set.forEach([&](int32_t id) {
+    auto dep = reader_->splitStaging()[id].get();
+    auto event = dep->event();
+    VELOX_CHECK(event);
+    event->wait(stream);
+  });
+}
+
 void ReadStream::launch(
     std::unique_ptr<ReadStream> readStream,
     int32_t row,
@@ -321,16 +356,18 @@ void ReadStream::launch(
         readStream->prepareRead();
         for (;;) {
           bool done = readStream->makePrograms(needSync);
-          stats.bytesToDevice += readStream->currentStaging_->bytesToDevice();
+          auto bytes = readStream->currentStaging_->bytesToDevice();
+          readStream->ioStats_->incRawBytesRead(bytes);
+          stats.bytesToDevice += bytes;
           ++stats.numKernels;
           stats.numPrograms += readStream->programs_.programs.size();
           stats.numThreads += readStream->programs_.programs.size() *
               std::min<int32_t>(readStream->rows_.size(), kBlockSize);
-          readStream->currentStaging_->transfer(*waveStream, *stream);
+          readStream->currentStaging_->transfer(*waveStream, *stream, true);
           if (done) {
             break;
           }
-          readStream->setBlockStatusAndTemp();
+          readStream->setBlockStatusAndTemp(stream);
           readStream->deviceStaging_.makeDeviceBuffer(waveStream->arena());
           WaveBufferPtr extra;
           if (!griddizedHere && firstLaunch) {
@@ -342,13 +379,18 @@ void ReadStream::launch(
             }
           }
           firstLaunch = false;
-          launchDecode(
-              readStream->programs(), &waveStream->arena(), extra, stream);
-          if (extra) {
-            readStream->commands_.push_back(std::move(extra));
+          readStream->syncStaging(*stream);
+          LaunchParams params(waveStream->deviceArena());
+          {
+            PrintTime l("decode");
+            launchDecode(readStream->programs(), params, stream);
           }
+          if (params.device) {
+            readStream->commands_.push_back(std::move(params));
+          }
+          auto nth = readStream->reader_->splitStaging().size();
           readStream->reader_->splitStaging().push_back(
-              std::make_unique<SplitStaging>());
+              std::make_unique<SplitStaging>(readStream->fileInfo_, nth));
           readStream->currentStaging_ =
               readStream->reader_->splitStaging().back().get();
           if (needSync) {
@@ -360,16 +402,17 @@ void ReadStream::launch(
           }
         }
 
-        readStream->setBlockStatusAndTemp();
+        readStream->setBlockStatusAndTemp(stream);
         readStream->deviceStaging_.makeDeviceBuffer(waveStream->arena());
-        WaveBufferPtr extra;
-        launchDecode(
-            readStream->programs(),
-            &readStream->waveStream->arena(),
-            extra,
-            stream);
-        if (extra) {
-          readStream->commands_.push_back(std::move(extra));
+        LaunchParams params(readStream->waveStream->deviceArena());
+        readStream->syncStaging(*stream);
+        {
+          // stream->wait();
+          PrintTime l("decode-f");
+          launchDecode(readStream->programs(), params, stream);
+        }
+        if (params.device) {
+          readStream->commands_.push_back(std::move(params));
         }
         readStream->waveStream->setState(WaveStream::State::kParallel);
         readStream->waveStream->markLaunch(*stream, *readStream);
@@ -382,12 +425,12 @@ void ReadStream::makeControl() {
   waveStream->setNumRows(numRows);
   WaveStream::ExeLaunchInfo info;
   waveStream->exeLaunchInfo(*this, numBlocks_, info);
-  auto statusBytes = bits::roundUp(sizeof(BlockStatus) * numBlocks_, 8);
-  auto deviceBytes = statusBytes + info.totalBytes;
+  statusBytes_ = bits::roundUp(sizeof(BlockStatus) * numBlocks_, 8);
+  auto deviceBytes = statusBytes_ + info.totalBytes;
   auto control = std::make_unique<LaunchControl>(0, numRows);
   control->deviceData = waveStream->arena().allocate<char>(deviceBytes);
-  // Zero initialization is expected, for example for operands and arrays in
-  // Operand::indices.
+  // The operand section must be cleared before written on host. The statuses
+  // are cleared on device.
   memset(control->deviceData->as<char>(), 0, deviceBytes);
   control->params.status = control->deviceData->as<BlockStatus>();
   for (auto& reader : reader_->children()) {
@@ -399,7 +442,7 @@ void ReadStream::makeControl() {
     }
   }
   operands = waveStream->fillOperands(
-      *this, control->deviceData->as<char>() + statusBytes, info)[0];
+      *this, control->deviceData->as<char>() + statusBytes_, info)[0];
   control_ = control.get();
   waveStream->setLaunchControl(0, 0, std::move(control));
 }
diff --git a/velox/experimental/wave/dwio/decode/DecodeStep.h b/velox/experimental/wave/dwio/decode/DecodeStep.h
index 590fd887529c..bae3f605c722 100644
--- a/velox/experimental/wave/dwio/decode/DecodeStep.h
+++ b/velox/experimental/wave/dwio/decode/DecodeStep.h
@@ -118,12 +118,6 @@ struct alignas(16) GpuDecode {
 
   NullMode nullMode;
 
-  // Ordinal number of TB in TBs working on the same column. Each TB does a
-  // multiple of TB width rows. The TBs for different ranges of rows are
-  // launched in the same grid but are independent. The ordinal for non-first
-  // TBs gets the base index for values.
-  uint8_t nthBlock{0};
-
   /// Number of chunks (e.g. Parquet pages). If > 1, different rows row ranges
   /// have different encodings. The first chunk's encoding is in 'data'. The
   /// next chunk's encoding is in the next GpuDecode's 'data'. Each chunk has
@@ -131,6 +125,12 @@ struct alignas(16) GpuDecode {
   /// given by the first GpuDecode.
   uint8_t numChunks{1};
 
+  // Ordinal number of TB in TBs working on the same column. Each TB does a
+  // multiple of TB width rows. The TBs for different ranges of rows are
+  // launched in the same grid but are independent. The ordinal for non-first
+  // TBs gets the base index for values.
+  uint16_t nthBlock{0};
+
   uint16_t numRowsPerThread{1};
 
   /// Number of rows to decode. if kFilterHits, the previous GpuDecode gives
@@ -400,8 +400,7 @@ struct DecodePrograms {
 
 void launchDecode(
     const DecodePrograms& programs,
-    GpuArena* arena,
-    WaveBufferPtr& extra,
+    LaunchParams& params,
     Stream* stream);
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
index c830e026256e..5e30df09b78f 100644
--- a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
+++ b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
@@ -22,7 +22,6 @@
 namespace facebook::velox::wave {
 
 namespace detail {
-
 template <typename T>
 __device__ void decodeTrivial(GpuDecode::Trivial& op) {
   auto address = reinterpret_cast<uint64_t>(op.input);
@@ -90,6 +89,7 @@ __device__ void storeResult(
   }
 }
 
+#if 1
 template <typename T, ResultOp resultOp>
 __device__ void decodeDictionaryOnBitpack(GpuDecode::DictionaryOnBitpack& op) {
   int32_t i = op.begin + threadIdx.x;
@@ -109,19 +109,111 @@ __device__ void decodeDictionaryOnBitpack(GpuDecode::DictionaryOnBitpack& op) {
     int32_t wordIndex = bitIndex >> 6;
     int32_t bit = bitIndex & 63;
     uint64_t word = words[wordIndex];
-    uint64_t low = word >> bit;
-    if (bitWidth + bit <= 64) {
-      uint64_t index = low & mask;
-      storeResult<T, resultOp>(i, index, baseline, dict, scatter, result);
-    } else {
+    uint64_t index = word >> bit;
+    if (bitWidth + bit > 64) {
+      uint64_t nextWord = words[wordIndex + 1];
+      index |= nextWord << (64 - bit);
+    }
+    index &= mask;
+    storeResult<T, resultOp>(i, index, baseline, dict, scatter, result);
+  }
+}
+#elif 0
+
+template <typename T, ResultOp resultOp>
+__device__ void decodeDictionaryOnBitpack(GpuDecode::DictionaryOnBitpack& op) {
+  int32_t i = op.begin + threadIdx.x;
+  __shared__ int32_t end;
+  __shared__ uint64_t address;
+  __shared__ int32_t alignOffset;
+  __shared__ uint64_t* words;
+  __shared__ const T* dict;
+  __shared__ const int32_t* scatter;
+  __shared__ int64_t baseline;
+  __shared__ uint8_t bitWidth;
+  __shared__ uint64_t mask;
+  __shared__ T* result;
+
+  if (threadIdx.x == 0) {
+    end = op.end;
+    address = reinterpret_cast<uint64_t>(op.indices);
+    alignOffset = (address & 7) * 8;
+    address &= ~7UL;
+    words = reinterpret_cast<uint64_t*>(address);
+    dict = reinterpret_cast<const T*>(op.alphabet);
+    scatter = op.scatter;
+    baseline = op.baseline;
+    bitWidth = op.bitWidth;
+    mask = (1LU << bitWidth) - 1;
+    result = reinterpret_cast<T*>(op.result);
+  }
+  __syncthreads();
+  for (; i < end; i += blockDim.x) {
+    int32_t bitIndex = i * bitWidth + alignOffset;
+    int32_t wordIndex = bitIndex >> 6;
+    int32_t bit = bitIndex & 63;
+    uint64_t word = words[wordIndex];
+    uint64_t index = word >> bit;
+    if (bitWidth + bit > 64) {
+      uint64_t nextWord = words[wordIndex + 1];
+      index |= nextWord << (64 - bit);
+    }
+    index &= mask;
+    storeResult<T, resultOp>(i, index, baseline, dict, scatter, result);
+  }
+}
+#elif 0
+
+template <typename T, ResultOp resultOp>
+__device__ void decodeDictionaryOnBitpack(GpuDecode::DictionaryOnBitpack& op) {
+  int32_t i = op.begin + threadIdx.x;
+  auto address = reinterpret_cast<uint64_t>(op.indices);
+  int32_t alignOffset = (address & 7) * 8;
+  address &= ~7UL;
+  auto words = reinterpret_cast<uint64_t*>(address);
+  const T* dict = reinterpret_cast<const T*>(op.alphabet);
+  auto scatter = op.scatter;
+  auto baseline = op.baseline;
+  auto bitWidth = op.bitWidth;
+  uint64_t mask = (1LU << bitWidth) - 1;
+  for (; i < op.end; i += blockDim.x) {
+    int32_t bitIndex = i * op.bitWidth + alignOffset;
+    int32_t wordIndex = bitIndex >> 6;
+    int32_t bit = bitIndex & 63;
+    uint64_t word = words[wordIndex];
+    uint64_t index = word >> bit;
+    if (bitWidth + bit > 64) {
       uint64_t nextWord = words[wordIndex + 1];
-      int32_t bitsFromNext = bitWidth - (64 - bit);
-      uint64_t index =
-          low | ((nextWord & ((1LU << bitsFromNext) - 1)) << (64 - bit));
-      storeResult<T, resultOp>(i, index, baseline, dict, scatter, result);
+      index |= nextWord << (64 - bit);
     }
+    index &= mask;
+    storeResult<T, resultOp>(
+        i, index, baseline, dict, scatter, reinterpret_cast<T*>(op.result));
   }
 }
+#elif 0
+template <typename T, ResultOp resultOp>
+__device__ void decodeDictionaryOnBitpack(GpuDecode::DictionaryOnBitpack& op) {
+  int32_t i = op.begin + threadIdx.x;
+  auto scatter = op.scatter;
+  auto baseline = op.baseline;
+  for (; i < op.end; i += blockDim.x) {
+    uint64_t index;
+    if (op.bitWidth <= 32) {
+      index = loadBits32(op.indices, i * op.bitWidth, op.bitWidth);
+    } else {
+      index = loadBits64(op.indices, i * op.bitWidth, op.bitWidth);
+    }
+    storeResult<T, resultOp>(
+        i,
+        index,
+        baseline,
+        reinterpret_cast<const T*>(op.alphabet),
+        scatter,
+        reinterpret_cast<T*>(op.result));
+  }
+}
+#endif
 
 template <typename T>
 __device__ void decodeDictionaryOnBitpack(GpuDecode::DictionaryOnBitpack& op) {
@@ -509,9 +601,9 @@ __device__ void makeScatterIndices(GpuDecode::MakeScatterIndices& op) {
   }
 }
 
-template <typename T>
+template <typename T, DecodeStep kEncoding>
 inline __device__ T randomAccessDecode(const GpuDecode* op, int32_t idx) {
-  switch (op->encoding) {
+  switch (kEncoding) {
     case DecodeStep::kDictionaryOnBitpack: {
       const auto& d = op->data.dictionaryOnBitpack;
       auto width = d.bitWidth;
@@ -536,7 +628,12 @@ __device__ bool testFilter(const GpuDecode* op, T data) {
   }
 }
 
-template <typename T, int32_t kBlockSize>
+template <
+    typename T,
+    int32_t kBlockSize,
+    bool kHasFilter,
+    bool kHasResult,
+    bool kHasNulls>
 __device__ void makeResult(
     const GpuDecode* op,
     T data,
@@ -546,7 +643,7 @@ __device__ void makeResult(
     uint8_t nullFlag,
     int32_t* temp) {
   auto base = nthLoop * kBlockSize;
-  if (op->filterKind != WaveFilterKind::kAlwaysTrue) {
+  if (kHasFilter) {
     int32_t resultIdx = exclusiveSum<int16_t, kBlockSize>(
         static_cast<int16_t>(filterPass),
         nullptr,
@@ -560,9 +657,9 @@ __device__ void makeResult(
     if (filterPass) {
       resultIdx += base;
       op->resultRows[resultIdx] = row;
-      if (op->result) {
+      if (kHasResult) {
         reinterpret_cast<T*>(op->result)[resultIdx] = data;
-        if (op->resultNulls) {
+        if (kHasNulls) {
           op->resultNulls[resultIdx] = nullFlag;
         }
       }
@@ -575,31 +672,69 @@ __device__ void makeResult(
     }
     auto resultIdx = base + threadIdx.x;
     reinterpret_cast<T*>(op->result)[resultIdx] = data;
-    if (op->resultNulls) {
+    if (kHasNulls && op->resultNulls) {
       op->resultNulls[resultIdx] = nullFlag;
     }
   }
 }
 
-template <typename T, int32_t kBlockSize>
+template <
+    typename T,
+    int32_t kBlockSize,
+    DecodeStep kEncoding,
+    WaveFilterKind kFilterKind,
+    bool kHasResult>
 __device__ void decodeSelective(GpuDecode* op) {
-  int32_t dataIdx;
-
   int32_t nthLoop = 0;
   switch (op->nullMode) {
     case NullMode::kDenseNonNull: {
+#if 0
+      //  No-filter case with everything inlined.
+auto base = op->baseRow;
+      auto i = threadIdx.x;
+      auto& d = op->data.dictionaryOnBitpack;
+      auto end = op->maxRow - op->baseRow;
+      auto address = reinterpret_cast<uint64_t>(d.indices);
+      int32_t alignOffset = (address & 7) * 8;
+      address &= ~7UL;
+      auto words = reinterpret_cast<uint64_t*>(address);
+      auto baseline = d.baseline;
+      auto bitWidth = d.bitWidth;
+      uint64_t mask = (1LU << bitWidth) - 1;
+      auto* result = reinterpret_cast<T*>(op->result);
+      for (; i < end; i += blockDim.x) {
+        int32_t bitIndex = (i + base) * bitWidth + alignOffset;
+        int32_t wordIndex = bitIndex >> 6;
+	if (threadIdx.x < 3) {
+	  asm volatile("prefetch.global.L1 [%0];" ::"l"(&words[wordIndex + 48 + threadIdx.x * 4]));
+	}
+        int32_t bit = bitIndex & 63;
+        uint64_t word = words[wordIndex];
+        uint64_t index = word >> bit;
+        if (bitWidth + bit > 64) {
+          uint64_t nextWord = words[wordIndex + 1];
+          index |= nextWord << (64 - bit);
+        }
+        index &= mask;
+        result[i] = index + baseline;
+      }
+#else
       do {
         int32_t row = threadIdx.x + op->baseRow + nthLoop * kBlockSize;
         bool filterPass = false;
         T data{};
         if (row < op->maxRow) {
-          data = randomAccessDecode<T>(op, row);
-          filterPass =
-              testFilter<T, WaveFilterKind::kAlwaysTrue, false>(op, data);
+          data = randomAccessDecode<T, kEncoding>(op, row);
+          filterPass = testFilter<T, kFilterKind, true>(op, data);
         }
-        makeResult<T, kBlockSize>(
-            op, data, row, filterPass, nthLoop, kNotNull, op->temp);
+        makeResult<
+            T,
+            kBlockSize,
+            kFilterKind != WaveFilterKind::kAlwaysTrue,
+            kHasResult,
+            false>(op, data, row, filterPass, nthLoop, kNotNull, op->temp);
       } while (++nthLoop < op->numRowsPerThread);
+#endif
       break;
     }
     case NullMode::kSparseNonNull:
@@ -610,12 +745,15 @@ __device__ void decodeSelective(GpuDecode* op) {
         int32_t row = 0;
         if (threadIdx.x < numRows) {
           row = op->rows[threadIdx.x + nthLoop * kBlockSize];
-          data = randomAccessDecode<T>(op, row);
-          filterPass =
-              testFilter<T, WaveFilterKind::kAlwaysTrue, false>(op, data);
+          data = randomAccessDecode<T, kEncoding>(op, row);
+          filterPass = testFilter<T, kFilterKind, true>(op, data);
         }
-        makeResult<T, kBlockSize>(
-            op, data, row, filterPass, nthLoop, kNotNull, op->temp);
+        makeResult<
+            T,
+            kBlockSize,
+            kFilterKind != WaveFilterKind::kAlwaysTrue,
+            kHasResult,
+            false>(op, data, row, filterPass, nthLoop, kNotNull, op->temp);
       } while (++nthLoop < op->numRowsPerThread);
       break;
     case NullMode::kDenseNullable: {
@@ -644,13 +782,17 @@ __device__ void decodeSelective(GpuDecode* op) {
                 filterPass = false;
               }
             } else {
-              data = randomAccessDecode<T>(op, dataIdx);
-              filterPass =
-                  testFilter<T, WaveFilterKind::kAlwaysTrue, false>(op, data);
+              data = randomAccessDecode<T, kEncoding>(op, dataIdx);
+              filterPass = testFilter<T, kFilterKind, true>(op, data);
             }
           }
         }
-        makeResult<T, kBlockSize>(
+        makeResult<
+            T,
+            kBlockSize,
+            kFilterKind != WaveFilterKind::kAlwaysTrue,
+            kHasResult,
+            true>(
             op,
             data,
             base + threadIdx.x,
@@ -677,7 +819,7 @@ __device__ void decodeSelective(GpuDecode* op) {
         } else {
           bool filterPass = true;
           T data{};
-          dataIdx =
+          int32_t dataIdx =
               nonNullIndex256Sparse(op->nulls, op->rows + base, numRows, state);
           filterPass = threadIdx.x < numRows;
           if (filterPass) {
@@ -686,12 +828,17 @@ __device__ void decodeSelective(GpuDecode* op) {
                 filterPass = false;
               }
             } else {
-              data = randomAccessDecode<T>(op, dataIdx);
+              data = randomAccessDecode<T, kEncoding>(op, dataIdx);
               filterPass =
                   testFilter<T, WaveFilterKind::kAlwaysTrue, false>(op, data);
             }
           }
-          makeResult<T, kBlockSize>(
+          makeResult<
+              T,
+              kBlockSize,
+              kFilterKind != WaveFilterKind::kAlwaysTrue,
+              kHasResult,
+              true>(
               op,
               data,
               op->rows[base + threadIdx.x],
@@ -859,14 +1006,47 @@ __device__ void countBits(GpuDecode& step) {
   }
 }
 
+template <typename T, int32_t kBlockSize, DecodeStep kEncoding>
+__device__ void selectiveFilter(GpuDecode* op) {
+  switch (op->filterKind) {
+    case WaveFilterKind::kAlwaysTrue:
+      decodeSelective<
+          T,
+          kBlockSize,
+          kEncoding,
+          WaveFilterKind::kAlwaysTrue,
+          true>(op);
+      break;
+    case WaveFilterKind::kBigintRange:
+      decodeSelective<
+          T,
+          kBlockSize,
+          kEncoding,
+          WaveFilterKind::kBigintRange,
+          true>(op);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+template <typename T, int32_t kBlockSize>
+__device__ void selectiveSwitch(GpuDecode* op) {
+  if (op->encoding == DecodeStep::kDictionaryOnBitpack) {
+    selectiveFilter<T, kBlockSize, DecodeStep::kDictionaryOnBitpack>(op);
+  } else {
+    assert(false);
+  }
+}
+
 template <int32_t kBlockSize>
 __device__ void decodeSwitch(GpuDecode& op) {
   switch (op.step) {
     case DecodeStep::kSelective32:
-      detail::decodeSelective<int32_t, kBlockSize>(&op);
+      selectiveSwitch<int32_t, kBlockSize>(&op);
       break;
     case DecodeStep::kSelective64:
-      detail::decodeSelective<int64_t, kBlockSize>(&op);
+      selectiveSwitch<int64_t, kBlockSize>(&op);
       break;
     case DecodeStep::kCompact64:
       detail::compactValues<int64_t, kBlockSize>(&op);
@@ -903,14 +1083,17 @@ __device__ void decodeSwitch(GpuDecode& op) {
       break;
     default:
       if (threadIdx.x == 0) {
-        printf("ERROR: Unsupported DecodeStep (with shared memory)\n");
+        printf(
+            "ERROR: Unsupported DecodeStep %d\n",
+            static_cast<int32_t>(op.step));
       }
   }
 }
 
 template <int kBlockSize>
 __global__ void decodeGlobal(GpuDecode* plan) {
-  decodeSwitch<kBlockSize>(plan[blockIdx.x]);
+  detail::decodeSwitch<kBlockSize>(plan[blockIdx.x]);
+  __syncthreads();
 }
 
 template <int32_t kBlockSize>
diff --git a/velox/experimental/wave/dwio/decode/GpuDecoder.cu b/velox/experimental/wave/dwio/decode/GpuDecoder.cu
index b955d7d574e5..7d54fcf59cd1 100644
--- a/velox/experimental/wave/dwio/decode/GpuDecoder.cu
+++ b/velox/experimental/wave/dwio/decode/GpuDecoder.cu
@@ -54,13 +54,18 @@ struct alignas(16) GpuDecodeParams {
 
 void __global__ __launch_bounds__(1024)
     decodeKernel(GpuDecodeParams inlineParams) {
-  // asm volatile (".maxnregs 40;");
-  GpuDecodeParams* params =
-      inlineParams.external ? inlineParams.external : &inlineParams;
-  int32_t programStart = blockIdx.x == 0 ? 0 : params->ends[blockIdx.x - 1];
-  int32_t programEnd = params->ends[blockIdx.x];
-  GpuDecode* ops =
-      reinterpret_cast<GpuDecode*>(&params->ends[0] + roundUp(gridDim.x, 4));
+  __shared__ GpuDecodeParams* params;
+  __shared__ int32_t programStart;
+  __shared__ int32_t programEnd;
+  __shared__ GpuDecode* ops;
+  if (threadIdx.x == 0) {
+    params = inlineParams.external ? inlineParams.external : &inlineParams;
+    programStart = blockIdx.x == 0 ? 0 : params->ends[blockIdx.x - 1];
+    programEnd = params->ends[blockIdx.x];
+    ops =
+        reinterpret_cast<GpuDecode*>(&params->ends[0] + roundUp(gridDim.x, 4));
+  }
+  __syncthreads();
   for (auto i = programStart; i < programEnd; ++i) {
     detail::decodeSwitch<kBlockSize>(ops[i]);
   }
@@ -69,14 +74,19 @@ void __global__ __launch_bounds__(1024)
 
 void launchDecode(
     const DecodePrograms& programs,
-    GpuArena* arena,
-    WaveBufferPtr& extra,
+    LaunchParams& launchParams,
     Stream* stream) {
   int32_t numBlocks = programs.programs.size();
   int32_t numOps = 0;
+  bool allSingle = true;
   int32_t shared = 0;
   for (auto& program : programs.programs) {
-    numOps += program.size();
+    int numSteps = program.size();
+    ;
+    if (numSteps != 1) {
+      allSingle = false;
+    }
+    numOps += numSteps;
     for (auto& step : program) {
       shared = std::max(
           shared, detail::sharedMemorySizeForDecode<kBlockSize>(step->step));
@@ -87,16 +97,20 @@ void launchDecode(
   }
   GpuDecodeParams localParams;
   GpuDecodeParams* params = &localParams;
-  if (numOps > GpuDecodeParams::kMaxInlineOps) {
-    extra = arena->allocate<char>(
+  char* host = nullptr;
+  char* device = nullptr;
+  if (numOps > GpuDecodeParams::kMaxInlineOps || allSingle) {
+    auto pair = launchParams.setup(
         (numOps + 1) * (sizeof(GpuDecode) + sizeof(int32_t)) + 16);
-    uintptr_t aligned =
-        roundUp(reinterpret_cast<uintptr_t>(extra->as<char>()), 16);
+    host = pair.first;
+    device = pair.second;
+    uintptr_t aligned = roundUp(reinterpret_cast<uintptr_t>(host), 16);
     params = reinterpret_cast<GpuDecodeParams*>(aligned);
   }
   int32_t end = programs.programs[0].size();
   GpuDecode* decodes =
       reinterpret_cast<GpuDecode*>(&params->ends[0] + roundUp(numBlocks, 4));
+  uintptr_t decodeOffset = reinterpret_cast<char*>(decodes) - host;
   int32_t fill = 0;
   for (auto i = 0; i < programs.programs.size(); ++i) {
     params->ends[i] =
@@ -105,9 +119,18 @@ void launchDecode(
       decodes[fill++] = *op;
     }
   }
-  if (extra) {
-    localParams.external = params;
-    stream->prefetch(getDevice(), extra->as<char>(), extra->size());
+  if (allSingle) {
+    launchParams.transfer(*stream);
+    detail::decodeGlobal<kBlockSize>
+        <<<numBlocks, kBlockSize, shared, stream->stream()->stream>>>(
+            reinterpret_cast<GpuDecode*>(device + decodeOffset));
+    CUDA_CHECK(cudaGetLastError());
+    programs.result.transfer(*stream);
+    return;
+  }
+  if (launchParams.device) {
+    localParams.external = reinterpret_cast<GpuDecodeParams*>(device);
+    launchParams.transfer(*stream);
   }
 
   decodeKernel<<<numBlocks, kBlockSize, shared, stream->stream()->stream>>>(
@@ -117,5 +140,10 @@ void launchDecode(
 }
 
 REGISTER_KERNEL("decode", decodeKernel);
+namespace {
+static bool decSingles_reg = registerKernel(
+    "decodeSingle",
+    reinterpret_cast<const void*>(detail::decodeGlobal<kBlockSize>));
+}
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu b/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu
index 5bd5eb3f28d0..940f3f638d90 100644
--- a/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu
+++ b/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu
@@ -23,12 +23,18 @@
 
 DEFINE_int32(device_id, 0, "");
 DEFINE_bool(benchmark, false, "");
+DEFINE_bool(print_kernels, false, "Print register and smem usage");
+DEFINE_bool(use_selective, false, "Use selective path for test");
 
 namespace facebook::velox::wave {
 namespace {
 
 using namespace facebook::velox;
 
+// define to use the flexible call path wiht multiple ops per TB
+#define USE_PROGRAM_API
+#define USE_SEL_BITPACK true
+
 // Returns the number of bytes the "values" will occupy after varint encoding.
 uint64_t bulkVarintSize(const uint64_t* values, int count) {
   constexpr uint8_t kLookupSizeTable64[64] = {
@@ -84,6 +90,10 @@ inline const T* addBytes(const T* ptr, int bytes) {
   return reinterpret_cast<const T*>(reinterpret_cast<const char*>(ptr) + bytes);
 }
 
+void prefetchToDevice(void* ptr, size_t size) {
+  CUDA_CHECK_FATAL(cudaMemPrefetchAsync(ptr, size, FLAGS_device_id, nullptr));
+}
+
 template <typename T>
 void makeBitpackDict(
     int32_t bitWidth,
@@ -92,19 +102,25 @@ void makeBitpackDict(
     T*& dict,
     uint64_t*& bits,
     T*& result,
-    int32_t** scatter) {
-  int64_t dictBytes = sizeof(T) << bitWidth;
+    int32_t** scatter,
+    bool bitsOnly,
+    BlockStatus*& blockStatus,
+    int32_t numBlocks,
+    int32_t blockSize) {
+  int64_t dictBytes = bitsOnly ? 0 : sizeof(T) << bitWidth;
   int64_t bitBytes = (roundUp(numValues * bitWidth, 128) / 8) + 16;
   int64_t resultBytes = numValues * sizeof(T);
   int scatterBytes =
       scatter ? roundUp(numValues * sizeof(int32_t), sizeof(T)) : 0;
+  int32_t statusBytes = sizeof(BlockStatus) * numBlocks;
   if (scatterBytes) {
     resultBytes += resultBytes / 2;
   }
-  cudaPtr = allocate<char>(dictBytes + bitBytes + scatterBytes + resultBytes);
+  cudaPtr = allocate<char>(
+      dictBytes + bitBytes + scatterBytes + resultBytes + statusBytes);
   T* memory = (T*)cudaPtr.get();
 
-  dict = memory;
+  dict = bitsOnly ? nullptr : memory;
 
   static int sequence = 1;
   ++sequence;
@@ -126,6 +142,14 @@ void makeBitpackDict(
   }
   result = addBytes(
       reinterpret_cast<T*>(memory), dictBytes + bitBytes + scatterBytes);
+  blockStatus =
+      reinterpret_cast<BlockStatus*>(addBytes(result, numValues * sizeof(T)));
+  for (auto i = 0; i < numBlocks; ++i) {
+    blockStatus[i].numRows =
+        i < numBlocks - 1 ? blockSize : numValues - (i * blockSize);
+  }
+  prefetchToDevice(
+      memory, dictBytes + bitBytes + scatterBytes + resultBytes + statusBytes);
 }
 
 class GpuDecoderTest : public ::testing::Test {
@@ -220,12 +244,15 @@ class GpuDecoderTest : public ::testing::Test {
       int32_t bitWidth,
       int64_t numValues,
       int numBlocks,
-      bool useScatter) {
+      bool useScatter,
+      bool bitsOnly = false,
+      bool useSelective = false) {
     gpu::CudaPtr<char[]> ptr;
     T* dict;
     uint64_t* bits;
     T* result;
     int32_t* scatter = nullptr;
+    BlockStatus* blockStatus;
     makeBitpackDict(
         bitWidth,
         numValues,
@@ -233,14 +260,33 @@ class GpuDecoderTest : public ::testing::Test {
         dict,
         bits,
         result,
-        useScatter ? &scatter : nullptr);
+        useScatter ? &scatter : nullptr,
+        bitsOnly,
+        blockStatus,
+        roundUp(numValues, kBlockSize) / kBlockSize,
+        kBlockSize);
     result[numValues] = 0xdeadbeef;
     int valuesPerOp = roundUp(numValues / numBlocks, kBlockSize);
     int numOps = roundUp(numValues, valuesPerOp) / valuesPerOp;
+    auto valuesPerThread = valuesPerOp / kBlockSize;
     auto ops = allocate<GpuDecode>(numOps);
     for (auto i = 0; i < numOps; ++i) {
       int32_t begin = i * valuesPerOp;
-      ops[i].step = DecodeStep::kDictionaryOnBitpack;
+      ops[i].step = useSelective ? (sizeof(T) == 8 ? DecodeStep::kSelective64
+                                                   : DecodeStep::kSelective32)
+                                 : DecodeStep::kDictionaryOnBitpack;
+      ops[i].encoding = DecodeStep::kDictionaryOnBitpack;
+      ops[i].dataType = WaveTypeTrait<T>::typeKind;
+      ops[i].nullMode = NullMode::kDenseNonNull;
+      ops[i].nthBlock = i;
+      ops[i].numRowsPerThread = i == numOps - 1
+          ? roundUp(numValues - (valuesPerOp * i), kBlockSize) / kBlockSize
+          : valuesPerThread;
+      ops[i].baseRow = i * valuesPerOp;
+      ops[i].maxRow = std::min<int32_t>((i + 1) * valuesPerOp, numValues);
+      ops[i].result = reinterpret_cast<T*>(result) + i * valuesPerOp;
+
+      ops[i].blockStatus = blockStatus + (i * valuesPerThread);
       auto& op = ops[i].data.dictionaryOnBitpack;
       op.begin = begin;
       op.end = std::min<int>(numValues, (i + 1) * valuesPerOp);
@@ -254,21 +300,29 @@ class GpuDecoderTest : public ::testing::Test {
     }
     testCase(
         fmt::format(
-            "bitpack dictplan {} numValues={} useScatter={}",
+            "bitpack dictplan {} -> {} numValues={} useScatter={}",
+            bitWidth,
             sizeof(T) * 8,
             numValues,
             useScatter),
-        [&] { decodeGlobal<kBlockSize>(ops.get(), numOps); },
+        [&] {
+#ifdef USE_PROGRAM_API
+          callViaPrograms(ops.get(), numOps);
+#else
+          decodeGlobal<kBlockSize>(ops.get(), numOps);
+#endif
+        },
         numValues * sizeof(T),
         10);
     if (!scatter) {
       EXPECT_EQ(0xdeadbeef, result[numValues]);
     }
-    auto mask = (1u << bitWidth) - 1;
+    auto mask = (1uL << bitWidth) - 1;
     for (auto i = 0; i < numValues; ++i) {
       int32_t bit = i * bitWidth;
       uint64_t word = *addBytes(bits, bit / 8);
-      T expected = dict[(word >> (bit & 7)) & mask];
+      uint64_t index = (word >> (bit & 7)) & mask;
+      T expected = bitsOnly ? index : dict[index];
       ASSERT_EQ(result[scatter ? scatter[i] : i], expected) << i;
     }
   }
@@ -512,6 +566,20 @@ class GpuDecoderTest : public ::testing::Test {
     }
   }
 
+  void callViaPrograms(GpuDecode* ops, int32_t numOps) {
+    auto stream = std::make_unique<Stream>();
+    LaunchParams params(*arena_);
+    DecodePrograms programs;
+    for (int i = 0; i < numOps; ++i) {
+      programs.programs.emplace_back();
+      programs.programs.back().push_back(std::make_unique<GpuDecode>());
+      auto opPtr = programs.programs.back().front().get();
+      *opPtr = ops[i];
+    }
+    launchDecode(programs, params, stream.get());
+    stream->wait();
+  }
+
   void testMakeScatterIndicesStream(int numValues, int numBlocks) {
     auto bits = allocate<uint8_t>((numValues * numBlocks + 7) / 8);
     fillRandomBits(bits.get(), 0.5, numValues * numBlocks);
@@ -532,8 +600,8 @@ class GpuDecoderTest : public ::testing::Test {
       op.indicesCount = indicesCounts.get() + i;
     }
     auto stream = std::make_unique<Stream>();
-    WaveBufferPtr extra;
-    launchDecode(programs, arena_.get(), extra, stream.get());
+    LaunchParams params(*arena_);
+    launchDecode(programs, params, stream.get());
     stream->wait();
     for (int i = 0; i < numBlocks; ++i) {
       auto& op = programs.programs[i].front()->data.makeScatterIndices;
@@ -566,8 +634,8 @@ class GpuDecoderTest : public ::testing::Test {
     op.resultStride = stride;
     opPtr->result = result.get();
     auto stream = std::make_unique<Stream>();
-    WaveBufferPtr extra;
-    launchDecode(programs, arena_.get(), extra, stream.get());
+    LaunchParams params(*arena_);
+    launchDecode(programs, params, stream.get());
     stream->wait();
     auto numResults = ((numWords * 64) - 1) / stride;
     auto* rawResult = result.get();
@@ -601,6 +669,16 @@ TEST_F(GpuDecoderTest, dictionaryOnBitpack) {
   dictTestPlan<int64_t, 256>(11, 40'000'003, 1024, true);
 }
 
+TEST_F(GpuDecoderTest, bitpack) {
+  bool useSelective = FLAGS_use_selective;
+  dictTestPlan<int32_t, 256>(27, 4000001, 1024, false, true, useSelective);
+  dictTestPlan<int64_t, 256>(28, 4'000'037, 1024, false, true, useSelective);
+  dictTestPlan<int32_t, 256>(26, 40'000'003, 1024, false, true, useSelective);
+  dictTestPlan<int64_t, 256>(30, 40'000'003, 1024, false, true, useSelective);
+  dictTestPlan<int64_t, 256>(47, 40'000'003, 1024, false, true, useSelective);
+  dictTestPlan<int64_t, 256>(22, 40'000'003, 1024, true, true, false);
+}
+
 TEST_F(GpuDecoderTest, sparseBool) {
   testSparseBool<256>(40013, 1024);
 }
@@ -667,14 +745,19 @@ int main(int argc, char** argv) {
   CUDA_CHECK_FATAL(cudaGetDeviceProperties(&prop, FLAGS_device_id));
   printf("Running on device: %s\n", prop.name);
   CUDA_CHECK_FATAL(cudaSetDevice(FLAGS_device_id));
-  cudaFuncAttributes attrs;
-  CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, detail::decodeGlobal<128>));
-  printFuncAttrs("decode blocksize 128", attrs);
-  CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, detail::decodeGlobal<256>));
-  printFuncAttrs("decode blocksize 256", attrs);
-  CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, detail::decodeGlobal<512>));
-  printFuncAttrs("decode blocksize 512", attrs);
-  CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, detail::decodeGlobal<1024>));
-  printFuncAttrs("decode blocksize 1024", attrs);
+  if (FLAGS_print_kernels) {
+    cudaFuncAttributes attrs;
+    CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, detail::decodeGlobal<128>));
+    printFuncAttrs("decode blocksize 128", attrs);
+    CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, detail::decodeGlobal<256>));
+    printFuncAttrs("decode blocksize 256", attrs);
+    CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, detail::decodeGlobal<512>));
+    printFuncAttrs("decode blocksize 512", attrs);
+    CUDA_CHECK_FATAL(cudaFuncGetAttributes(&attrs, detail::decodeGlobal<1024>));
+    printFuncAttrs("decode blocksize 1024", attrs);
+    printFuncAttrs("decode2", attrs);
+
+    printKernels();
+  }
   return RUN_ALL_TESTS();
 }
diff --git a/velox/experimental/wave/exec/Aggregation.cpp b/velox/experimental/wave/exec/Aggregation.cpp
index c04ea6d90584..de2839a1d05b 100644
--- a/velox/experimental/wave/exec/Aggregation.cpp
+++ b/velox/experimental/wave/exec/Aggregation.cpp
@@ -354,8 +354,4 @@ void Aggregation::schedule(WaveStream& waveStream, int32_t maxRows) {
           << stats_.ingestedRowCount / stats_.gpuTimeMs * 1000 << " rows/s";
 }
 
-vector_size_t Aggregation::outputSize(WaveStream&) const {
-  return container_->actualNumGroups;
-}
-
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/Aggregation.h b/velox/experimental/wave/exec/Aggregation.h
index afb01709d910..e16b3c0b7ce3 100644
--- a/velox/experimental/wave/exec/Aggregation.h
+++ b/velox/experimental/wave/exec/Aggregation.h
@@ -51,8 +51,6 @@ class Aggregation : public WaveOperator {
     return finished_;
   }
 
-  vector_size_t outputSize(WaveStream&) const override;
-
   std::string toString() const override {
     return "Aggregation";
   }
diff --git a/velox/experimental/wave/exec/Instruction.h b/velox/experimental/wave/exec/Instruction.h
index a93e62c7ed76..0419d24ef662 100644
--- a/velox/experimental/wave/exec/Instruction.h
+++ b/velox/experimental/wave/exec/Instruction.h
@@ -96,13 +96,13 @@ struct AdvanceResult {
     return numRows == 0 && !isRetry;
   }
 
-  ///  Max umber of result rows.
+  ///  Max number of result rows.
   int32_t numRows{0};
 
   /// The sequence number of kernel launch that needs continue.
   int32_t nthLaunch{0};
 
-  /// The ordinal of the program i the launch.
+  /// The ordinal of the program in the launch.
   int32_t programIdx{0};
 
   /// The instruction where to pick up. If not 0, must have 'isRetry' true.
diff --git a/velox/experimental/wave/exec/Project.cpp b/velox/experimental/wave/exec/Project.cpp
index fb109fe5bf20..cbe4446fe00f 100644
--- a/velox/experimental/wave/exec/Project.cpp
+++ b/velox/experimental/wave/exec/Project.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "velox/experimental/wave/exec/Project.h"
+#include "velox/common/process/TraceContext.h"
 #include "velox/experimental/wave/exec/ToWave.h"
 #include "velox/experimental/wave/exec/Wave.h"
 #include "velox/experimental/wave/exec/WaveDriver.h"
@@ -117,12 +118,19 @@ void Project::schedule(WaveStream& stream, int32_t maxRows) {
               blocksPerExe,
               inputControl,
               out);
+          out->prefetch(
+              getDevice(),
+              control->deviceData->as<char>(),
+              control->deviceData->size());
           stream.setState(WaveStream::State::kParallel);
-          reinterpret_cast<WaveKernelStream*>(out)->call(
-              out,
-              exes.size() * blocksPerExe,
-              control->sharedMemorySize,
-              control->params);
+          {
+            PrintTime c("expr");
+            reinterpret_cast<WaveKernelStream*>(out)->call(
+                out,
+                exes.size() * blocksPerExe,
+                control->sharedMemorySize,
+                control->params);
+          }
           // A sink at the end has no output params but need to wait for host
           // return event before reusing the stream.
           if (exes.size() == 1 && exes[0]->programShared->isSink()) {
@@ -146,10 +154,4 @@ void Project::finalize(CompileState& state) {
   }
 }
 
-vector_size_t Project::outputSize(WaveStream& stream) const {
-  auto& control = stream.launchControls(id_);
-  VELOX_CHECK(!control.empty());
-  return control[0]->inputRows;
-}
-
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/Project.h b/velox/experimental/wave/exec/Project.h
index 798c20db4c52..60fb58af428c 100644
--- a/velox/experimental/wave/exec/Project.h
+++ b/velox/experimental/wave/exec/Project.h
@@ -60,8 +60,6 @@ class Project : public WaveOperator {
 
   void schedule(WaveStream& stream, int32_t maxRows = 0) override;
 
-  vector_size_t outputSize(WaveStream& stream) const override;
-
   void finalize(CompileState& state) override;
 
   std::string toString() const override {
diff --git a/velox/experimental/wave/exec/TableScan.cpp b/velox/experimental/wave/exec/TableScan.cpp
index eecccacd78b1..f2c518949474 100644
--- a/velox/experimental/wave/exec/TableScan.cpp
+++ b/velox/experimental/wave/exec/TableScan.cpp
@@ -18,6 +18,7 @@
 #include "velox/common/time/Timer.h"
 #include "velox/exec/Task.h"
 #include "velox/experimental/wave/exec/WaveDriver.h"
+#include "velox/experimental/wave/exec/WaveSplitReader.h"
 #include "velox/expression/Expr.h"
 
 namespace facebook::velox::wave {
@@ -53,10 +54,36 @@ void TableScan::schedule(WaveStream& stream, int32_t maxRows) {
   waveDataSource_->schedule(stream, maxRows);
   nextAvailableRows_ = waveDataSource_->canAdvance(stream);
   if (nextAvailableRows_ == 0) {
+    updateStats(
+        waveDataSource_->splitReader()->runtimeStats(),
+        waveDataSource_->splitReader().get());
     needNewSplit_ = true;
   }
 }
 
+void TableScan::updateStats(
+    std::unordered_map<std::string, RuntimeCounter> connectorStats,
+    WaveSplitReader* splitReader) {
+  auto lockedStats = stats().wlock();
+  if (splitReader) {
+    lockedStats->rawInputPositions = splitReader->getCompletedRows();
+    lockedStats->rawInputBytes = splitReader->getCompletedBytes();
+  }
+  for (const auto& [name, counter] : connectorStats) {
+    if (name == "ioWaitNanos") {
+      ioWaitNanos_ += counter.value - lastIoWaitNanos_;
+      lastIoWaitNanos_ = counter.value;
+    }
+    if (UNLIKELY(lockedStats->runtimeStats.count(name) == 0)) {
+      lockedStats->runtimeStats.insert(
+          std::make_pair(name, RuntimeMetric(counter.unit)));
+    } else {
+      VELOX_CHECK_EQ(lockedStats->runtimeStats.at(name).unit, counter.unit);
+    }
+    lockedStats->runtimeStats.at(name).addValue(counter.value);
+  }
+}
+
 BlockingReason TableScan::nextSplit(ContinueFuture* future) {
   exec::Split split;
   blockingReason_ = driverCtx_->task->getSplitOrFuture(
@@ -73,21 +100,7 @@ BlockingReason TableScan::nextSplit(ContinueFuture* future) {
   if (!split.hasConnectorSplit()) {
     noMoreSplits_ = true;
     if (dataSource_) {
-      auto connectorStats = dataSource_->runtimeStats();
-      auto lockedStats = stats().wlock();
-      for (const auto& [name, counter] : connectorStats) {
-        if (name == "ioWaitNanos") {
-          ioWaitNanos_ += counter.value - lastIoWaitNanos_;
-          lastIoWaitNanos_ = counter.value;
-        }
-        if (UNLIKELY(lockedStats->runtimeStats.count(name) == 0)) {
-          lockedStats->runtimeStats.insert(
-              std::make_pair(name, RuntimeMetric(counter.unit)));
-        } else {
-          VELOX_CHECK_EQ(lockedStats->runtimeStats.at(name).unit, counter.unit);
-        }
-        lockedStats->runtimeStats.at(name).addValue(counter.value);
-      }
+      updateStats(dataSource_->runtimeStats());
     }
     return BlockingReason::kNotBlocked;
   }
@@ -129,6 +142,8 @@ BlockingReason TableScan::nextSplit(ContinueFuture* future) {
       waveDataSource_->addSplit(connectorSplit);
     }
   }
+  ++stats().wlock()->numSplits;
+
   for (const auto& entry : pendingDynamicFilters_) {
     waveDataSource_->addDynamicFilter(entry.first, entry.second);
   }
diff --git a/velox/experimental/wave/exec/TableScan.h b/velox/experimental/wave/exec/TableScan.h
index a569b03eb7d9..e4b3ff23a073 100644
--- a/velox/experimental/wave/exec/TableScan.h
+++ b/velox/experimental/wave/exec/TableScan.h
@@ -54,10 +54,6 @@ class TableScan : public WaveSourceOperator {
 
   void schedule(WaveStream& stream, int32_t maxRows = 0) override;
 
-  vector_size_t outputSize(WaveStream& stream) const {
-    return waveDataSource_->outputSize(stream);
-  }
-
   bool isStreaming() const override {
     return true;
   }
@@ -98,6 +94,15 @@ class TableScan : public WaveSourceOperator {
   // needed before prepare is done, it will be made when needed.
   void preload(std::shared_ptr<connector::ConnectorSplit> split);
 
+  // Adds 'stats' to operator stats of the containing WaveDriver. Some
+  // stats come from DataSource, others from SplitReader. If
+  // 'splitReader' is given, the completed rows/bytes from
+  // 'splitReader' are added. These do not come in the runtimeStats()
+  // map.
+  void updateStats(
+      std::unordered_map<std::string, RuntimeCounter> stats,
+      WaveSplitReader* splitReader = nullptr);
+
   // Process-wide IO wait time.
   static std::atomic<uint64_t> ioWaitNanos_;
 
diff --git a/velox/experimental/wave/exec/ToWave.cpp b/velox/experimental/wave/exec/ToWave.cpp
index c66ce86cdc9e..aed2effcd41a 100644
--- a/velox/experimental/wave/exec/ToWave.cpp
+++ b/velox/experimental/wave/exec/ToWave.cpp
@@ -464,6 +464,7 @@ void CompileState::makeAggregateAccumulate(const core::AggregationNode* node) {
   folly::F14FastSet<Program*> programs;
   std::vector<AbstractOperand*> allArgs;
   std::vector<AbstractAggInstruction> aggregates;
+  int numPrograms = allPrograms_.size();
   for (auto& key : node->groupingKeys()) {
     auto arg = findCurrentValue(key);
     allArgs.push_back(arg);
@@ -520,7 +521,6 @@ void CompileState::makeAggregateAccumulate(const core::AggregationNode* node) {
       sourceList.push_back(s);
     }
   }
-  int numPrograms = allPrograms_.size();
   auto aggInstruction = instruction.get();
   addInstruction(std::move(instruction), nullptr, sourceList);
   if (allPrograms_.size() > numPrograms) {
diff --git a/velox/experimental/wave/exec/Values.h b/velox/experimental/wave/exec/Values.h
index 820739a5fa5f..5702b022d0a5 100644
--- a/velox/experimental/wave/exec/Values.h
+++ b/velox/experimental/wave/exec/Values.h
@@ -36,12 +36,6 @@ class Values : public WaveSourceOperator {
     return roundsLeft_ == (current_ == values_.size());
   }
 
-  vector_size_t outputSize(WaveStream& stream) const override {
-    // Must not be called before schedule().
-    VELOX_CHECK_LT(0, current_);
-    return values_[current_ - 1]->size();
-  }
-
   std::string toString() const override;
 
  private:
diff --git a/velox/experimental/wave/exec/Wave.cpp b/velox/experimental/wave/exec/Wave.cpp
index a7f02f43ecc1..05715bc62d4b 100644
--- a/velox/experimental/wave/exec/Wave.cpp
+++ b/velox/experimental/wave/exec/Wave.cpp
@@ -15,10 +15,27 @@
  */
 
 #include "velox/experimental/wave/exec/Wave.h"
+#include <iostream>
 #include "velox/experimental/wave/exec/Vectors.h"
 
+DEFINE_bool(wave_timing, true, "Enable Wave perf timers");
+DEFINE_bool(
+    wave_print_time,
+    false,
+    "Enables printing times inside PrinTime guard.");
+
 namespace facebook::velox::wave {
 
+PrintTime::PrintTime(const char* title)
+    : title_(title),
+      start_(FLAGS_wave_print_time ? getCurrentTimeMicro() : 0) {}
+
+PrintTime::~PrintTime() {
+  if (FLAGS_wave_print_time) {
+    std::cout << title_ << "=" << getCurrentTimeMicro() - start_ << std::endl;
+  }
+}
+
 std::string WaveTime::toString() const {
   if (micros < 20) {
     return fmt::format("{} ({} clocks)", succinctNanos(micros * 1000), clocks);
@@ -38,6 +55,7 @@ void WaveStats::add(const WaveStats& other) {
   hostOnlyTime += other.hostOnlyTime;
   hostParallelTime += other.hostParallelTime;
   waitTime += other.waitTime;
+  stagingTime += other.stagingTime;
 }
 
 void WaveStats::clear() {
@@ -178,6 +196,28 @@ std::mutex WaveStream::reserveMutex_;
 std::vector<std::unique_ptr<Stream>> WaveStream::streamsForReuse_;
 std::vector<std::unique_ptr<Event>> WaveStream::eventsForReuse_;
 bool WaveStream::exitInited_{false};
+std::unique_ptr<folly::CPUThreadPoolExecutor> WaveStream::copyExecutor_;
+std::unique_ptr<folly::CPUThreadPoolExecutor> WaveStream::syncExecutor_;
+
+folly::CPUThreadPoolExecutor* WaveStream::copyExecutor() {
+  return getExecutor(copyExecutor_);
+}
+
+folly::CPUThreadPoolExecutor* WaveStream::syncExecutor() {
+  return getExecutor(syncExecutor_);
+}
+
+folly::CPUThreadPoolExecutor* WaveStream::getExecutor(
+    std::unique_ptr<folly::CPUThreadPoolExecutor>& ptr) {
+  if (ptr) {
+    return ptr.get();
+  }
+  std::lock_guard<std::mutex> l(reserveMutex_);
+  if (!ptr) {
+    ptr = std::make_unique<folly::CPUThreadPoolExecutor>(32);
+  }
+  return ptr.get();
+}
 
 Stream* WaveStream::newStream() {
   auto stream = streamFromReserve();
@@ -707,6 +747,9 @@ LaunchControl* WaveStream::prepareProgramLaunch(
   int32_t operatorStateOffset = size;
   size += exes.size() * sizeof(void*) + operatorStateBytes;
   auto buffer = arena_.allocate<char>(size);
+  if (stream) {
+    stream->prefetch(nullptr, buffer->as<char>(), buffer->size());
+  }
   // Zero initialization is expected, for example for operands and arrays in
   // Operand::indices.
   memset(buffer->as<char>(), 0, size);
@@ -893,6 +936,9 @@ AdvanceResult Program::canAdvance(
   if (stateId.has_value()) {
     state = stream.operatorState(stateId.value());
   }
+  if (state == nullptr) {
+    return {};
+  }
   return source->canAdvance(stream, control, state, programIdx);
 }
 
diff --git a/velox/experimental/wave/exec/Wave.h b/velox/experimental/wave/exec/Wave.h
index 19d2b197f170..2737f02f1f97 100644
--- a/velox/experimental/wave/exec/Wave.h
+++ b/velox/experimental/wave/exec/Wave.h
@@ -26,20 +26,42 @@
 #include "velox/experimental/wave/exec/ExprKernel.h"
 #include "velox/experimental/wave/vector/WaveVector.h"
 
+#include <folly/executors/CPUThreadPoolExecutor.h>
+
+DECLARE_bool(wave_timing);
+
 namespace facebook::velox::wave {
 
+/// Scoped guard, prints the time spent inside if
+class PrintTime {
+ public:
+  PrintTime(const char* title);
+  ~PrintTime();
+
+ private:
+  const char* title_;
+  uint64_t start_;
+};
+
 /// A host side time point for measuring wait and launch prepare latency. Counts
 /// both wall microseconds and clocks.
 struct WaveTime {
   size_t micros{0};
   uint64_t clocks{0};
 
+  static uint64_t getMicro() {
+    return FLAGS_wave_timing ? getCurrentTimeMicro() : 0;
+  }
+
   static WaveTime now() {
+    if (!FLAGS_wave_timing) {
+      return {0, 0};
+    }
     return {getCurrentTimeMicro(), folly::hardware_timestamp()};
   }
 
   WaveTime operator-(const WaveTime right) const {
-    return {right.micros - micros, right.clocks - clocks};
+    return {micros - right.micros, clocks - right.clocks};
   }
 
   WaveTime operator+(const WaveTime right) const {
@@ -99,6 +121,12 @@ struct WaveStats {
   /// Time a host thread waits for device.
   WaveTime waitTime;
 
+  /// Time a host thread is synchronously staging data to device. This is either
+  /// the wall time of multithreaded memcpy to pinned host or the wall time of
+  /// multithreaded GPU Direct NVME read. This does not include the time of
+  /// hostToDeviceAsync.
+  WaveTime stagingTime;
+
   void clear();
   void add(const WaveStats& other);
 };
@@ -557,11 +585,11 @@ class WaveStream {
 
   WaveStream(
       GpuArena& arena,
-      GpuArena& hostArena,
+      GpuArena& deviceArena,
       const std::vector<std::unique_ptr<AbstractOperand>>* operands,
       OperatorStateMap* stateMap)
       : arena_(arena),
-        hostArena_(hostArena),
+        deviceArena_(deviceArena),
         operands_(operands),
         taskStateMap_(stateMap) {
     operandNullable_.resize(operands_->size(), true);
@@ -580,6 +608,10 @@ class WaveStream {
     return arena_;
   }
 
+  GpuArena& deviceArena() {
+    return deviceArena_;
+  }
+
   /// Sets nullability of a source column. This is runtime, since may depend on
   /// the actual presence of nulls in the source, e.g. file. Nullability
   /// defaults to nullable.
@@ -805,6 +837,9 @@ class WaveStream {
     hasError_ = true;
   }
 
+  static folly::CPUThreadPoolExecutor* copyExecutor();
+  static folly::CPUThreadPoolExecutor* syncExecutor();
+
   std::string toString() const;
 
  private:
@@ -821,11 +856,20 @@ class WaveStream {
   static std::vector<std::unique_ptr<Event>> eventsForReuse_;
   static std::vector<std::unique_ptr<Stream>> streamsForReuse_;
   static bool exitInited_;
+  static std::unique_ptr<folly::CPUThreadPoolExecutor> copyExecutor_;
+  static std::unique_ptr<folly::CPUThreadPoolExecutor> syncExecutor_;
 
   static void clearReusable();
 
+  static folly::CPUThreadPoolExecutor* getExecutor(
+      std::unique_ptr<folly::CPUThreadPoolExecutor>& ptr);
+
+  // Unified memory.
   GpuArena& arena_;
-  GpuArena& hostArena_;
+
+  // Device memory.
+  GpuArena& deviceArena_;
+
   const std::vector<std::unique_ptr<AbstractOperand>>* const operands_;
   // True at '[i]' if in this stream 'operands_[i]' should have null flags.
   std::vector<bool> operandNullable_;
diff --git a/velox/experimental/wave/exec/WaveDataSource.h b/velox/experimental/wave/exec/WaveDataSource.h
index bad83e2fabcc..caf7c28f2504 100644
--- a/velox/experimental/wave/exec/WaveDataSource.h
+++ b/velox/experimental/wave/exec/WaveDataSource.h
@@ -48,8 +48,6 @@ class WaveDataSource : public std::enable_shared_from_this<WaveDataSource> {
 
   virtual void schedule(WaveStream& stream, int32_t maxRows = 0) = 0;
 
-  virtual vector_size_t outputSize(WaveStream& stream) const = 0;
-
   virtual bool isFinished() = 0;
 
   virtual std::shared_ptr<WaveSplitReader> splitReader() = 0;
diff --git a/velox/experimental/wave/exec/WaveDriver.cpp b/velox/experimental/wave/exec/WaveDriver.cpp
index 6a9a07c0006f..71d03fc290ba 100644
--- a/velox/experimental/wave/exec/WaveDriver.cpp
+++ b/velox/experimental/wave/exec/WaveDriver.cpp
@@ -15,6 +15,8 @@
  */
 
 #include "velox/experimental/wave/exec/WaveDriver.h"
+#include <iostream>
+#include "velox/common/process/TraceContext.h"
 #include "velox/common/testutil/TestValue.h"
 #include "velox/exec/Task.h"
 #include "velox/experimental/wave/exec/Instruction.h"
@@ -51,8 +53,8 @@ WaveDriver::WaveDriver(
       states_(std::move(states)) {
   VELOX_CHECK(!waveOperators.empty());
   auto returnBatchSize = 10000 * outputType_->size() * 10;
-  hostArena_ = std::make_unique<GpuArena>(
-      returnBatchSize * 10, getHostAllocator(getDevice()));
+  deviceArena_ = std::make_unique<GpuArena>(
+      100000000, getDeviceAllocator(getDevice()), 400000000);
   pipelines_.emplace_back();
   for (auto& op : waveOperators) {
     op->setDriver(this);
@@ -92,6 +94,7 @@ RowVectorPtr WaveDriver::getOutput() {
       auto status = advance(i);
       switch (status) {
         case Advance::kBlocked:
+          updateStats();
           return nullptr;
         case Advance::kResult:
           if (i == last) {
@@ -116,6 +119,7 @@ RowVectorPtr WaveDriver::getOutput() {
             } else {
               // Last finished.
               finished_ = true;
+              updateStats();
               return nullptr;
             }
           }
@@ -123,10 +127,12 @@ RowVectorPtr WaveDriver::getOutput() {
       }
     }
   } catch (const std::exception& e) {
+    updateStats();
     setError();
     throw;
   }
   finished_ = true;
+  updateStats();
   return nullptr;
 }
 
@@ -206,12 +212,13 @@ void WaveDriver::waitForArrival(Pipeline& pipeline) {
         incStats((pipeline.running[i]->stats()));
         pipeline.running[i]->setState(WaveStream::State::kNotRunning);
         moveTo(pipeline.running, i, pipeline.arrived);
-        totalWaitLoops += waitLoops;
       }
       ++waitLoops;
     }
   }
+  totalWaitLoops += waitLoops;
 }
+
 namespace {
 bool shouldStop(exec::StopReason taskStopReason) {
   return taskStopReason != exec::StopReason::kNone &&
@@ -222,6 +229,13 @@ bool shouldStop(exec::StopReason taskStopReason) {
 Advance WaveDriver::advance(int pipelineIdx) {
   auto& pipeline = pipelines_[pipelineIdx];
   int64_t waitLoops = 0;
+  // Set to true when any stream is seen not ready, false when any stream is
+  // seen ready.
+  bool isWaiting = false;
+  // Time when a stream was first seen not ready.
+  int64_t waitingSince = 0;
+  // Total wait time. Incremented when isWaiting is set to false from true.
+  int64_t waitUs = 0;
   for (;;) {
     const exec::StopReason taskStopReason =
         operatorCtx()->driverCtx()->task->shouldStop();
@@ -232,6 +246,9 @@ Advance WaveDriver::advance(int pipelineIdx) {
       // A point for test code injection.
       common::testutil::TestValue::adjust(
           "facebook::velox::wave::WaveDriver::getOutput::yield", this);
+      totalWaitLoops += waitLoops;
+      waveStats_.waitTime.micros += waitUs;
+
       return Advance::kBlocked;
     }
 
@@ -254,15 +271,24 @@ Advance WaveDriver::advance(int pipelineIdx) {
     auto& op = *pipeline.operators.back();
     auto& lastSet = op.syncSet();
     for (auto i = 0; i < pipeline.running.size(); ++i) {
-      if (pipeline.running[i]->isArrived(lastSet)) {
+      bool isArrived;
+      int64_t start = WaveTime::getMicro();
+      isArrived = pipeline.running[i]->isArrived(lastSet);
+      waveStats_.waitTime.micros += WaveTime::getMicro() - start;
+      if (isArrived) {
         auto arrived = pipeline.running[i].get();
         arrived->setState(WaveStream::State::kNotRunning);
         incStats(arrived->stats());
+        if (isWaiting) {
+          waitUs += WaveTime::getMicro() - waitingSince;
+          isWaiting = false;
+        }
         moveTo(pipeline.running, i, pipeline.arrived);
         if (pipeline.makesHostResult) {
           result_ = makeResult(*arrived, lastSet);
           if (result_ && result_->size() != 0) {
             totalWaitLoops += waitLoops;
+            waveStats_.waitTime.micros += waitUs;
             return Advance::kResult;
           }
           --i;
@@ -270,16 +296,21 @@ Advance WaveDriver::advance(int pipelineIdx) {
           pipeline.sinkFull = true;
           waitForArrival(pipeline);
           totalWaitLoops += waitLoops;
+          waveStats_.waitTime.micros += waitUs;
           return Advance::kResult;
         }
+      } else if (!isWaiting) {
+        waitingSince = WaveTime::getMicro();
+        isWaiting = true;
+      } else {
+        ++waitLoops;
       }
-      ++waitLoops;
     }
     if (pipeline.finished.empty() &&
         pipeline.running.size() + pipeline.arrived.size() <
             FLAGS_max_streams_per_driver) {
       auto stream = std::make_unique<WaveStream>(
-          *arena_, *hostArena_, &operands(), &stateMap_);
+          *arena_, *deviceArena_, &operands(), &stateMap_);
       ++stream->stats().numWaves;
       stream->setState(WaveStream::State::kHost);
       pipeline.arrived.push_back(std::move(stream));
@@ -368,18 +399,22 @@ void WaveDriver::updateStats() {
       "wave.bytesToHost",
       RuntimeCounter(waveStats_.bytesToHost, RuntimeCounter::Unit::kBytes));
   lockedStats->addRuntimeStat(
-      "wave.hostOnlyTime",
+      "wave.hostOnlyNanos",
       RuntimeCounter(
           waveStats_.hostOnlyTime.micros * 1000, RuntimeCounter::Unit::kNanos));
   lockedStats->addRuntimeStat(
-      "wave.hostParallelTime",
+      "wave.hostParallelNanos",
       RuntimeCounter(
           waveStats_.hostParallelTime.micros * 1000,
           RuntimeCounter::Unit::kNanos));
   lockedStats->addRuntimeStat(
-      "wave.waitTime",
+      "wave.waitNanos",
       RuntimeCounter(
           waveStats_.waitTime.micros * 1000, RuntimeCounter::Unit::kNanos));
+  lockedStats->addRuntimeStat(
+      "wave.stagingNanos",
+      RuntimeCounter(
+          waveStats_.stagingTime.micros * 1000, RuntimeCounter::Unit::kNanos));
 }
 
 } // namespace facebook::velox::wave
diff --git a/velox/experimental/wave/exec/WaveDriver.h b/velox/experimental/wave/exec/WaveDriver.h
index dfcfad7f277f..2b593520dec8 100644
--- a/velox/experimental/wave/exec/WaveDriver.h
+++ b/velox/experimental/wave/exec/WaveDriver.h
@@ -59,10 +59,6 @@ class WaveDriver : public exec::SourceOperator {
     return *arena_;
   }
 
-  GpuArena& hostArena() const {
-    return *hostArena_;
-  }
-
   const std::vector<std::unique_ptr<AbstractOperand>>& operands() {
     return operands_;
   }
@@ -160,7 +156,6 @@ class WaveDriver : public exec::SourceOperator {
 
   std::unique_ptr<GpuArena> arena_;
   std::unique_ptr<GpuArena> deviceArena_;
-  std::unique_ptr<GpuArena> hostArena_;
 
   ContinueFuture blockingFuture_{ContinueFuture::makeEmpty()};
   exec::BlockingReason blockingReason_;
diff --git a/velox/experimental/wave/exec/WaveHiveDataSource.cpp b/velox/experimental/wave/exec/WaveHiveDataSource.cpp
index 016aa5c011ce..7540518718a1 100644
--- a/velox/experimental/wave/exec/WaveHiveDataSource.cpp
+++ b/velox/experimental/wave/exec/WaveHiveDataSource.cpp
@@ -112,10 +112,6 @@ void WaveHiveDataSource::schedule(WaveStream& stream, int32_t maxRows) {
   stream.setSplitReader(splitReader_);
 }
 
-vector_size_t WaveHiveDataSource::outputSize(WaveStream& stream) const {
-  return splitReader_->outputSize(stream);
-}
-
 bool WaveHiveDataSource::isFinished() {
   if (!splitReader_) {
     return false;
diff --git a/velox/experimental/wave/exec/WaveHiveDataSource.h b/velox/experimental/wave/exec/WaveHiveDataSource.h
index 8b8e7d8ff680..7e1336ab4f57 100644
--- a/velox/experimental/wave/exec/WaveHiveDataSource.h
+++ b/velox/experimental/wave/exec/WaveHiveDataSource.h
@@ -50,8 +50,6 @@ class WaveHiveDataSource : public WaveDataSource {
 
   void schedule(WaveStream& stream, int32_t maxRows) override;
 
-  vector_size_t outputSize(WaveStream& stream) const override;
-
   bool isFinished() override;
 
   std::shared_ptr<WaveSplitReader> splitReader() override {
diff --git a/velox/experimental/wave/exec/WaveOperator.h b/velox/experimental/wave/exec/WaveOperator.h
index 89360dd4b520..abf380fd8ebf 100644
--- a/velox/experimental/wave/exec/WaveOperator.h
+++ b/velox/experimental/wave/exec/WaveOperator.h
@@ -127,13 +127,6 @@ class WaveOperator {
     driver_ = driver;
   }
 
-  // Returns the number of non-filtered out result rows in the invocation inside
-  // 'stream'. 'this' must have had schedule() called with the same stream and
-  // the stream must have arrived. The actual result rows may be non-contiguous
-  // in the result vectors and may need indirection to access, as seen in output
-  // operands of the corresponding executables.
-  virtual vector_size_t outputSize(WaveStream& stream) const = 0;
-
   const OperandSet& outputIds() const {
     return outputIds_;
   }
@@ -171,7 +164,7 @@ class WaveOperator {
  protected:
   folly::Synchronized<exec::OperatorStats>& stats();
 
-  // Sequence number in WaveOperator sequence inside WaveDriver. IUsed to label
+  // Sequence number in WaveOperator sequence inside WaveDriver. Used to label
   // states of different oprators in WaveStream.
   int32_t id_;
 
diff --git a/velox/experimental/wave/exec/WaveSplitReader.h b/velox/experimental/wave/exec/WaveSplitReader.h
index 9ad19bca821c..3bd4e6a7cc54 100644
--- a/velox/experimental/wave/exec/WaveSplitReader.h
+++ b/velox/experimental/wave/exec/WaveSplitReader.h
@@ -60,8 +60,6 @@ class WaveSplitReader {
 
   virtual void schedule(WaveStream& stream, int32_t maxRows) = 0;
 
-  virtual vector_size_t outputSize(WaveStream& stream) const = 0;
-
   virtual bool isFinished() const = 0;
 
   virtual uint64_t getCompletedBytes() = 0;
diff --git a/velox/experimental/wave/exec/tests/CMakeLists.txt b/velox/experimental/wave/exec/tests/CMakeLists.txt
index d6b0c71be853..e10cce40ba42 100644
--- a/velox/experimental/wave/exec/tests/CMakeLists.txt
+++ b/velox/experimental/wave/exec/tests/CMakeLists.txt
@@ -17,10 +17,53 @@ add_subdirectory(utils)
 add_executable(velox_wave_exec_test FilterProjectTest.cpp TableScanTest.cpp
                                     AggregationTest.cpp Main.cpp)
 
+target_link_libraries(
+  velox_wave_exec_test
+  velox_wave_exec
+  velox_wave_mock_reader
+  velox_aggregates
+  velox_dwio_common
+  velox_dwio_common_exception
+  velox_dwio_common_test_utils
+  velox_dwio_parquet_reader
+  velox_dwio_parquet_writer
+  velox_exec
+  velox_exec_test_lib
+  velox_functions_json
+  velox_functions_lib
+  velox_functions_prestosql
+  velox_functions_test_lib
+  velox_hive_connector
+  velox_memory
+  velox_serialization
+  velox_test_util
+  velox_type
+  velox_vector
+  velox_vector_fuzzer
+  Boost::atomic
+  Boost::context
+  Boost::date_time
+  Boost::filesystem
+  Boost::program_options
+  Boost::regex
+  Boost::thread
+  Boost::system
+  gtest
+  gtest_main
+  gmock
+  Folly::folly
+  gflags::gflags
+  glog::glog
+  fmt::fmt
+  ${FILESYSTEM})
+
 add_test(velox_wave_exec_test velox_wave_exec_test)
 
+add_executable(velox_wave_benchmark WaveBenchmark.cpp)
+
 target_link_libraries(
-  velox_wave_exec_test
+  velox_wave_benchmark
+  velox_query_benchmark
   velox_wave_exec
   velox_wave_mock_reader
   velox_aggregates
@@ -50,9 +93,10 @@ target_link_libraries(
   Boost::regex
   Boost::thread
   Boost::system
-  GTest::gtest
-  GTest::gtest_main
-  GTest::gmock
+  gtest
+  gtest_main
+  gmock
+  ${FOLLY_BENCHMARK}
   Folly::folly
   gflags::gflags
   glog::glog
diff --git a/velox/experimental/wave/exec/tests/WaveBenchmark.cpp b/velox/experimental/wave/exec/tests/WaveBenchmark.cpp
new file mode 100644
index 000000000000..1f4f433c62d4
--- /dev/null
+++ b/velox/experimental/wave/exec/tests/WaveBenchmark.cpp
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/benchmarks/QueryBenchmarkBase.h"
+#include "velox/common/process/TraceContext.h"
+#include "velox/dwio/dwrf/writer/Writer.h"
+#include "velox/dwio/dwrf/writer/WriterContext.h"
+#include "velox/experimental/wave/exec/ToWave.h"
+#include "velox/experimental/wave/exec/WaveHiveDataSource.h"
+#include "velox/experimental/wave/exec/tests/utils/FileFormat.h"
+#include "velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h"
+#include "velox/vector/fuzzer/VectorFuzzer.h"
+
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+using namespace facebook::velox::exec::test;
+using namespace facebook::velox::dwio::common;
+
+DEFINE_string(
+    data_path,
+    "",
+    "Root path of test data. Data layout must follow Hive-style partitioning. "
+    "If the content are directories, they contain the data files for "
+    "each table. If they are files, they contain a file system path for each "
+    "data file, one per line. This allows running against cloud storage or "
+    "HDFS");
+
+DEFINE_bool(
+    generate,
+    true,
+    "Generate input data. If false, data_path must "
+    "contain a directory with a subdirectory per table.");
+
+DEFINE_bool(preload, false, "Preload Wave data into RAM before starting query");
+
+DEFINE_bool(wave, true, "Run benchmark with Wave");
+
+DEFINE_int32(num_columns, 10, "Number of columns in test table");
+
+DEFINE_int64(filter_pass_pct, 100, "Passing % for one filter");
+
+DEFINE_int32(null_pct, 0, "Pct of null values in columns");
+
+DEFINE_int32(num_column_filters, 0, "Number of columns wit a filter");
+
+DEFINE_int32(num_expr_filters, 0, "Number of columns  with a filter expr");
+
+DEFINE_int32(
+    num_arithmetic,
+    0,
+    "Number of arithmetic ops per column after filters");
+
+DEFINE_int32(rows_per_stripe, 200000, "Rows in a stripe");
+
+DEFINE_int64(num_rows, 1000000000, "Rows in test table");
+
+DEFINE_int32(
+    run_query_verbose,
+    -1,
+    "Run a given query and print execution statistics");
+
+class WaveBenchmark : public QueryBenchmarkBase {
+ public:
+  ~WaveBenchmark() {
+    wave::test::Table::dropAll();
+  }
+
+  void initialize() override {
+    QueryBenchmarkBase::initialize();
+    if (FLAGS_wave) {
+      wave::registerWave();
+      wave::WaveHiveDataSource::registerConnector();
+      wave::test::WaveTestSplitReader::registerTestSplitReader();
+    }
+    rootPool_ = memory::memoryManager()->addRootPool("WaveBenchmark");
+    leafPool_ = rootPool_->addLeafChild("WaveBenchmark");
+  }
+
+  void makeData(
+      const RowTypePtr& type,
+      int32_t numVectors,
+      int32_t vectorSize,
+      float nullPct = 0) {
+    auto vectors = makeVectors(type, numVectors, vectorSize, nullPct / 100);
+    int32_t cnt = 0;
+    for (auto& vector : vectors) {
+      makeRange(vector, 1000000000, nullPct == 0);
+      auto rn = vector->childAt(type->size() - 1)->as<FlatVector<int64_t>>();
+      for (auto i = 0; i < rn->size(); ++i) {
+        rn->set(i, cnt++);
+      }
+    }
+    if (FLAGS_wave) {
+      makeTable(FLAGS_data_path + "/test.wave", vectors);
+      if (FLAGS_generate) {
+        auto table =
+            wave::test::Table::getTable(FLAGS_data_path + "/test.wave");
+        table->toFile(FLAGS_data_path + "/test.wave");
+      }
+    } else {
+      std::string temp = FLAGS_data_path + "/data.dwrf";
+      auto config = std::make_shared<dwrf::Config>();
+      config->set(dwrf::Config::COMPRESSION, common::CompressionKind_NONE);
+      config->set(
+          dwrf::Config::STRIPE_SIZE,
+          static_cast<uint64_t>(FLAGS_rows_per_stripe * FLAGS_num_columns * 4));
+      writeToFile(temp, vectors, config, vectors.front()->type());
+    }
+  }
+
+  std::vector<RowVectorPtr> makeVectors(
+      const RowTypePtr& rowType,
+      int32_t numVectors,
+      int32_t rowsPerVector,
+      float nullRatio = 0) {
+    std::vector<RowVectorPtr> vectors;
+    options_.vectorSize = rowsPerVector;
+    options_.nullRatio = nullRatio;
+    fuzzer_ = std::make_unique<VectorFuzzer>(options_, leafPool_.get());
+    for (int32_t i = 0; i < numVectors; ++i) {
+      auto vector = fuzzer_->fuzzInputFlatRow(rowType);
+      vectors.push_back(vector);
+    }
+    return vectors;
+  }
+
+  void makeRange(
+      RowVectorPtr row,
+      int64_t mod = std::numeric_limits<int64_t>::max(),
+      bool notNull = true) {
+    for (auto i = 0; i < row->type()->size(); ++i) {
+      auto child = row->childAt(i);
+      if (auto ints = child->as<FlatVector<int64_t>>()) {
+        for (auto i = 0; i < child->size(); ++i) {
+          if (!notNull && ints->isNullAt(i)) {
+            continue;
+          }
+          ints->set(i, ints->valueAt(i) % mod);
+        }
+      }
+      if (notNull) {
+        child->clearNulls(0, row->size());
+      }
+    }
+  }
+
+  wave::test::SplitVector makeTable(
+      const std::string& name,
+      std::vector<RowVectorPtr>& rows) {
+    wave::test::Table::dropTable(name);
+    return wave::test::Table::defineTable(name, rows)->splits();
+  }
+
+  void writeToFile(
+      const std::string& filePath,
+      const std::vector<RowVectorPtr>& vectors,
+      std::shared_ptr<dwrf::Config> config,
+      const TypePtr& schema) {
+    dwrf::WriterOptions options;
+    options.config = config;
+    options.schema = schema;
+    auto localWriteFile =
+        std::make_unique<LocalWriteFile>(filePath, true, false);
+    auto sink = std::make_unique<dwio::common::WriteFileSink>(
+        std::move(localWriteFile), filePath);
+    auto childPool =
+        rootPool_->addAggregateChild("HiveConnectorTestBase.Writer");
+    options.memoryPool = childPool.get();
+    facebook::velox::dwrf::Writer writer{std::move(sink), options};
+    for (size_t i = 0; i < vectors.size(); ++i) {
+      writer.write(vectors[i]);
+    }
+    writer.close();
+  }
+
+  exec::test::TpchPlan getQueryPlan(int32_t query) {
+    switch (query) {
+      case 1: {
+        if (!type_) {
+          type_ = makeType();
+        }
+
+        exec::test::TpchPlan plan;
+        if (FLAGS_wave) {
+          plan.dataFiles["0"] = {FLAGS_data_path + "/test.wave"};
+        } else {
+          plan.dataFiles["0"] = {FLAGS_data_path + "/data.dwrf"};
+        }
+        int64_t bound = (1'000'000'000LL * FLAGS_filter_pass_pct) / 100;
+        std::vector<std::string> scanFilters;
+        for (auto i = 0; i < FLAGS_num_column_filters; ++i) {
+          scanFilters.push_back(fmt::format("c{} < {}", i, bound));
+        }
+        auto builder =
+            PlanBuilder(leafPool_.get()).tableScan(type_, scanFilters);
+
+        for (auto i = 0; i < FLAGS_num_expr_filters; ++i) {
+          builder = builder.filter(
+              fmt::format("c{} < {}", FLAGS_num_column_filters + i, bound));
+        }
+
+        std::vector<std::string> aggInputs;
+        if (FLAGS_num_arithmetic > 0) {
+          std::vector<std::string> projects;
+          for (auto c = 0; c < type_->size(); ++c) {
+            std::string expr = fmt::format("c{} ", c);
+            for (auto i = 0; i < FLAGS_num_arithmetic; ++i) {
+              expr += fmt::format(" + c{}", c);
+            }
+            expr += fmt::format(" as f{}", c);
+            projects.push_back(std::move(expr));
+            aggInputs.push_back(fmt::format("f{}", c));
+          }
+          builder = builder.project(std::move(projects));
+        } else {
+          for (auto i = 0; i < type_->size(); ++i) {
+            aggInputs.push_back(fmt::format("c{}", i));
+          }
+        }
+        std::vector<std::string> aggs;
+        for (auto i = 0; i < aggInputs.size(); ++i) {
+          aggs.push_back(fmt::format("sum({})", aggInputs[i]));
+        }
+
+        plan.plan = builder.singleAggregation({}, aggs).planNode();
+
+        plan.dataFileFormat =
+            FLAGS_wave ? FileFormat::UNKNOWN : FileFormat::DWRF;
+        return plan;
+      }
+      default:
+        VELOX_FAIL("Bad query number");
+    }
+  }
+
+  void prepareQuery(int32_t query) {
+    switch (query) {
+      case 1: {
+        type_ = makeType();
+        auto numVectors =
+            std::max<int64_t>(1, FLAGS_num_rows / FLAGS_rows_per_stripe);
+        if (FLAGS_generate) {
+          makeData(
+              type_, numVectors, FLAGS_num_rows / numVectors, FLAGS_null_pct);
+        } else {
+          loadData();
+        }
+        break;
+      }
+      default:
+        VELOX_FAIL("Bad query number");
+    }
+  }
+
+  void loadData() {
+    if (FLAGS_wave) {
+      auto table =
+          wave::test::Table::getTable(FLAGS_data_path + "/test.wave", true);
+      table->fromFile(FLAGS_data_path + "/test.wave");
+      if (FLAGS_preload) {
+        table->loadData(leafPool_);
+      }
+    }
+  }
+
+  std::vector<std::shared_ptr<connector::ConnectorSplit>> listSplits(
+      const std::string& path,
+      int32_t numSplitsPerFile,
+      const TpchPlan& plan) override {
+    if (plan.dataFileFormat == FileFormat::UNKNOWN) {
+      auto table = wave::test::Table::getTable(path);
+      return table->splits();
+    }
+    return QueryBenchmarkBase::listSplits(path, numSplitsPerFile, plan);
+  }
+
+  void runMain(std::ostream& out, RunStats& runStats) override {
+    if (FLAGS_run_query_verbose == -1) {
+      folly::runBenchmarks();
+    } else {
+      const auto queryPlan = getQueryPlan(FLAGS_run_query_verbose);
+      auto [cursor, actualResults] = run(queryPlan);
+      if (!cursor) {
+        LOG(ERROR) << "Query terminated with error. Exiting";
+        exit(1);
+      }
+      auto task = cursor->task();
+      ensureTaskCompletion(task.get());
+      if (FLAGS_include_results) {
+        printResults(actualResults, out);
+        out << std::endl;
+      }
+      const auto stats = task->taskStats();
+      int64_t rawInputBytes = 0;
+      for (auto& pipeline : stats.pipelineStats) {
+        auto& first = pipeline.operatorStats[0];
+        if (first.operatorType == "TableScan" || first.operatorType == "Wave") {
+          rawInputBytes += first.rawInputBytes;
+        }
+      }
+      runStats.rawInputBytes = rawInputBytes;
+      out << fmt::format(
+                 "Execution time: {}",
+                 succinctMillis(
+                     stats.executionEndTimeMs - stats.executionStartTimeMs))
+          << std::endl;
+      out << fmt::format(
+                 "Splits total: {}, finished: {}",
+                 stats.numTotalSplits,
+                 stats.numFinishedSplits)
+          << std::endl;
+      out << printPlanWithStats(
+                 *queryPlan.plan, stats, FLAGS_include_custom_stats)
+          << std::endl;
+    }
+  }
+
+  RowTypePtr makeType() {
+    std::vector<std::string> names;
+    std::vector<TypePtr> types;
+    for (auto i = 0; i < FLAGS_num_columns; ++i) {
+      names.push_back(fmt::format("c{}", i));
+      types.push_back(BIGINT());
+    }
+    return ROW(std::move(names), std::move(types));
+  }
+
+  std::shared_ptr<memory::MemoryPool> rootPool_;
+  std::shared_ptr<memory::MemoryPool> leafPool_;
+  RowTypePtr type_;
+  VectorFuzzer::Options options_;
+  std::unique_ptr<VectorFuzzer> fuzzer_;
+};
+
+void waveBenchmarkMain() {
+  auto benchmark = std::make_unique<WaveBenchmark>();
+  benchmark->initialize();
+  if (FLAGS_run_query_verbose != -1) {
+    benchmark->prepareQuery(FLAGS_run_query_verbose);
+  }
+  if (FLAGS_test_flags_file.empty()) {
+    RunStats stats;
+    benchmark->runOne(std::cout, stats);
+    std::cout << stats.toString(false);
+  } else {
+    benchmark->runAllCombinations();
+  }
+  benchmark->shutdown();
+}
+
+int main(int argc, char** argv) {
+  std::string kUsage(
+      "This program benchmarks Wave. Run 'velox_wave_benchmark -helpon=WaveBenchmark' for available options.\n");
+  gflags::SetUsageMessage(kUsage);
+  folly::Init init{&argc, &argv, false};
+  facebook::velox::wave::printKernels();
+  waveBenchmarkMain();
+  return 0;
+}
diff --git a/velox/experimental/wave/exec/tests/utils/FileFormat.cpp b/velox/experimental/wave/exec/tests/utils/FileFormat.cpp
index d6800e201cb4..cd7c2a929c90 100644
--- a/velox/experimental/wave/exec/tests/utils/FileFormat.cpp
+++ b/velox/experimental/wave/exec/tests/utils/FileFormat.cpp
@@ -16,6 +16,9 @@
 
 #include "velox/experimental/wave/exec/tests/utils/FileFormat.h"
 #include <iostream>
+#include "velox/common/config/Config.h"
+#include "velox/common/file/FileSystems.h"
+#include "velox/vector/VectorSaver.h"
 
 namespace facebook::velox::wave::test {
 
@@ -364,8 +367,9 @@ void Writer::finishStripe() {
   for (auto& encoder : encoders_) {
     columns.push_back(encoder->toColumn());
   }
+  int32_t numRows = columns[0]->numValues;
   stripes_.push_back(std::make_unique<Stripe>(
-      std::move(columns), dwio::common::TypeWithId::create(type_)));
+      std::move(columns), dwio::common::TypeWithId::create(type_), numRows));
   encoders_.clear();
   rowsInStripe_ = 0;
 }
@@ -377,6 +381,234 @@ Table* Writer::finalize(std::string tableName) {
   return table;
 }
 
+template <typename T>
+void writeNumber(std::ostream& stream, const T& n) {
+  stream.write(reinterpret_cast<const char*>(&n), sizeof(T));
+}
+
+void Column::load(
+    std::unique_ptr<ReadFile>& file,
+    const std::string& path,
+    memory::MemoryPool* pool) {
+  VELOX_CHECK(region.length > 0 || nulls || !children.empty());
+  if (nulls) {
+    nulls->load(file, path, pool);
+  }
+  if (alphabet) {
+    alphabet->load(file, path, pool);
+  }
+  if (region.length > 0) {
+    values = AlignedBuffer::allocate<char>(region.length, pool);
+    if (!file) {
+      auto fileSystem = filesystems::getFileSystem(path, nullptr);
+      file = fileSystem->openFileForRead(path);
+    }
+    file->pread(region.offset, region.length, values->asMutable<char>());
+  }
+  for (auto& child : children) {
+    child->load(file, path, pool);
+  }
+}
+
+template <typename T>
+void readNumber(std::istream& stream, T& n) {
+  stream.read(reinterpret_cast<char*>(&n), sizeof(n));
+}
+
+template <typename T>
+void appendNumber(WriteFile& file, T n) {
+  file.append(std::string_view(reinterpret_cast<const char*>(&n), sizeof(n)));
+}
+
+void writeColumn(Column& column, WriteFile& file, std::stringstream& footer) {
+  writeNumber(footer, column.encoding);
+  writeNumber(footer, column.kind);
+  if (column.nulls) {
+    writeColumn(*column.nulls, file, footer);
+  } else {
+    writeNumber(footer, Encoding::kNone);
+  }
+  writeNumber(footer, column.numValues);
+  if (Encoding::kStruct == column.encoding) {
+    VELOX_NYI();
+    return;
+  }
+  if (Encoding::kDict == column.encoding) {
+    writeColumn(*column.alphabet, file, footer);
+  }
+  writeNumber(footer, column.bitWidth);
+  writeNumber(footer, column.baseline);
+  int64_t offset = file.size();
+  writeNumber(footer, offset);
+  int64_t size = column.values->size();
+  writeNumber(footer, size);
+  file.append(std::string_view(column.values->as<char>(), size));
+}
+
+void writeColumns(
+    std::vector<std::unique_ptr<Column>>& columns,
+    Column* nulls,
+    WriteFile& file,
+    std::stringstream& footer) {
+  writeNumber(footer, Encoding::kStruct);
+  writeNumber(footer, TypeKind::ROW);
+  if (nulls) {
+    writeColumn(*nulls, file, footer);
+  } else {
+    writeNumber(footer, Encoding::kNone);
+  }
+  writeNumber(footer, columns[0]->numValues);
+  int32_t numColumns = columns.size();
+  writeNumber(footer, numColumns);
+  std::vector<std::string> columnFooters;
+  int32_t columnStart = 0;
+  for (auto columnIdx = 0; columnIdx < columns.size(); ++columnIdx) {
+    std::stringstream columnFooter;
+    writeColumn(*columns[columnIdx], file, columnFooter);
+    auto footerString = columnFooter.str();
+    writeNumber(footer, columnStart);
+    columnStart += footerString.size();
+    columnFooters.push_back(std::move(footerString));
+  }
+  for (auto columnFooter : columnFooters) {
+    footer.write(columnFooter.data(), columnFooter.size());
+  }
+}
+
+std::vector<std::unique_ptr<Column>> readColumns(std::stringstream& in);
+
+std::unique_ptr<Column> readColumn(std::stringstream& in) {
+  Encoding encoding;
+  readNumber(in, encoding);
+  if (encoding == Encoding::kNone) {
+    return nullptr;
+  }
+  auto column = std::make_unique<Column>();
+  column->encoding = encoding;
+  readNumber(in, column->kind);
+  column->nulls = readColumn(in);
+  readNumber(in, column->numValues);
+  if (encoding == Encoding::kStruct) {
+    column->children = readColumns(in);
+    return column;
+  }
+  if (encoding == Encoding::kDict) {
+    column->alphabet = readColumn(in);
+  }
+  readNumber(in, column->bitWidth);
+  readNumber(in, column->baseline);
+  readNumber(in, column->region.offset);
+  readNumber(in, column->region.length);
+  return column;
+}
+
+std::vector<std::unique_ptr<Column>> readColumns(std::stringstream& in) {
+  int32_t numColumns;
+  readNumber(in, numColumns);
+  std::vector<int32_t> starts(numColumns);
+  in.read(reinterpret_cast<char*>(starts.data()), numColumns * sizeof(int32_t));
+  std::vector<std::unique_ptr<Column>> children(numColumns);
+  for (auto i = 0; i < numColumns; ++i) {
+    children[i] = readColumn(in);
+  }
+  return children;
+}
+
+void Table::toFile(const std::string& path) {
+  std::vector<std::string> stripeFooters;
+  std::vector<int64_t> stripeStart;
+  auto type = stripes_.front()->typeWithId->type();
+  auto fileSystem = filesystems::getFileSystem(path, nullptr);
+  try {
+    fileSystem->remove(path);
+  } catch (const std::exception& e) {
+  }
+  auto file = fileSystem->openFileForWrite(path);
+  std::vector<std::string> footers;
+  for (auto stripeIdx = 0; stripeIdx < stripes_.size(); ++stripeIdx) {
+    auto stripe = stripes_[stripeIdx].get();
+    stripeStart.push_back(file->size());
+    std::stringstream footer;
+    writeColumns(stripe->columns, nullptr, *file, footer);
+    footers.push_back(footer.str());
+  }
+  int64_t footerStart = file->size();
+  std::vector<int64_t> footerStarts;
+  for (auto& footer : footers) {
+    footerStarts.push_back(file->size());
+    file->append(std::string_view(footer.data(), footer.size()));
+  }
+  int64_t typeStart = file->size();
+  std::stringstream typeStream;
+  saveType(type, typeStream);
+  auto typeString = typeStream.str();
+  file->append(std::string_view(typeString.data(), typeString.size()));
+  int64_t offsetStart = file->size();
+  for (auto i = 0; i < stripeStart.size(); ++i) {
+    appendNumber(*file, stripeStart[i]);
+    appendNumber(*file, footerStarts[i]);
+  }
+  int32_t numStripes = stripes_.size();
+  appendNumber(*file, numStripes);
+  appendNumber(*file, offsetStart);
+  appendNumber(*file, typeStart);
+  appendNumber(*file, footerStart);
+  file->close();
+}
+
+void Table::fromFile(
+    const std::string& path,
+    int64_t splitStart,
+    int64_t splitSize) {
+  auto fileSystem = filesystems::getFileSystem(path, nullptr);
+  auto file = fileSystem->openFileForRead(path);
+  int64_t size = file->size();
+  std::string tail;
+  tail.resize(std::min<int64_t>(size, 100000));
+  file->pread(size - tail.size(), tail.size(), tail.data());
+  char* end = tail.data() + tail.size();
+  auto numStripes = *reinterpret_cast<int32_t*>(end - 28);
+  auto offsetStart = size - *reinterpret_cast<int64_t*>(end - 24);
+  auto typeStart = size - *reinterpret_cast<int64_t*>(end - 16);
+  auto footerStart = size - *reinterpret_cast<int64_t*>(end - 8);
+  int64_t tailSize = footerStart;
+  if (tailSize > tail.size()) {
+    std::string moreTail;
+    moreTail.resize(tailSize - tail.size());
+    file->pread(size - footerStart, moreTail.size(), moreTail.data());
+    moreTail += tail;
+    tail = std::move(moreTail);
+  }
+  // Read the type.
+  std::stringstream typeStream(tail);
+  typeStream.seekg(tail.size() - typeStart);
+  auto type = restoreType(typeStream);
+  auto typeWithIdUnique = dwio::common::TypeWithId::create(
+      std::static_pointer_cast<const RowType>(type));
+  std::shared_ptr<const dwio::common::TypeWithId> typeWithId(
+      typeWithIdUnique.release());
+  // Loop over offsets and make stripes for the ones in range.
+  auto offset = tail.size() - offsetStart;
+  std::vector<std::unique_ptr<Stripe>> stripes;
+  for (auto i = 0; i < numStripes; ++i) {
+    auto dataStart = *reinterpret_cast<int64_t*>(tail.data() + offset + i * 16);
+    if (dataStart >= splitStart && dataStart < splitStart + splitSize) {
+      auto footerStart =
+          size - *reinterpret_cast<int64_t*>(tail.data() + offset + 8 + i * 16);
+      auto footerOff = tail.size() - footerStart;
+      std::stringstream footerStream(tail);
+      // skip 3 first,i.e. kStruct, TypeKind::ROW, kNone.
+      footerStream.seekg(footerOff + 3);
+      int32_t numRows;
+      readNumber(footerStream, numRows);
+      auto columns = readColumns(footerStream);
+      stripes.push_back(std::make_unique<Stripe>(
+          std::move(columns), typeWithId, numRows, path));
+    }
+  }
+  addStripes(std::move(stripes), nullptr);
+}
+
 void Table::addStripes(
     std::vector<std::unique_ptr<Stripe>>&& stripes,
     std::shared_ptr<memory::MemoryPool> pool) {
@@ -386,6 +618,19 @@ void Table::addStripes(
     allStripes_[s->name] = s.get();
     stripes_.push_back(std::move(s));
   }
+  if (pool) {
+    pools_.push_back(pool);
+  }
+}
+
+void Table::loadData(std::shared_ptr<memory::MemoryPool> pool) {
+  std::unique_ptr<ReadFile> file;
+  VELOX_CHECK(pools_.empty());
+  for (auto& stripe : stripes_) {
+    for (auto& column : stripe->columns) {
+      column->load(file, stripe->path, pool.get());
+    }
+  }
   pools_.push_back(pool);
 }
 
diff --git a/velox/experimental/wave/exec/tests/utils/FileFormat.h b/velox/experimental/wave/exec/tests/utils/FileFormat.h
index 2a0140e709a4..58e837d6beec 100644
--- a/velox/experimental/wave/exec/tests/utils/FileFormat.h
+++ b/velox/experimental/wave/exec/tests/utils/FileFormat.h
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "velox/common/file/Region.h"
 #include "velox/connectors/Connector.h"
 #include "velox/connectors/hive/HiveConnectorSplit.h"
 #include "velox/dwio/common/TypeWithId.h"
@@ -26,12 +27,17 @@ namespace facebook::velox::wave::test {
 
 class Table;
 
-enum Encoding { kFlat, kDict };
+enum Encoding : uint8_t { kFlat, kDict, kStruct, kNone };
 
 struct Column {
-  TypeKind kind;
+  void load(
+      std::unique_ptr<ReadFile>& file,
+      const std::string& path,
+      memory::MemoryPool* pool);
+
   Encoding encoding;
-  // Number of encoded values.
+  TypeKind kind;
+  // Number of encoded values including nulls.
   int32_t numValues{0};
 
   // Distinct values in kDict.
@@ -52,23 +58,48 @@ struct Column {
   /// Encoded column with 'numValues' null bits, nullptr if no nulls. If set,
   /// 'values' has an entry for each non-null.
   std::unique_ptr<Column> nulls;
+
+  /// Location of raw data in the backing file, or 0,0 if no backing file.
+  common::Region region;
+
+  std::vector<std::unique_ptr<Column>> children;
 };
 
 struct Stripe {
   Stripe(
       std::vector<std::unique_ptr<Column>>&& in,
-      const std::shared_ptr<const dwio::common::TypeWithId>& type)
-      : typeWithId(type), columns(std::move(in)) {}
+      const std::shared_ptr<const dwio::common::TypeWithId>& type,
+      int32_t numRows,
+      std::string path = "")
+      : typeWithId(type),
+        numRows(numRows),
+        columns(std::move(in)),
+        path(std::move(path)) {}
 
   const Column* findColumn(const dwio::common::TypeWithId& child) const;
 
+  bool isLoaded() const {
+    for (auto i = 0; i < columns.size(); ++i) {
+      if (!columns[i]->values) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Unique name assigned when associating with a Table.
   std::string name;
 
   std::shared_ptr<const dwio::common::TypeWithId> typeWithId;
 
+  /// Number of rows. Needed for counting if no columns.
+  int32_t numRows{0};
+
   // Top level columns.
   std::vector<std::unique_ptr<Column>> columns;
+
+  /// Path of file referenced by Region in Columns.
+  std::string path;
 };
 
 class StringSet {
@@ -212,6 +243,19 @@ class Table {
     return it->second.get();
   }
 
+  void toFile(const std::string& path);
+
+  /// Initializes from 'path' for stripes whose start falls between 'start' and
+  /// 'start + size'.
+  void fromFile(
+      const std::string& path,
+      int64_t start = 0,
+      int64_t size = std::numeric_limits<int64_t>::max());
+
+  /// Reads the encoded data for all columns in column->buffer, allocating from
+  /// 'pool'.
+  void loadData(std::shared_ptr<memory::MemoryPool> pool);
+
   static void dropTable(const std::string& name);
 
   static Stripe* getStripe(const std::string& path) {
diff --git a/velox/experimental/wave/exec/tests/utils/TestFormatReader.cpp b/velox/experimental/wave/exec/tests/utils/TestFormatReader.cpp
index c8b9e80c4d1e..8dc591b973a1 100644
--- a/velox/experimental/wave/exec/tests/utils/TestFormatReader.cpp
+++ b/velox/experimental/wave/exec/tests/utils/TestFormatReader.cpp
@@ -35,18 +35,23 @@ std::unique_ptr<FormatData> TestFormatParams::toFormatData(
 int TestFormatData::stageNulls(
     ResultStaging& deviceStaging,
     SplitStaging& splitStaging) {
+  if (!column_->nulls) {
+    nullsStaged_ = true;
+    return kNotRegistered;
+  }
+
   if (nullsStaged_) {
+    splitStaging.addDependency(nullsStagingId_);
     return kNotRegistered;
   }
   nullsStaged_ = true;
   auto* nulls = column_->nulls.get();
-  if (!nulls) {
-    return kNotRegistered;
-  }
   Staging staging(
       nulls->values->as<char>(),
-      bits::nwords(column_->numValues) * sizeof(uint64_t));
+      bits::nwords(column_->numValues) * sizeof(uint64_t),
+      column_->region);
   auto id = splitStaging.add(staging);
+  nullsStagingId_ = splitStaging.id();
   splitStaging.registerPointer(id, &grid_.nulls, true);
   return id;
 }
@@ -90,8 +95,12 @@ void TestFormatData::startOp(
   stageNulls(deviceStaging, splitStaging);
   if (!staged_) {
     staged_ = true;
-    Staging staging(column_->values->as<char>(), column_->values->size());
+    Staging staging(
+        column_->values->as<char>(), column_->values->size(), column_->region);
     id = splitStaging.add(staging);
+    lastStagingId_ = splitStaging.id();
+  } else {
+    splitStaging.addDependency(lastStagingId_);
   }
   auto rowsPerBlock = FLAGS_wave_reader_rows_per_tb;
   int32_t numBlocks =
diff --git a/velox/experimental/wave/exec/tests/utils/TestFormatReader.h b/velox/experimental/wave/exec/tests/utils/TestFormatReader.h
index 409d5283dd18..55332c02b4c0 100644
--- a/velox/experimental/wave/exec/tests/utils/TestFormatReader.h
+++ b/velox/experimental/wave/exec/tests/utils/TestFormatReader.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "velox/connectors/hive/FileHandle.h"
 #include "velox/experimental/wave/dwio/ColumnReader.h"
 #include "velox/experimental/wave/exec/tests/utils/FileFormat.h"
 #include "velox/type/Subfield.h"
@@ -69,6 +70,7 @@ class TestFormatData : public wave::FormatData {
   int32_t stageNulls(ResultStaging& deviceStaging, SplitStaging& splitStaging);
 
   const OperandId operand_;
+
   int32_t totalRows_{0};
 
   const test::Column* column_;
diff --git a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp
index 15c1139855b9..bdbaab479d75 100644
--- a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp
+++ b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp
@@ -27,10 +27,29 @@ WaveTestSplitReader::WaveTestSplitReader(
     const std::shared_ptr<connector::ConnectorSplit>& split,
     const SplitReaderParams& params,
     const DefinesMap* defines) {
+  params_ = params;
   auto hiveSplit =
       dynamic_cast<connector::hive::HiveConnectorSplit*>(split.get());
   VELOX_CHECK_NOT_NULL(hiveSplit);
+
   stripe_ = test::Table::getStripe(hiveSplit->filePath);
+  if (!stripe_->isLoaded()) {
+    try {
+      fileHandle_ = params_.fileHandleFactory->generate(
+          stripe_->path,
+          hiveSplit->properties.has_value() ? &*hiveSplit->properties
+                                            : nullptr);
+      VELOX_CHECK_NOT_NULL(fileHandle_.get());
+    } catch (const VeloxRuntimeError& e) {
+      if (e.errorCode() == error_code::kFileNotFound &&
+          params_.hiveConfig->ignoreMissingFiles(
+              params_.connectorQueryCtx->sessionProperties())) {
+        emptySplit_ = true;
+        return;
+      }
+      throw;
+    }
+  }
   VELOX_CHECK_NOT_NULL(stripe_);
   TestFormatParams formatParams(
       *params.connectorQueryCtx->memoryPool(), readerStats_, stripe_);
@@ -43,6 +62,11 @@ WaveTestSplitReader::WaveTestSplitReader(
       empty,
       *defines,
       true);
+  if (fileHandle_.get()) {
+    fileInfo_.file = fileHandle_->file.get();
+    fileInfo_.fileId = &fileHandle_->uuid;
+    fileInfo_.cache = params_.connectorQueryCtx->cache();
+  }
 }
 
 int32_t WaveTestSplitReader::canAdvance(WaveStream& stream) {
@@ -68,7 +92,10 @@ void WaveTestSplitReader::schedule(WaveStream& waveStream, int32_t maxRows) {
   }
   if (!exe) {
     exe = std::make_unique<ReadStream>(
-        reinterpret_cast<StructColumnReader*>(columnReader_.get()), waveStream);
+        reinterpret_cast<StructColumnReader*>(columnReader_.get()),
+        waveStream,
+        params_.ioStats.get(),
+        fileInfo_);
   }
   ReadStream::launch(std::move(exe), nextRow_, rowSet);
   nextRow_ += scheduledRows_;
diff --git a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h
index 2881f6de03f1..320c8a08784e 100644
--- a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h
+++ b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h
@@ -31,7 +31,7 @@ class WaveTestSplitReader : public WaveSplitReader {
       const DefinesMap* defines);
 
   bool emptySplit() override {
-    return !stripe_ || stripe_->columns[0]->numValues == 0;
+    return !stripe_ || stripe_->columns[0]->numValues == 0 || emptySplit_;
   }
 
   int32_t canAdvance(WaveStream& stream) override;
@@ -43,11 +43,11 @@ class WaveTestSplitReader : public WaveSplitReader {
   bool isFinished() const override;
 
   uint64_t getCompletedBytes() override {
-    return 0;
+    return params_.ioStats->rawBytesRead();
   }
 
   uint64_t getCompletedRows() override {
-    return 0;
+    return nextRow_;
   }
 
   std::unordered_map<std::string, RuntimeCounter> runtimeStats() override {
@@ -63,6 +63,8 @@ class WaveTestSplitReader : public WaveSplitReader {
 
   std::shared_ptr<connector::ConnectorSplit> split_;
   SplitReaderParams params_;
+  FileHandleCachedPtr fileHandleCachePtr;
+  cache::AsyncDataCache* cache_{nullptr};
   test::Stripe* stripe_{nullptr};
   std::unique_ptr<ColumnReader> columnReader_;
   // First unscheduled row.
@@ -70,6 +72,9 @@ class WaveTestSplitReader : public WaveSplitReader {
   int32_t scheduledRows_{0};
   dwio::common::ColumnReaderStatistics readerStats_;
   raw_vector<int32_t> rows_;
+  FileHandleCachedPtr fileHandle_;
+  FileInfo fileInfo_;
+  bool emptySplit_{false};
 };
 
 } // namespace facebook::velox::wave::test